Ejemplo n.º 1
0
        def tweet_download_and_lang_detect(df_tweets, user_id,
                                           update_to_invalid_list):
            """
            Calls language detection and checks if enough german tweets remain.
            If it found almost enough german Tweets it will load more.
            If it found almost none it will abort.
            :param df_tweets: 0 during first run, dataframe with tweets during later runs
            :param user_id: Twitter User_ID for tweet download and language check
            :param update_to_invalid_list: List of user that can not be downloaded from. Will append to if applicable.
            :return: df_tweets, update_to_invalid_list, abort_loop, len_df
            """
            if isinstance(df_tweets, int):
                df_tweets = TwitterAPI.API_tweet_multitool(
                    user_id,
                    'temp',
                    pages=1,
                    method='user_timeline',
                    append=False,
                    write_to_db=False)  # fills DF with 200 tweets of 1 page
                df_tweets = helper_functions.lang_detect(df_tweets)
            else:
                df_tweets_additions = TwitterAPI.API_tweet_multitool(
                    user_id,
                    'temp',
                    pages=1,
                    method='user_timeline',
                    append=False,
                    write_to_db=False)  # fills DF with 200 tweets of 1 page
                df_tweets_additions = helper_functions.lang_detect(
                    df_tweets_additions)
                if isinstance(df_tweets_additions, pd.DataFrame):
                    df_tweets = pd.concat([df_tweets, df_tweets_additions])
                    df_tweets.reset_index(inplace=True)
                    del df_tweets['index']

            # if df_tweets is None: #no tweets found or all tweets deleted (non german)
            #     abort_loop = True
            #     return df_tweets, update_to_invalid_list, abort_loop'

            len_df = helper_functions.dataframe_length(df_tweets)
            if len_df <= 50:
                # if almost no tweets are german don't try to get more german tweets from this users.
                # would take to many page loads
                update_to_invalid_list.append(user_id)
                abort_loop = True
            elif len_df >= 200:
                abort_loop = True
            else:
                # if to few tweets are german load more tweets to get a better result
                abort_loop = False
            gc.collect()
            return df_tweets, update_to_invalid_list, abort_loop, len_df
Ejemplo n.º 2
0
def download_user_timelines(political_moderate_list: list,
                            right_wing_populists_list: list):
    """
    Downloads user timelines of users featured in below lists. The downloads are used as training material for AI
    training. All lists are just incomplete examples.
    :return:
    """

    #List examples
    # political_moderate_list = ['_pik_dame_', 'Leekleinkunst', 'MartinaKraus7', 'KAFVKA', 'Volksverpetzer', 'insideX',
    #                            'FranziLucke', 'leonie_stella9', 'Ute631', 'justMPO', 'anouk_avf', 'Komisaar',
    #                            'MenschBernd', 'von_ems', 'lies_das', 'seewanda', 'Rene_Bacher', 'Prasanita93',
    #                            'IgorUllrich', 'AJSalzgitter', 'Bussi72', 'HuWutze', 'strahlgewitter', 'PhilKupi',
    #                            'BaldusEla', 'LarsKlingenberg', 'MichaelSchfer71', 'EddyAges', 'veripot', 'JoernPL',
    #                            'ondreka', 'kleinerJedi', 'DanielKinski', 'wfh7175', 'Sister_records1', 'TinaJergerkamp']
    # right_wing_populists_list = ['Junge_Freiheit', 'zaferflocken', 'HelmutWachler', 'M_Briefing', 'TinVonWo', 'mcwd12',
    #                              'EBlume3', 'h_hendrich']

    #Political unpolitical stance is currently not used
    # Tweets of below accounts will be downloaded from twitter. During model a subset of below accounts might be used.
    # unpolitical_list = ['Podolski10', 'fckoeln', 'FCBayern', 'BVB', 'rtl2', 'DMAX_TV', 'tim_kocht', 'grandcheflafer',
    #                     'bildderfrau', 'gala', 'BUNTE', 'promiflash', 'funny_catvideos', 'BibisBeauty', 'dagibee',
    #                     'siggismallz', 'Gronkh', 'CHIP_online', 'COMPUTERWOCHE', 'SkySportNewsHD', 'MOpdenhoevel',
    #                     'kayefofficial', 'VOGUE_Germany', 'lucycatofficial', 'RealLexyRoxx', 'AnselmSchindler',
    #                     'pentru_tine', 'KaJa80028344']

    #unpolitical_list = ['Podolski10'] For Testing

    # political_list = ['Thomas_Ehrhorn', 'HilseMdb', 'DirkSpaniel', 'MdB_Lucassen', 'RolandTichy', 'UllmannMdB',
    #                   'c_jung77', 'michael_g_link', 'theliberalfrank', 'IreneMihalic', 'KaiGehring', 'RenateKuenast',
    #                   'GoeringEckardt', 'MdB_Freihold', 'ZaklinNastic', 'PetraPauMaHe', 'lgbeutin', 'arnoklare',
    #                   'zierke', 'Timon_Gremmels', 'Johann_Saathoff', 'uhl_markus', 'AnjaKarliczek', 'KLeikert',
    #                   'Junge_Gruppe']

    user_lists = {
        'political_moderate_list': political_moderate_list,
        'right_wing_populists_list': right_wing_populists_list
    }

    # List Download
    for list_name, username_list in user_lists.items():
        for element in username_list:
            TwitterAPI.API_tweet_multitool(element,
                                           list_name,
                                           pages=10,
                                           method='user_timeline',
                                           append=True,
                                           write_to_db=True)
Ejemplo n.º 3
0
def eval_bert(model_path) -> None:
    """
    Runs evaluation against evaluation accounts in table "eval_table"
    Return a printout of the results
    :return: none
    """
    data = []
    df_pred_data = pd.DataFrame(data,
                                columns=[
                                    'screen_name', 'pol', 'unpol', 'pol_time',
                                    'left', 'right', 'lr_time'
                                ])
    sql = "select distinct username from eval_table"
    #sql = "select distinct screen_name as username from n_users where id = 805308596"
    df = db_functions.select_from_db(sql)

    print("Loading BERT")
    # older version
    # model_path = r"C:\Users\Admin\PycharmProjects\untitled\outputs\political_bert_1605094936.6519241\checkpoint-15000"
    # model_path = r"F:\AI\outputs\political_bert_1605652513.149895\checkpoint-480000"
    model = init(model_path)
    print("Querying BERT")

    for index, element in tqdm(df.iterrows(), total=df.shape[0]):

        screen_name = element[0]

        df_tweets = TwitterAPI.API_tweet_multitool(
            screen_name,
            'temp',
            pages=1,
            method='user_timeline',
            append=False,
            write_to_db=False)  # speichert tweets in DF
        if isinstance(
                df_tweets,
                str):  # if df_tweets is a string it contains an error message
            continue
        start_time = time.time()
        german_language = helper_functions.lang_detect(df_tweets)
        runtime = time.time() - start_time
        print(f"Runtime Lang Detect: {runtime}")
        if german_language is False:
            continue
        start_time = time.time()
        prediction_result = [bert_predictions(df_tweets['tweet'], model)]
        runtime = time.time() - start_time
        print(f"Runtime Bert: {runtime}")

        result = prediction_result[0]
        df_pred_data.at[index, 'screen_name'] = screen_name
        try:
            df_pred_data.at[index, 'left'] = result[0]
            df_pred_data.at[index, 'right'] = result[1]
        except:
            df_pred_data.at[index, 'left'] = 0
            df_pred_data.at[index, 'right'] = 0
        df_pred_data.at[index, 'lr_time'] = runtime

    print("screen_name,Pol,Unpol,Pol_Time,Left,Right,LR_Time")
    for index, element in df_pred_data.iterrows():
        print(
            f"{element['screen_name']},{element['pol']},{element['unpol']},{element['pol_time']},{element['left']},"
            f"{element['right']},{int(element['lr_time'])}")