def tweet_download_and_lang_detect(df_tweets, user_id, update_to_invalid_list): """ Calls language detection and checks if enough german tweets remain. If it found almost enough german Tweets it will load more. If it found almost none it will abort. :param df_tweets: 0 during first run, dataframe with tweets during later runs :param user_id: Twitter User_ID for tweet download and language check :param update_to_invalid_list: List of user that can not be downloaded from. Will append to if applicable. :return: df_tweets, update_to_invalid_list, abort_loop, len_df """ if isinstance(df_tweets, int): df_tweets = TwitterAPI.API_tweet_multitool( user_id, 'temp', pages=1, method='user_timeline', append=False, write_to_db=False) # fills DF with 200 tweets of 1 page df_tweets = helper_functions.lang_detect(df_tweets) else: df_tweets_additions = TwitterAPI.API_tweet_multitool( user_id, 'temp', pages=1, method='user_timeline', append=False, write_to_db=False) # fills DF with 200 tweets of 1 page df_tweets_additions = helper_functions.lang_detect( df_tweets_additions) if isinstance(df_tweets_additions, pd.DataFrame): df_tweets = pd.concat([df_tweets, df_tweets_additions]) df_tweets.reset_index(inplace=True) del df_tweets['index'] # if df_tweets is None: #no tweets found or all tweets deleted (non german) # abort_loop = True # return df_tweets, update_to_invalid_list, abort_loop' len_df = helper_functions.dataframe_length(df_tweets) if len_df <= 50: # if almost no tweets are german don't try to get more german tweets from this users. # would take to many page loads update_to_invalid_list.append(user_id) abort_loop = True elif len_df >= 200: abort_loop = True else: # if to few tweets are german load more tweets to get a better result abort_loop = False gc.collect() return df_tweets, update_to_invalid_list, abort_loop, len_df
def download_user_timelines(political_moderate_list: list, right_wing_populists_list: list): """ Downloads user timelines of users featured in below lists. The downloads are used as training material for AI training. All lists are just incomplete examples. :return: """ #List examples # political_moderate_list = ['_pik_dame_', 'Leekleinkunst', 'MartinaKraus7', 'KAFVKA', 'Volksverpetzer', 'insideX', # 'FranziLucke', 'leonie_stella9', 'Ute631', 'justMPO', 'anouk_avf', 'Komisaar', # 'MenschBernd', 'von_ems', 'lies_das', 'seewanda', 'Rene_Bacher', 'Prasanita93', # 'IgorUllrich', 'AJSalzgitter', 'Bussi72', 'HuWutze', 'strahlgewitter', 'PhilKupi', # 'BaldusEla', 'LarsKlingenberg', 'MichaelSchfer71', 'EddyAges', 'veripot', 'JoernPL', # 'ondreka', 'kleinerJedi', 'DanielKinski', 'wfh7175', 'Sister_records1', 'TinaJergerkamp'] # right_wing_populists_list = ['Junge_Freiheit', 'zaferflocken', 'HelmutWachler', 'M_Briefing', 'TinVonWo', 'mcwd12', # 'EBlume3', 'h_hendrich'] #Political unpolitical stance is currently not used # Tweets of below accounts will be downloaded from twitter. During model a subset of below accounts might be used. # unpolitical_list = ['Podolski10', 'fckoeln', 'FCBayern', 'BVB', 'rtl2', 'DMAX_TV', 'tim_kocht', 'grandcheflafer', # 'bildderfrau', 'gala', 'BUNTE', 'promiflash', 'funny_catvideos', 'BibisBeauty', 'dagibee', # 'siggismallz', 'Gronkh', 'CHIP_online', 'COMPUTERWOCHE', 'SkySportNewsHD', 'MOpdenhoevel', # 'kayefofficial', 'VOGUE_Germany', 'lucycatofficial', 'RealLexyRoxx', 'AnselmSchindler', # 'pentru_tine', 'KaJa80028344'] #unpolitical_list = ['Podolski10'] For Testing # political_list = ['Thomas_Ehrhorn', 'HilseMdb', 'DirkSpaniel', 'MdB_Lucassen', 'RolandTichy', 'UllmannMdB', # 'c_jung77', 'michael_g_link', 'theliberalfrank', 'IreneMihalic', 'KaiGehring', 'RenateKuenast', # 'GoeringEckardt', 'MdB_Freihold', 'ZaklinNastic', 'PetraPauMaHe', 'lgbeutin', 'arnoklare', # 'zierke', 'Timon_Gremmels', 'Johann_Saathoff', 'uhl_markus', 'AnjaKarliczek', 'KLeikert', # 'Junge_Gruppe'] user_lists = { 'political_moderate_list': political_moderate_list, 'right_wing_populists_list': right_wing_populists_list } # List Download for list_name, username_list in user_lists.items(): for element in username_list: TwitterAPI.API_tweet_multitool(element, list_name, pages=10, method='user_timeline', append=True, write_to_db=True)
def eval_bert(model_path) -> None: """ Runs evaluation against evaluation accounts in table "eval_table" Return a printout of the results :return: none """ data = [] df_pred_data = pd.DataFrame(data, columns=[ 'screen_name', 'pol', 'unpol', 'pol_time', 'left', 'right', 'lr_time' ]) sql = "select distinct username from eval_table" #sql = "select distinct screen_name as username from n_users where id = 805308596" df = db_functions.select_from_db(sql) print("Loading BERT") # older version # model_path = r"C:\Users\Admin\PycharmProjects\untitled\outputs\political_bert_1605094936.6519241\checkpoint-15000" # model_path = r"F:\AI\outputs\political_bert_1605652513.149895\checkpoint-480000" model = init(model_path) print("Querying BERT") for index, element in tqdm(df.iterrows(), total=df.shape[0]): screen_name = element[0] df_tweets = TwitterAPI.API_tweet_multitool( screen_name, 'temp', pages=1, method='user_timeline', append=False, write_to_db=False) # speichert tweets in DF if isinstance( df_tweets, str): # if df_tweets is a string it contains an error message continue start_time = time.time() german_language = helper_functions.lang_detect(df_tweets) runtime = time.time() - start_time print(f"Runtime Lang Detect: {runtime}") if german_language is False: continue start_time = time.time() prediction_result = [bert_predictions(df_tweets['tweet'], model)] runtime = time.time() - start_time print(f"Runtime Bert: {runtime}") result = prediction_result[0] df_pred_data.at[index, 'screen_name'] = screen_name try: df_pred_data.at[index, 'left'] = result[0] df_pred_data.at[index, 'right'] = result[1] except: df_pred_data.at[index, 'left'] = 0 df_pred_data.at[index, 'right'] = 0 df_pred_data.at[index, 'lr_time'] = runtime print("screen_name,Pol,Unpol,Pol_Time,Left,Right,LR_Time") for index, element in df_pred_data.iterrows(): print( f"{element['screen_name']},{element['pol']},{element['unpol']},{element['pol_time']},{element['left']}," f"{element['right']},{int(element['lr_time'])}")