def get_BERT_friends_scores_from_friends( sql: str, min_required_bert_friend_opinions: int): """get_followers Counts how many left or right friends a user has. Unlike seemingly similar functions (which download and analyse tweets), this one gets user profiles from DB as DF. For users that need special attention because they did not get a combined score due to bad BERT_friend score. Works only if a users friends are available in n_friends. Afterwards get_combined_scores_from_followers(sql) needs to run again, to give the accounts handled here a combined score :param sql: sql statement that provides delivers users and their friends, which have a bert_friend or a combined score """ df = db_functions.select_from_db(sql) # Accounts in this list can only rated via bert_friend_rating bert_friend_rating_list = df[ df['combined_conf'].isnull()]['id'].drop_duplicates().to_list() all_ids = df['id'].drop_duplicates().to_list() # Account in this list can be rated via (the better) combined_rating combined_rating_list = [ item for item in all_ids if item not in bert_friend_rating_list ] # count left/right friends based on BERT_friend rating (widely available) df_result_BERT_friends = count_friend_stances( df, friend_lst=bert_friend_rating_list, column_to_count='result_bert_friends', min_required_bert_friend_opinions=min_required_bert_friend_opinions) # count left/right friends based on combined rating (better results) df_result_combined_ratings = count_friend_stances( df, friend_lst=combined_rating_list, column_to_count='combined_rating', min_required_bert_friend_opinions=min_required_bert_friend_opinions) df_combined = pd.concat( [df_result_BERT_friends, df_result_combined_ratings]) df_combined.dropna(inplace=True) del df del df_result_combined_ratings del df_result_BERT_friends db_functions.df_to_sql(df_combined, 'temp_scores_table', drop='replace') update_sql = 'update n_users set result_bert_friends = t."1", bert_friends_conf = cast (t."2" as numeric), ' \ 'bf_left_number = t."3", bf_right_number = t."4", bert_friends_last_seen = t."5" from ' \ 'temp_scores_table t where n_users.id = t."0"' db_functions.update_table(update_sql) db_functions.drop_table('temp_scores_table')
def SN_db_operations(hashtag: str, since: str, until: str) -> tuple: """ 1. Creates staging table for new hashtag 2. Calls tweet ID downloader 3. Saves tweet IDs to DB Downloads TweetIDs of Hashtags via snScrapeTweet :param hashtag: hashtag, to be downloaded :param since: start date for tweet download (string) :param until: until date for tweet download (string) :return: table_name, hashtag, dataframe length """ table_name = db_functions.get_staging_table_name( hashtag) #adds prefix (s_h_) and suffix (dateand time) to tablename db_functions.drop_table(table_name) db_functions.create_empty_staging_table(table_name) tweet_ids = SN_get_tweet_ids_for_hashtag(since, until, hashtag) #downloads tweets if len(tweet_ids) == 0: print("####################################################") print("####Warning! No Tweets fetched! Process aborted!####") print("####################################################") sys.exit() df = pd.DataFrame(tweet_ids) write_to_table = "update_temp" db_functions.df_to_sql(df, write_to_table, drop="replace") #Write Tweet ID to temp table #db_functions.update_table('insert into ' + str(table_name) + ' (id) select cast ("0" as bigint) from update_temp') #insert temp table content into staging table db_functions.update_table( f'insert into {table_name} (id) select cast ("0" as bigint) from {write_to_table}' ) # insert temp table content into staging table try: print(f"Table {table_name} created with {len(df)} tweets.") except: print("Unhandled Error") db_functions.drop_table(write_to_table) return table_name, hashtag, df.shape[0]
def inference_bert_friends(classifier, column_list: list, sql: str, min_matches: int): """ Performs inference based on users an account follows. Stores result to n_users :param classifier: Classifier :param friend_column_list_path: Column list to be used. Any Users friends are matched against this column list :param sql: Sql with combination of User to be inferendes, their label (combined rating) and their friend ID :param min_matches: Minimum friends that must be found in friend_column for the user to get a prediction. More connections = more accurate prediction result :return: """ start = time.time() friends = db_functions.select_from_db(sql) input_dataset_length = len(friends) print(f"SQL fetching time: {time.time() - start}") if len(friends) == 0: rows_processed = 0 return rows_processed friend_set = set(friends['follows_ids'].values.tolist()) friend_list = friends['follows_ids'].values.tolist() user_list = friends['user_id'].values.tolist() rating_list = friends['combined_rating'].values.tolist() del friends #Transforms DataFrame into DefaultDict relationship_dict = defaultdict(lambda: defaultdict(list)) for i, element in enumerate(friend_list): relationship_dict[element][0].append(user_list[i]) relationship_dict[element][1].append(rating_list[i]) conditions_not_met_list = [ ] # Ids in this list will still get a last seen date in DB to ignore them during next loop result_dict = {} #Iteration though friend_set and prediction of faction for element in tqdm(friend_set): common_friends = set(relationship_dict[element][0]) & set(column_list) number_of_common_friends = len(common_friends) if number_of_common_friends >= min_matches: df = pd.DataFrame(index=column_list).T df = df.append(pd.Series(), ignore_index=True).fillna(0) for friend in relationship_dict[element][0]: df.loc[:, friend] = 1 df = df.iloc[:, :len(column_list)] prediction_proba = classifier.predict_proba( df.values.tolist()) # pure predict text, conf = helper_functions.conf_value("LR", prediction_proba, min_boundary=0.5, max_boundary=1) result_dict[element] = [text, conf, number_of_common_friends] else: conditions_not_met_list.append(element) del friend_set del friend_list del user_list del rating_list timestamp = db_functions.staging_timestamp() result_df = pd.DataFrame(result_dict).T result_df['last_seen'] = timestamp rows_processed = len(result_df) if rows_processed > 0: #checks if data has been written to DF db_functions.df_to_sql(result_df, "temp_table", "replace") update_sql = """update n_users set bert_friends_ml_result = "0", bert_friends_ml_conf = cast("1" as numeric), bert_friends_ml_count = cast ("2" as integer), bert_friends_ml_last_seen = temp_table.last_seen from temp_table where cast (id as text) = temp_table."index" """ start = time.time() db_functions.update_table(update_sql) db_functions.drop_table("temp_table") print(f"Update Time: {time.time() - start}") else: print( f"WARNING: 0 new ratings generated despite an input dataset of {input_dataset_length} rows." ) if len(conditions_not_met_list) > 0: stamps = [timestamp for elm in conditions_not_met_list] ziped = list(zip(conditions_not_met_list, stamps)) db_functions.df_to_sql(pd.DataFrame(ziped), "temp_table", drop='replace') sql = 'update n_users set bert_friends_ml_last_seen = temp_table."1" from temp_table where n_users.id::text = temp_table."0"' db_functions.update_table(sql) #db_functions.drop_table('temp_table') return rows_processed
def prediction_launcher(table_name: str, BERT_model, sql: str, write_to_db: bool = True, TFIDF_pol_unpol_conv=0, Algo_pol_unpol=0): """ Loads 200 Tweets (one page call) per users and sends them to BERT for inference :param table_name: Name of temp table used to store results :param BERT_model: BERT Model :param sql: Statement providing Users to be inferenced :param write_to_db: True or False :param TFIDF_pol_unpol_conv: tfidf converter (optional) :param Algo_pol_unpol: Random Forrest classifier (optional) :return: """ start_time_overal = time.time() cur_date = str(date.today()) # date for last seen columns #methods = ['pol', 'LR'] methods = ['LR'] data = [] update_to_invalid_list = [] # This DF will store all precditions results df_pred_data = pd.DataFrame(data, columns=[ 'user_id', 'screen_name', 'pol', 'unpol', 'pol_text', 'pol_conf', 'pol_time', 'left', 'right', 'lr_text', 'lr_conf', 'lr_time', 'analyse_date' ]) sql_time_start = time.time() df = db_functions.select_from_db(sql) print( f"############## --- SQL Select time : {sql_time_start - time.time()} --- #############" ) if df.shape[1] != 2: print("ERROR: DF must ONLY have columns user_id and username") gc.collect() sys.exit() gc.collect() for index, element in tqdm(df.iterrows(), total=df.shape[0]): start_time = time.time() user_id = element[0] screen_name = element[1] def tweet_download_and_lang_detect(df_tweets, user_id, update_to_invalid_list): """ Calls language detection and checks if enough german tweets remain. If it found almost enough german Tweets it will load more. If it found almost none it will abort. :param df_tweets: 0 during first run, dataframe with tweets during later runs :param user_id: Twitter User_ID for tweet download and language check :param update_to_invalid_list: List of user that can not be downloaded from. Will append to if applicable. :return: df_tweets, update_to_invalid_list, abort_loop, len_df """ if isinstance(df_tweets, int): df_tweets = TwitterAPI.API_tweet_multitool( user_id, 'temp', pages=1, method='user_timeline', append=False, write_to_db=False) # fills DF with 200 tweets of 1 page df_tweets = helper_functions.lang_detect(df_tweets) else: df_tweets_additions = TwitterAPI.API_tweet_multitool( user_id, 'temp', pages=1, method='user_timeline', append=False, write_to_db=False) # fills DF with 200 tweets of 1 page df_tweets_additions = helper_functions.lang_detect( df_tweets_additions) if isinstance(df_tweets_additions, pd.DataFrame): df_tweets = pd.concat([df_tweets, df_tweets_additions]) df_tweets.reset_index(inplace=True) del df_tweets['index'] # if df_tweets is None: #no tweets found or all tweets deleted (non german) # abort_loop = True # return df_tweets, update_to_invalid_list, abort_loop' len_df = helper_functions.dataframe_length(df_tweets) if len_df <= 50: # if almost no tweets are german don't try to get more german tweets from this users. # would take to many page loads update_to_invalid_list.append(user_id) abort_loop = True elif len_df >= 200: abort_loop = True else: # if to few tweets are german load more tweets to get a better result abort_loop = False gc.collect() return df_tweets, update_to_invalid_list, abort_loop, len_df df_tweets = 0 # tries two times to get at least 200 german tweets, if first attempt returns less than 150 german tweets for i in range(2): df_tweets, update_to_invalid_list, abort_loop, len_df = tweet_download_and_lang_detect( df_tweets, user_id, update_to_invalid_list) if abort_loop == True: break if len_df > 0: for method in methods: prediction_result = [] if method == 'pol': if TFIDF_pol_unpol_conv == 0 or Algo_pol_unpol == 0: print( "Warning: No Political/Unpolitical classifier given. Check function parameters." ) else: prediction_result.append( TFIDF_inference.TFIDF_inference( df_tweets['tweet'], TFIDF_pol_unpol_conv, Algo_pol_unpol)) if method == 'LR': #prediction_result.append(inference_political_bert.bert_predictions(df_tweets['tweet'], BERT_model)) prediction_result.append( bert_predictions(df_tweets['tweet'], BERT_model)) runtime = int(time.time() - start_time) # returns text interpretation of inference text, conf = helper_functions.conf_value( method, prediction_result, max_boundary=len(df_tweets)) # result and confidence score df_pred_data.at[index, 'user_id'] = user_id df_pred_data.at[index, 'screen_name'] = screen_name df_pred_data.at[index, 'analyse_date'] = cur_date # TODO: If you store the column names in variables that update depending on the method, you only need # one block pred_result_zero = 'left' if method == "LR" else 'pol' pred_result_one = 'right' if method == "LR" else 'unpol' df_pred_data.at[index, pred_result_zero] = prediction_result[0][0] df_pred_data.at[index, pred_result_one] = prediction_result[0][1] if method == "LR": df_pred_data.at[index, 'left'] = prediction_result[0][0] df_pred_data.at[index, 'right'] = prediction_result[0][1] df_pred_data.at[index, 'lr_text'] = text df_pred_data.at[index, 'lr_conf'] = conf df_pred_data.at[index, 'lr_time'] = runtime else: df_pred_data.at[index, 'pol'] = prediction_result[0][0] df_pred_data.at[index, 'unpol'] = prediction_result[0][1] df_pred_data.at[index, 'pol_text'] = text df_pred_data.at[index, 'pol_conf'] = conf df_pred_data.at[index, 'pol_time'] = runtime # print("screen_name,Pol,Unpol,Pol_Time,Left,Right,LR_Time") # for index, element in df_pred_data.iterrows(): # print( # f"{element['user_id']},{element['screen_name']},{element['pol']}" # f",{element['unpol']},{element['pol_time']},{element['left']},{element['right']},{element['lr_time']}") # print("\n") # if index == 6: # print ("Stopp") if (write_to_db is True and index != 0 and index % batch_size == 0) or ( write_to_db is True and (df.shape[0]) == (index + 1) ): #saved data x iterations OR when df has no further rows if len(update_to_invalid_list) > 0: invalids = pd.DataFrame(update_to_invalid_list) invalids['cur_date'] = cur_date db_functions.df_to_sql(invalids, "temp_invalids", drop='replace') update_sql = """update n_users set lr = 'invalid', pol= 'invalid', lr_pol_last_analysed = temp_invalids.cur_Date from temp_invalids where id = temp_invalids."0" """ db_functions.update_table(update_sql) db_functions.drop_table("temp_invalids") if helper_functions.dataframe_length(df_pred_data) > 0: db_functions.df_to_sql(df_pred_data, table_name, drop='replace') update_sql = f""" update n_users set lr = lr_text, lr_conf = cast (a.lr_conf as numeric), pol = pol_text, pol_conf = cast (a.pol_conf as numeric), lr_pol_last_analysed = analyse_date from {table_name} a where id = cast(user_id as bigint)""" db_functions.update_table( update_sql) # update n_users table with new resulsts print(f"Data written to table: {table_name}.") gc.collect()
def combined_scores_calc_launcher(sql: str, bert_friends_high_confidence_capp_off, self_conf_high_conf_capp_off, min_required_bert_friend_opinions): """ Calculates combined score from users self-LR score and users bert_friend score :return: """ # limit = 1000 # sql = f"select id, screen_name, lr, lr_conf, result_bert_friends, bert_friends_conf, bf_left_number, # bf_right_number from n_users where lr is not null limit {limit}" # sql = f"select id, screen_name, lr, lr_conf, result_bert_friends, bert_friends_conf, bf_left_number, # bf_right_number from n_users where lr is not null or result_bert_friends is not null" df = db_functions.select_from_db(sql) df.fillna(0, inplace=True) count_rated_accounts = 0 count_uncategorized_accounts = 0 count_rating_less_accounts = 0 count_to_few_bert_friends_to_rate_and_LRself_is_invalid = 0 count_bert_friends_result_is_mediocre = 0 id_list = df['id'].to_list() id_dict = {i: 0 for i in id_list} # ToDo: Runtime 30 minutes. Changes to Dict for index, element in tqdm(df.iterrows(), total=df.shape[0]): result, rated_accounts, rating_less_accounts, to_few_bert_friends_to_rate_and_LRself_is_invalid, bert_friends_result_is_mediocre, uncategorized_accounts = calculate_combined_score( bert_friends_high_confidence_cap_off= bert_friends_high_confidence_capp_off, self_conf_high_conf_cap_off=self_conf_high_conf_capp_off, min_required_bert_friend_opinions=min_required_bert_friend_opinions, user_id=element[0], self_lr=element[2], self_lr_conf=element[3], bert_friends_lr=element[4], bert_friends_lr_conf=element[5], number_of_bert_friends_L=element[6], number_of_bert_friends_R=element[7], BERT_ML_rating=element[8], BERT_ML_conf=element[9]) id_dict[element[0]] = result count_rated_accounts += rated_accounts count_rating_less_accounts += rating_less_accounts count_to_few_bert_friends_to_rate_and_LRself_is_invalid += to_few_bert_friends_to_rate_and_LRself_is_invalid count_bert_friends_result_is_mediocre += bert_friends_result_is_mediocre count_uncategorized_accounts += uncategorized_accounts print("\n\n") print(f"ratingless_accounts: {count_rating_less_accounts}") print( f"to_few_bert_friends_to_rate_and_LRself_is_invalid_or_unknown_or_of_low_conf: " f"{count_to_few_bert_friends_to_rate_and_LRself_is_invalid}") print( f"bert_friends_result_is_medicore: {count_bert_friends_result_is_mediocre}" ) print(f"uncategorized_accounts: {count_uncategorized_accounts}") total_rating_less_accounts = count_rating_less_accounts + \ count_to_few_bert_friends_to_rate_and_LRself_is_invalid + \ count_bert_friends_result_is_mediocre + \ count_uncategorized_accounts print(f"\nAccounts without rating: {total_rating_less_accounts}") print(f"Rated accounts: {count_rated_accounts}") print("\n\n") print("Calculation done. Writing results to DB.") del df id_dict = {k: v for k, v in id_dict.items() if v != 0} df_result = pd.DataFrame(id_dict).transpose() df_result = df_result.replace('null', np.NaN) if len(df_result) == 0: print("Now new data.") else: print(f"{len(df_result)} new results found.") db_functions.df_to_sql(df_result, "temp_table", drop='replace') update_sql = 'update n_users set combined_rating = t."0", combined_conf = cast(t."1" as numeric) from temp_table t where n_users.id = t.index' db_functions.update_table(update_sql) # runtime 8 minutes db_functions.drop_table("temp_table")
def friend_rating_launcher(sql: str, get_data_from_DB: bool) -> None: # def bert_friends_score(get_data_from_DB): """Refreshes score for all users in DB who... 1) have a Bert LR rating and 2) follow someone in n_followers Writes result to table n_users (total runtime 87 min) """ timestamp = db_functions.staging_timestamp() start_time = time.time() if get_data_from_DB is True: # Runtime 18 min # --Bert_Friends: Zu bewertende User und die Scores ihrer Freunde df = db_functions.select_from_db(sql) db_functions.save_pickle(df, "bert_friends.pkl") else: df = db_functions.load_pickle("bert_friends.pkl") # df = df.iloc[:50000,:] df_sub0 = df.groupby(['follows_ids', 'bert_self']).size().unstack(fill_value=0) df_sub1 = df.groupby(['follows_ids', 'bert_friends']).size().unstack(fill_value=0) del df result = df_sub1.join(df_sub0, lsuffix='_friend_Bert', rsuffix='_self_Bert') del df_sub0 del df_sub1 user_list = result.index.to_list() try: left_friend_Bert_list = result['links_friend_Bert'].to_list() except: print( "Warning: 0 Results. Staging results possibly not copied to facts_hashtags." ) right_friend_Bert_list = result['rechts_friend_Bert'].to_list() del result user_dict = {} for i, user in enumerate(tqdm(user_list)): if user not in user_dict: user_dict[user] = {} right = right_friend_Bert_list[i] left = left_friend_Bert_list[i] text, conf = helper_functions.conf_value( method='LR', prediction_result=[[left, right]], min_boundary=0, max_boundary=left + right) user_dict[user]["text"] = text user_dict[user]["confidence"] = conf user_dict[user]["last_seen"] = timestamp user_dict[user]["bf_left_number"] = left user_dict[user]["bf_right_number"] = right print("User dict erstellt.") print(len(user_dict)) result = pd.DataFrame(user_dict).T print("DF transponiert.") db_functions.df_to_sql(result, "temp_result", drop='replace') print("Insert into temp done.") sql = "update n_users set result_bert_friends = text, bert_friends_conf = cast(confidence as numeric), " \ "bert_friends_last_seen = temp_result.last_seen, bf_left_number = temp_result.bf_left_number, " \ "bf_right_number = temp_result.bf_right_number from temp_result where id = cast (temp_result.index as bigint)" db_functions.update_table(sql) db_functions.drop_table("temp_result") print(f"Runtime in min: {(time.time() - start_time) / 60} ")
def tweet_details_download_launcher(table_name: str, hashtag: str, bulk_size: int = 1000, download_parent_tweets=True): """ 1. Calls Tweet downloader 2. Adds details to Tweet IDs in staging table via update 3. If Tweets are are reply to another tweet, those parent tweets can be downloaded via option download_parent_tweets :param table_name: Staging table name which will be updated :param hashtag: hashtag scraped :param bulk_size: number of tweets to processed in one function call :param download_parent_tweets: If True: Loops through all tweets until no further father elements can be found :return: none """ parent_sql = f"""select * from {table_name} where tweet is null or (retweet is not null and retweet not in (select id from {table_name})) limit {bulk_size}""" df_parent = db_functions.select_from_db(parent_sql) df_parent['id'] = df_parent[ 'retweet'] #replaces tweet ID with parent id. Otherwise previously downloaded tweets would be downloaded again sql = f"select * from {table_name} where tweet is null limit {bulk_size}" df = db_functions.select_from_db(sql) download_parents = helper_functions.dataframe_length( df) == 0 and download_parent_tweets == True if download_parents == True: df = df_parent for index, element in df.iterrows(): error = False # downloads details for Tweets from staging table result = API_get_tweet_details(element[1], sleep=True) if result == 'Error: Not authorized': #Tweet is set to private error = True if result == 'Error: No status found with that ID.' or result == "Undefined Error": #sql = f"update {table_name} set tweet = 'deleted' where id = {element[1]}" #db_functions.update_table(sql) df.tweet[df.id == element[1]] = 'deleted' error = True if error is False: df.iloc[index:index + 1, 4:5] = result[1] # date df.iloc[index:index + 1, 5:6] = result[2] # tweet df.iloc[index:index + 1, 6:7] = hashtag # hashtag df.iloc[index:index + 1, 7:8] = result[0] # user_id df.iloc[index:index + 1, 8:9] = result[4] # screen_name df.iloc[index:index + 1, 9:10] = result[3] # name df.iloc[index:index + 1, 11:12] = result[5] # in reply to tweet id df.iloc[index:index + 1, 18:19] = table_name #print ("Fetched Tweet: {}".format(element[1])) if len(df) == 0: return 0 db_functions.df_to_sql(df, 'temp_df', 'replace') new_tweets_fetched = helper_functions.dataframe_length(df) # try: # print (new_tweets_fetched) # except UnboundLocalError: # print ("STOPP") #update staging table with values form temp_df if download_parents == False: sql = f"""update {table_name} a set date = b.date, tweet = b.tweet, hashtags = b.hashtags, user_id = cast (b.user_id as bigint), username = b.username, name = b.name, retweet = cast (b.retweet as bigint), staging_name = b.staging_name from (select * from temp_df) b where a.id = b.id""" else: sql = f"""INSERT INTO {table_name} SELECT index::bigint, id, conversation_id::bigint, created_at, date, tweet, hashtags, user_id, username, name, link::bigint, retweet::bigint, nlikes::bigint, nreplies::bigint, nretweets::bigint, quote_url::bigint, user_rt_id::bigint, user_rt::bigint, staging_name FROM temp_df""" print(f"{new_tweets_fetched} parent tweets added.") db_functions.update_table(sql) db_functions.drop_table('temp_df') #new_tweets_fetched = helper_functions.dataframe_length(df) return new_tweets_fetched