Ejemplo n.º 1
0
def run_rnd_forrest_training(sql_political, sql_unpolitical, pkl_filename_converter, pkl_filename_classifier) -> None:
    """
    Run Random Forrest training. Stores model in pickle file.
    :param sql_political: Political Tweets selected from DB
    :param sql_unpolitical: Unpolitical Tweets selected from DB
    :param pkl_filename_converter: name of TFIDF converter pickle file
    :param pkl_filename_classifier: name of classifier pickle file
    :return: none
    """
    start_time = time.time()
    df_unpolitical = db_functions.select_from_db(sql_unpolitical)
    df_unpolitical = df_unpolitical.assign(label=1)
    df_unpolitical = delete_non_german_tweets_from_df(df_unpolitical)
    
    df_political = db_functions.select_from_db(sql_political)
    df_political = df_political.assign(label=0)
    df_political = delete_non_german_tweets_from_df(df_political)
    
    df = pd.concat([df_unpolitical, df_political])
    df['tweet'] = df['tweet'].str.replace('\r', "")
    df = df.sample(frac=1)
    X = df['tweet'].tolist()
    y = df['label'].tolist()
    processed_tweets = helper_functions.scrub_tweets(X)
    
    tfidfconverter = TfidfVectorizer(max_features=5000, min_df=5, max_df=1.0)
    X = tfidfconverter.fit_transform(processed_tweets).toarray()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
    # Random Forest Classifier
    text_classifier = RandomForestClassifier(n_estimators=100, random_state=0)
    text_classifier.fit(X_train, y_train)
    
    # Save to file in the current working directory
    with open(pkl_filename_converter, 'wb') as file:
        pickle.dump(tfidfconverter, file)
    
    with open(pkl_filename_classifier, 'wb') as file:
        pickle.dump(text_classifier, file)
    
    predictions = text_classifier.predict(X_test)
    print(confusion_matrix(y_test, predictions))
    print(classification_report(y_test, predictions))
    print(accuracy_score(y_test, predictions))
    print("\n")
    print("--- %s seconds ---" % (time.time() - start_time))
Ejemplo n.º 2
0
def check_DB_users_existance(id):
    """
    Checks if a user is already present in table n:users
    :param id: User Id as number or string (string is preferable)
    :return: True is user exists, False if not
    """
    df = db_functions.select_from_db(f"select * from n_users where id = {id}")
    #assert (len(df) <= 1),f"ERROR: Duplicate row in n_users found. Check User ID {id}"
    if len(df) == 1:
        return True
    elif len(df) == 0:
        return False
Ejemplo n.º 3
0
def get_BERT_friends_scores_from_friends(
        sql: str, min_required_bert_friend_opinions: int):
    """get_followers
    Counts how many left or right friends a user has. Unlike seemingly similar functions (which download and analyse
    tweets), this one gets user profiles from DB as DF.

    For users that need special attention because they did not get a combined score due to bad BERT_friend score.
    Works only if a users friends are available in n_friends.
    Afterwards get_combined_scores_from_followers(sql) needs to run again, to give the accounts handled here a
    combined score
    :param sql: sql statement that provides delivers users and their friends, which have a bert_friend or a combined
    score
    """
    df = db_functions.select_from_db(sql)

    # Accounts in this list can only rated via bert_friend_rating
    bert_friend_rating_list = df[
        df['combined_conf'].isnull()]['id'].drop_duplicates().to_list()
    all_ids = df['id'].drop_duplicates().to_list()
    # Account in this list can be rated via (the better) combined_rating
    combined_rating_list = [
        item for item in all_ids if item not in bert_friend_rating_list
    ]

    # count left/right friends based on BERT_friend rating (widely available)
    df_result_BERT_friends = count_friend_stances(
        df,
        friend_lst=bert_friend_rating_list,
        column_to_count='result_bert_friends',
        min_required_bert_friend_opinions=min_required_bert_friend_opinions)

    # count left/right friends based on combined rating (better results)
    df_result_combined_ratings = count_friend_stances(
        df,
        friend_lst=combined_rating_list,
        column_to_count='combined_rating',
        min_required_bert_friend_opinions=min_required_bert_friend_opinions)

    df_combined = pd.concat(
        [df_result_BERT_friends, df_result_combined_ratings])
    df_combined.dropna(inplace=True)
    del df
    del df_result_combined_ratings
    del df_result_BERT_friends

    db_functions.df_to_sql(df_combined, 'temp_scores_table', drop='replace')
    update_sql = 'update n_users set result_bert_friends = t."1", bert_friends_conf = cast (t."2" as numeric), ' \
                 'bf_left_number = t."3", bf_right_number = t."4", bert_friends_last_seen = t."5" from ' \
                 'temp_scores_table t where n_users.id = t."0"'
    db_functions.update_table(update_sql)
    db_functions.drop_table('temp_scores_table')
def prepare_training_data(sql_right, sql_left, sql_right_eval, sql_left_eval):
    #ToDo: Add Docstinrg
    """
    :param sql_right:
    :param sql_left:
    :return:
    """
    eval_set_limit = 10000
    # Define data sources in DB
    df_right = db_functions.select_from_db(sql_right)
    df_left = db_functions.select_from_db(sql_left)

    df_right_eval = db_functions.select_from_db(sql_right_eval)
    df_left_eval = db_functions.select_from_db(sql_left_eval)
    df_right_eval = df_right_eval.sample(frac=1)
    df_left_eval = df_left_eval.sample(frac=1)
    df_right_eval=df_right_eval.iloc[0:eval_set_limit,]
    df_left_eval=df_left_eval.iloc[0:eval_set_limit,]

    df_right['pred_class'] = 1
    df_left['pred_class'] = 0
    df_right_eval['pred_class'] = 1
    df_left_eval['pred_class'] = 0

    df = pd.concat([df_right, df_left])
    df['tweet'] = df['tweet'].str.replace('\r', "")
    df = df.sample(frac=1)

    df_eval = pd.concat([df_right_eval, df_left_eval])
    df_eval['tweet'] = df_eval['tweet'].str.replace('\r', "")
    df_eval = df_eval.sample(frac=1)

    train_df, test_df = train_test_split(df, test_size=0.10)
    print('train shape: ', train_df.shape)
    print('test shape: ', test_df.shape)
    return train_df, test_df, df_eval
Ejemplo n.º 5
0
def get_friends(sql: str):
    """
    Downloads accounts a user follows. Account IDs are given in form of an SQL statement.
    :param sql: SQL statement that delivers list of users
        Example:
        Finds distinct users of a hashtag and loads friends of all users to n_friends
        sql = "select distinct user_id from s_h_umweltsau_20201104_1540 where user_id is not null except select
        distinct user_id from n_friends"
        Example 2:
        Finds friends for users with an LR rating and a high confidence rating
        select distinct id from n_users u where lr in ('links','rechts') and lr_conf > 0.8 except select distinct
        user_id from n_friends
    :return: nothing
    """
    df = db_functions.select_from_db(sql)
    for index, element in tqdm(df.iterrows(), total=df.shape[0]):
        number_written = TwitterAPI.API_Friends(element[0], "unknown")
        print(
            str(number_written) +
            " friends written to table n_friends for id " + str(element))
        time.sleep(60)  # Avoids exceeding Twitter API rate limit
        gc.collect()  # API calls seem to be memory leaky
def inference_bert_friends(classifier, column_list: list, sql: str,
                           min_matches: int):
    """
    Performs inference based on users an account follows. Stores result to n_users
    :param classifier: Classifier
    :param friend_column_list_path: Column list to be used. Any Users friends are matched against this column list
    :param sql: Sql with combination of User to be inferendes, their label (combined rating) and their friend ID
    :param min_matches: Minimum friends that must be found in friend_column for the user to get a prediction.
    More connections = more accurate prediction result
    :return:
    """
    start = time.time()
    friends = db_functions.select_from_db(sql)
    input_dataset_length = len(friends)
    print(f"SQL fetching time: {time.time() - start}")

    if len(friends) == 0:
        rows_processed = 0
        return rows_processed

    friend_set = set(friends['follows_ids'].values.tolist())
    friend_list = friends['follows_ids'].values.tolist()
    user_list = friends['user_id'].values.tolist()
    rating_list = friends['combined_rating'].values.tolist()
    del friends

    #Transforms DataFrame into DefaultDict
    relationship_dict = defaultdict(lambda: defaultdict(list))
    for i, element in enumerate(friend_list):
        relationship_dict[element][0].append(user_list[i])
        relationship_dict[element][1].append(rating_list[i])

    conditions_not_met_list = [
    ]  # Ids in this list will still get a last seen date in DB to ignore them during next loop
    result_dict = {}
    #Iteration though friend_set and prediction of faction
    for element in tqdm(friend_set):
        common_friends = set(relationship_dict[element][0]) & set(column_list)
        number_of_common_friends = len(common_friends)
        if number_of_common_friends >= min_matches:
            df = pd.DataFrame(index=column_list).T
            df = df.append(pd.Series(), ignore_index=True).fillna(0)
            for friend in relationship_dict[element][0]:
                df.loc[:, friend] = 1
            df = df.iloc[:, :len(column_list)]
            prediction_proba = classifier.predict_proba(
                df.values.tolist())  # pure predict

            text, conf = helper_functions.conf_value("LR",
                                                     prediction_proba,
                                                     min_boundary=0.5,
                                                     max_boundary=1)
            result_dict[element] = [text, conf, number_of_common_friends]
        else:
            conditions_not_met_list.append(element)

    del friend_set
    del friend_list
    del user_list
    del rating_list

    timestamp = db_functions.staging_timestamp()
    result_df = pd.DataFrame(result_dict).T
    result_df['last_seen'] = timestamp
    rows_processed = len(result_df)
    if rows_processed > 0:  #checks if data has been written to DF
        db_functions.df_to_sql(result_df, "temp_table", "replace")
        update_sql = """update n_users
        set bert_friends_ml_result = "0",
        bert_friends_ml_conf = cast("1" as numeric),
        bert_friends_ml_count = cast ("2" as integer),
        bert_friends_ml_last_seen = temp_table.last_seen
        from temp_table where
        cast (id as text) = temp_table."index"
        """
        start = time.time()
        db_functions.update_table(update_sql)
        db_functions.drop_table("temp_table")
        print(f"Update Time: {time.time() - start}")
    else:
        print(
            f"WARNING: 0 new ratings generated despite an input dataset of {input_dataset_length} rows."
        )

    if len(conditions_not_met_list) > 0:
        stamps = [timestamp for elm in conditions_not_met_list]
        ziped = list(zip(conditions_not_met_list, stamps))
        db_functions.df_to_sql(pd.DataFrame(ziped),
                               "temp_table",
                               drop='replace')
        sql = 'update n_users set bert_friends_ml_last_seen = temp_table."1" from temp_table where n_users.id::text = temp_table."0"'
        db_functions.update_table(sql)
        #db_functions.drop_table('temp_table')

    return rows_processed
def create_training_matrix(load_from_db, sql_left, sql_right,
                           clf_pure_predict_path, column_list_path,
                           pickle_name_left, pickle_name_right,
                           classifier_pkl):
    """
    Creates training matrix for Bert_Friend_ML training, runs training and saves the model as pickle file.
    The trained model tries to answer this question: If you follow 1 communists, 5 moderate lefties, 4 conservatives and 2 Nazis. What does that make you?
    SQL Input by columns:
        u.id: Id of an average Jow type Twitter user like you and me
        u.combined_rating: Left/Right rating of above average users
        u.combined_conf: combined confidence of above user
        f.user_id: Id of a "big user" with many followers (e.g. Bernie Sanders) the average Joe follows
    Column List:
        All big users will end up as columns (with their IDs) in a matrix. The IDs ares stored in order to match users
        during inference against them
    Pure predict:
        Pure predict is light weight version of sk learn that performs the inference much faster at the cost a tiny
        bit of accuracy (> 0,5%)
    :param load_from_db: False to use data from pickle file
    :param sql_left: left wing users
    :param sql_right: right wing users
    :param clf_pure_predict_path: Will save pure predict model at this location
    :param column_list_path: Column List save path
    :param classifier_pkl: Classifier path
    :return: None
    """
    if load_from_db:
        # sql_left= """
        # select distinct u.id, u.combined_rating, u.combined_conf, f.user_id from n_followers f, n_users u
        # where cast (f.follows_ids as numeric) = u.id
        # and u.combined_conf >= 0.9
        # and u.combined_rating = 'links'
        # order by u.id
        # limit 750000
        # """
        #
        # sql_right = """
        # select distinct u.id, u.combined_rating, u.combined_conf, f.user_id from n_followers f, n_users u
        # where cast (f.follows_ids as numeric) = u.id
        # and u.combined_conf >= 0.7
        # and u.combined_rating = 'rechts'
        # order by u.id
        # limit 750000
        # """
        df_left = db_functions.select_from_db(sql_left)
        df_right = db_functions.select_from_db(sql_right)

        db_functions.save_pickle(df_left, pickle_name_left)
        db_functions.save_pickle(df_right, pickle_name_right)
    else:
        df_left = db_functions.load_pickle(pickle_name_left)
        df_right = db_functions.load_pickle(pickle_name_right)

    features = pd.concat([df_left, df_right])
    labels = features[['id', 'combined_rating']].drop_duplicates()
    del df_left
    del df_right
    features = features.pivot(index='id',
                              columns='user_id',
                              values='combined_rating')
    features.replace(['links', 'rechts'], 1, inplace=True)
    features.fillna(0, inplace=True)

    column_list = features.columns.values.tolist()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        labels,
                                                        test_size=0.2,
                                                        random_state=42)

    print(f"Len Features: {len(features)}")
    # classifier = build_model()
    classifier = RandomForestClassifier(n_estimators=500,
                                        criterion='entropy',
                                        min_samples_leaf=1,
                                        min_samples_split=2)
    classifier.fit(X_train, y_train['combined_rating'])

    clf_pure_predict = convert_estimator(classifier)
    db_functions.save_pickle(clf_pure_predict, clf_pure_predict_path)
    db_functions.save_pickle(classifier, classifier_pkl)
    db_functions.save_pickle(column_list, column_list_path)

    predictions = classifier.predict(X_test)
    print(confusion_matrix(y_test['combined_rating'], predictions))
    print(classification_report(y_test['combined_rating'], predictions))
    print(accuracy_score(y_test['combined_rating'], predictions))
Ejemplo n.º 8
0
def eval_bert(model_path) -> None:
    """
    Runs evaluation against evaluation accounts in table "eval_table"
    Return a printout of the results
    :return: none
    """
    data = []
    df_pred_data = pd.DataFrame(data,
                                columns=[
                                    'screen_name', 'pol', 'unpol', 'pol_time',
                                    'left', 'right', 'lr_time'
                                ])
    sql = "select distinct username from eval_table"
    #sql = "select distinct screen_name as username from n_users where id = 805308596"
    df = db_functions.select_from_db(sql)

    print("Loading BERT")
    # older version
    # model_path = r"C:\Users\Admin\PycharmProjects\untitled\outputs\political_bert_1605094936.6519241\checkpoint-15000"
    # model_path = r"F:\AI\outputs\political_bert_1605652513.149895\checkpoint-480000"
    model = init(model_path)
    print("Querying BERT")

    for index, element in tqdm(df.iterrows(), total=df.shape[0]):

        screen_name = element[0]

        df_tweets = TwitterAPI.API_tweet_multitool(
            screen_name,
            'temp',
            pages=1,
            method='user_timeline',
            append=False,
            write_to_db=False)  # speichert tweets in DF
        if isinstance(
                df_tweets,
                str):  # if df_tweets is a string it contains an error message
            continue
        start_time = time.time()
        german_language = helper_functions.lang_detect(df_tweets)
        runtime = time.time() - start_time
        print(f"Runtime Lang Detect: {runtime}")
        if german_language is False:
            continue
        start_time = time.time()
        prediction_result = [bert_predictions(df_tweets['tweet'], model)]
        runtime = time.time() - start_time
        print(f"Runtime Bert: {runtime}")

        result = prediction_result[0]
        df_pred_data.at[index, 'screen_name'] = screen_name
        try:
            df_pred_data.at[index, 'left'] = result[0]
            df_pred_data.at[index, 'right'] = result[1]
        except:
            df_pred_data.at[index, 'left'] = 0
            df_pred_data.at[index, 'right'] = 0
        df_pred_data.at[index, 'lr_time'] = runtime

    print("screen_name,Pol,Unpol,Pol_Time,Left,Right,LR_Time")
    for index, element in df_pred_data.iterrows():
        print(
            f"{element['screen_name']},{element['pol']},{element['unpol']},{element['pol_time']},{element['left']},"
            f"{element['right']},{int(element['lr_time'])}")
Ejemplo n.º 9
0
def prediction_launcher(table_name: str,
                        BERT_model,
                        sql: str,
                        write_to_db: bool = True,
                        TFIDF_pol_unpol_conv=0,
                        Algo_pol_unpol=0):
    """
    Loads 200 Tweets (one page call) per users and sends them to BERT for inference
    :param table_name: Name of temp table used to store results
    :param BERT_model: BERT Model
    :param sql: Statement providing Users to be inferenced
    :param write_to_db: True or False
    :param TFIDF_pol_unpol_conv: tfidf converter (optional)
    :param Algo_pol_unpol: Random Forrest classifier (optional)
    :return:
    """
    start_time_overal = time.time()
    cur_date = str(date.today())  # date for last seen columns
    #methods = ['pol', 'LR']
    methods = ['LR']
    data = []
    update_to_invalid_list = []

    # This DF will store all precditions results
    df_pred_data = pd.DataFrame(data,
                                columns=[
                                    'user_id', 'screen_name', 'pol', 'unpol',
                                    'pol_text', 'pol_conf', 'pol_time', 'left',
                                    'right', 'lr_text', 'lr_conf', 'lr_time',
                                    'analyse_date'
                                ])

    sql_time_start = time.time()
    df = db_functions.select_from_db(sql)
    print(
        f"##############  --- SQL Select time : {sql_time_start - time.time()} --- #############"
    )
    if df.shape[1] != 2:
        print("ERROR: DF must ONLY have columns user_id and username")
        gc.collect()
        sys.exit()
    gc.collect()

    for index, element in tqdm(df.iterrows(), total=df.shape[0]):
        start_time = time.time()
        user_id = element[0]
        screen_name = element[1]

        def tweet_download_and_lang_detect(df_tweets, user_id,
                                           update_to_invalid_list):
            """
            Calls language detection and checks if enough german tweets remain.
            If it found almost enough german Tweets it will load more.
            If it found almost none it will abort.
            :param df_tweets: 0 during first run, dataframe with tweets during later runs
            :param user_id: Twitter User_ID for tweet download and language check
            :param update_to_invalid_list: List of user that can not be downloaded from. Will append to if applicable.
            :return: df_tweets, update_to_invalid_list, abort_loop, len_df
            """
            if isinstance(df_tweets, int):
                df_tweets = TwitterAPI.API_tweet_multitool(
                    user_id,
                    'temp',
                    pages=1,
                    method='user_timeline',
                    append=False,
                    write_to_db=False)  # fills DF with 200 tweets of 1 page
                df_tweets = helper_functions.lang_detect(df_tweets)
            else:
                df_tweets_additions = TwitterAPI.API_tweet_multitool(
                    user_id,
                    'temp',
                    pages=1,
                    method='user_timeline',
                    append=False,
                    write_to_db=False)  # fills DF with 200 tweets of 1 page
                df_tweets_additions = helper_functions.lang_detect(
                    df_tweets_additions)
                if isinstance(df_tweets_additions, pd.DataFrame):
                    df_tweets = pd.concat([df_tweets, df_tweets_additions])
                    df_tweets.reset_index(inplace=True)
                    del df_tweets['index']

            # if df_tweets is None: #no tweets found or all tweets deleted (non german)
            #     abort_loop = True
            #     return df_tweets, update_to_invalid_list, abort_loop'

            len_df = helper_functions.dataframe_length(df_tweets)
            if len_df <= 50:
                # if almost no tweets are german don't try to get more german tweets from this users.
                # would take to many page loads
                update_to_invalid_list.append(user_id)
                abort_loop = True
            elif len_df >= 200:
                abort_loop = True
            else:
                # if to few tweets are german load more tweets to get a better result
                abort_loop = False
            gc.collect()
            return df_tweets, update_to_invalid_list, abort_loop, len_df

        df_tweets = 0
        # tries two times to get at least 200 german tweets, if first attempt returns less than 150 german tweets
        for i in range(2):
            df_tweets, update_to_invalid_list, abort_loop, len_df = tweet_download_and_lang_detect(
                df_tweets, user_id, update_to_invalid_list)
            if abort_loop == True:
                break

        if len_df > 0:
            for method in methods:
                prediction_result = []
                if method == 'pol':
                    if TFIDF_pol_unpol_conv == 0 or Algo_pol_unpol == 0:
                        print(
                            "Warning: No Political/Unpolitical classifier given. Check function parameters."
                        )
                    else:
                        prediction_result.append(
                            TFIDF_inference.TFIDF_inference(
                                df_tweets['tweet'], TFIDF_pol_unpol_conv,
                                Algo_pol_unpol))
                if method == 'LR':
                    #prediction_result.append(inference_political_bert.bert_predictions(df_tweets['tweet'], BERT_model))
                    prediction_result.append(
                        bert_predictions(df_tweets['tweet'], BERT_model))
                runtime = int(time.time() - start_time)

                # returns text interpretation of inference
                text, conf = helper_functions.conf_value(
                    method, prediction_result, max_boundary=len(df_tweets))

                # result and confidence score
                df_pred_data.at[index, 'user_id'] = user_id
                df_pred_data.at[index, 'screen_name'] = screen_name
                df_pred_data.at[index, 'analyse_date'] = cur_date

                # TODO: If you store the column names in variables that update depending on the method, you only need
                #  one block
                pred_result_zero = 'left' if method == "LR" else 'pol'
                pred_result_one = 'right' if method == "LR" else 'unpol'
                df_pred_data.at[index,
                                pred_result_zero] = prediction_result[0][0]
                df_pred_data.at[index,
                                pred_result_one] = prediction_result[0][1]

                if method == "LR":
                    df_pred_data.at[index, 'left'] = prediction_result[0][0]
                    df_pred_data.at[index, 'right'] = prediction_result[0][1]
                    df_pred_data.at[index, 'lr_text'] = text
                    df_pred_data.at[index, 'lr_conf'] = conf
                    df_pred_data.at[index, 'lr_time'] = runtime
                else:
                    df_pred_data.at[index, 'pol'] = prediction_result[0][0]
                    df_pred_data.at[index, 'unpol'] = prediction_result[0][1]
                    df_pred_data.at[index, 'pol_text'] = text
                    df_pred_data.at[index, 'pol_conf'] = conf
                    df_pred_data.at[index, 'pol_time'] = runtime

            # print("screen_name,Pol,Unpol,Pol_Time,Left,Right,LR_Time")
            # for index, element in df_pred_data.iterrows():
            #     print(
            #         f"{element['user_id']},{element['screen_name']},{element['pol']}"
            #         f",{element['unpol']},{element['pol_time']},{element['left']},{element['right']},{element['lr_time']}")
            # print("\n")
            # if index == 6:
            #     print ("Stopp")
            if (write_to_db is True and index != 0
                    and index % batch_size == 0) or (
                        write_to_db is True and (df.shape[0]) == (index + 1)
                    ):  #saved data x iterations OR when df has no further rows
                if len(update_to_invalid_list) > 0:
                    invalids = pd.DataFrame(update_to_invalid_list)
                    invalids['cur_date'] = cur_date
                    db_functions.df_to_sql(invalids,
                                           "temp_invalids",
                                           drop='replace')
                    update_sql = """update n_users 
                    set lr = 'invalid', pol= 'invalid', lr_pol_last_analysed = temp_invalids.cur_Date
                    from temp_invalids 
                    where id = temp_invalids."0"
                    """
                    db_functions.update_table(update_sql)
                    db_functions.drop_table("temp_invalids")

                if helper_functions.dataframe_length(df_pred_data) > 0:
                    db_functions.df_to_sql(df_pred_data,
                                           table_name,
                                           drop='replace')
                    update_sql = f"""
                                 update n_users set lr = lr_text, lr_conf = cast (a.lr_conf as numeric), pol = pol_text,
                                 pol_conf = cast (a.pol_conf as numeric), lr_pol_last_analysed = analyse_date from {table_name} 
                                 a where id = cast(user_id as bigint)"""
                    db_functions.update_table(
                        update_sql)  # update n_users table with new resulsts

                    print(f"Data written to table: {table_name}.")
        gc.collect()
Ejemplo n.º 10
0
def combined_scores_calc_launcher(sql: str,
                                  bert_friends_high_confidence_capp_off,
                                  self_conf_high_conf_capp_off,
                                  min_required_bert_friend_opinions):
    """
    Calculates combined score from users self-LR score and users bert_friend score
    :return:
    """
    # limit = 1000
    # sql = f"select id, screen_name, lr, lr_conf, result_bert_friends, bert_friends_conf, bf_left_number,
    # bf_right_number from n_users where lr is not null limit {limit}"
    # sql = f"select id, screen_name, lr, lr_conf, result_bert_friends, bert_friends_conf, bf_left_number,
    # bf_right_number from n_users where lr is not null or result_bert_friends is not null"
    df = db_functions.select_from_db(sql)
    df.fillna(0, inplace=True)

    count_rated_accounts = 0
    count_uncategorized_accounts = 0
    count_rating_less_accounts = 0
    count_to_few_bert_friends_to_rate_and_LRself_is_invalid = 0
    count_bert_friends_result_is_mediocre = 0

    id_list = df['id'].to_list()
    id_dict = {i: 0 for i in id_list}

    # ToDo: Runtime 30 minutes. Changes to Dict
    for index, element in tqdm(df.iterrows(), total=df.shape[0]):
        result, rated_accounts, rating_less_accounts, to_few_bert_friends_to_rate_and_LRself_is_invalid, bert_friends_result_is_mediocre, uncategorized_accounts = calculate_combined_score(
            bert_friends_high_confidence_cap_off=
            bert_friends_high_confidence_capp_off,
            self_conf_high_conf_cap_off=self_conf_high_conf_capp_off,
            min_required_bert_friend_opinions=min_required_bert_friend_opinions,
            user_id=element[0],
            self_lr=element[2],
            self_lr_conf=element[3],
            bert_friends_lr=element[4],
            bert_friends_lr_conf=element[5],
            number_of_bert_friends_L=element[6],
            number_of_bert_friends_R=element[7],
            BERT_ML_rating=element[8],
            BERT_ML_conf=element[9])

        id_dict[element[0]] = result
        count_rated_accounts += rated_accounts
        count_rating_less_accounts += rating_less_accounts
        count_to_few_bert_friends_to_rate_and_LRself_is_invalid += to_few_bert_friends_to_rate_and_LRself_is_invalid
        count_bert_friends_result_is_mediocre += bert_friends_result_is_mediocre
        count_uncategorized_accounts += uncategorized_accounts

    print("\n\n")
    print(f"ratingless_accounts: {count_rating_less_accounts}")
    print(
        f"to_few_bert_friends_to_rate_and_LRself_is_invalid_or_unknown_or_of_low_conf: "
        f"{count_to_few_bert_friends_to_rate_and_LRself_is_invalid}")
    print(
        f"bert_friends_result_is_medicore: {count_bert_friends_result_is_mediocre}"
    )
    print(f"uncategorized_accounts: {count_uncategorized_accounts}")
    total_rating_less_accounts = count_rating_less_accounts + \
                                 count_to_few_bert_friends_to_rate_and_LRself_is_invalid + \
                                 count_bert_friends_result_is_mediocre + \
                                 count_uncategorized_accounts
    print(f"\nAccounts without rating: {total_rating_less_accounts}")
    print(f"Rated accounts: {count_rated_accounts}")
    print("\n\n")
    print("Calculation done. Writing results to DB.")

    del df

    id_dict = {k: v for k, v in id_dict.items() if v != 0}
    df_result = pd.DataFrame(id_dict).transpose()
    df_result = df_result.replace('null', np.NaN)
    if len(df_result) == 0:
        print("Now new data.")
    else:
        print(f"{len(df_result)} new results found.")
        db_functions.df_to_sql(df_result, "temp_table", drop='replace')
        update_sql = 'update n_users set combined_rating = t."0", combined_conf = cast(t."1" as numeric) from temp_table t where n_users.id = t.index'
        db_functions.update_table(update_sql)  # runtime 8 minutes
        db_functions.drop_table("temp_table")
Ejemplo n.º 11
0
def friend_rating_launcher(sql: str, get_data_from_DB: bool) -> None:
    # def bert_friends_score(get_data_from_DB):
    """Refreshes score for all users in DB who...
     1) have a Bert LR rating and
     2) follow someone in n_followers
    Writes result to table n_users (total runtime 87 min)
    """
    timestamp = db_functions.staging_timestamp()
    start_time = time.time()

    if get_data_from_DB is True:
        # Runtime 18 min
        # --Bert_Friends: Zu bewertende User und die Scores ihrer Freunde
        df = db_functions.select_from_db(sql)
        db_functions.save_pickle(df, "bert_friends.pkl")
    else:
        df = db_functions.load_pickle("bert_friends.pkl")
    # df = df.iloc[:50000,:]
    df_sub0 = df.groupby(['follows_ids',
                          'bert_self']).size().unstack(fill_value=0)
    df_sub1 = df.groupby(['follows_ids',
                          'bert_friends']).size().unstack(fill_value=0)
    del df
    result = df_sub1.join(df_sub0,
                          lsuffix='_friend_Bert',
                          rsuffix='_self_Bert')
    del df_sub0
    del df_sub1
    user_list = result.index.to_list()
    try:
        left_friend_Bert_list = result['links_friend_Bert'].to_list()
    except:
        print(
            "Warning: 0 Results. Staging results possibly not copied to facts_hashtags."
        )
    right_friend_Bert_list = result['rechts_friend_Bert'].to_list()
    del result

    user_dict = {}
    for i, user in enumerate(tqdm(user_list)):
        if user not in user_dict:
            user_dict[user] = {}
        right = right_friend_Bert_list[i]
        left = left_friend_Bert_list[i]
        text, conf = helper_functions.conf_value(
            method='LR',
            prediction_result=[[left, right]],
            min_boundary=0,
            max_boundary=left + right)
        user_dict[user]["text"] = text
        user_dict[user]["confidence"] = conf
        user_dict[user]["last_seen"] = timestamp
        user_dict[user]["bf_left_number"] = left
        user_dict[user]["bf_right_number"] = right

    print("User dict erstellt.")
    print(len(user_dict))
    result = pd.DataFrame(user_dict).T
    print("DF transponiert.")
    db_functions.df_to_sql(result, "temp_result", drop='replace')
    print("Insert into temp done.")
    sql = "update n_users set result_bert_friends = text, bert_friends_conf = cast(confidence as numeric), " \
          "bert_friends_last_seen = temp_result.last_seen, bf_left_number = temp_result.bf_left_number, " \
          "bf_right_number = temp_result.bf_right_number from temp_result where id = cast (temp_result.index as bigint)"
    db_functions.update_table(sql)
    db_functions.drop_table("temp_result")
    print(f"Runtime in  min: {(time.time() - start_time) / 60} ")
Ejemplo n.º 12
0
def get_followers(sql,
                  sql_insert_new_followers,
                  download_limit=12500000,
                  time_limit=False) -> None:
    """
    Is given user ids in form of SQL statement OR as List. Will retrieve followers for this user from Twitter.
    :param sql: SQL statement containing followers OR list of users IDs
    :param download_limit: max follower download for each accounts.
    :param sql_insert_new_followers: Contains SQL statement used to write results from temp_table to n_followers
    :return: none
    """
    # Block 1: Check Twitter Limits
    startime = time.time()
    #timeout = 86400
    timeout = 3600  #run one iteration or one hour
    limit = TwitterAPI.api_limit()
    ts = limit['resources']['followers']['/followers/ids']['reset']
    limit = limit['resources']['followers']['/followers/ids'][
        'remaining']  # gets the remaining limit for follower retrival
    print("Reset Time: " +
          str(datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')))
    if limit == 0:
        print('Twitter API limit used up. Aborting query.')
        print("Reset Time: " +
              str(datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')))
        sys.exit()
    else:
        print("Current API Follower Limit: " + str(limit))

    # Block 2: Get users whose followers we want from DB or list
    if isinstance(sql, str):
        df = db_functions.select_from_db(sql)
    elif isinstance(sql, list):
        df = pd.DataFrame(sql, columns=['id'])
    else:
        print("Error: Must either use SQL statement or List")
        sys.exit()
    # Block 3: Get followers for each of the users retrieved in Block 2
    for index, element in tqdm(df.iterrows(), total=df.shape[0]):
        try:
            # this option works if we load followers for cores
            id = element['user_id']
            screen_name = element['screen_name']
        except KeyError:
            # this setting is used if we load follower for anything but cores
            id = element['id']
            screen_name = 0

        #Query Twitter API to find out how many followers a user has
        try:
            scr_name, followers_count = TwitterAPI.API_get_single_user_object(
                id)
        #except TypeError:
        #print ("STOPP!")
        #Did happen once. Did not happen again when trying to investigate
        except ValueError:
            print(f"User {id} not found. Skipping.")
            continue
        if followers_count >= download_limit:
            continue
        print("Getting Followers of " + str(id) + " | Element " +
              str(index + 1) + " of " + str(len(df)))
        TwitterAPI.API_Followers(
            screen_name, id,
            download_limit=download_limit)  # <== API follower retrieval
        db_functions.update_table(sql_insert_new_followers)

        # Block 4: Write follower retrieve date back to n_cores
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        if screen_name != 0:
            sql = "update n_cores set followers_retrieved =' " + str(
                timestamp) + "' where user_id = " + str(id)
            db_functions.update_table(sql)
        if startime - time.time() > timeout:
            print("###Timeout reached!###")
    return id
Ejemplo n.º 13
0
                'bert_friends_high_confidence_cap_off')
        self_conf_high_conf_cap_off = config[
            'CALCULATE_BERT_FRIEND_RATING'].getfloat(
                'self_conf_high_conf_cap_off')
        sql_combined_scores = config['CALCULATE_BERT_FRIEND_RATING'].get(
            'sql_combined_scores')
        combined_scores_calc_launcher(sql_combined_scores,
                                      bert_friends_high_confidence_cap_off,
                                      self_conf_high_conf_cap_off,
                                      int(min_required_bert_friend_opinions))

        sql_new_results = config['CALCULATE_BERT_FRIEND_RATING'].get(
            'sql_new_results')
        sql_new_results = sql_new_results.replace("INSERT_HASHTAG",
                                                  "%" + hashtag + "%")
        print(db_functions.select_from_db(sql_new_results))

    if config['TASKS'].getboolean('calculate_ML_friend_rating'):
        bulk_size = config['CALCULATE_ML_FRIEND_RATING'].get('bulk_size')
        cool_down = config['CALCULATE_ML_FRIEND_RATING'].get('cool_down')
        combined_conf_cap_off = config['CALCULATE_ML_FRIEND_RATING'].get(
            'combined_conf_cap_off')
        sql = config['CALCULATE_ML_FRIEND_RATING'].get('sql')
        sql = sql.replace("INSERT_bulk_size", bulk_size)
        sql = sql.replace("INSERT_HASHTAG", "%" + hashtag + "%")
        sql = sql.replace("INSERT_combined_conf_cap_off",
                          combined_conf_cap_off)
        clf_path = config['CALCULATE_ML_FRIEND_RATING'].get('clf_path')
        column_list_path = config['CALCULATE_ML_FRIEND_RATING'].get(
            'column_list_path')
        min_matches = config['CALCULATE_ML_FRIEND_RATING'].getint(
Ejemplo n.º 14
0
def tweet_details_download_launcher(table_name: str,
                                    hashtag: str,
                                    bulk_size: int = 1000,
                                    download_parent_tweets=True):
    """
    1. Calls Tweet downloader
    2. Adds details to Tweet IDs in staging table via update
    3. If Tweets are are reply to another tweet, those parent tweets can be downloaded via option download_parent_tweets
    :param table_name: Staging table name which will be updated
    :param hashtag: hashtag scraped
    :param bulk_size: number of tweets to processed in one function call
    :param download_parent_tweets: If True: Loops through all tweets until no further father elements can be found
    :return: none
    """

    parent_sql = f"""select * from {table_name} where tweet is null 
    or (retweet is not null and retweet not in (select id from {table_name})) limit {bulk_size}"""
    df_parent = db_functions.select_from_db(parent_sql)
    df_parent['id'] = df_parent[
        'retweet']  #replaces tweet ID with parent id. Otherwise previously downloaded tweets would be downloaded again

    sql = f"select * from {table_name} where tweet is null limit {bulk_size}"
    df = db_functions.select_from_db(sql)

    download_parents = helper_functions.dataframe_length(
        df) == 0 and download_parent_tweets == True
    if download_parents == True:
        df = df_parent
    for index, element in df.iterrows():
        error = False
        # downloads details for Tweets from staging table
        result = API_get_tweet_details(element[1], sleep=True)
        if result == 'Error: Not authorized':  #Tweet is set to private
            error = True
        if result == 'Error: No status found with that ID.' or result == "Undefined Error":
            #sql = f"update {table_name} set tweet = 'deleted' where id = {element[1]}"
            #db_functions.update_table(sql)
            df.tweet[df.id == element[1]] = 'deleted'
            error = True
        if error is False:
            df.iloc[index:index + 1, 4:5] = result[1]  # date
            df.iloc[index:index + 1, 5:6] = result[2]  # tweet
            df.iloc[index:index + 1, 6:7] = hashtag  # hashtag
            df.iloc[index:index + 1, 7:8] = result[0]  # user_id
            df.iloc[index:index + 1, 8:9] = result[4]  # screen_name
            df.iloc[index:index + 1, 9:10] = result[3]  # name
            df.iloc[index:index + 1, 11:12] = result[5]  # in reply to tweet id
            df.iloc[index:index + 1, 18:19] = table_name
            #print ("Fetched Tweet: {}".format(element[1]))

    if len(df) == 0:
        return 0
    db_functions.df_to_sql(df, 'temp_df', 'replace')
    new_tweets_fetched = helper_functions.dataframe_length(df)
    # try:
    #     print (new_tweets_fetched)
    # except UnboundLocalError:
    #     print ("STOPP")
    #update staging table with values form temp_df
    if download_parents == False:
        sql = f"""update {table_name} a set date = b.date, tweet = b.tweet, hashtags = b.hashtags,
              user_id = cast (b.user_id as bigint), username = b.username, name = b.name,
              retweet = cast (b.retweet as bigint), staging_name = b.staging_name from (select * from temp_df) b
              where a.id = b.id"""
    else:
        sql = f"""INSERT INTO {table_name} 
        SELECT index::bigint, id, conversation_id::bigint, created_at, date, tweet, hashtags, user_id, username, name,
        link::bigint, retweet::bigint, nlikes::bigint, nreplies::bigint, nretweets::bigint, quote_url::bigint, user_rt_id::bigint,
        user_rt::bigint, staging_name FROM temp_df"""
        print(f"{new_tweets_fetched} parent tweets added.")
    db_functions.update_table(sql)
    db_functions.drop_table('temp_df')
    #new_tweets_fetched = helper_functions.dataframe_length(df)
    return new_tweets_fetched
Ejemplo n.º 15
0
def sql_timer(sql):
    start = time.time()
    db_functions.select_from_db(sql)
    print(f"Runtime: {time.time() - start}")
Ejemplo n.º 16
0
def topic_model_wordcloud(sql_raw):
    plt.figure()
    stance = ['links', 'rechts']
    for stance_index, stance_element in enumerate(stance):
        sql = sql_raw.replace('STANCE_REPLACE', stance_element)
        df = db_functions.select_from_db(sql)
        df.dropna(inplace=True)

        # vectorizer = CountVectorizer()
        # X = vectorizer.fit_transform(df['tweet'].tolist())
        # vectorizer.get_feature_names()

        tokens_input = df['tweet'].tolist()
        tokens_output = []
        for element in tokens_input:
            element = element.replace('"', "")
            element = element.replace('#', "")
            tokens_output.append(element.split())
        tokens = tokens_output
        #tokens = preprocess_documents(df['tweet'].tolist())
        stop_words = set(stopwords.words('german'))

        stop_words.add('http')
        stop_words.add('Der')
        stop_words.add('Die')
        stop_words.add('Das')
        stop_words.add('mal')
        stop_words.add('Was')
        stop_words.add('Und')
        stop_words.add('Wir')
        stop_words.add('Aber')
        stop_words.add('Ja')
        stop_words.add('ja')
        stop_words.add('Sie')
        stop_words.add('schon')
        stop_words.add('Ich')

        [x.upper() for x in stop_words]
        [x.title() for x in stop_words]

        #Stopword removal
        for index, element in enumerate(tokens):
            filtered_tweet = []
            for word in element:
                if word not in stop_words:
                    filtered_tweet.append(word)
                tokens[index] = filtered_tweet

        dictionary = corpora.Dictionary(tokens)
        corpus = [dictionary.doc2bow(text) for text in tokens]

        tfidf = models.TfidfModel(corpus)
        transformed_tfidf = tfidf[corpus]
        LDA_model = models.LdaMulticore(transformed_tfidf,
                                        num_topics=1,
                                        id2word=dictionary)

        plt.subplot(1, 2, stance_index + 1)
        plt.title(stance_element)
        for t in range(LDA_model.num_topics):
            # fig, ax = plt.subplots()
            # ax.plot(x, y)
            # ax.set_title('A single plot')
            plt.imshow(
                WordCloud(random_state=42, min_word_length=3).fit_words(
                    dict(LDA_model.show_topic(t, 20))))
            #plt.axis("off")
    # pyLDAvis.enable_notebook()
    # vis = pyLDAvis.gensim.prepare(LDA_model, corpus, dictionary)
    # pyLDAvis.show(vis)
    plt.show()