Ejemplo n.º 1
0
def SN_db_operations(hashtag: str, since: str, until: str) -> tuple:
    """
    1. Creates staging table for new hashtag
    2. Calls tweet ID downloader
    3. Saves tweet IDs to DB
    Downloads TweetIDs of Hashtags via snScrapeTweet
    :param hashtag: hashtag, to be downloaded
    :param since: start date for tweet download (string)
    :param until: until date for tweet download (string)
    :return: table_name, hashtag,  dataframe length
    """

    table_name = db_functions.get_staging_table_name(
        hashtag)  #adds prefix (s_h_) and suffix (dateand time) to tablename
    db_functions.drop_table(table_name)
    db_functions.create_empty_staging_table(table_name)
    tweet_ids = SN_get_tweet_ids_for_hashtag(since, until,
                                             hashtag)  #downloads tweets
    if len(tweet_ids) == 0:
        print("####################################################")
        print("####Warning! No Tweets fetched! Process aborted!####")
        print("####################################################")
        sys.exit()
    df = pd.DataFrame(tweet_ids)
    write_to_table = "update_temp"

    db_functions.df_to_sql(df, write_to_table,
                           drop="replace")  #Write Tweet ID to temp table
    #db_functions.update_table('insert into ' + str(table_name) + ' (id) select cast ("0" as bigint) from update_temp') #insert temp table content into staging table
    db_functions.update_table(
        f'insert into {table_name} (id) select cast ("0" as bigint) from {write_to_table}'
    )  # insert temp table content into staging table

    try:
        print(f"Table {table_name} created with {len(df)} tweets.")
    except:
        print("Unhandled Error")
    db_functions.drop_table(write_to_table)
    return table_name, hashtag, df.shape[0]
Ejemplo n.º 2
0
def hashtag_download_launcher(hashtag, since: str, until: str,
                              download_parent_tweets: bool):
    """
    Manages whole hashtag download process.
    Step1: Calls procedures for SQL table creation, Tweet ID download
    Step2: Calls Tweet Details download launcher to add tweet details. Does this in iterations to minimize data loss risk
    Result is stored in staging table s_h_HASHTAG_TIMESTAMP
    :param hashtag: hashtag, to be downloaded
    :param since: start date for tweet download (string)
    :param until: until date for tweet download (string)
    :return: table_name
    """
    table_name, hashtag, len_df = SN_db_operations(hashtag, since, until)
    print("Hashtag Twitter ID download complete. Starting detail download.")
    bulk_size = 1000

    new_tweets_fetched = 1
    loop_counter = 1
    while new_tweets_fetched != 0:
        start_time = time.time()
        print(
            f"Iteration {loop_counter} running. Estimated Iterations {int (len_df / bulk_size) +1+5}"
        )
        #+1 to avoid 0 itteratios, +5 is a good estimate for number of iterations to get all parent tweets
        new_tweets_fetched = TwitterAPI.tweet_details_download_launcher(
            table_name, hashtag, bulk_size, download_parent_tweets)
        loop_counter += 1
        print(f"Iteration {loop_counter} runtime: {time.time() - start_time} ")
    print("Hashtag downloaded successfully.")

    #insert new users into table n_users
    new_users = f"""insert into n_users (id)
    select f.user_id from {table_name} f  left join n_users u on f.user_id = u.id
    where u.id is null and f.user_id is not null"""
    db_functions.update_table(new_users)
    return table_name
def inference_bert_friends(classifier, column_list: list, sql: str,
                           min_matches: int):
    """
    Performs inference based on users an account follows. Stores result to n_users
    :param classifier: Classifier
    :param friend_column_list_path: Column list to be used. Any Users friends are matched against this column list
    :param sql: Sql with combination of User to be inferendes, their label (combined rating) and their friend ID
    :param min_matches: Minimum friends that must be found in friend_column for the user to get a prediction.
    More connections = more accurate prediction result
    :return:
    """
    start = time.time()
    friends = db_functions.select_from_db(sql)
    input_dataset_length = len(friends)
    print(f"SQL fetching time: {time.time() - start}")

    if len(friends) == 0:
        rows_processed = 0
        return rows_processed

    friend_set = set(friends['follows_ids'].values.tolist())
    friend_list = friends['follows_ids'].values.tolist()
    user_list = friends['user_id'].values.tolist()
    rating_list = friends['combined_rating'].values.tolist()
    del friends

    #Transforms DataFrame into DefaultDict
    relationship_dict = defaultdict(lambda: defaultdict(list))
    for i, element in enumerate(friend_list):
        relationship_dict[element][0].append(user_list[i])
        relationship_dict[element][1].append(rating_list[i])

    conditions_not_met_list = [
    ]  # Ids in this list will still get a last seen date in DB to ignore them during next loop
    result_dict = {}
    #Iteration though friend_set and prediction of faction
    for element in tqdm(friend_set):
        common_friends = set(relationship_dict[element][0]) & set(column_list)
        number_of_common_friends = len(common_friends)
        if number_of_common_friends >= min_matches:
            df = pd.DataFrame(index=column_list).T
            df = df.append(pd.Series(), ignore_index=True).fillna(0)
            for friend in relationship_dict[element][0]:
                df.loc[:, friend] = 1
            df = df.iloc[:, :len(column_list)]
            prediction_proba = classifier.predict_proba(
                df.values.tolist())  # pure predict

            text, conf = helper_functions.conf_value("LR",
                                                     prediction_proba,
                                                     min_boundary=0.5,
                                                     max_boundary=1)
            result_dict[element] = [text, conf, number_of_common_friends]
        else:
            conditions_not_met_list.append(element)

    del friend_set
    del friend_list
    del user_list
    del rating_list

    timestamp = db_functions.staging_timestamp()
    result_df = pd.DataFrame(result_dict).T
    result_df['last_seen'] = timestamp
    rows_processed = len(result_df)
    if rows_processed > 0:  #checks if data has been written to DF
        db_functions.df_to_sql(result_df, "temp_table", "replace")
        update_sql = """update n_users
        set bert_friends_ml_result = "0",
        bert_friends_ml_conf = cast("1" as numeric),
        bert_friends_ml_count = cast ("2" as integer),
        bert_friends_ml_last_seen = temp_table.last_seen
        from temp_table where
        cast (id as text) = temp_table."index"
        """
        start = time.time()
        db_functions.update_table(update_sql)
        db_functions.drop_table("temp_table")
        print(f"Update Time: {time.time() - start}")
    else:
        print(
            f"WARNING: 0 new ratings generated despite an input dataset of {input_dataset_length} rows."
        )

    if len(conditions_not_met_list) > 0:
        stamps = [timestamp for elm in conditions_not_met_list]
        ziped = list(zip(conditions_not_met_list, stamps))
        db_functions.df_to_sql(pd.DataFrame(ziped),
                               "temp_table",
                               drop='replace')
        sql = 'update n_users set bert_friends_ml_last_seen = temp_table."1" from temp_table where n_users.id::text = temp_table."0"'
        db_functions.update_table(sql)
        #db_functions.drop_table('temp_table')

    return rows_processed
Ejemplo n.º 4
0
    lr_pol_last_analysed text COLLATE pg_catalog."default",
    result_bert_friends text COLLATE pg_catalog."default",
    bert_friends_conf numeric,
    bert_friends_last_seen text COLLATE pg_catalog."default",
    bf_left_number integer,
    bf_right_number integer,
    combined_rating text COLLATE pg_catalog."default",
    combined_conf numeric,
    private_profile boolean,
    batch integer,
    bert_friends_ml_result text COLLATE pg_catalog."default",
    bert_friends_ml_conf numeric,
    bert_friends_ml_count integer,
    bert_friends_ml_last_seen text COLLATE pg_catalog."default"
)"""
db_functions.update_table(sql_table)

# Create table n_followers
table_name = "n_followers"
sql_table = f"""
CREATE TABLE public.{table_name}
(
    index integer NOT NULL DEFAULT nextval('n_likes_index_seq'::regclass),
    username text COLLATE pg_catalog."default",
    user_id bigint,
    follows_users text COLLATE pg_catalog."default",
    follows_ids text COLLATE pg_catalog."default",
    retrieve_date text COLLATE pg_catalog."default"
)
"""
db_functions.update_table(sql_table)
Ejemplo n.º 5
0
def prediction_launcher(table_name: str,
                        BERT_model,
                        sql: str,
                        write_to_db: bool = True,
                        TFIDF_pol_unpol_conv=0,
                        Algo_pol_unpol=0):
    """
    Loads 200 Tweets (one page call) per users and sends them to BERT for inference
    :param table_name: Name of temp table used to store results
    :param BERT_model: BERT Model
    :param sql: Statement providing Users to be inferenced
    :param write_to_db: True or False
    :param TFIDF_pol_unpol_conv: tfidf converter (optional)
    :param Algo_pol_unpol: Random Forrest classifier (optional)
    :return:
    """
    start_time_overal = time.time()
    cur_date = str(date.today())  # date for last seen columns
    #methods = ['pol', 'LR']
    methods = ['LR']
    data = []
    update_to_invalid_list = []

    # This DF will store all precditions results
    df_pred_data = pd.DataFrame(data,
                                columns=[
                                    'user_id', 'screen_name', 'pol', 'unpol',
                                    'pol_text', 'pol_conf', 'pol_time', 'left',
                                    'right', 'lr_text', 'lr_conf', 'lr_time',
                                    'analyse_date'
                                ])

    sql_time_start = time.time()
    df = db_functions.select_from_db(sql)
    print(
        f"##############  --- SQL Select time : {sql_time_start - time.time()} --- #############"
    )
    if df.shape[1] != 2:
        print("ERROR: DF must ONLY have columns user_id and username")
        gc.collect()
        sys.exit()
    gc.collect()

    for index, element in tqdm(df.iterrows(), total=df.shape[0]):
        start_time = time.time()
        user_id = element[0]
        screen_name = element[1]

        def tweet_download_and_lang_detect(df_tweets, user_id,
                                           update_to_invalid_list):
            """
            Calls language detection and checks if enough german tweets remain.
            If it found almost enough german Tweets it will load more.
            If it found almost none it will abort.
            :param df_tweets: 0 during first run, dataframe with tweets during later runs
            :param user_id: Twitter User_ID for tweet download and language check
            :param update_to_invalid_list: List of user that can not be downloaded from. Will append to if applicable.
            :return: df_tweets, update_to_invalid_list, abort_loop, len_df
            """
            if isinstance(df_tweets, int):
                df_tweets = TwitterAPI.API_tweet_multitool(
                    user_id,
                    'temp',
                    pages=1,
                    method='user_timeline',
                    append=False,
                    write_to_db=False)  # fills DF with 200 tweets of 1 page
                df_tweets = helper_functions.lang_detect(df_tweets)
            else:
                df_tweets_additions = TwitterAPI.API_tweet_multitool(
                    user_id,
                    'temp',
                    pages=1,
                    method='user_timeline',
                    append=False,
                    write_to_db=False)  # fills DF with 200 tweets of 1 page
                df_tweets_additions = helper_functions.lang_detect(
                    df_tweets_additions)
                if isinstance(df_tweets_additions, pd.DataFrame):
                    df_tweets = pd.concat([df_tweets, df_tweets_additions])
                    df_tweets.reset_index(inplace=True)
                    del df_tweets['index']

            # if df_tweets is None: #no tweets found or all tweets deleted (non german)
            #     abort_loop = True
            #     return df_tweets, update_to_invalid_list, abort_loop'

            len_df = helper_functions.dataframe_length(df_tweets)
            if len_df <= 50:
                # if almost no tweets are german don't try to get more german tweets from this users.
                # would take to many page loads
                update_to_invalid_list.append(user_id)
                abort_loop = True
            elif len_df >= 200:
                abort_loop = True
            else:
                # if to few tweets are german load more tweets to get a better result
                abort_loop = False
            gc.collect()
            return df_tweets, update_to_invalid_list, abort_loop, len_df

        df_tweets = 0
        # tries two times to get at least 200 german tweets, if first attempt returns less than 150 german tweets
        for i in range(2):
            df_tweets, update_to_invalid_list, abort_loop, len_df = tweet_download_and_lang_detect(
                df_tweets, user_id, update_to_invalid_list)
            if abort_loop == True:
                break

        if len_df > 0:
            for method in methods:
                prediction_result = []
                if method == 'pol':
                    if TFIDF_pol_unpol_conv == 0 or Algo_pol_unpol == 0:
                        print(
                            "Warning: No Political/Unpolitical classifier given. Check function parameters."
                        )
                    else:
                        prediction_result.append(
                            TFIDF_inference.TFIDF_inference(
                                df_tweets['tweet'], TFIDF_pol_unpol_conv,
                                Algo_pol_unpol))
                if method == 'LR':
                    #prediction_result.append(inference_political_bert.bert_predictions(df_tweets['tweet'], BERT_model))
                    prediction_result.append(
                        bert_predictions(df_tweets['tweet'], BERT_model))
                runtime = int(time.time() - start_time)

                # returns text interpretation of inference
                text, conf = helper_functions.conf_value(
                    method, prediction_result, max_boundary=len(df_tweets))

                # result and confidence score
                df_pred_data.at[index, 'user_id'] = user_id
                df_pred_data.at[index, 'screen_name'] = screen_name
                df_pred_data.at[index, 'analyse_date'] = cur_date

                # TODO: If you store the column names in variables that update depending on the method, you only need
                #  one block
                pred_result_zero = 'left' if method == "LR" else 'pol'
                pred_result_one = 'right' if method == "LR" else 'unpol'
                df_pred_data.at[index,
                                pred_result_zero] = prediction_result[0][0]
                df_pred_data.at[index,
                                pred_result_one] = prediction_result[0][1]

                if method == "LR":
                    df_pred_data.at[index, 'left'] = prediction_result[0][0]
                    df_pred_data.at[index, 'right'] = prediction_result[0][1]
                    df_pred_data.at[index, 'lr_text'] = text
                    df_pred_data.at[index, 'lr_conf'] = conf
                    df_pred_data.at[index, 'lr_time'] = runtime
                else:
                    df_pred_data.at[index, 'pol'] = prediction_result[0][0]
                    df_pred_data.at[index, 'unpol'] = prediction_result[0][1]
                    df_pred_data.at[index, 'pol_text'] = text
                    df_pred_data.at[index, 'pol_conf'] = conf
                    df_pred_data.at[index, 'pol_time'] = runtime

            # print("screen_name,Pol,Unpol,Pol_Time,Left,Right,LR_Time")
            # for index, element in df_pred_data.iterrows():
            #     print(
            #         f"{element['user_id']},{element['screen_name']},{element['pol']}"
            #         f",{element['unpol']},{element['pol_time']},{element['left']},{element['right']},{element['lr_time']}")
            # print("\n")
            # if index == 6:
            #     print ("Stopp")
            if (write_to_db is True and index != 0
                    and index % batch_size == 0) or (
                        write_to_db is True and (df.shape[0]) == (index + 1)
                    ):  #saved data x iterations OR when df has no further rows
                if len(update_to_invalid_list) > 0:
                    invalids = pd.DataFrame(update_to_invalid_list)
                    invalids['cur_date'] = cur_date
                    db_functions.df_to_sql(invalids,
                                           "temp_invalids",
                                           drop='replace')
                    update_sql = """update n_users 
                    set lr = 'invalid', pol= 'invalid', lr_pol_last_analysed = temp_invalids.cur_Date
                    from temp_invalids 
                    where id = temp_invalids."0"
                    """
                    db_functions.update_table(update_sql)
                    db_functions.drop_table("temp_invalids")

                if helper_functions.dataframe_length(df_pred_data) > 0:
                    db_functions.df_to_sql(df_pred_data,
                                           table_name,
                                           drop='replace')
                    update_sql = f"""
                                 update n_users set lr = lr_text, lr_conf = cast (a.lr_conf as numeric), pol = pol_text,
                                 pol_conf = cast (a.pol_conf as numeric), lr_pol_last_analysed = analyse_date from {table_name} 
                                 a where id = cast(user_id as bigint)"""
                    db_functions.update_table(
                        update_sql)  # update n_users table with new resulsts

                    print(f"Data written to table: {table_name}.")
        gc.collect()
Ejemplo n.º 6
0
def combined_scores_calc_launcher(sql: str,
                                  bert_friends_high_confidence_capp_off,
                                  self_conf_high_conf_capp_off,
                                  min_required_bert_friend_opinions):
    """
    Calculates combined score from users self-LR score and users bert_friend score
    :return:
    """
    # limit = 1000
    # sql = f"select id, screen_name, lr, lr_conf, result_bert_friends, bert_friends_conf, bf_left_number,
    # bf_right_number from n_users where lr is not null limit {limit}"
    # sql = f"select id, screen_name, lr, lr_conf, result_bert_friends, bert_friends_conf, bf_left_number,
    # bf_right_number from n_users where lr is not null or result_bert_friends is not null"
    df = db_functions.select_from_db(sql)
    df.fillna(0, inplace=True)

    count_rated_accounts = 0
    count_uncategorized_accounts = 0
    count_rating_less_accounts = 0
    count_to_few_bert_friends_to_rate_and_LRself_is_invalid = 0
    count_bert_friends_result_is_mediocre = 0

    id_list = df['id'].to_list()
    id_dict = {i: 0 for i in id_list}

    # ToDo: Runtime 30 minutes. Changes to Dict
    for index, element in tqdm(df.iterrows(), total=df.shape[0]):
        result, rated_accounts, rating_less_accounts, to_few_bert_friends_to_rate_and_LRself_is_invalid, bert_friends_result_is_mediocre, uncategorized_accounts = calculate_combined_score(
            bert_friends_high_confidence_cap_off=
            bert_friends_high_confidence_capp_off,
            self_conf_high_conf_cap_off=self_conf_high_conf_capp_off,
            min_required_bert_friend_opinions=min_required_bert_friend_opinions,
            user_id=element[0],
            self_lr=element[2],
            self_lr_conf=element[3],
            bert_friends_lr=element[4],
            bert_friends_lr_conf=element[5],
            number_of_bert_friends_L=element[6],
            number_of_bert_friends_R=element[7],
            BERT_ML_rating=element[8],
            BERT_ML_conf=element[9])

        id_dict[element[0]] = result
        count_rated_accounts += rated_accounts
        count_rating_less_accounts += rating_less_accounts
        count_to_few_bert_friends_to_rate_and_LRself_is_invalid += to_few_bert_friends_to_rate_and_LRself_is_invalid
        count_bert_friends_result_is_mediocre += bert_friends_result_is_mediocre
        count_uncategorized_accounts += uncategorized_accounts

    print("\n\n")
    print(f"ratingless_accounts: {count_rating_less_accounts}")
    print(
        f"to_few_bert_friends_to_rate_and_LRself_is_invalid_or_unknown_or_of_low_conf: "
        f"{count_to_few_bert_friends_to_rate_and_LRself_is_invalid}")
    print(
        f"bert_friends_result_is_medicore: {count_bert_friends_result_is_mediocre}"
    )
    print(f"uncategorized_accounts: {count_uncategorized_accounts}")
    total_rating_less_accounts = count_rating_less_accounts + \
                                 count_to_few_bert_friends_to_rate_and_LRself_is_invalid + \
                                 count_bert_friends_result_is_mediocre + \
                                 count_uncategorized_accounts
    print(f"\nAccounts without rating: {total_rating_less_accounts}")
    print(f"Rated accounts: {count_rated_accounts}")
    print("\n\n")
    print("Calculation done. Writing results to DB.")

    del df

    id_dict = {k: v for k, v in id_dict.items() if v != 0}
    df_result = pd.DataFrame(id_dict).transpose()
    df_result = df_result.replace('null', np.NaN)
    if len(df_result) == 0:
        print("Now new data.")
    else:
        print(f"{len(df_result)} new results found.")
        db_functions.df_to_sql(df_result, "temp_table", drop='replace')
        update_sql = 'update n_users set combined_rating = t."0", combined_conf = cast(t."1" as numeric) from temp_table t where n_users.id = t.index'
        db_functions.update_table(update_sql)  # runtime 8 minutes
        db_functions.drop_table("temp_table")
Ejemplo n.º 7
0
def friend_rating_launcher(sql: str, get_data_from_DB: bool) -> None:
    # def bert_friends_score(get_data_from_DB):
    """Refreshes score for all users in DB who...
     1) have a Bert LR rating and
     2) follow someone in n_followers
    Writes result to table n_users (total runtime 87 min)
    """
    timestamp = db_functions.staging_timestamp()
    start_time = time.time()

    if get_data_from_DB is True:
        # Runtime 18 min
        # --Bert_Friends: Zu bewertende User und die Scores ihrer Freunde
        df = db_functions.select_from_db(sql)
        db_functions.save_pickle(df, "bert_friends.pkl")
    else:
        df = db_functions.load_pickle("bert_friends.pkl")
    # df = df.iloc[:50000,:]
    df_sub0 = df.groupby(['follows_ids',
                          'bert_self']).size().unstack(fill_value=0)
    df_sub1 = df.groupby(['follows_ids',
                          'bert_friends']).size().unstack(fill_value=0)
    del df
    result = df_sub1.join(df_sub0,
                          lsuffix='_friend_Bert',
                          rsuffix='_self_Bert')
    del df_sub0
    del df_sub1
    user_list = result.index.to_list()
    try:
        left_friend_Bert_list = result['links_friend_Bert'].to_list()
    except:
        print(
            "Warning: 0 Results. Staging results possibly not copied to facts_hashtags."
        )
    right_friend_Bert_list = result['rechts_friend_Bert'].to_list()
    del result

    user_dict = {}
    for i, user in enumerate(tqdm(user_list)):
        if user not in user_dict:
            user_dict[user] = {}
        right = right_friend_Bert_list[i]
        left = left_friend_Bert_list[i]
        text, conf = helper_functions.conf_value(
            method='LR',
            prediction_result=[[left, right]],
            min_boundary=0,
            max_boundary=left + right)
        user_dict[user]["text"] = text
        user_dict[user]["confidence"] = conf
        user_dict[user]["last_seen"] = timestamp
        user_dict[user]["bf_left_number"] = left
        user_dict[user]["bf_right_number"] = right

    print("User dict erstellt.")
    print(len(user_dict))
    result = pd.DataFrame(user_dict).T
    print("DF transponiert.")
    db_functions.df_to_sql(result, "temp_result", drop='replace')
    print("Insert into temp done.")
    sql = "update n_users set result_bert_friends = text, bert_friends_conf = cast(confidence as numeric), " \
          "bert_friends_last_seen = temp_result.last_seen, bf_left_number = temp_result.bf_left_number, " \
          "bf_right_number = temp_result.bf_right_number from temp_result where id = cast (temp_result.index as bigint)"
    db_functions.update_table(sql)
    db_functions.drop_table("temp_result")
    print(f"Runtime in  min: {(time.time() - start_time) / 60} ")
Ejemplo n.º 8
0
def get_followers(sql,
                  sql_insert_new_followers,
                  download_limit=12500000,
                  time_limit=False) -> None:
    """
    Is given user ids in form of SQL statement OR as List. Will retrieve followers for this user from Twitter.
    :param sql: SQL statement containing followers OR list of users IDs
    :param download_limit: max follower download for each accounts.
    :param sql_insert_new_followers: Contains SQL statement used to write results from temp_table to n_followers
    :return: none
    """
    # Block 1: Check Twitter Limits
    startime = time.time()
    #timeout = 86400
    timeout = 3600  #run one iteration or one hour
    limit = TwitterAPI.api_limit()
    ts = limit['resources']['followers']['/followers/ids']['reset']
    limit = limit['resources']['followers']['/followers/ids'][
        'remaining']  # gets the remaining limit for follower retrival
    print("Reset Time: " +
          str(datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')))
    if limit == 0:
        print('Twitter API limit used up. Aborting query.')
        print("Reset Time: " +
              str(datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')))
        sys.exit()
    else:
        print("Current API Follower Limit: " + str(limit))

    # Block 2: Get users whose followers we want from DB or list
    if isinstance(sql, str):
        df = db_functions.select_from_db(sql)
    elif isinstance(sql, list):
        df = pd.DataFrame(sql, columns=['id'])
    else:
        print("Error: Must either use SQL statement or List")
        sys.exit()
    # Block 3: Get followers for each of the users retrieved in Block 2
    for index, element in tqdm(df.iterrows(), total=df.shape[0]):
        try:
            # this option works if we load followers for cores
            id = element['user_id']
            screen_name = element['screen_name']
        except KeyError:
            # this setting is used if we load follower for anything but cores
            id = element['id']
            screen_name = 0

        #Query Twitter API to find out how many followers a user has
        try:
            scr_name, followers_count = TwitterAPI.API_get_single_user_object(
                id)
        #except TypeError:
        #print ("STOPP!")
        #Did happen once. Did not happen again when trying to investigate
        except ValueError:
            print(f"User {id} not found. Skipping.")
            continue
        if followers_count >= download_limit:
            continue
        print("Getting Followers of " + str(id) + " | Element " +
              str(index + 1) + " of " + str(len(df)))
        TwitterAPI.API_Followers(
            screen_name, id,
            download_limit=download_limit)  # <== API follower retrieval
        db_functions.update_table(sql_insert_new_followers)

        # Block 4: Write follower retrieve date back to n_cores
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        if screen_name != 0:
            sql = "update n_cores set followers_retrieved =' " + str(
                timestamp) + "' where user_id = " + str(id)
            db_functions.update_table(sql)
        if startime - time.time() > timeout:
            print("###Timeout reached!###")
    return id
Ejemplo n.º 9
0
    if config['TASKS'].getboolean('download_hashtag'):
        since = config['DOWNLOAD_HASHTAG'].get('since')
        until = config['DOWNLOAD_HASHTAG'].get('until')
        download_parent_tweets = config['DOWNLOAD_HASHTAG'].getboolean(
            'download_parent_tweets')
        table_name = sn_scrape.hashtag_download_launcher(
            hashtag=hashtag,
            since=since,
            until=until,
            download_parent_tweets=download_parent_tweets)

        copy_sql = config['DOWNLOAD_HASHTAG'].get('copy_sql')
        copy_sql = copy_sql.replace("INSERT_HASHTAG", "s_h" + hashtag)
        copy_sql = copy_sql.replace("INSERT_TABLE", table_name)
        db_functions.update_table(copy_sql)

    if config['TASKS'].getboolean('download_friends'):
        sql = config['DOWNLOAD_FRIENDS'].get('sql')
        sql = sql.replace("INSERT_HASHTAG", "%" + hashtag + "%")
        get_friends(sql)

        #copy friends to n_followers table
        #This process makes an update of the downloaded followers (many millions) unnecessary since it gradealy adds
        #followers to Bog accounts using the followership lists of small accounts

        #Insert all friends of a hashtag into n_followers
        #Same effect as donwloading all the followers of many big accounts but much much gasters
        sql = """INSERT INTO n_followers (username, user_id, follows_users, follows_ids, retrieve_date)
        select follows_users, follows_ids::bigint , follows_users, f.user_id, retrieve_date 
        FROM n_friends f, facts_hashtags fh
Ejemplo n.º 10
0
def tweet_details_download_launcher(table_name: str,
                                    hashtag: str,
                                    bulk_size: int = 1000,
                                    download_parent_tweets=True):
    """
    1. Calls Tweet downloader
    2. Adds details to Tweet IDs in staging table via update
    3. If Tweets are are reply to another tweet, those parent tweets can be downloaded via option download_parent_tweets
    :param table_name: Staging table name which will be updated
    :param hashtag: hashtag scraped
    :param bulk_size: number of tweets to processed in one function call
    :param download_parent_tweets: If True: Loops through all tweets until no further father elements can be found
    :return: none
    """

    parent_sql = f"""select * from {table_name} where tweet is null 
    or (retweet is not null and retweet not in (select id from {table_name})) limit {bulk_size}"""
    df_parent = db_functions.select_from_db(parent_sql)
    df_parent['id'] = df_parent[
        'retweet']  #replaces tweet ID with parent id. Otherwise previously downloaded tweets would be downloaded again

    sql = f"select * from {table_name} where tweet is null limit {bulk_size}"
    df = db_functions.select_from_db(sql)

    download_parents = helper_functions.dataframe_length(
        df) == 0 and download_parent_tweets == True
    if download_parents == True:
        df = df_parent
    for index, element in df.iterrows():
        error = False
        # downloads details for Tweets from staging table
        result = API_get_tweet_details(element[1], sleep=True)
        if result == 'Error: Not authorized':  #Tweet is set to private
            error = True
        if result == 'Error: No status found with that ID.' or result == "Undefined Error":
            #sql = f"update {table_name} set tweet = 'deleted' where id = {element[1]}"
            #db_functions.update_table(sql)
            df.tweet[df.id == element[1]] = 'deleted'
            error = True
        if error is False:
            df.iloc[index:index + 1, 4:5] = result[1]  # date
            df.iloc[index:index + 1, 5:6] = result[2]  # tweet
            df.iloc[index:index + 1, 6:7] = hashtag  # hashtag
            df.iloc[index:index + 1, 7:8] = result[0]  # user_id
            df.iloc[index:index + 1, 8:9] = result[4]  # screen_name
            df.iloc[index:index + 1, 9:10] = result[3]  # name
            df.iloc[index:index + 1, 11:12] = result[5]  # in reply to tweet id
            df.iloc[index:index + 1, 18:19] = table_name
            #print ("Fetched Tweet: {}".format(element[1]))

    if len(df) == 0:
        return 0
    db_functions.df_to_sql(df, 'temp_df', 'replace')
    new_tweets_fetched = helper_functions.dataframe_length(df)
    # try:
    #     print (new_tweets_fetched)
    # except UnboundLocalError:
    #     print ("STOPP")
    #update staging table with values form temp_df
    if download_parents == False:
        sql = f"""update {table_name} a set date = b.date, tweet = b.tweet, hashtags = b.hashtags,
              user_id = cast (b.user_id as bigint), username = b.username, name = b.name,
              retweet = cast (b.retweet as bigint), staging_name = b.staging_name from (select * from temp_df) b
              where a.id = b.id"""
    else:
        sql = f"""INSERT INTO {table_name} 
        SELECT index::bigint, id, conversation_id::bigint, created_at, date, tweet, hashtags, user_id, username, name,
        link::bigint, retweet::bigint, nlikes::bigint, nreplies::bigint, nretweets::bigint, quote_url::bigint, user_rt_id::bigint,
        user_rt::bigint, staging_name FROM temp_df"""
        print(f"{new_tweets_fetched} parent tweets added.")
    db_functions.update_table(sql)
    db_functions.drop_table('temp_df')
    #new_tweets_fetched = helper_functions.dataframe_length(df)
    return new_tweets_fetched