Ejemplo n.º 1
0
def get_BERT_friends_scores_from_friends(
        sql: str, min_required_bert_friend_opinions: int):
    """get_followers
    Counts how many left or right friends a user has. Unlike seemingly similar functions (which download and analyse
    tweets), this one gets user profiles from DB as DF.

    For users that need special attention because they did not get a combined score due to bad BERT_friend score.
    Works only if a users friends are available in n_friends.
    Afterwards get_combined_scores_from_followers(sql) needs to run again, to give the accounts handled here a
    combined score
    :param sql: sql statement that provides delivers users and their friends, which have a bert_friend or a combined
    score
    """
    df = db_functions.select_from_db(sql)

    # Accounts in this list can only rated via bert_friend_rating
    bert_friend_rating_list = df[
        df['combined_conf'].isnull()]['id'].drop_duplicates().to_list()
    all_ids = df['id'].drop_duplicates().to_list()
    # Account in this list can be rated via (the better) combined_rating
    combined_rating_list = [
        item for item in all_ids if item not in bert_friend_rating_list
    ]

    # count left/right friends based on BERT_friend rating (widely available)
    df_result_BERT_friends = count_friend_stances(
        df,
        friend_lst=bert_friend_rating_list,
        column_to_count='result_bert_friends',
        min_required_bert_friend_opinions=min_required_bert_friend_opinions)

    # count left/right friends based on combined rating (better results)
    df_result_combined_ratings = count_friend_stances(
        df,
        friend_lst=combined_rating_list,
        column_to_count='combined_rating',
        min_required_bert_friend_opinions=min_required_bert_friend_opinions)

    df_combined = pd.concat(
        [df_result_BERT_friends, df_result_combined_ratings])
    df_combined.dropna(inplace=True)
    del df
    del df_result_combined_ratings
    del df_result_BERT_friends

    db_functions.df_to_sql(df_combined, 'temp_scores_table', drop='replace')
    update_sql = 'update n_users set result_bert_friends = t."1", bert_friends_conf = cast (t."2" as numeric), ' \
                 'bf_left_number = t."3", bf_right_number = t."4", bert_friends_last_seen = t."5" from ' \
                 'temp_scores_table t where n_users.id = t."0"'
    db_functions.update_table(update_sql)
    db_functions.drop_table('temp_scores_table')
Ejemplo n.º 2
0
def API_Followers(screen_name: str, user_id: str, download_limit=12500000):
    """
    Downloads Follower using API. Stops after 1000 pages (5Mio Followers), no matter how many there are.
    :param screen_name: Twitter screen_name. Technically not required but added to DB for convenient lockup
    :param user_id: Twitter User_ID
    :param limit: #prevents download of more than 12,5 Million Followers of a user. Accounts with that
    many Followers are mostly non german and their followers not relevant enough to justify the download time.
    :return: none
    """

    ids = []
    followers = []
    count = 0

    try:
        for page in tweepy.Cursor(api.followers_ids, user_id=user_id).pages():
            ids.extend(page)
            for index in enumerate(page["ids"]):
                followers.append(page["ids"][index[0]])
                count += 1
            time.sleep(60)
            print("Scraping " + str(screen_name) + "Current Follower Count:" +
                  str(len(followers)))
            if count >= download_limit:
                break
    except TweepError as e:
        if "Not authorized" in str(e):
            print("Error: Not authorized. | Followers possibly set to private")
            time.sleep(60)
            return "Error: Not authorized"

    username_list = [screen_name for element in followers]
    user_id_list = [user_id for element in followers]
    result_user = ["" for element in followers]
    timestamp = [
        datetime.now().strftime("%Y%m%d_%H%M%S") for element in followers
    ]
    df = pd.DataFrame({
        'username': pd.Series(username_list),
        'user_id': pd.Series(user_id_list),
        'fav_users': pd.Series(result_user),
        'fav_ids': pd.Series(followers),
        'retrieve_date': pd.Series(timestamp)
    })

    db_functions.df_to_sql(df, 'temp_followers', drop='replace')
Ejemplo n.º 3
0
def SN_db_operations(hashtag: str, since: str, until: str) -> tuple:
    """
    1. Creates staging table for new hashtag
    2. Calls tweet ID downloader
    3. Saves tweet IDs to DB
    Downloads TweetIDs of Hashtags via snScrapeTweet
    :param hashtag: hashtag, to be downloaded
    :param since: start date for tweet download (string)
    :param until: until date for tweet download (string)
    :return: table_name, hashtag,  dataframe length
    """

    table_name = db_functions.get_staging_table_name(
        hashtag)  #adds prefix (s_h_) and suffix (dateand time) to tablename
    db_functions.drop_table(table_name)
    db_functions.create_empty_staging_table(table_name)
    tweet_ids = SN_get_tweet_ids_for_hashtag(since, until,
                                             hashtag)  #downloads tweets
    if len(tweet_ids) == 0:
        print("####################################################")
        print("####Warning! No Tweets fetched! Process aborted!####")
        print("####################################################")
        sys.exit()
    df = pd.DataFrame(tweet_ids)
    write_to_table = "update_temp"

    db_functions.df_to_sql(df, write_to_table,
                           drop="replace")  #Write Tweet ID to temp table
    #db_functions.update_table('insert into ' + str(table_name) + ' (id) select cast ("0" as bigint) from update_temp') #insert temp table content into staging table
    db_functions.update_table(
        f'insert into {table_name} (id) select cast ("0" as bigint) from {write_to_table}'
    )  # insert temp table content into staging table

    try:
        print(f"Table {table_name} created with {len(df)} tweets.")
    except:
        print("Unhandled Error")
    db_functions.drop_table(write_to_table)
    return table_name, hashtag, df.shape[0]
def inference_bert_friends(classifier, column_list: list, sql: str,
                           min_matches: int):
    """
    Performs inference based on users an account follows. Stores result to n_users
    :param classifier: Classifier
    :param friend_column_list_path: Column list to be used. Any Users friends are matched against this column list
    :param sql: Sql with combination of User to be inferendes, their label (combined rating) and their friend ID
    :param min_matches: Minimum friends that must be found in friend_column for the user to get a prediction.
    More connections = more accurate prediction result
    :return:
    """
    start = time.time()
    friends = db_functions.select_from_db(sql)
    input_dataset_length = len(friends)
    print(f"SQL fetching time: {time.time() - start}")

    if len(friends) == 0:
        rows_processed = 0
        return rows_processed

    friend_set = set(friends['follows_ids'].values.tolist())
    friend_list = friends['follows_ids'].values.tolist()
    user_list = friends['user_id'].values.tolist()
    rating_list = friends['combined_rating'].values.tolist()
    del friends

    #Transforms DataFrame into DefaultDict
    relationship_dict = defaultdict(lambda: defaultdict(list))
    for i, element in enumerate(friend_list):
        relationship_dict[element][0].append(user_list[i])
        relationship_dict[element][1].append(rating_list[i])

    conditions_not_met_list = [
    ]  # Ids in this list will still get a last seen date in DB to ignore them during next loop
    result_dict = {}
    #Iteration though friend_set and prediction of faction
    for element in tqdm(friend_set):
        common_friends = set(relationship_dict[element][0]) & set(column_list)
        number_of_common_friends = len(common_friends)
        if number_of_common_friends >= min_matches:
            df = pd.DataFrame(index=column_list).T
            df = df.append(pd.Series(), ignore_index=True).fillna(0)
            for friend in relationship_dict[element][0]:
                df.loc[:, friend] = 1
            df = df.iloc[:, :len(column_list)]
            prediction_proba = classifier.predict_proba(
                df.values.tolist())  # pure predict

            text, conf = helper_functions.conf_value("LR",
                                                     prediction_proba,
                                                     min_boundary=0.5,
                                                     max_boundary=1)
            result_dict[element] = [text, conf, number_of_common_friends]
        else:
            conditions_not_met_list.append(element)

    del friend_set
    del friend_list
    del user_list
    del rating_list

    timestamp = db_functions.staging_timestamp()
    result_df = pd.DataFrame(result_dict).T
    result_df['last_seen'] = timestamp
    rows_processed = len(result_df)
    if rows_processed > 0:  #checks if data has been written to DF
        db_functions.df_to_sql(result_df, "temp_table", "replace")
        update_sql = """update n_users
        set bert_friends_ml_result = "0",
        bert_friends_ml_conf = cast("1" as numeric),
        bert_friends_ml_count = cast ("2" as integer),
        bert_friends_ml_last_seen = temp_table.last_seen
        from temp_table where
        cast (id as text) = temp_table."index"
        """
        start = time.time()
        db_functions.update_table(update_sql)
        db_functions.drop_table("temp_table")
        print(f"Update Time: {time.time() - start}")
    else:
        print(
            f"WARNING: 0 new ratings generated despite an input dataset of {input_dataset_length} rows."
        )

    if len(conditions_not_met_list) > 0:
        stamps = [timestamp for elm in conditions_not_met_list]
        ziped = list(zip(conditions_not_met_list, stamps))
        db_functions.df_to_sql(pd.DataFrame(ziped),
                               "temp_table",
                               drop='replace')
        sql = 'update n_users set bert_friends_ml_last_seen = temp_table."1" from temp_table where n_users.id::text = temp_table."0"'
        db_functions.update_table(sql)
        #db_functions.drop_table('temp_table')

    return rows_processed
Ejemplo n.º 5
0
def prediction_launcher(table_name: str,
                        BERT_model,
                        sql: str,
                        write_to_db: bool = True,
                        TFIDF_pol_unpol_conv=0,
                        Algo_pol_unpol=0):
    """
    Loads 200 Tweets (one page call) per users and sends them to BERT for inference
    :param table_name: Name of temp table used to store results
    :param BERT_model: BERT Model
    :param sql: Statement providing Users to be inferenced
    :param write_to_db: True or False
    :param TFIDF_pol_unpol_conv: tfidf converter (optional)
    :param Algo_pol_unpol: Random Forrest classifier (optional)
    :return:
    """
    start_time_overal = time.time()
    cur_date = str(date.today())  # date for last seen columns
    #methods = ['pol', 'LR']
    methods = ['LR']
    data = []
    update_to_invalid_list = []

    # This DF will store all precditions results
    df_pred_data = pd.DataFrame(data,
                                columns=[
                                    'user_id', 'screen_name', 'pol', 'unpol',
                                    'pol_text', 'pol_conf', 'pol_time', 'left',
                                    'right', 'lr_text', 'lr_conf', 'lr_time',
                                    'analyse_date'
                                ])

    sql_time_start = time.time()
    df = db_functions.select_from_db(sql)
    print(
        f"##############  --- SQL Select time : {sql_time_start - time.time()} --- #############"
    )
    if df.shape[1] != 2:
        print("ERROR: DF must ONLY have columns user_id and username")
        gc.collect()
        sys.exit()
    gc.collect()

    for index, element in tqdm(df.iterrows(), total=df.shape[0]):
        start_time = time.time()
        user_id = element[0]
        screen_name = element[1]

        def tweet_download_and_lang_detect(df_tweets, user_id,
                                           update_to_invalid_list):
            """
            Calls language detection and checks if enough german tweets remain.
            If it found almost enough german Tweets it will load more.
            If it found almost none it will abort.
            :param df_tweets: 0 during first run, dataframe with tweets during later runs
            :param user_id: Twitter User_ID for tweet download and language check
            :param update_to_invalid_list: List of user that can not be downloaded from. Will append to if applicable.
            :return: df_tweets, update_to_invalid_list, abort_loop, len_df
            """
            if isinstance(df_tweets, int):
                df_tweets = TwitterAPI.API_tweet_multitool(
                    user_id,
                    'temp',
                    pages=1,
                    method='user_timeline',
                    append=False,
                    write_to_db=False)  # fills DF with 200 tweets of 1 page
                df_tweets = helper_functions.lang_detect(df_tweets)
            else:
                df_tweets_additions = TwitterAPI.API_tweet_multitool(
                    user_id,
                    'temp',
                    pages=1,
                    method='user_timeline',
                    append=False,
                    write_to_db=False)  # fills DF with 200 tweets of 1 page
                df_tweets_additions = helper_functions.lang_detect(
                    df_tweets_additions)
                if isinstance(df_tweets_additions, pd.DataFrame):
                    df_tweets = pd.concat([df_tweets, df_tweets_additions])
                    df_tweets.reset_index(inplace=True)
                    del df_tweets['index']

            # if df_tweets is None: #no tweets found or all tweets deleted (non german)
            #     abort_loop = True
            #     return df_tweets, update_to_invalid_list, abort_loop'

            len_df = helper_functions.dataframe_length(df_tweets)
            if len_df <= 50:
                # if almost no tweets are german don't try to get more german tweets from this users.
                # would take to many page loads
                update_to_invalid_list.append(user_id)
                abort_loop = True
            elif len_df >= 200:
                abort_loop = True
            else:
                # if to few tweets are german load more tweets to get a better result
                abort_loop = False
            gc.collect()
            return df_tweets, update_to_invalid_list, abort_loop, len_df

        df_tweets = 0
        # tries two times to get at least 200 german tweets, if first attempt returns less than 150 german tweets
        for i in range(2):
            df_tweets, update_to_invalid_list, abort_loop, len_df = tweet_download_and_lang_detect(
                df_tweets, user_id, update_to_invalid_list)
            if abort_loop == True:
                break

        if len_df > 0:
            for method in methods:
                prediction_result = []
                if method == 'pol':
                    if TFIDF_pol_unpol_conv == 0 or Algo_pol_unpol == 0:
                        print(
                            "Warning: No Political/Unpolitical classifier given. Check function parameters."
                        )
                    else:
                        prediction_result.append(
                            TFIDF_inference.TFIDF_inference(
                                df_tweets['tweet'], TFIDF_pol_unpol_conv,
                                Algo_pol_unpol))
                if method == 'LR':
                    #prediction_result.append(inference_political_bert.bert_predictions(df_tweets['tweet'], BERT_model))
                    prediction_result.append(
                        bert_predictions(df_tweets['tweet'], BERT_model))
                runtime = int(time.time() - start_time)

                # returns text interpretation of inference
                text, conf = helper_functions.conf_value(
                    method, prediction_result, max_boundary=len(df_tweets))

                # result and confidence score
                df_pred_data.at[index, 'user_id'] = user_id
                df_pred_data.at[index, 'screen_name'] = screen_name
                df_pred_data.at[index, 'analyse_date'] = cur_date

                # TODO: If you store the column names in variables that update depending on the method, you only need
                #  one block
                pred_result_zero = 'left' if method == "LR" else 'pol'
                pred_result_one = 'right' if method == "LR" else 'unpol'
                df_pred_data.at[index,
                                pred_result_zero] = prediction_result[0][0]
                df_pred_data.at[index,
                                pred_result_one] = prediction_result[0][1]

                if method == "LR":
                    df_pred_data.at[index, 'left'] = prediction_result[0][0]
                    df_pred_data.at[index, 'right'] = prediction_result[0][1]
                    df_pred_data.at[index, 'lr_text'] = text
                    df_pred_data.at[index, 'lr_conf'] = conf
                    df_pred_data.at[index, 'lr_time'] = runtime
                else:
                    df_pred_data.at[index, 'pol'] = prediction_result[0][0]
                    df_pred_data.at[index, 'unpol'] = prediction_result[0][1]
                    df_pred_data.at[index, 'pol_text'] = text
                    df_pred_data.at[index, 'pol_conf'] = conf
                    df_pred_data.at[index, 'pol_time'] = runtime

            # print("screen_name,Pol,Unpol,Pol_Time,Left,Right,LR_Time")
            # for index, element in df_pred_data.iterrows():
            #     print(
            #         f"{element['user_id']},{element['screen_name']},{element['pol']}"
            #         f",{element['unpol']},{element['pol_time']},{element['left']},{element['right']},{element['lr_time']}")
            # print("\n")
            # if index == 6:
            #     print ("Stopp")
            if (write_to_db is True and index != 0
                    and index % batch_size == 0) or (
                        write_to_db is True and (df.shape[0]) == (index + 1)
                    ):  #saved data x iterations OR when df has no further rows
                if len(update_to_invalid_list) > 0:
                    invalids = pd.DataFrame(update_to_invalid_list)
                    invalids['cur_date'] = cur_date
                    db_functions.df_to_sql(invalids,
                                           "temp_invalids",
                                           drop='replace')
                    update_sql = """update n_users 
                    set lr = 'invalid', pol= 'invalid', lr_pol_last_analysed = temp_invalids.cur_Date
                    from temp_invalids 
                    where id = temp_invalids."0"
                    """
                    db_functions.update_table(update_sql)
                    db_functions.drop_table("temp_invalids")

                if helper_functions.dataframe_length(df_pred_data) > 0:
                    db_functions.df_to_sql(df_pred_data,
                                           table_name,
                                           drop='replace')
                    update_sql = f"""
                                 update n_users set lr = lr_text, lr_conf = cast (a.lr_conf as numeric), pol = pol_text,
                                 pol_conf = cast (a.pol_conf as numeric), lr_pol_last_analysed = analyse_date from {table_name} 
                                 a where id = cast(user_id as bigint)"""
                    db_functions.update_table(
                        update_sql)  # update n_users table with new resulsts

                    print(f"Data written to table: {table_name}.")
        gc.collect()
Ejemplo n.º 6
0
def combined_scores_calc_launcher(sql: str,
                                  bert_friends_high_confidence_capp_off,
                                  self_conf_high_conf_capp_off,
                                  min_required_bert_friend_opinions):
    """
    Calculates combined score from users self-LR score and users bert_friend score
    :return:
    """
    # limit = 1000
    # sql = f"select id, screen_name, lr, lr_conf, result_bert_friends, bert_friends_conf, bf_left_number,
    # bf_right_number from n_users where lr is not null limit {limit}"
    # sql = f"select id, screen_name, lr, lr_conf, result_bert_friends, bert_friends_conf, bf_left_number,
    # bf_right_number from n_users where lr is not null or result_bert_friends is not null"
    df = db_functions.select_from_db(sql)
    df.fillna(0, inplace=True)

    count_rated_accounts = 0
    count_uncategorized_accounts = 0
    count_rating_less_accounts = 0
    count_to_few_bert_friends_to_rate_and_LRself_is_invalid = 0
    count_bert_friends_result_is_mediocre = 0

    id_list = df['id'].to_list()
    id_dict = {i: 0 for i in id_list}

    # ToDo: Runtime 30 minutes. Changes to Dict
    for index, element in tqdm(df.iterrows(), total=df.shape[0]):
        result, rated_accounts, rating_less_accounts, to_few_bert_friends_to_rate_and_LRself_is_invalid, bert_friends_result_is_mediocre, uncategorized_accounts = calculate_combined_score(
            bert_friends_high_confidence_cap_off=
            bert_friends_high_confidence_capp_off,
            self_conf_high_conf_cap_off=self_conf_high_conf_capp_off,
            min_required_bert_friend_opinions=min_required_bert_friend_opinions,
            user_id=element[0],
            self_lr=element[2],
            self_lr_conf=element[3],
            bert_friends_lr=element[4],
            bert_friends_lr_conf=element[5],
            number_of_bert_friends_L=element[6],
            number_of_bert_friends_R=element[7],
            BERT_ML_rating=element[8],
            BERT_ML_conf=element[9])

        id_dict[element[0]] = result
        count_rated_accounts += rated_accounts
        count_rating_less_accounts += rating_less_accounts
        count_to_few_bert_friends_to_rate_and_LRself_is_invalid += to_few_bert_friends_to_rate_and_LRself_is_invalid
        count_bert_friends_result_is_mediocre += bert_friends_result_is_mediocre
        count_uncategorized_accounts += uncategorized_accounts

    print("\n\n")
    print(f"ratingless_accounts: {count_rating_less_accounts}")
    print(
        f"to_few_bert_friends_to_rate_and_LRself_is_invalid_or_unknown_or_of_low_conf: "
        f"{count_to_few_bert_friends_to_rate_and_LRself_is_invalid}")
    print(
        f"bert_friends_result_is_medicore: {count_bert_friends_result_is_mediocre}"
    )
    print(f"uncategorized_accounts: {count_uncategorized_accounts}")
    total_rating_less_accounts = count_rating_less_accounts + \
                                 count_to_few_bert_friends_to_rate_and_LRself_is_invalid + \
                                 count_bert_friends_result_is_mediocre + \
                                 count_uncategorized_accounts
    print(f"\nAccounts without rating: {total_rating_less_accounts}")
    print(f"Rated accounts: {count_rated_accounts}")
    print("\n\n")
    print("Calculation done. Writing results to DB.")

    del df

    id_dict = {k: v for k, v in id_dict.items() if v != 0}
    df_result = pd.DataFrame(id_dict).transpose()
    df_result = df_result.replace('null', np.NaN)
    if len(df_result) == 0:
        print("Now new data.")
    else:
        print(f"{len(df_result)} new results found.")
        db_functions.df_to_sql(df_result, "temp_table", drop='replace')
        update_sql = 'update n_users set combined_rating = t."0", combined_conf = cast(t."1" as numeric) from temp_table t where n_users.id = t.index'
        db_functions.update_table(update_sql)  # runtime 8 minutes
        db_functions.drop_table("temp_table")
Ejemplo n.º 7
0
def friend_rating_launcher(sql: str, get_data_from_DB: bool) -> None:
    # def bert_friends_score(get_data_from_DB):
    """Refreshes score for all users in DB who...
     1) have a Bert LR rating and
     2) follow someone in n_followers
    Writes result to table n_users (total runtime 87 min)
    """
    timestamp = db_functions.staging_timestamp()
    start_time = time.time()

    if get_data_from_DB is True:
        # Runtime 18 min
        # --Bert_Friends: Zu bewertende User und die Scores ihrer Freunde
        df = db_functions.select_from_db(sql)
        db_functions.save_pickle(df, "bert_friends.pkl")
    else:
        df = db_functions.load_pickle("bert_friends.pkl")
    # df = df.iloc[:50000,:]
    df_sub0 = df.groupby(['follows_ids',
                          'bert_self']).size().unstack(fill_value=0)
    df_sub1 = df.groupby(['follows_ids',
                          'bert_friends']).size().unstack(fill_value=0)
    del df
    result = df_sub1.join(df_sub0,
                          lsuffix='_friend_Bert',
                          rsuffix='_self_Bert')
    del df_sub0
    del df_sub1
    user_list = result.index.to_list()
    try:
        left_friend_Bert_list = result['links_friend_Bert'].to_list()
    except:
        print(
            "Warning: 0 Results. Staging results possibly not copied to facts_hashtags."
        )
    right_friend_Bert_list = result['rechts_friend_Bert'].to_list()
    del result

    user_dict = {}
    for i, user in enumerate(tqdm(user_list)):
        if user not in user_dict:
            user_dict[user] = {}
        right = right_friend_Bert_list[i]
        left = left_friend_Bert_list[i]
        text, conf = helper_functions.conf_value(
            method='LR',
            prediction_result=[[left, right]],
            min_boundary=0,
            max_boundary=left + right)
        user_dict[user]["text"] = text
        user_dict[user]["confidence"] = conf
        user_dict[user]["last_seen"] = timestamp
        user_dict[user]["bf_left_number"] = left
        user_dict[user]["bf_right_number"] = right

    print("User dict erstellt.")
    print(len(user_dict))
    result = pd.DataFrame(user_dict).T
    print("DF transponiert.")
    db_functions.df_to_sql(result, "temp_result", drop='replace')
    print("Insert into temp done.")
    sql = "update n_users set result_bert_friends = text, bert_friends_conf = cast(confidence as numeric), " \
          "bert_friends_last_seen = temp_result.last_seen, bf_left_number = temp_result.bf_left_number, " \
          "bf_right_number = temp_result.bf_right_number from temp_result where id = cast (temp_result.index as bigint)"
    db_functions.update_table(sql)
    db_functions.drop_table("temp_result")
    print(f"Runtime in  min: {(time.time() - start_time) / 60} ")
Ejemplo n.º 8
0
def API_tweet_multitool(query: str,
                        table_name: str,
                        pages: int,
                        method: str,
                        append: bool = False,
                        write_to_db: bool = True,
                        count: int = 1000):
    """
    Can run twitter API calls search OR user_timeline and store result in DB table specified in input paramater
    :param query: search item e.g. hashtag
    :param table_name: name of staging table. With get a 's_h_' (for Staging_table_Hashtag) as prefix and date as suffix
    :param pages: number of cursor pages to retrieve
    :param method: API method to be used: 1.search, 2. user_timeline
    :param append: If True: Adds data to existing table. If False: replaces table table_name
    :param write_to_db: If True (default), writes data to DB, if false returns data as Dataframe
    :param count (default 1000). Number of tweets to be retrieved per user
    :return:
    """

    if append is False:
        table_name = 's_h_' + table_name + '_' + str(
            db_functions.staging_timestamp())
        table_name = table_name.lower()

    #deactivates JSON which is required for api search cursor call
    api = tweepy.API(auth)

    id = []
    conversation_id = []
    created_at = []
    date = []
    tweet = []
    hashtags = []
    user_id = []
    username = []
    name = []
    link = []
    retweet = []
    nlikes = []
    nreplies = []
    nretweets = []
    quote_url = []
    user_rt_id = []
    user_rt = []

    #api_method = api.search if method == "search" else api.user_timeline
    if method == 'search':
        api_method = api.search
        wait = True
    elif method == 'user_timeline':
        api_method = api.user_timeline
        wait = False
    else:
        assert (method == 'search'
                or method == 'user_timeline'), "Error: No known method given!"
    try:
        for fetched_tweets in tweepy.Cursor(
                api_method, query, count=count,
                tweet_mode='extended').pages(pages):
            for index in range(len(fetched_tweets)):
                id.append(fetched_tweets[index].id)
                conversation_id.append(0)
                created_at.append(str(fetched_tweets[index].created_at))
                date.append(str(fetched_tweets[index].created_at))
                try:  #Checks if Tweet returns a retweet.
                    at_user = getattr(
                        getattr(
                            getattr(fetched_tweets[index], 'retweeted_status'),
                            'user'), 'screen_name')
                    tweet.append("RT @" + at_user + ": " + getattr(
                        getattr(fetched_tweets[index], 'retweeted_status'),
                        'full_text'))
                except AttributeError:  #no retweet found
                    tweet.append(fetched_tweets[index].full_text)
                hashtags_sub = []
                for i, element in enumerate(
                        fetched_tweets[index].entities['hashtags']):
                    hashtags_sub.append(
                        fetched_tweets[index].entities['hashtags'][i]['text'])
                hashtags.append(hashtags_sub)
                user_id.append(fetched_tweets[index].user.id)
                username.append(fetched_tweets[index].user.screen_name)
                name.append(fetched_tweets[index].user.name)
                link.append(0)
                retweet.append(0)
                nlikes.append(0)
                nreplies.append(0)
                nretweets.append(fetched_tweets[index].retweet_count)
                quote_url.append(0)
                user_rt_id.append(0)
                user_rt.append(0)
            if wait is True:
                time.sleep(20)
    # TODO: In case the error message is saved in variable e, print and return e?
    except TweepError as e:
        if "User not found" in str(e):
            time.sleep(
                20
            )  #error are handled so fast that sleep is required to avoid rate limit exhaustian
            return "Error: User not found"
        elif "status code = 401" in str(e):
            time.sleep(20)
            print("Twitter error response: status code = 401")
            return "Twitter error response: status code = 401"
        elif "status code = 404" in str(e):
            time.sleep(20)
            print("Error 404: Account does not exist (anymore?)")
            return "Error 404: Account does not exist (anymore)"
        elif "User has been suspended" in str(e):
            time.sleep(20)
            print("Error: User has been suspended")
            return "Error: User has been suspended"
        else:
            time.sleep(20)
            print(e)
            return ("Error")

    # Build Dataframe
    id = pd.Series(id, name='id')
    conversation_id = pd.Series(conversation_id, name='conversation_id')
    created_at = pd.Series(created_at, name='created_at')
    date = pd.Series(date, name='date')
    tweet = pd.Series(tweet, name='tweet')
    hashtags = pd.Series(hashtags, name='hashtags')
    user_id = pd.Series(user_id, name='user_id')
    username = pd.Series(username, name='username')
    name = pd.Series(name, name='name')
    link = pd.Series(link, name='link')
    retweet = pd.Series(retweet, name='retweet')
    nlikes = pd.Series(nlikes, name='nlikes')
    nreplies = pd.Series(nreplies, name='nreplies')
    nretweets = pd.Series(nretweets, name='nretweets')
    quote_url = pd.Series(quote_url, name='quote_url')
    user_rt_id = pd.Series(user_rt_id, name='user_rt_id')
    user_rt = pd.Series(user_rt, name='user_rt')

    df = pd.concat([
        id, conversation_id, created_at, date, tweet, hashtags, user_id,
        username, name, link, retweet, nlikes, nreplies, nretweets, quote_url,
        user_rt_id, user_rt
    ],
                   axis=1)

    if df.shape[0] == 0:
        print("Error: 0 Tweets")
        return "Error: 0 Tweets"

    #Write to DB or return result
    if write_to_db is True:
        df = df.rename(
            {
                'data-item-id': 'id',
                'data-conversation-id': 'conversation_id',
                'avatar': 'link'
            },
            axis=1)
        # adds empty column to df
        df['user_id'] = ''
        # remove @ sign in username
        df['username'] = df['username'].str.replace("@", "")

        #engine, metadata = sql_alchemy_engine()  # gets SQL alchemy connection

        # creates list that is as long das the DF and contains tablename in every entry.
        staging_column_list = [table_name for index in range(len(df))]

        # appends staging_column_list to df
        df = df.assign(staging_name=pd.Series(staging_column_list).values)
        #db_functions.tweet_multitool_results_to_DB(df, table_name, append)
        db_functions.df_to_sql(df, table_name, drop='append')
        print("Staging Table " + str(table_name) + " created with " +
              str(len(df)) + " entries.")
    else:
        return df
Ejemplo n.º 9
0
def tweet_details_download_launcher(table_name: str,
                                    hashtag: str,
                                    bulk_size: int = 1000,
                                    download_parent_tweets=True):
    """
    1. Calls Tweet downloader
    2. Adds details to Tweet IDs in staging table via update
    3. If Tweets are are reply to another tweet, those parent tweets can be downloaded via option download_parent_tweets
    :param table_name: Staging table name which will be updated
    :param hashtag: hashtag scraped
    :param bulk_size: number of tweets to processed in one function call
    :param download_parent_tweets: If True: Loops through all tweets until no further father elements can be found
    :return: none
    """

    parent_sql = f"""select * from {table_name} where tweet is null 
    or (retweet is not null and retweet not in (select id from {table_name})) limit {bulk_size}"""
    df_parent = db_functions.select_from_db(parent_sql)
    df_parent['id'] = df_parent[
        'retweet']  #replaces tweet ID with parent id. Otherwise previously downloaded tweets would be downloaded again

    sql = f"select * from {table_name} where tweet is null limit {bulk_size}"
    df = db_functions.select_from_db(sql)

    download_parents = helper_functions.dataframe_length(
        df) == 0 and download_parent_tweets == True
    if download_parents == True:
        df = df_parent
    for index, element in df.iterrows():
        error = False
        # downloads details for Tweets from staging table
        result = API_get_tweet_details(element[1], sleep=True)
        if result == 'Error: Not authorized':  #Tweet is set to private
            error = True
        if result == 'Error: No status found with that ID.' or result == "Undefined Error":
            #sql = f"update {table_name} set tweet = 'deleted' where id = {element[1]}"
            #db_functions.update_table(sql)
            df.tweet[df.id == element[1]] = 'deleted'
            error = True
        if error is False:
            df.iloc[index:index + 1, 4:5] = result[1]  # date
            df.iloc[index:index + 1, 5:6] = result[2]  # tweet
            df.iloc[index:index + 1, 6:7] = hashtag  # hashtag
            df.iloc[index:index + 1, 7:8] = result[0]  # user_id
            df.iloc[index:index + 1, 8:9] = result[4]  # screen_name
            df.iloc[index:index + 1, 9:10] = result[3]  # name
            df.iloc[index:index + 1, 11:12] = result[5]  # in reply to tweet id
            df.iloc[index:index + 1, 18:19] = table_name
            #print ("Fetched Tweet: {}".format(element[1]))

    if len(df) == 0:
        return 0
    db_functions.df_to_sql(df, 'temp_df', 'replace')
    new_tweets_fetched = helper_functions.dataframe_length(df)
    # try:
    #     print (new_tweets_fetched)
    # except UnboundLocalError:
    #     print ("STOPP")
    #update staging table with values form temp_df
    if download_parents == False:
        sql = f"""update {table_name} a set date = b.date, tweet = b.tweet, hashtags = b.hashtags,
              user_id = cast (b.user_id as bigint), username = b.username, name = b.name,
              retweet = cast (b.retweet as bigint), staging_name = b.staging_name from (select * from temp_df) b
              where a.id = b.id"""
    else:
        sql = f"""INSERT INTO {table_name} 
        SELECT index::bigint, id, conversation_id::bigint, created_at, date, tweet, hashtags, user_id, username, name,
        link::bigint, retweet::bigint, nlikes::bigint, nreplies::bigint, nretweets::bigint, quote_url::bigint, user_rt_id::bigint,
        user_rt::bigint, staging_name FROM temp_df"""
        print(f"{new_tweets_fetched} parent tweets added.")
    db_functions.update_table(sql)
    db_functions.drop_table('temp_df')
    #new_tweets_fetched = helper_functions.dataframe_length(df)
    return new_tweets_fetched