コード例 #1
0
def transform_entities_data():
    filepaths = get_data_filepaths(ENTITIES_DATA_DIR, "entities", ".txt")
    for filepath in filepaths:
        entities = load_tweets(filepath)
        separate_entities(entities)
        transform_entities()
        prefix_file(filepath, "transformed")
コード例 #2
0
def transform_users_data():
    filepaths = get_data_filepaths(USER_DATA_DIR, "users", ".txt")
    for filepath in filepaths:
        df = build_dataframe(filepath)
        df.rename(columns=USER_COLS_DICT, inplace=True)
        df["collection_time"] = df.collection_time.apply(timestamp_to_datetime)
        df_to_csv(df, USER_CSV_DIR, "users")
        prefix_file(filepath, "transformed")
コード例 #3
0
def transform_tweets_data():
    filepaths = get_data_filepaths(TWEET_DATA_DIR, "tweets", ".txt")
    for filepath in filepaths:
        df = build_dataframe(filepath)
        df.rename(columns={"id": "twitter_id"}, inplace=True)
        df["collection_time"] = df.collection_time.apply(timestamp_to_datetime)
        df["timestamp_ms"] = df.timestamp_ms.apply(timestamp_to_datetime)
        df_to_csv(df, TWEET_CSV_DIR, "tweets")
        prefix_file(filepath, "transformed")
コード例 #4
0
def score_users():
    for filepath in get_data_filepaths(SCORING_DATA_DIR, "score", ".txt"):
        mentioned_screen_name = get_mentioned_from_filename(filepath)
        tweets = load_tweets(filepath)
        users_screen_names = set(tweet["user_core"].get("user_screen_name")
                                 for tweet in tweets)
        users_scores = query_botometer(users_screen_names)
        parse_scores(tweets, users_scores, mentioned_screen_name)
        prefix_file(filepath, "parsed")
    save_scored_tweets()
コード例 #5
0
def transform_scores_data():
    filepaths = get_data_filepaths(SCORED_DATA_DIR, "scored", ".txt")
    for filepath in filepaths:
        df = build_dataframe(filepath)
        df.rename(
            columns={"user_profile_image_url_https": "user_profile_image_url"},
            inplace=True)
        df["scoring_time"] = df.scoring_time.apply(datetime.fromtimestamp)
        df = normalize_scores(df)
        df_to_csv(df, SCORES_CSV_DIR, "scores")
        prefix_file(filepath, "transformed")
コード例 #6
0
def parse_tweets_data():
    tweet_files = get_data_filepaths(RAW_DIR, "stf", ".txt")

    for _file in tweet_files:
        for tweet in load_tweets(_file):
            parse_tweet(tweet)
    
    separate_users_tweets()
    save_parsed_tweets()
    save_chosen_for_scoring()
    
    prefix_files(RAW_DIR, "parsed")