def transform_entities_data(): filepaths = get_data_filepaths(ENTITIES_DATA_DIR, "entities", ".txt") for filepath in filepaths: entities = load_tweets(filepath) separate_entities(entities) transform_entities() prefix_file(filepath, "transformed")
def transform_users_data(): filepaths = get_data_filepaths(USER_DATA_DIR, "users", ".txt") for filepath in filepaths: df = build_dataframe(filepath) df.rename(columns=USER_COLS_DICT, inplace=True) df["collection_time"] = df.collection_time.apply(timestamp_to_datetime) df_to_csv(df, USER_CSV_DIR, "users") prefix_file(filepath, "transformed")
def transform_tweets_data(): filepaths = get_data_filepaths(TWEET_DATA_DIR, "tweets", ".txt") for filepath in filepaths: df = build_dataframe(filepath) df.rename(columns={"id": "twitter_id"}, inplace=True) df["collection_time"] = df.collection_time.apply(timestamp_to_datetime) df["timestamp_ms"] = df.timestamp_ms.apply(timestamp_to_datetime) df_to_csv(df, TWEET_CSV_DIR, "tweets") prefix_file(filepath, "transformed")
def score_users(): for filepath in get_data_filepaths(SCORING_DATA_DIR, "score", ".txt"): mentioned_screen_name = get_mentioned_from_filename(filepath) tweets = load_tweets(filepath) users_screen_names = set(tweet["user_core"].get("user_screen_name") for tweet in tweets) users_scores = query_botometer(users_screen_names) parse_scores(tweets, users_scores, mentioned_screen_name) prefix_file(filepath, "parsed") save_scored_tweets()
def transform_scores_data(): filepaths = get_data_filepaths(SCORED_DATA_DIR, "scored", ".txt") for filepath in filepaths: df = build_dataframe(filepath) df.rename( columns={"user_profile_image_url_https": "user_profile_image_url"}, inplace=True) df["scoring_time"] = df.scoring_time.apply(datetime.fromtimestamp) df = normalize_scores(df) df_to_csv(df, SCORES_CSV_DIR, "scores") prefix_file(filepath, "transformed")
def parse_tweets_data(): tweet_files = get_data_filepaths(RAW_DIR, "stf", ".txt") for _file in tweet_files: for tweet in load_tweets(_file): parse_tweet(tweet) separate_users_tweets() save_parsed_tweets() save_chosen_for_scoring() prefix_files(RAW_DIR, "parsed")