def transform_subs_data(): df = load_dataframes(SUBS_RAW_DIR, "reddit", ".txt", SUBS_HEADERS) if df is None: return df = transform_data(df) df_to_csv(df, SUBS_TRANSFORMED_DIR, "reddit_subs") prefix_files(SUBS_RAW_DIR, "transformed")
def transform_users_data(): filepaths = get_data_filepaths(USER_DATA_DIR, "users", ".txt") for filepath in filepaths: df = build_dataframe(filepath) df.rename(columns=USER_COLS_DICT, inplace=True) df["collection_time"] = df.collection_time.apply(timestamp_to_datetime) df_to_csv(df, USER_CSV_DIR, "users") prefix_file(filepath, "transformed")
def transform_tweets_data(): filepaths = get_data_filepaths(TWEET_DATA_DIR, "tweets", ".txt") for filepath in filepaths: df = build_dataframe(filepath) df.rename(columns={"id": "twitter_id"}, inplace=True) df["collection_time"] = df.collection_time.apply(timestamp_to_datetime) df["timestamp_ms"] = df.timestamp_ms.apply(timestamp_to_datetime) df_to_csv(df, TWEET_CSV_DIR, "tweets") prefix_file(filepath, "transformed")
def transform_posts_data(): df = load_dataframes(POSTS_RAW_DIR, "reddit", ".txt", POSTS_HEADERS) if df is None: return df = transform_data(df) df = filter_data(df) df_to_csv(df, POSTS_TRANSFORMED_DIR, "reddit_posts") prefix_files(POSTS_RAW_DIR, "transformed")
def transform_scores_data(): filepaths = get_data_filepaths(SCORED_DATA_DIR, "scored", ".txt") for filepath in filepaths: df = build_dataframe(filepath) df.rename( columns={"user_profile_image_url_https": "user_profile_image_url"}, inplace=True) df["scoring_time"] = df.scoring_time.apply(datetime.fromtimestamp) df = normalize_scores(df) df_to_csv(df, SCORES_CSV_DIR, "scores") prefix_file(filepath, "transformed")
def transform_users_data(): df = load_dataframes(USERS_RAW_DIR, "reddit", ".txt", USERS_HEADERS) if df is None: return df = transform_data(df) df = filter_users_stats(df) df_to_csv(df, USERS_TRANSFORMED_DIR, "reddit_user_stats") prefix_files(USERS_RAW_DIR, "transformed") log_errors(ERROR_LOG_PATH, "reddit.users_transform", ERRORS)
def transform_g1_data(): df = load_dataframes(RAW_DIR, "g1", "txt", HEADERS) if df is None: return df = filter_dataframe(df) if not df.empty: df = transform_data(df) df_to_csv(df, TRANSFORMED_DIR, "g1") prefix_files(RAW_DIR, "transformed")
def transform_entity(entity, name): df = pd.DataFrame(entity) df.drop(columns=["type"], inplace=True) df["collection_time"] = df.collection_time.apply(timestamp_to_datetime) df.rename(columns=COLUMNS.get(name), inplace=True) df_to_csv(df, ENTITIES_CSV_DIR, name)