Example #1
0
def transform_subs_data():
    df = load_dataframes(SUBS_RAW_DIR, "reddit", ".txt", SUBS_HEADERS)
    if df is None:
        return

    df = transform_data(df)
    df_to_csv(df, SUBS_TRANSFORMED_DIR, "reddit_subs")
    prefix_files(SUBS_RAW_DIR, "transformed")
Example #2
0
def transform_users_data():
    filepaths = get_data_filepaths(USER_DATA_DIR, "users", ".txt")
    for filepath in filepaths:
        df = build_dataframe(filepath)
        df.rename(columns=USER_COLS_DICT, inplace=True)
        df["collection_time"] = df.collection_time.apply(timestamp_to_datetime)
        df_to_csv(df, USER_CSV_DIR, "users")
        prefix_file(filepath, "transformed")
Example #3
0
def transform_tweets_data():
    filepaths = get_data_filepaths(TWEET_DATA_DIR, "tweets", ".txt")
    for filepath in filepaths:
        df = build_dataframe(filepath)
        df.rename(columns={"id": "twitter_id"}, inplace=True)
        df["collection_time"] = df.collection_time.apply(timestamp_to_datetime)
        df["timestamp_ms"] = df.timestamp_ms.apply(timestamp_to_datetime)
        df_to_csv(df, TWEET_CSV_DIR, "tweets")
        prefix_file(filepath, "transformed")
Example #4
0
def transform_posts_data():
    df = load_dataframes(POSTS_RAW_DIR, "reddit", ".txt", POSTS_HEADERS)
    if df is None:
        return

    df = transform_data(df)
    df = filter_data(df)

    df_to_csv(df, POSTS_TRANSFORMED_DIR, "reddit_posts")
    prefix_files(POSTS_RAW_DIR, "transformed")
Example #5
0
def transform_scores_data():
    filepaths = get_data_filepaths(SCORED_DATA_DIR, "scored", ".txt")
    for filepath in filepaths:
        df = build_dataframe(filepath)
        df.rename(
            columns={"user_profile_image_url_https": "user_profile_image_url"},
            inplace=True)
        df["scoring_time"] = df.scoring_time.apply(datetime.fromtimestamp)
        df = normalize_scores(df)
        df_to_csv(df, SCORES_CSV_DIR, "scores")
        prefix_file(filepath, "transformed")
Example #6
0
def transform_users_data():
    df = load_dataframes(USERS_RAW_DIR, "reddit", ".txt", USERS_HEADERS)
    if df is None:
        return

    df = transform_data(df)
    df = filter_users_stats(df)

    df_to_csv(df, USERS_TRANSFORMED_DIR, "reddit_user_stats")
    prefix_files(USERS_RAW_DIR, "transformed")
    log_errors(ERROR_LOG_PATH, "reddit.users_transform", ERRORS)
Example #7
0
def transform_g1_data():
    df = load_dataframes(RAW_DIR, "g1", "txt", HEADERS)
    if df is None:
        return

    df = filter_dataframe(df)

    if not df.empty:
        df = transform_data(df)
        df_to_csv(df, TRANSFORMED_DIR, "g1")

    prefix_files(RAW_DIR, "transformed")
Example #8
0
def transform_entity(entity, name):
    df = pd.DataFrame(entity)
    df.drop(columns=["type"], inplace=True)
    df["collection_time"] = df.collection_time.apply(timestamp_to_datetime)
    df.rename(columns=COLUMNS.get(name), inplace=True)
    df_to_csv(df, ENTITIES_CSV_DIR, name)