コード例 #1
0
sc.addPyFile("/home/ubuntu/data_utilities/data_cleaner.py")
from data_cleaner import DataCleaner

# Read input file and create DataCleaner object
dc = DataCleaner(sqlCtx)

df = dc.read_csv("/home/ubuntu/csv/so_bq_questions.csv")

# Remove records that lack a question_id, questioner_id, question_body_length, questioner_reputation, or questioner_up_votes
df = dc.drop_na_values(dataframe=df,
                       field_names=[
                           "question_id", "questioner_id",
                           "question_body_length", "questioner_reputation",
                           "questioner_up_votes"
                       ])

# Fix data types
df = dc.fix_data_type(dataframe=df,
                      field_names=[
                          "question_body_length", "question_codeblock_count",
                          "answer_count", "question_comment_count",
                          "questioner_id", "questioner_up_votes",
                          "questioner_down_votes", "accepted_answer_id",
                          "questioner_reputation", "questioner_views",
                          "max_answer_score"
                      ],
                      data_type='int')

df = dc.fix_data_type(dataframe=df,
                      field_names=[
                          "questioner_account_creation_date",
コード例 #2
0
sc.addPyFile("/home/ubuntu/data_utilities/data_cleaner.py")
from data_cleaner import DataCleaner

# Read input file and create DataCleaner object
dc = DataCleaner(sqlCtx)

df = dc.read_csv("/home/ubuntu/csv/so_bq_questions.csv")

# Remove records that lack a question_id, questioner_id, question_body_length, questioner_reputation, or questioner_up_votes
df = dc.drop_na_values(dataframe=df,field_names=["question_id","questioner_id","question_body_length","questioner_reputation","questioner_up_votes"])

# Fix data types
df = dc.fix_data_type(dataframe=df, field_names=["question_body_length","question_codeblock_count","answer_count","question_comment_count","questioner_id","questioner_up_votes",
                      "questioner_down_votes","accepted_answer_id","questioner_reputation","questioner_views","max_answer_score"], data_type='int')

df = dc.fix_data_type(dataframe=df, field_names=["questioner_account_creation_date","min_answer_creation_date"], data_type="timestamp")

df = dc.set_tag_count(dataframe=df, base_field="question_tags", count_field="question_tags_count")

df = dc.set_years_between_dates(dataframe=df, start_date="questioner_account_creation_date", end_date="question_creation_date", years_between_field="questioner_years_since_joining")

df = dc.fill_na(dataframe=df, field_name="question_favorite_count", fill_value=0)

# Create categorical feature question_view_quantile from question_view_count
df = dc.create_categorical_feature(dataframe=df, base_field="question_view_count", categorical_field="question_view_quantile", levels=10, increment=0)

df = dc.create_binary_feature(dataframe=df, base_field="question_favorite_count", binary_field="question_favorited")
df = dc.create_binary_feature(dataframe=df, base_field="answer_count", binary_field="has_answer")

df.select("answer_count","has_answer").show(20)
コード例 #3
0
sc.addPyFile("/home/ubuntu/data_utilities/data_cleaner.py")
from data_cleaner import DataCleaner

# Create DataCleaner object and read input file
dc = DataCleaner(sqlCtx)
df = dc.read_csv("/home/ubuntu/csv/so_bq_users.csv")

# Remove records that lack a user_id, user_display_name, user_reputation, questions_count, answers_count, or comments_count
df = dc.drop_na_values(dataframe=df,
                       field_names=[
                           "user_id", "user_display_name", "user_reputation",
                           "questions_count", "answers_count", "comments_count"
                       ])

print(df.printSchema())

# Create categorical feature user_reputation_quantile from user_reputation
df = dc.create_categorical_feature(
    dataframe=df,
    base_field="user_reputation",
    categorical_field="user_reputation_quantile",
    levels=5,
    increment=1)

print("Number of records:", df.count())

# Show count, min, max, etc. for up to 4 columns at a time
dc.show_stats(dataframe=df, batch_size=4)

# Export output to file
#dc.write_output("file:///home/ubuntu/csv/BigQueryUserOutputCleaner.csv")