sc.addPyFile("/home/ubuntu/data_utilities/data_cleaner.py") from data_cleaner import DataCleaner # Read input file and create DataCleaner object dc = DataCleaner(sqlCtx) df = dc.read_csv("/home/ubuntu/csv/so_bq_questions.csv") # Remove records that lack a question_id, questioner_id, question_body_length, questioner_reputation, or questioner_up_votes df = dc.drop_na_values(dataframe=df, field_names=[ "question_id", "questioner_id", "question_body_length", "questioner_reputation", "questioner_up_votes" ]) # Fix data types df = dc.fix_data_type(dataframe=df, field_names=[ "question_body_length", "question_codeblock_count", "answer_count", "question_comment_count", "questioner_id", "questioner_up_votes", "questioner_down_votes", "accepted_answer_id", "questioner_reputation", "questioner_views", "max_answer_score" ], data_type='int') df = dc.fix_data_type(dataframe=df, field_names=[ "questioner_account_creation_date",
sc.addPyFile("/home/ubuntu/data_utilities/data_cleaner.py") from data_cleaner import DataCleaner # Read input file and create DataCleaner object dc = DataCleaner(sqlCtx) df = dc.read_csv("/home/ubuntu/csv/so_bq_questions.csv") # Remove records that lack a question_id, questioner_id, question_body_length, questioner_reputation, or questioner_up_votes df = dc.drop_na_values(dataframe=df,field_names=["question_id","questioner_id","question_body_length","questioner_reputation","questioner_up_votes"]) # Fix data types df = dc.fix_data_type(dataframe=df, field_names=["question_body_length","question_codeblock_count","answer_count","question_comment_count","questioner_id","questioner_up_votes", "questioner_down_votes","accepted_answer_id","questioner_reputation","questioner_views","max_answer_score"], data_type='int') df = dc.fix_data_type(dataframe=df, field_names=["questioner_account_creation_date","min_answer_creation_date"], data_type="timestamp") df = dc.set_tag_count(dataframe=df, base_field="question_tags", count_field="question_tags_count") df = dc.set_years_between_dates(dataframe=df, start_date="questioner_account_creation_date", end_date="question_creation_date", years_between_field="questioner_years_since_joining") df = dc.fill_na(dataframe=df, field_name="question_favorite_count", fill_value=0) # Create categorical feature question_view_quantile from question_view_count df = dc.create_categorical_feature(dataframe=df, base_field="question_view_count", categorical_field="question_view_quantile", levels=10, increment=0) df = dc.create_binary_feature(dataframe=df, base_field="question_favorite_count", binary_field="question_favorited") df = dc.create_binary_feature(dataframe=df, base_field="answer_count", binary_field="has_answer") df.select("answer_count","has_answer").show(20)
sc.addPyFile("/home/ubuntu/data_utilities/data_cleaner.py") from data_cleaner import DataCleaner # Create DataCleaner object and read input file dc = DataCleaner(sqlCtx) df = dc.read_csv("/home/ubuntu/csv/so_bq_users.csv") # Remove records that lack a user_id, user_display_name, user_reputation, questions_count, answers_count, or comments_count df = dc.drop_na_values(dataframe=df, field_names=[ "user_id", "user_display_name", "user_reputation", "questions_count", "answers_count", "comments_count" ]) print(df.printSchema()) # Create categorical feature user_reputation_quantile from user_reputation df = dc.create_categorical_feature( dataframe=df, base_field="user_reputation", categorical_field="user_reputation_quantile", levels=5, increment=1) print("Number of records:", df.count()) # Show count, min, max, etc. for up to 4 columns at a time dc.show_stats(dataframe=df, batch_size=4) # Export output to file #dc.write_output("file:///home/ubuntu/csv/BigQueryUserOutputCleaner.csv")