command = """select did, gender_new_dev as gender, forecast_age_dev as age from {}""".format(persona_table) df_persona = hive_context.sql(command) df_persona = clean_persona(df_persona) # Use keywords to clean the clicklog and showlog which do not have any keyword association. # Create ad keywords table if does not exist, else load the keywords. if create_keywords: df_keywords = generate_add_keywords(keywords_table) else: df_keywords = load_df(hive_context, keywords_table) #[Row(keyword=u'education', keyword_index=1, spread_app_id=u'C100203741')] log_table_names = (showlog_table, showlog_new_table, clicklog_table, clicklog_new_table) clean_logs(cfg, df_persona, df_keywords, log_table_names) write_to_table(df_persona, persona_new_table, mode='overwrite') if __name__ == "__main__": """ main_clean is a process to generate cleaned persona, clicklog and showlog. """ sc, hive_context, cfg = load_config( description="clean data of persona, clicklog and showlog.") run(hive_context=hive_context, cfg=cfg) sc.stop()
tfrecords_statistics = {} tfrecords_statistics['distinct_records_count'] = df.count() save_pickle_file(tfrecords_statistics, tf_statis_path) def save_tfrecords(hive_context, trainready_table, tfrecords_hdfs_path, tf_statis_path): command = """select uckey_index, media_index, media_category_index, net_type_index, gender_index, age_index, region_id_index, interval_starting_time, keyword_indexes as keywords, keyword_indexes_click_counts as click_counts, keyword_indexes_show_counts as show_counts from {}""".format( trainready_table) df = hive_context.sql(command) generate_tf_statistics(df, tf_statis_path) df.write.format("tfrecords").option( "recordType", "Example").mode('overwrite').save(tfrecords_hdfs_path) if __name__ == "__main__": sc, hive_context, cfg = load_config(description="generate tf records") cfgp = cfg['pipeline'] trainready_table = cfgp['main_trainready']['trainready_output_table'] tfrecords_hdfs_path = cfgp['tfrecords']['tfrecords_hdfs_path'] tf_statis_path = cfgp['tfrecords']['tfrecords_statistics_path'] # save selected columns of train ready table as tfrecords. save_tfrecords(hive_context, trainready_table, tfrecords_hdfs_path, tf_statis_path) sc.stop()
batched += 1 starting_time = batched_time_end # use the temp table to save all the batched logs with region id inside it. # drop the logs table and alter the temp table name to the logs table. drop_table(hive_context, logs_table) command = """alter table {} rename to {}""".format(logs_table_temp_name, logs_table) hive_context.sql(command) timer_end = timeit.default_timer() print('Total batching seconds: ' + str(timer_end - timer_start)) def run(hive_context, cfg): batch_config = load_batch_config(cfg) cfg_logs = cfg['pipeline']['main_logs'] logs_table = cfg_logs['logs_output_table_name'] # add region ids to logs. add_region_to_logs(hive_context, batch_config, logs_table) if __name__ == "__main__": """ This is an optional step only for the logs data without regions. If original logs have the geo info or region(ipl or r), ignore this. """ sc, hive_context, cfg = load_config(description="main logs with regions") run(hive_context=hive_context, cfg=cfg) sc.stop()
write_to_table(df_trainready, trainready_table, mode='overwrite') timer_end = timeit.default_timer() print('Total batching seconds: ' + str(timer_end - timer_start)) return df_trainready def run(hive_context, cfg): cfg_logs = cfg['pipeline']['main_logs'] logs_table_name = cfg_logs['logs_output_table_name'] interval_time_in_seconds = cfg_logs['interval_time_in_seconds'] cfg_train = cfg['pipeline']['main_trainready'] trainready_table = cfg_train['trainready_output_table'] batch_config = load_batch_config(cfg) generate_trainready(hive_context, batch_config, interval_time_in_seconds, logs_table_name, trainready_table) if __name__ == "__main__": """ This program performs the followings: adds normalized data by adding index of features groups data into time_intervals and ucdocs (labeled by uckey) """ sc, hive_context, cfg = load_config( description="pre-processing train ready data") run(hive_context=hive_context, cfg=cfg) sc.stop()
# prepare parameters for processing batched logs. cfg_clean = cfg['pipeline']['main_clean'] cfg_clean_output = cfg_clean['data_output'] batch_config = load_batch_config(cfg) clicklog_table_name = cfg_clean_output['clicklog_output_table'] showlog_table_name = cfg_clean_output['showlog_output_table'] cfg_logs = cfg['pipeline']['main_logs'] logs_table_name = cfg_logs['logs_output_table_name'] interval_time_in_seconds = cfg_logs['interval_time_in_seconds'] log_table_names = (showlog_table_name, clicklog_table_name, logs_table_name) join_logs(hive_context, batch_config, interval_time_in_seconds, log_table_names) if __name__ == "__main__": """ This program performs the followings: unions show and click logs adds time_interval related data for batch processing adds uckey """ sc, hive_context, cfg = load_config(description="pre-processing logs data") run(hive_context=hive_context, cfg=cfg) sc.stop()