def clean_logs(cfg, df_persona, df_keywords, log_table_names): sc = SparkContext.getOrCreate() sc.setLogLevel(cfg['log']['level']) hive_context = HiveContext(sc) cfg_clean = cfg['pipeline']['main_clean'] conditions = cfg_clean['conditions'] start_date, end_date, load_minutes = load_batch_config(cfg) timer_start = timeit.default_timer() showlog_table, showlog_output_table, clicklog_table, clicklog_output_table = log_table_names starting_time = datetime.strptime(start_date, "%Y-%m-%d") ending_time = datetime.strptime(end_date, "%Y-%m-%d") batched_round = 1 while starting_time < ending_time: time_start = starting_time.strftime("%Y-%m-%d %H:%M:%S") batch_time_end = starting_time + timedelta(minutes=load_minutes) batch_time_end = min(batch_time_end, ending_time) time_end = batch_time_end.strftime("%Y-%m-%d %H:%M:%S") print_batching_info("Main clean", batched_round, time_start, time_end) command = """select did, adv_id, adv_type as media, slot_id, spread_app_id, device_name, net_type, adv_bill_mode_cd as price_model, {time} as action_time from {table} where {time} >= '{time_start}' and {time} < '{time_end}'""" df_clicklog_batched = hive_context.sql( command.format(time='click_time', table=clicklog_table, time_start=time_start, time_end=time_end)) df_showlog_batched = hive_context.sql( command.format(time='show_time', table=showlog_table, time_start=time_start, time_end=time_end)) mode = 'overwrite' if batched_round == 1 else 'append' is_empty_showlog_batched = df_showlog_batched.rdd.isEmpty() if not is_empty_showlog_batched: df_showlog_batched = clean_batched_log(df_showlog_batched, df_persona, conditions, df_keywords) write_to_table(df_showlog_batched, showlog_output_table, mode=mode) is_empty_clicklog_batched = df_clicklog_batched.rdd.isEmpty() if not is_empty_clicklog_batched: df_clicklog_batched = clean_batched_log(df_clicklog_batched, df_persona, conditions, df_keywords) write_to_table(df_clicklog_batched, clicklog_output_table, mode=mode) batched_round += 1 starting_time = batch_time_end timer_end = timeit.default_timer() print('Total batching seconds: ' + str(timer_end - timer_start))
def add_region_to_logs(hive_context, batch_config, logs_table): start_date, end_date, load_minutes = batch_config timer_start = timeit.default_timer() batched = 1 starting_time = datetime.strptime(start_date, "%Y-%m-%d") ending_time = datetime.strptime(end_date, "%Y-%m-%d") logs_table_temp_name = logs_table + '_temp' while starting_time < ending_time: # data clean for showlog table. time_start_str = starting_time.strftime("%Y-%m-%d %H:%M:%S") batched_time_end = starting_time + timedelta(minutes=load_minutes) time_end_str = batched_time_end.strftime("%Y-%m-%d %H:%M:%S") print_batching_info("Main regions", batched, time_start_str, time_end_str) command = """select * from {} where action_time >= '{}' and action_time < '{}'""" logs = hive_context.sql( command.format(logs_table, time_start_str, time_end_str)) logs = logs.drop(col('region_id')) logs = fit_distribution(logs) logs = logs.withColumnRenamed('index', 'region_id') logs = logs.withColumn( 'uckey', concat_ws(",", col('media'), col('media_category'), col('net_type'), col('gender'), col('age'), col('region_id'))) mode = 'overwrite' if batched == 1 else 'append' write_to_table(logs, logs_table_temp_name, mode=mode) batched += 1 starting_time = batched_time_end # use the temp table to save all the batched logs with region id inside it. # drop the logs table and alter the temp table name to the logs table. drop_table(hive_context, logs_table) command = """alter table {} rename to {}""".format(logs_table_temp_name, logs_table) hive_context.sql(command) timer_end = timeit.default_timer() print('Total batching seconds: ' + str(timer_end - timer_start))
def generate_trainready(hive_context, batch_config, interval_time_in_seconds, logs_table_name, trainready_table): def index_df_trainready(df): # normalized the adv_id values to the adv_id_index values. df = add_index(df, "uckey", "uckey_index", drop_column=False) df = add_index(df, "media", "media_index", drop_column=False) df = add_index(df, "media_category", "media_category_index", drop_column=False) df = add_index(df, "net_type", "net_type_index", drop_column=False) df = add_index(df, "gender", "gender_index", drop_column=False) df = add_index(df, "age", "age_index", drop_column=False) df = add_index(df, "region_id", "region_id_index", drop_column=False) return df def group_batched_logs(logs): # group logs from uckey + interval_time + keyword. # group 1: group by uckey + interval_starting_time + keyword df = logs.groupBy( 'uckey', 'interval_starting_time', 'keyword_index').agg( first('keyword').alias('keyword'), fn.sum(col('is_click')).alias('kw_clicks_count'), fn.count(fn.when(col('is_click') == 0, 1).otherwise(0)).alias('kw_shows_count')) df = df.withColumn( 'kwi_clicks_count', concat_ws(":", col('keyword_index'), col('kw_clicks_count'))) df = df.withColumn( 'kwi_shows_count', concat_ws(":", col('keyword_index'), col('kw_shows_count'))) df = df.withColumn( 'kw_clicks_count', concat_ws(":", col('keyword'), col('kw_clicks_count'))) df = df.withColumn( 'kw_shows_count', concat_ws(":", col('keyword'), col('kw_shows_count'))) # group 2: group by uckey + interval_starting_time df = df.groupBy('uckey', 'interval_starting_time').agg( concat_ws(",", collect_list('keyword_index')).alias('kwi'), concat_ws( ",", collect_list('kwi_clicks_count')).alias('kwi_click_counts'), concat_ws( ",", collect_list('kwi_shows_count')).alias('kwi_show_counts'), concat_ws(",", collect_list('keyword')).alias('interval_keywords'), concat_ws( ",", collect_list('kw_clicks_count')).alias('kw_click_counts'), concat_ws(",", collect_list('kw_shows_count')).alias('kw_show_counts')) return df def collect_trainready(df_trainready_batched_temp): # group 3: group by uckey with the temp batched uckey-interval rows. df = df_trainready_batched_temp # To improve performance, remove sorting and move it to each uckey. df = df.orderBy([col('uckey'), col('interval_starting_time').desc()]) df = df.groupBy('uckey').agg( collect_list('interval_starting_time').alias( 'interval_starting_time'), collect_list('kwi').alias('keyword_indexes'), collect_list('kwi_click_counts').alias( 'keyword_indexes_click_counts'), collect_list('kwi_show_counts').alias( 'keyword_indexes_show_counts'), collect_list('interval_keywords').alias('keywords'), collect_list('kw_click_counts').alias('keywords_click_counts'), collect_list('kw_show_counts').alias('keywords_show_counts')) uckey_split_col = fn.split(df['uckey'], ',') df = df.withColumn('media', uckey_split_col.getItem(0)) df = df.withColumn('media_category', uckey_split_col.getItem(1)) df = df.withColumn('net_type', uckey_split_col.getItem(2)) df = df.withColumn('gender', uckey_split_col.getItem(3)) df = df.withColumn('age', uckey_split_col.getItem(4)) df = df.withColumn('region_id', uckey_split_col.getItem(5)) df = df.withColumn('gender', df['gender'].cast(IntegerType())) df = df.withColumn('age', df['age'].cast(IntegerType())) df = df.withColumn('region_id', df['region_id'].cast(IntegerType())) return df trainready_table_temp = trainready_table + '_temp' timer_start = timeit.default_timer() start_date, end_date, load_minutes = batch_config starting_time_sec = int( datetime.strptime(start_date, "%Y-%m-%d").strftime("%s")) ending_time_sec = int( datetime.strptime(end_date, "%Y-%m-%d").strftime("%s")) batched_round = 1 while starting_time_sec < ending_time_sec: batched_time_end_sec = starting_time_sec + \ timedelta(minutes=load_minutes).total_seconds() command = """select distinct interval_starting_time from {} where action_time_seconds between {} and {}""" intervals = hive_context.sql( command.format(logs_table_name, starting_time_sec, batched_time_end_sec)).collect() intervals = [_['interval_starting_time'] for _ in intervals] intervals.sort() command = """select * from {} where interval_starting_time between {} and {}""" start_time = intervals[0] end_time = intervals[-1] logs = hive_context.sql( command.format(logs_table_name, start_time, end_time)) print_batching_info("Train ready", batched_round, str(start_time), str(end_time)) df_trainready = group_batched_logs(logs) mode = 'overwrite' if batched_round == 1 else 'append' write_to_table(df_trainready, trainready_table_temp, mode=mode) batched_round += 1 # calculate new batched_time_end starting_time_sec = logs.filter('interval_starting_time == {}'.format( intervals[-1])).agg(fn.max('action_time_seconds')).take(1)[0][0] starting_time_sec += 1 # load the batched trainready data and merge them with the same uckey. df_trainready_batched_temp = load_df(hive_context, trainready_table_temp) df_trainready = collect_trainready(df_trainready_batched_temp) df_trainready = index_df_trainready(df_trainready) write_to_table(df_trainready, trainready_table, mode='overwrite') timer_end = timeit.default_timer() print('Total batching seconds: ' + str(timer_end - timer_start)) return df_trainready
def join_logs(hive_context, batch_config, interval_time_in_seconds, log_table_names): def union_logs(df_clicklog, df_showlog): # union click log and show log. columns = [ 'did', 'is_click', 'action_time', 'keyword', 'keyword_index', 'media', 'media_category', 'net_type', 'gender', 'age', 'adv_id' ] df_clicklog = df_clicklog.withColumn('is_click', lit(1)) df_clicklog = df_clicklog.select(columns) df_showlog = df_showlog.withColumn('is_click', lit(0)) df_showlog = df_showlog.select(columns) df_unionlog = df_showlog.union(df_clicklog) return df_unionlog def transform_action_time(df_logs, interval_time_in_seconds): _udf_time = udf( lambda x: int( datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f').strftime("%s")), IntegerType()) df_logs = df_logs.withColumn('action_time_seconds', _udf_time(col('action_time'))) _udf_interval_time = udf(lambda x: x - x % interval_time_in_seconds, IntegerType()) df_logs = df_logs.withColumn( 'interval_starting_time', _udf_interval_time(col('action_time_seconds'))) return df_logs timer_start = timeit.default_timer() start_date, end_date, load_minutes = batch_config starting_time = datetime.strptime(start_date, "%Y-%m-%d") ending_time = datetime.strptime(end_date, "%Y-%m-%d") showlog_table_name, clicklog_table_name, logs_table_name = log_table_names batched_round = 1 while starting_time < ending_time: batched_time_start_str = starting_time.strftime("%Y-%m-%d %H:%M:%S") batched_time_end = starting_time + \ timedelta(minutes=load_minutes) batched_time_end_str = batched_time_end.strftime("%Y-%m-%d %H:%M:%S") print_batching_info("Main logs", batched_round, batched_time_start_str, batched_time_end_str) command = """select did, action_time, keyword, keyword_index, media, media_category, net_type, gender, age, adv_id from {} where action_time >= '{}' and action_time < '{}'""" df_clicklog_batched = hive_context.sql( command.format(clicklog_table_name, batched_time_start_str, batched_time_end_str)) df_showlog_batched = hive_context.sql( command.format(showlog_table_name, batched_time_start_str, batched_time_end_str)) df_logs_batched = union_logs(df_clicklog_batched, df_showlog_batched) df_logs_batched = transform_action_time(df_logs_batched, interval_time_in_seconds) df_logs_batched = df_logs_batched.withColumn( 'uckey', concat_ws(",", col('media'), col('media_category'), col('net_type'), col('gender'), col('age'))) mode = 'overwrite' if batched_round == 1 else 'append' write_to_table(df_logs_batched, logs_table_name, mode=mode) batched_round += 1 starting_time = batched_time_end timer_end = timeit.default_timer() print('Total batching seconds: ' + str(timer_end - timer_start))