Esempio n. 1
0
def clean_logs(cfg, df_persona, df_keywords, log_table_names):
    sc = SparkContext.getOrCreate()
    sc.setLogLevel(cfg['log']['level'])
    hive_context = HiveContext(sc)
    cfg_clean = cfg['pipeline']['main_clean']
    conditions = cfg_clean['conditions']
    start_date, end_date, load_minutes = load_batch_config(cfg)

    timer_start = timeit.default_timer()
    showlog_table, showlog_output_table, clicklog_table, clicklog_output_table = log_table_names
    starting_time = datetime.strptime(start_date, "%Y-%m-%d")
    ending_time = datetime.strptime(end_date, "%Y-%m-%d")

    batched_round = 1
    while starting_time < ending_time:
        time_start = starting_time.strftime("%Y-%m-%d %H:%M:%S")
        batch_time_end = starting_time + timedelta(minutes=load_minutes)
        batch_time_end = min(batch_time_end, ending_time)
        time_end = batch_time_end.strftime("%Y-%m-%d %H:%M:%S")
        print_batching_info("Main clean", batched_round, time_start, time_end)

        command = """select did, adv_id, adv_type as media, slot_id, 
                    spread_app_id, device_name, net_type, 
                    adv_bill_mode_cd as price_model, {time} as action_time 
                    from {table} where {time} >= '{time_start}' and {time} < '{time_end}'"""

        df_clicklog_batched = hive_context.sql(
            command.format(time='click_time',
                           table=clicklog_table,
                           time_start=time_start,
                           time_end=time_end))

        df_showlog_batched = hive_context.sql(
            command.format(time='show_time',
                           table=showlog_table,
                           time_start=time_start,
                           time_end=time_end))

        mode = 'overwrite' if batched_round == 1 else 'append'
        is_empty_showlog_batched = df_showlog_batched.rdd.isEmpty()
        if not is_empty_showlog_batched:
            df_showlog_batched = clean_batched_log(df_showlog_batched,
                                                   df_persona, conditions,
                                                   df_keywords)
            write_to_table(df_showlog_batched, showlog_output_table, mode=mode)
        is_empty_clicklog_batched = df_clicklog_batched.rdd.isEmpty()
        if not is_empty_clicklog_batched:
            df_clicklog_batched = clean_batched_log(df_clicklog_batched,
                                                    df_persona, conditions,
                                                    df_keywords)
            write_to_table(df_clicklog_batched,
                           clicklog_output_table,
                           mode=mode)

        batched_round += 1
        starting_time = batch_time_end

    timer_end = timeit.default_timer()
    print('Total batching seconds: ' + str(timer_end - timer_start))
def add_region_to_logs(hive_context, batch_config, logs_table):
    start_date, end_date, load_minutes = batch_config
    timer_start = timeit.default_timer()
    batched = 1
    starting_time = datetime.strptime(start_date, "%Y-%m-%d")
    ending_time = datetime.strptime(end_date, "%Y-%m-%d")
    logs_table_temp_name = logs_table + '_temp'
    while starting_time < ending_time:
        # data clean for showlog table.
        time_start_str = starting_time.strftime("%Y-%m-%d %H:%M:%S")
        batched_time_end = starting_time + timedelta(minutes=load_minutes)
        time_end_str = batched_time_end.strftime("%Y-%m-%d %H:%M:%S")
        print_batching_info("Main regions", batched, time_start_str,
                            time_end_str)
        command = """select * from {} where action_time >= '{}' and action_time < '{}'"""
        logs = hive_context.sql(
            command.format(logs_table, time_start_str, time_end_str))
        logs = logs.drop(col('region_id'))
        logs = fit_distribution(logs)
        logs = logs.withColumnRenamed('index', 'region_id')
        logs = logs.withColumn(
            'uckey',
            concat_ws(",", col('media'), col('media_category'),
                      col('net_type'), col('gender'), col('age'),
                      col('region_id')))

        mode = 'overwrite' if batched == 1 else 'append'
        write_to_table(logs, logs_table_temp_name, mode=mode)
        batched += 1
        starting_time = batched_time_end

    # use the temp table to save all the batched logs with region id inside it.
    # drop the logs table and alter the temp table name to the logs table.
    drop_table(hive_context, logs_table)
    command = """alter table {} rename to {}""".format(logs_table_temp_name,
                                                       logs_table)
    hive_context.sql(command)

    timer_end = timeit.default_timer()
    print('Total batching seconds: ' + str(timer_end - timer_start))
Esempio n. 3
0
def generate_trainready(hive_context, batch_config, interval_time_in_seconds,
                        logs_table_name, trainready_table):
    def index_df_trainready(df):
        # normalized the adv_id values to the adv_id_index values.
        df = add_index(df, "uckey", "uckey_index", drop_column=False)
        df = add_index(df, "media", "media_index", drop_column=False)
        df = add_index(df,
                       "media_category",
                       "media_category_index",
                       drop_column=False)
        df = add_index(df, "net_type", "net_type_index", drop_column=False)
        df = add_index(df, "gender", "gender_index", drop_column=False)
        df = add_index(df, "age", "age_index", drop_column=False)
        df = add_index(df, "region_id", "region_id_index", drop_column=False)
        return df

    def group_batched_logs(logs):
        # group logs from uckey + interval_time + keyword.
        # group 1: group by uckey + interval_starting_time + keyword
        df = logs.groupBy(
            'uckey', 'interval_starting_time', 'keyword_index').agg(
                first('keyword').alias('keyword'),
                fn.sum(col('is_click')).alias('kw_clicks_count'),
                fn.count(fn.when(col('is_click') == 0,
                                 1).otherwise(0)).alias('kw_shows_count'))
        df = df.withColumn(
            'kwi_clicks_count',
            concat_ws(":", col('keyword_index'), col('kw_clicks_count')))
        df = df.withColumn(
            'kwi_shows_count',
            concat_ws(":", col('keyword_index'), col('kw_shows_count')))
        df = df.withColumn(
            'kw_clicks_count',
            concat_ws(":", col('keyword'), col('kw_clicks_count')))
        df = df.withColumn(
            'kw_shows_count',
            concat_ws(":", col('keyword'), col('kw_shows_count')))

        # group 2: group by uckey + interval_starting_time
        df = df.groupBy('uckey', 'interval_starting_time').agg(
            concat_ws(",", collect_list('keyword_index')).alias('kwi'),
            concat_ws(
                ",",
                collect_list('kwi_clicks_count')).alias('kwi_click_counts'),
            concat_ws(
                ",", collect_list('kwi_shows_count')).alias('kwi_show_counts'),
            concat_ws(",", collect_list('keyword')).alias('interval_keywords'),
            concat_ws(
                ",", collect_list('kw_clicks_count')).alias('kw_click_counts'),
            concat_ws(",",
                      collect_list('kw_shows_count')).alias('kw_show_counts'))
        return df

    def collect_trainready(df_trainready_batched_temp):
        # group 3: group by uckey with the temp batched uckey-interval rows.

        df = df_trainready_batched_temp

        # To improve performance, remove sorting and move it to each uckey.
        df = df.orderBy([col('uckey'), col('interval_starting_time').desc()])

        df = df.groupBy('uckey').agg(
            collect_list('interval_starting_time').alias(
                'interval_starting_time'),
            collect_list('kwi').alias('keyword_indexes'),
            collect_list('kwi_click_counts').alias(
                'keyword_indexes_click_counts'),
            collect_list('kwi_show_counts').alias(
                'keyword_indexes_show_counts'),
            collect_list('interval_keywords').alias('keywords'),
            collect_list('kw_click_counts').alias('keywords_click_counts'),
            collect_list('kw_show_counts').alias('keywords_show_counts'))

        uckey_split_col = fn.split(df['uckey'], ',')

        df = df.withColumn('media', uckey_split_col.getItem(0))
        df = df.withColumn('media_category', uckey_split_col.getItem(1))
        df = df.withColumn('net_type', uckey_split_col.getItem(2))
        df = df.withColumn('gender', uckey_split_col.getItem(3))
        df = df.withColumn('age', uckey_split_col.getItem(4))
        df = df.withColumn('region_id', uckey_split_col.getItem(5))

        df = df.withColumn('gender', df['gender'].cast(IntegerType()))
        df = df.withColumn('age', df['age'].cast(IntegerType()))
        df = df.withColumn('region_id', df['region_id'].cast(IntegerType()))
        return df

    trainready_table_temp = trainready_table + '_temp'
    timer_start = timeit.default_timer()
    start_date, end_date, load_minutes = batch_config

    starting_time_sec = int(
        datetime.strptime(start_date, "%Y-%m-%d").strftime("%s"))
    ending_time_sec = int(
        datetime.strptime(end_date, "%Y-%m-%d").strftime("%s"))

    batched_round = 1
    while starting_time_sec < ending_time_sec:
        batched_time_end_sec = starting_time_sec + \
            timedelta(minutes=load_minutes).total_seconds()

        command = """select distinct interval_starting_time from {} 
                     where action_time_seconds between {} and {}"""
        intervals = hive_context.sql(
            command.format(logs_table_name, starting_time_sec,
                           batched_time_end_sec)).collect()
        intervals = [_['interval_starting_time'] for _ in intervals]
        intervals.sort()
        command = """select * from {} where interval_starting_time between {} and {}"""
        start_time = intervals[0]
        end_time = intervals[-1]
        logs = hive_context.sql(
            command.format(logs_table_name, start_time, end_time))
        print_batching_info("Train ready", batched_round, str(start_time),
                            str(end_time))

        df_trainready = group_batched_logs(logs)
        mode = 'overwrite' if batched_round == 1 else 'append'
        write_to_table(df_trainready, trainready_table_temp, mode=mode)
        batched_round += 1

        # calculate new batched_time_end
        starting_time_sec = logs.filter('interval_starting_time == {}'.format(
            intervals[-1])).agg(fn.max('action_time_seconds')).take(1)[0][0]
        starting_time_sec += 1

    # load the batched trainready data and merge them with the same uckey.
    df_trainready_batched_temp = load_df(hive_context, trainready_table_temp)
    df_trainready = collect_trainready(df_trainready_batched_temp)
    df_trainready = index_df_trainready(df_trainready)
    write_to_table(df_trainready, trainready_table, mode='overwrite')
    timer_end = timeit.default_timer()
    print('Total batching seconds: ' + str(timer_end - timer_start))
    return df_trainready
Esempio n. 4
0
def join_logs(hive_context, batch_config, interval_time_in_seconds,
              log_table_names):
    def union_logs(df_clicklog, df_showlog):
        # union click log and show log.
        columns = [
            'did', 'is_click', 'action_time', 'keyword', 'keyword_index',
            'media', 'media_category', 'net_type', 'gender', 'age', 'adv_id'
        ]

        df_clicklog = df_clicklog.withColumn('is_click', lit(1))
        df_clicklog = df_clicklog.select(columns)

        df_showlog = df_showlog.withColumn('is_click', lit(0))
        df_showlog = df_showlog.select(columns)

        df_unionlog = df_showlog.union(df_clicklog)
        return df_unionlog

    def transform_action_time(df_logs, interval_time_in_seconds):
        _udf_time = udf(
            lambda x: int(
                datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f').strftime("%s")),
            IntegerType())
        df_logs = df_logs.withColumn('action_time_seconds',
                                     _udf_time(col('action_time')))

        _udf_interval_time = udf(lambda x: x - x % interval_time_in_seconds,
                                 IntegerType())
        df_logs = df_logs.withColumn(
            'interval_starting_time',
            _udf_interval_time(col('action_time_seconds')))

        return df_logs

    timer_start = timeit.default_timer()
    start_date, end_date, load_minutes = batch_config
    starting_time = datetime.strptime(start_date, "%Y-%m-%d")
    ending_time = datetime.strptime(end_date, "%Y-%m-%d")
    showlog_table_name, clicklog_table_name, logs_table_name = log_table_names

    batched_round = 1
    while starting_time < ending_time:
        batched_time_start_str = starting_time.strftime("%Y-%m-%d %H:%M:%S")
        batched_time_end = starting_time + \
            timedelta(minutes=load_minutes)
        batched_time_end_str = batched_time_end.strftime("%Y-%m-%d %H:%M:%S")
        print_batching_info("Main logs", batched_round, batched_time_start_str,
                            batched_time_end_str)
        command = """select did, action_time, keyword, keyword_index, 
                     media, media_category, net_type, gender, 
                     age, adv_id from {} where action_time >= '{}' 
                     and action_time < '{}'"""
        df_clicklog_batched = hive_context.sql(
            command.format(clicklog_table_name, batched_time_start_str,
                           batched_time_end_str))
        df_showlog_batched = hive_context.sql(
            command.format(showlog_table_name, batched_time_start_str,
                           batched_time_end_str))
        df_logs_batched = union_logs(df_clicklog_batched, df_showlog_batched)
        df_logs_batched = transform_action_time(df_logs_batched,
                                                interval_time_in_seconds)
        df_logs_batched = df_logs_batched.withColumn(
            'uckey',
            concat_ws(",", col('media'), col('media_category'),
                      col('net_type'), col('gender'), col('age')))
        mode = 'overwrite' if batched_round == 1 else 'append'
        write_to_table(df_logs_batched, logs_table_name, mode=mode)
        batched_round += 1
        starting_time = batched_time_end

    timer_end = timeit.default_timer()
    print('Total batching seconds: ' + str(timer_end - timer_start))