コード例 #1
0
ファイル: main_clean.py プロジェクト: xu-weiyuan/blue-marlin
def clean_logs(cfg, df_persona, df_keywords, log_table_names):
    sc = SparkContext.getOrCreate()
    sc.setLogLevel(cfg['log']['level'])
    hive_context = HiveContext(sc)
    cfg_clean = cfg['pipeline']['main_clean']
    conditions = cfg_clean['conditions']
    start_date, end_date, load_minutes = load_batch_config(cfg)

    timer_start = timeit.default_timer()
    showlog_table, showlog_output_table, clicklog_table, clicklog_output_table = log_table_names
    starting_time = datetime.strptime(start_date, "%Y-%m-%d")
    ending_time = datetime.strptime(end_date, "%Y-%m-%d")

    batched_round = 1
    while starting_time < ending_time:
        time_start = starting_time.strftime("%Y-%m-%d %H:%M:%S")
        batch_time_end = starting_time + timedelta(minutes=load_minutes)
        batch_time_end = min(batch_time_end, ending_time)
        time_end = batch_time_end.strftime("%Y-%m-%d %H:%M:%S")
        print_batching_info("Main clean", batched_round, time_start, time_end)

        command = """select did, adv_id, adv_type as media, slot_id, 
                    spread_app_id, device_name, net_type, 
                    adv_bill_mode_cd as price_model, {time} as action_time 
                    from {table} where {time} >= '{time_start}' and {time} < '{time_end}'"""

        df_clicklog_batched = hive_context.sql(
            command.format(time='click_time',
                           table=clicklog_table,
                           time_start=time_start,
                           time_end=time_end))

        df_showlog_batched = hive_context.sql(
            command.format(time='show_time',
                           table=showlog_table,
                           time_start=time_start,
                           time_end=time_end))

        mode = 'overwrite' if batched_round == 1 else 'append'
        is_empty_showlog_batched = df_showlog_batched.rdd.isEmpty()
        if not is_empty_showlog_batched:
            df_showlog_batched = clean_batched_log(df_showlog_batched,
                                                   df_persona, conditions,
                                                   df_keywords)
            write_to_table(df_showlog_batched, showlog_output_table, mode=mode)
        is_empty_clicklog_batched = df_clicklog_batched.rdd.isEmpty()
        if not is_empty_clicklog_batched:
            df_clicklog_batched = clean_batched_log(df_clicklog_batched,
                                                    df_persona, conditions,
                                                    df_keywords)
            write_to_table(df_clicklog_batched,
                           clicklog_output_table,
                           mode=mode)

        batched_round += 1
        starting_time = batch_time_end

    timer_end = timeit.default_timer()
    print('Total batching seconds: ' + str(timer_end - timer_start))
コード例 #2
0
ファイル: main_clean.py プロジェクト: xu-weiyuan/blue-marlin
def run(hive_context, cfg):
    """
    # This cleans persona, clicklog and showlog tables,
    # by having persona table with distinct (did,gender,age) and
    # by removing unassociated slot-id and did in the log tables.
    """
    cfg_clean = cfg['pipeline']['main_clean']
    cfg_input = cfg_clean['data_input']
    cfg_output = cfg_clean['data_output']

    persona_table = cfg_input['persona_table_name']
    clicklog_table = cfg_input['clicklog_table_name']
    showlog_table = cfg_input['showlog_table_name']
    keywords_table = cfg_input['keywords_table']
    create_keywords = cfg_input['create_keywords']

    persona_new_table = cfg_output['persona_output_table']
    clicklog_new_table = cfg_output['clicklog_output_table']
    showlog_new_table = cfg_output['showlog_output_table']

    command = """select did, gender_new_dev as gender, 
                 forecast_age_dev as age from {}""".format(persona_table)
    df_persona = hive_context.sql(command)

    df_persona = clean_persona(df_persona)

    # Use keywords to clean the clicklog and showlog which do not have any keyword association.
    # Create ad keywords table if does not exist, else load the keywords.
    if create_keywords:
        df_keywords = generate_add_keywords(keywords_table)
    else:
        df_keywords = load_df(hive_context, keywords_table)
    #[Row(keyword=u'education', keyword_index=1, spread_app_id=u'C100203741')]

    log_table_names = (showlog_table, showlog_new_table, clicklog_table,
                       clicklog_new_table)

    clean_logs(cfg, df_persona, df_keywords, log_table_names)

    write_to_table(df_persona, persona_new_table, mode='overwrite')
コード例 #3
0
def add_region_to_logs(hive_context, batch_config, logs_table):
    start_date, end_date, load_minutes = batch_config
    timer_start = timeit.default_timer()
    batched = 1
    starting_time = datetime.strptime(start_date, "%Y-%m-%d")
    ending_time = datetime.strptime(end_date, "%Y-%m-%d")
    logs_table_temp_name = logs_table + '_temp'
    while starting_time < ending_time:
        # data clean for showlog table.
        time_start_str = starting_time.strftime("%Y-%m-%d %H:%M:%S")
        batched_time_end = starting_time + timedelta(minutes=load_minutes)
        time_end_str = batched_time_end.strftime("%Y-%m-%d %H:%M:%S")
        print_batching_info("Main regions", batched, time_start_str,
                            time_end_str)
        command = """select * from {} where action_time >= '{}' and action_time < '{}'"""
        logs = hive_context.sql(
            command.format(logs_table, time_start_str, time_end_str))
        logs = logs.drop(col('region_id'))
        logs = fit_distribution(logs)
        logs = logs.withColumnRenamed('index', 'region_id')
        logs = logs.withColumn(
            'uckey',
            concat_ws(",", col('media'), col('media_category'),
                      col('net_type'), col('gender'), col('age'),
                      col('region_id')))

        mode = 'overwrite' if batched == 1 else 'append'
        write_to_table(logs, logs_table_temp_name, mode=mode)
        batched += 1
        starting_time = batched_time_end

    # use the temp table to save all the batched logs with region id inside it.
    # drop the logs table and alter the temp table name to the logs table.
    drop_table(hive_context, logs_table)
    command = """alter table {} rename to {}""".format(logs_table_temp_name,
                                                       logs_table)
    hive_context.sql(command)

    timer_end = timeit.default_timer()
    print('Total batching seconds: ' + str(timer_end - timer_start))
コード例 #4
0
def generate_trainready(hive_context, batch_config, interval_time_in_seconds,
                        logs_table_name, trainready_table):
    def index_df_trainready(df):
        # normalized the adv_id values to the adv_id_index values.
        df = add_index(df, "uckey", "uckey_index", drop_column=False)
        df = add_index(df, "media", "media_index", drop_column=False)
        df = add_index(df,
                       "media_category",
                       "media_category_index",
                       drop_column=False)
        df = add_index(df, "net_type", "net_type_index", drop_column=False)
        df = add_index(df, "gender", "gender_index", drop_column=False)
        df = add_index(df, "age", "age_index", drop_column=False)
        df = add_index(df, "region_id", "region_id_index", drop_column=False)
        return df

    def group_batched_logs(logs):
        # group logs from uckey + interval_time + keyword.
        # group 1: group by uckey + interval_starting_time + keyword
        df = logs.groupBy(
            'uckey', 'interval_starting_time', 'keyword_index').agg(
                first('keyword').alias('keyword'),
                fn.sum(col('is_click')).alias('kw_clicks_count'),
                fn.count(fn.when(col('is_click') == 0,
                                 1).otherwise(0)).alias('kw_shows_count'))
        df = df.withColumn(
            'kwi_clicks_count',
            concat_ws(":", col('keyword_index'), col('kw_clicks_count')))
        df = df.withColumn(
            'kwi_shows_count',
            concat_ws(":", col('keyword_index'), col('kw_shows_count')))
        df = df.withColumn(
            'kw_clicks_count',
            concat_ws(":", col('keyword'), col('kw_clicks_count')))
        df = df.withColumn(
            'kw_shows_count',
            concat_ws(":", col('keyword'), col('kw_shows_count')))

        # group 2: group by uckey + interval_starting_time
        df = df.groupBy('uckey', 'interval_starting_time').agg(
            concat_ws(",", collect_list('keyword_index')).alias('kwi'),
            concat_ws(
                ",",
                collect_list('kwi_clicks_count')).alias('kwi_click_counts'),
            concat_ws(
                ",", collect_list('kwi_shows_count')).alias('kwi_show_counts'),
            concat_ws(",", collect_list('keyword')).alias('interval_keywords'),
            concat_ws(
                ",", collect_list('kw_clicks_count')).alias('kw_click_counts'),
            concat_ws(",",
                      collect_list('kw_shows_count')).alias('kw_show_counts'))
        return df

    def collect_trainready(df_trainready_batched_temp):
        # group 3: group by uckey with the temp batched uckey-interval rows.

        df = df_trainready_batched_temp

        # To improve performance, remove sorting and move it to each uckey.
        df = df.orderBy([col('uckey'), col('interval_starting_time').desc()])

        df = df.groupBy('uckey').agg(
            collect_list('interval_starting_time').alias(
                'interval_starting_time'),
            collect_list('kwi').alias('keyword_indexes'),
            collect_list('kwi_click_counts').alias(
                'keyword_indexes_click_counts'),
            collect_list('kwi_show_counts').alias(
                'keyword_indexes_show_counts'),
            collect_list('interval_keywords').alias('keywords'),
            collect_list('kw_click_counts').alias('keywords_click_counts'),
            collect_list('kw_show_counts').alias('keywords_show_counts'))

        uckey_split_col = fn.split(df['uckey'], ',')

        df = df.withColumn('media', uckey_split_col.getItem(0))
        df = df.withColumn('media_category', uckey_split_col.getItem(1))
        df = df.withColumn('net_type', uckey_split_col.getItem(2))
        df = df.withColumn('gender', uckey_split_col.getItem(3))
        df = df.withColumn('age', uckey_split_col.getItem(4))
        df = df.withColumn('region_id', uckey_split_col.getItem(5))

        df = df.withColumn('gender', df['gender'].cast(IntegerType()))
        df = df.withColumn('age', df['age'].cast(IntegerType()))
        df = df.withColumn('region_id', df['region_id'].cast(IntegerType()))
        return df

    trainready_table_temp = trainready_table + '_temp'
    timer_start = timeit.default_timer()
    start_date, end_date, load_minutes = batch_config

    starting_time_sec = int(
        datetime.strptime(start_date, "%Y-%m-%d").strftime("%s"))
    ending_time_sec = int(
        datetime.strptime(end_date, "%Y-%m-%d").strftime("%s"))

    batched_round = 1
    while starting_time_sec < ending_time_sec:
        batched_time_end_sec = starting_time_sec + \
            timedelta(minutes=load_minutes).total_seconds()

        command = """select distinct interval_starting_time from {} 
                     where action_time_seconds between {} and {}"""
        intervals = hive_context.sql(
            command.format(logs_table_name, starting_time_sec,
                           batched_time_end_sec)).collect()
        intervals = [_['interval_starting_time'] for _ in intervals]
        intervals.sort()
        command = """select * from {} where interval_starting_time between {} and {}"""
        start_time = intervals[0]
        end_time = intervals[-1]
        logs = hive_context.sql(
            command.format(logs_table_name, start_time, end_time))
        print_batching_info("Train ready", batched_round, str(start_time),
                            str(end_time))

        df_trainready = group_batched_logs(logs)
        mode = 'overwrite' if batched_round == 1 else 'append'
        write_to_table(df_trainready, trainready_table_temp, mode=mode)
        batched_round += 1

        # calculate new batched_time_end
        starting_time_sec = logs.filter('interval_starting_time == {}'.format(
            intervals[-1])).agg(fn.max('action_time_seconds')).take(1)[0][0]
        starting_time_sec += 1

    # load the batched trainready data and merge them with the same uckey.
    df_trainready_batched_temp = load_df(hive_context, trainready_table_temp)
    df_trainready = collect_trainready(df_trainready_batched_temp)
    df_trainready = index_df_trainready(df_trainready)
    write_to_table(df_trainready, trainready_table, mode='overwrite')
    timer_end = timeit.default_timer()
    print('Total batching seconds: ' + str(timer_end - timer_start))
    return df_trainready
コード例 #5
0
    def test_clean_logs_1(self):
        """
        Refer to clean-log method docstring

        1. check if the tables are exist and drop them
        2. take the table and two dataframe (keyword and persona)
        3. clean up based on the dids and add persona's feature to the new dataframe
        4. check if the expected data frame is the same as the dataframe which created by clean_logs or not

        """
        self.hive_context.sql("DROP TABLE if exists show_test_0000098989")
        self.hive_context.sql(
            "DROP TABLE if exists show_output_test_0000098989")
        self.hive_context.sql("DROP TABLE if exists click_test_0000098989")
        self.hive_context.sql(
            "DROP TABLE if exists click_output_test_0000098989")

        cfg_test = {
            'log': {
                'level': 'WARN'
            },
            'pipeline': {
                'main_clean': {
                    'conditions': {
                        'new_slot_id_list': ['a47eavw7ex'],
                        'new_slot_id_app_name_list': [
                            'Huawei Reading', 'Honor Reading', 'Video 1.0',
                            'Video 2.0', 'HiSkytone'
                        ],
                        'starting_date':
                        '2019-12-19',
                        'ending_date':
                        '2020-04-15'
                    },
                    'data_input': {
                        'load_logs_in_minutes': 144000
                    }
                }
            }
        }

        df_showlog_test = self.hive_context.createDataFrame(
            [('1001', '103364', 'native', 'a47eavw7ex', 'C10499309',
              'LLD-AL20', 'WIFI', 'CPC', '2020-04-02 08:03:46.113'),
             ('1002', '45022046', 'native', 'a47eavw7ex', 'C10295678',
              'LLD-AL20', 'WIFI', 'CPC', '2020-04-02 08:03:46.413'),
             ('1001', '45022046', 'native', 'a47eavw7ex', 'C10295678',
              'LLD-AL20', 'WIFI', 'CPC', '2020-04-03 08:04:06.213'),
             ('1001', '103364', 'native', 'a47eavw7ey', 'C10499309',
              'LLD-AL20', 'WIFI', 'CPC', '2020-04-02 08:05:00.613'),
             ('1003', '103364', 'native', 'a47eavw7ex', 'C10499309',
              'LLD-AL20', 'WIFI', 'CPC', '2020-04-02 08:05:00.613')], [
                  'did', 'adv_id', 'adv_type', 'slot_id', 'spread_app_id',
                  'device_name', 'net_type', 'adv_bill_mode_cd', 'show_time'
              ])
        util.write_to_table(df_showlog_test,
                            'show_test_0000098989',
                            mode='overwrite')

        df_clicklog_test = self.hive_context.createDataFrame(
            [('1001', '103364', 'native', 'a47eavw7ex', 'C10499309',
              'LLD-AL20', 'WIFI', 'CPC', '2020-04-02 08:03:46.113'),
             ('1002', '45022046', 'native', 'a47eavw7ex', 'C10295678',
              'LLD-AL20', 'WIFI', 'CPC', '2020-04-02 08:03:46.413'),
             ('1001', '45022046', 'native', 'a47eavw7ex', 'C10295678',
              'LLD-AL20', 'WIFI', 'CPC', '2020-04-03 08:04:06.213'),
             ('1001', '103364', 'native', 'a47eavw7ey', 'C10499309',
              'LLD-AL20', 'WIFI', 'CPC', '2020-04-02 08:05:00.613'),
             ('1003', '103364', 'native', 'a47eavw7ex', 'C10499309',
              'LLD-AL20', 'WIFI', 'CPC', '2020-04-02 08:05:00.613')], [
                  'did', 'adv_id', 'adv_type', 'slot_id', 'spread_app_id',
                  'device_name', 'net_type', 'adv_bill_mode_cd', 'click_time'
              ])
        util.write_to_table(df_clicklog_test,
                            'click_test_0000098989',
                            mode='overwrite')

        df_persona = self.hive_context.createDataFrame(
            [('1001', 1, 2), ('1002', 0, 3),
             ('1004', 1, 3)], ['did', 'gender', 'age'])
        df_keywords = self.hive_context.createDataFrame(
            [('C10499309', 'video'),
             ('C10295678', 'info')], ['spread_app_id', 'keyword'])

        df_expected = self.hive_context.createDataFrame([
            ('1001', '103364', 'native', 'a47eavw7ex', 'C10499309', 'LLD-AL20',
             'WIFI', 'CPC', '2020-04-02 08:03:46.113', 1, 2, 'video'),
            ('1002', '45022046', 'native', 'a47eavw7ex', 'C10295678',
             'LLD-AL20', 'WIFI', 'CPC', '2020-04-02 08:03:46.413', 0, 3,
             'info'),
            ('1001', '45022046', 'native', 'a47eavw7ex', 'C10295678',
             'LLD-AL20', 'WIFI', 'CPC', '2020-04-03 08:04:06.213', 1, 2,
             'info'),
        ], [
            'did', 'adv_id', 'media', 'slot_id', 'spread_app_id',
            'device_name', 'net_type', 'price_model', 'action_time', 'gender',
            'age', 'keyword'
        ])

        columns = [
            'did', 'adv_id', 'media', 'slot_id', 'spread_app_id',
            'device_name', 'net_type', 'price_model', 'action_time', 'gender',
            'age', 'keyword'
        ]

        #showlog_table, showlog_output_table, clicklog_table, clicklog_output_table
        table_name = ("show_test_0000098989", "show_output_test_0000098989",
                      "click_test_0000098989", "click_output_test_0000098989")
        main_clean.clean_logs(cfg=cfg_test,
                              df_persona=df_persona,
                              df_keywords=df_keywords,
                              log_table_names=table_name)
        command = "select * from show_output_test_0000098989"
        df = self.hive_context.sql(command)

        self.assertTrue(
            self.compare_dfs(df.select(columns), df_expected.select(columns)))
コード例 #6
0
def join_logs(hive_context, batch_config, interval_time_in_seconds,
              log_table_names):
    def union_logs(df_clicklog, df_showlog):
        # union click log and show log.
        columns = [
            'did', 'is_click', 'action_time', 'keyword', 'keyword_index',
            'media', 'media_category', 'net_type', 'gender', 'age', 'adv_id'
        ]

        df_clicklog = df_clicklog.withColumn('is_click', lit(1))
        df_clicklog = df_clicklog.select(columns)

        df_showlog = df_showlog.withColumn('is_click', lit(0))
        df_showlog = df_showlog.select(columns)

        df_unionlog = df_showlog.union(df_clicklog)
        return df_unionlog

    def transform_action_time(df_logs, interval_time_in_seconds):
        _udf_time = udf(
            lambda x: int(
                datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f').strftime("%s")),
            IntegerType())
        df_logs = df_logs.withColumn('action_time_seconds',
                                     _udf_time(col('action_time')))

        _udf_interval_time = udf(lambda x: x - x % interval_time_in_seconds,
                                 IntegerType())
        df_logs = df_logs.withColumn(
            'interval_starting_time',
            _udf_interval_time(col('action_time_seconds')))

        return df_logs

    timer_start = timeit.default_timer()
    start_date, end_date, load_minutes = batch_config
    starting_time = datetime.strptime(start_date, "%Y-%m-%d")
    ending_time = datetime.strptime(end_date, "%Y-%m-%d")
    showlog_table_name, clicklog_table_name, logs_table_name = log_table_names

    batched_round = 1
    while starting_time < ending_time:
        batched_time_start_str = starting_time.strftime("%Y-%m-%d %H:%M:%S")
        batched_time_end = starting_time + \
            timedelta(minutes=load_minutes)
        batched_time_end_str = batched_time_end.strftime("%Y-%m-%d %H:%M:%S")
        print_batching_info("Main logs", batched_round, batched_time_start_str,
                            batched_time_end_str)
        command = """select did, action_time, keyword, keyword_index, 
                     media, media_category, net_type, gender, 
                     age, adv_id from {} where action_time >= '{}' 
                     and action_time < '{}'"""
        df_clicklog_batched = hive_context.sql(
            command.format(clicklog_table_name, batched_time_start_str,
                           batched_time_end_str))
        df_showlog_batched = hive_context.sql(
            command.format(showlog_table_name, batched_time_start_str,
                           batched_time_end_str))
        df_logs_batched = union_logs(df_clicklog_batched, df_showlog_batched)
        df_logs_batched = transform_action_time(df_logs_batched,
                                                interval_time_in_seconds)
        df_logs_batched = df_logs_batched.withColumn(
            'uckey',
            concat_ws(",", col('media'), col('media_category'),
                      col('net_type'), col('gender'), col('age')))
        mode = 'overwrite' if batched_round == 1 else 'append'
        write_to_table(df_logs_batched, logs_table_name, mode=mode)
        batched_round += 1
        starting_time = batched_time_end

    timer_end = timeit.default_timer()
    print('Total batching seconds: ' + str(timer_end - timer_start))