Ejemplo n.º 1
0
def clean_logs(cfg, df_persona, df_keywords, log_table_names):
    sc = SparkContext.getOrCreate()
    sc.setLogLevel(cfg['log']['level'])
    hive_context = HiveContext(sc)
    cfg_clean = cfg['pipeline']['main_clean']
    conditions = cfg_clean['conditions']
    start_date, end_date, load_minutes = load_batch_config(cfg)

    timer_start = timeit.default_timer()
    showlog_table, showlog_output_table, clicklog_table, clicklog_output_table = log_table_names
    starting_time = datetime.strptime(start_date, "%Y-%m-%d")
    ending_time = datetime.strptime(end_date, "%Y-%m-%d")

    batched_round = 1
    while starting_time < ending_time:
        time_start = starting_time.strftime("%Y-%m-%d %H:%M:%S")
        batch_time_end = starting_time + timedelta(minutes=load_minutes)
        batch_time_end = min(batch_time_end, ending_time)
        time_end = batch_time_end.strftime("%Y-%m-%d %H:%M:%S")
        print_batching_info("Main clean", batched_round, time_start, time_end)

        command = """select did, adv_id, adv_type as media, slot_id, 
                    spread_app_id, device_name, net_type, 
                    adv_bill_mode_cd as price_model, {time} as action_time 
                    from {table} where {time} >= '{time_start}' and {time} < '{time_end}'"""

        df_clicklog_batched = hive_context.sql(
            command.format(time='click_time',
                           table=clicklog_table,
                           time_start=time_start,
                           time_end=time_end))

        df_showlog_batched = hive_context.sql(
            command.format(time='show_time',
                           table=showlog_table,
                           time_start=time_start,
                           time_end=time_end))

        mode = 'overwrite' if batched_round == 1 else 'append'
        is_empty_showlog_batched = df_showlog_batched.rdd.isEmpty()
        if not is_empty_showlog_batched:
            df_showlog_batched = clean_batched_log(df_showlog_batched,
                                                   df_persona, conditions,
                                                   df_keywords)
            write_to_table(df_showlog_batched, showlog_output_table, mode=mode)
        is_empty_clicklog_batched = df_clicklog_batched.rdd.isEmpty()
        if not is_empty_clicklog_batched:
            df_clicklog_batched = clean_batched_log(df_clicklog_batched,
                                                    df_persona, conditions,
                                                    df_keywords)
            write_to_table(df_clicklog_batched,
                           clicklog_output_table,
                           mode=mode)

        batched_round += 1
        starting_time = batch_time_end

    timer_end = timeit.default_timer()
    print('Total batching seconds: ' + str(timer_end - timer_start))
Ejemplo n.º 2
0
def run(hive_context, cfg):
    cfg_logs = cfg['pipeline']['main_logs']
    logs_table_name = cfg_logs['logs_output_table_name']
    interval_time_in_seconds = cfg_logs['interval_time_in_seconds']

    cfg_train = cfg['pipeline']['main_trainready']
    trainready_table = cfg_train['trainready_output_table']

    batch_config = load_batch_config(cfg)

    generate_trainready(hive_context, batch_config, interval_time_in_seconds,
                        logs_table_name, trainready_table)
Ejemplo n.º 3
0
def run(hive_context, cfg):
    # prepare parameters for processing batched logs.
    cfg_clean = cfg['pipeline']['main_clean']
    cfg_clean_output = cfg_clean['data_output']

    batch_config = load_batch_config(cfg)

    clicklog_table_name = cfg_clean_output['clicklog_output_table']
    showlog_table_name = cfg_clean_output['showlog_output_table']

    cfg_logs = cfg['pipeline']['main_logs']
    logs_table_name = cfg_logs['logs_output_table_name']
    interval_time_in_seconds = cfg_logs['interval_time_in_seconds']

    log_table_names = (showlog_table_name, clicklog_table_name,
                       logs_table_name)

    join_logs(hive_context, batch_config, interval_time_in_seconds,
              log_table_names)
def run(hive_context, cfg):
    batch_config = load_batch_config(cfg)
    cfg_logs = cfg['pipeline']['main_logs']
    logs_table = cfg_logs['logs_output_table_name']
    # add region ids to logs.
    add_region_to_logs(hive_context, batch_config, logs_table)