コード例 #1
0
ファイル: main_clean.py プロジェクト: xu-weiyuan/blue-marlin
    command = """select did, gender_new_dev as gender, 
                 forecast_age_dev as age from {}""".format(persona_table)
    df_persona = hive_context.sql(command)

    df_persona = clean_persona(df_persona)

    # Use keywords to clean the clicklog and showlog which do not have any keyword association.
    # Create ad keywords table if does not exist, else load the keywords.
    if create_keywords:
        df_keywords = generate_add_keywords(keywords_table)
    else:
        df_keywords = load_df(hive_context, keywords_table)
    #[Row(keyword=u'education', keyword_index=1, spread_app_id=u'C100203741')]

    log_table_names = (showlog_table, showlog_new_table, clicklog_table,
                       clicklog_new_table)

    clean_logs(cfg, df_persona, df_keywords, log_table_names)

    write_to_table(df_persona, persona_new_table, mode='overwrite')


if __name__ == "__main__":
    """
    main_clean is a process to generate cleaned persona, clicklog and showlog.
    """
    sc, hive_context, cfg = load_config(
        description="clean data of persona, clicklog and showlog.")
    run(hive_context=hive_context, cfg=cfg)
    sc.stop()
コード例 #2
0
    tfrecords_statistics = {}
    tfrecords_statistics['distinct_records_count'] = df.count()
    save_pickle_file(tfrecords_statistics, tf_statis_path)


def save_tfrecords(hive_context, trainready_table, tfrecords_hdfs_path,
                   tf_statis_path):
    command = """select uckey_index, media_index, media_category_index, 
                 net_type_index, gender_index, age_index, region_id_index, 
                 interval_starting_time, keyword_indexes as keywords, 
                 keyword_indexes_click_counts as click_counts, 
                 keyword_indexes_show_counts as show_counts from {}""".format(
        trainready_table)
    df = hive_context.sql(command)
    generate_tf_statistics(df, tf_statis_path)
    df.write.format("tfrecords").option(
        "recordType", "Example").mode('overwrite').save(tfrecords_hdfs_path)


if __name__ == "__main__":

    sc, hive_context, cfg = load_config(description="generate tf records")
    cfgp = cfg['pipeline']
    trainready_table = cfgp['main_trainready']['trainready_output_table']
    tfrecords_hdfs_path = cfgp['tfrecords']['tfrecords_hdfs_path']
    tf_statis_path = cfgp['tfrecords']['tfrecords_statistics_path']
    # save selected columns of train ready table as tfrecords.
    save_tfrecords(hive_context, trainready_table, tfrecords_hdfs_path,
                   tf_statis_path)
    sc.stop()
コード例 #3
0
        batched += 1
        starting_time = batched_time_end

    # use the temp table to save all the batched logs with region id inside it.
    # drop the logs table and alter the temp table name to the logs table.
    drop_table(hive_context, logs_table)
    command = """alter table {} rename to {}""".format(logs_table_temp_name,
                                                       logs_table)
    hive_context.sql(command)

    timer_end = timeit.default_timer()
    print('Total batching seconds: ' + str(timer_end - timer_start))


def run(hive_context, cfg):
    batch_config = load_batch_config(cfg)
    cfg_logs = cfg['pipeline']['main_logs']
    logs_table = cfg_logs['logs_output_table_name']
    # add region ids to logs.
    add_region_to_logs(hive_context, batch_config, logs_table)


if __name__ == "__main__":
    """
    This is an optional step only for the logs data without regions.
    If original logs have the geo info or region(ipl or  r), ignore this.    
    """
    sc, hive_context, cfg = load_config(description="main logs with regions")
    run(hive_context=hive_context, cfg=cfg)
    sc.stop()
コード例 #4
0
    write_to_table(df_trainready, trainready_table, mode='overwrite')
    timer_end = timeit.default_timer()
    print('Total batching seconds: ' + str(timer_end - timer_start))
    return df_trainready


def run(hive_context, cfg):
    cfg_logs = cfg['pipeline']['main_logs']
    logs_table_name = cfg_logs['logs_output_table_name']
    interval_time_in_seconds = cfg_logs['interval_time_in_seconds']

    cfg_train = cfg['pipeline']['main_trainready']
    trainready_table = cfg_train['trainready_output_table']

    batch_config = load_batch_config(cfg)

    generate_trainready(hive_context, batch_config, interval_time_in_seconds,
                        logs_table_name, trainready_table)


if __name__ == "__main__":
    """
    This program performs the followings:
    adds normalized data by adding index of features
    groups data into time_intervals and ucdocs (labeled by uckey)
    """
    sc, hive_context, cfg = load_config(
        description="pre-processing train ready data")
    run(hive_context=hive_context, cfg=cfg)
    sc.stop()
コード例 #5
0
    # prepare parameters for processing batched logs.
    cfg_clean = cfg['pipeline']['main_clean']
    cfg_clean_output = cfg_clean['data_output']

    batch_config = load_batch_config(cfg)

    clicklog_table_name = cfg_clean_output['clicklog_output_table']
    showlog_table_name = cfg_clean_output['showlog_output_table']

    cfg_logs = cfg['pipeline']['main_logs']
    logs_table_name = cfg_logs['logs_output_table_name']
    interval_time_in_seconds = cfg_logs['interval_time_in_seconds']

    log_table_names = (showlog_table_name, clicklog_table_name,
                       logs_table_name)

    join_logs(hive_context, batch_config, interval_time_in_seconds,
              log_table_names)


if __name__ == "__main__":
    """
    This program performs the followings:
    unions show and click logs
    adds time_interval related data for batch processing
    adds uckey
    """
    sc, hive_context, cfg = load_config(description="pre-processing logs data")
    run(hive_context=hive_context, cfg=cfg)
    sc.stop()