Example #1
0
def run(cfg, model_name, model_version, serving_url):

    sc = SparkContext()
    hive_context = HiveContext(sc)
    forecaster = Forecaster(cfg)
    sc.setLogLevel('WARN')

    bucket_size = cfg['bucket_size']
    bucket_step = cfg['bucket_step']
    factdata = cfg['factdata']

    model_stats = get_model_stats(cfg, model_name, model_version)
    print(model_stats)

    start_bucket = 0
    while True:

        end_bucket = min(bucket_size, start_bucket + bucket_step)

        if start_bucket > end_bucket:
            break

        # Read factdata table
        command = """
        select count_array,day,hour,uckey from {} where bucket_id between {} and {}
        """.format(factdata, str(start_bucket), str(end_bucket))

        start_bucket = end_bucket + 1

        df = hive_context.sql(command)

        df = df.withColumn('hour_count', expr("map(hour, count_array)"))

        df = df.groupBy('uckey', 'day').agg(
            collect_list('hour_count').alias('hour_counts'))

        df = df.withColumn('day_hour_counts', expr("map(day, hour_counts)"))

        df = df.groupBy('uckey').agg(
            collect_list('day_hour_counts').alias('prediction_input'))

        l = df.take(1)
        row = l[0]
        predict_counts_for_uckey(serving_url, forecaster, model_stats,
                                 cfg)(row['uckey'], row['prediction_input'])
        break

        predictor_udf = udf(
            predict_counts_for_uckey(serving_url, forecaster, model_stats,
                                     cfg), StringType())

        df = df.withColumn('prediction_output',
                           predictor_udf(df.uckey, df.prediction_input))

        l = df.take(1)
        row = l[0]
        print(row['prediction_output'])
        break
def run(cfg, model_name, model_verion):

    sc = SparkContext()
    hive_context = HiveContext(sc)
    forecaster = Forecaster(cfg)
    sc.setLogLevel('WARN')

    bucket_size = cfg['bucket_size']
    bucket_step = cfg['bucket_step']
    factdata = cfg['factdata']

    model_stats = get_model_stats(cfg,model_name,model_verion)
    print(model_stats)

    # feeder = None
    # with open("data/vars/feeder_meta.pkl", mode='rb') as file:
    #     feeder = pickle.load(file)
    # print(type(feeder))
    # print(feeder)

    # # inp = VarFeeder.read_vars("data/vars")
    # # print(type(inp))
    # # Writing a JSON file
    # # with open('data.json', 'w') as f:
    # #     json.dump(inp, f)

    # feeder_brc = sc.broadcast(feeder)

    start_bucket = 0
    while True:

        end_bucket = min(bucket_size, start_bucket + bucket_step)

        if start_bucket > end_bucket:
            break

        # Read factdata table
        command = """
        select count_array,day,hour,uckey from {} where day between '2018-01-01' and '2018-01-05' and bucket_id between {} and {}
        """.format(factdata, str(start_bucket), str(end_bucket))

        start_bucket = end_bucket + 1

        df = hive_context.sql(command)

        df = df.withColumn('hour_count', expr("map(hour, count_array)"))

        df = df.groupBy('uckey', 'day').agg(
            collect_list('hour_count').alias('hour_counts'))

        df = df.withColumn('day_hour_counts', expr("map(day, hour_counts)"))

        df = df.groupBy('uckey').agg(collect_list(
            'day_hour_counts').alias('prediction_input'))

        l = df.take(1)
        row = l[0]
        predict_counts_for_uckey(forecaster, model_stats, cfg)(row['uckey'], row['prediction_input'])
        break

        predictor_udf = udf(predict_counts_for_uckey(forecaster, model_stats, cfg), StringType())

        df = df.withColumn('prediction_output', predictor_udf(df.uckey, df.prediction_input))

        l = df.take(1)
        row = l[0]
        print(row['prediction_output'])
        break
Example #3
0
def run(cfg, yesterday, serving_url):

    # os.environ[
    #     'PYSPARK_SUBMIT_ARGS'] = '--jars /home/reza/eshadoop/elasticsearch-hadoop-6.5.2/dist/elasticsearch-hadoop-6.5.2.jar pyspark-shell'

    es_write_conf = {"es.nodes": cfg['es_host'],
                     "es.port": cfg['es_port'],
                     "es.resource": cfg['es_predictions_index']+'/'+cfg['es_predictions_type'],
                     "es.batch.size.bytes": "1000000",
                     "es.batch.size.entries": "100",
                     "es.input.json": "yes",
                     "es.mapping.id": "uckey",
                     "es.nodes.wan.only": "true",
                     "es.write.operation": "upsert"}

    sc = SparkContext()
    hive_context = HiveContext(sc)
    forecaster = Forecaster(cfg)
    sc.setLogLevel(cfg['log_level'])

    # Reading the max bucket_id
    bucket_size = cfg['bucket_size']
    bucket_step = cfg['bucket_step']
    factdata = cfg['factdata_table']
    distribution_table = cfg['distribution_table']
    norm_table = cfg['norm_table']
    traffic_dist = cfg['traffic_dist']
    model_stat_table = cfg['model_stat_table']

    model_stats = get_model_stats(hive_context, model_stat_table)

    # Read dist
    command = """
        SELECT
        DIST.uckey,
        DIST.ratio,
        DIST.cluster_uckey,
        DIST.price_cat
        FROM {} AS DIST
        """.format(distribution_table)
    df_dist = hive_context.sql(command)

    # Read norm table
    # DataFrame[uckey: string, ts: array<int>, p: float, a__n: float, a_1_n: float, a_2_n: float, a_3_n: float, a_4_n: float, a_5_n: float, a_6_n: float, t_UNKNOWN_n: float, t_3G_n: float, t_4G_n: float, t_WIFI_n: float, t_2G_n: float, g__n: float, g_g_f_n: float, g_g_m_n: float, g_g_x_n: float, price_cat_1_n: float, price_cat_2_n: float, price_cat_3_n: float, si_vec_n: array<float>, r_vec_n: array<float>, p_n: float, ts_n: array<float>]
    command = """
        SELECT
        uckey AS cluster_uckey,
        price_cat,
        a__n,a_1_n,a_2_n,a_3_n,a_4_n,a_5_n,a_6_n,
        t_UNKNOWN_n,t_3G_n,t_4G_n,t_WIFI_n,t_2G_n,
        g__n, g_g_f_n, g_g_m_n, g_g_x_n,
        price_cat_1_n, price_cat_2_n, price_cat_3_n,
        si_vec_n,
        r_vec_n,
        ipl_vec_n
        FROM {}
        """.format(norm_table)
    df_norm = hive_context.sql(command)

    # create day_list from yesterday for train_window
    duration = model_stats['model']['duration']
    day = datetime.strptime(yesterday, '%Y-%m-%d')
    day_list = []
    for _ in range(0, duration):
        day_list.append(datetime.strftime(day, '%Y-%m-%d'))
        day = day + timedelta(days=-1)
    day_list.sort()

    df_prediction_ready = None
    df_uckey_cluster = None
    start_bucket = 0

    while True:

        end_bucket = min(bucket_size, start_bucket + bucket_step)

        if start_bucket > end_bucket:
            break

        # Read factdata table
        command = """
        SELECT
        FACTDATA.count_array,
        FACTDATA.day,
        FACTDATA.hour,
        FACTDATA.uckey
        FROM {} AS FACTDATA
        WHERE FACTDATA.bucket_id BETWEEN {} AND {}
        """.format(factdata, str(start_bucket), str(end_bucket))

        start_bucket = end_bucket + 1

        df = hive_context.sql(command)

        # [Row(count_array=[u'1:504'], day=u'2019-11-02', hour=2, uckey=u'magazinelock,04,WIFI,g_m,1,CPM,78', hour_price_imp_map={2: [u'1:504']})]
        df = df.withColumn('hour_price_imp_map',
                           expr("map(hour, count_array)"))

        # [Row(uckey=u'native,68bcd2720e5011e79bc8fa163e05184e,4G,g_m,2,CPM,19', day=u'2019-11-02', hour_price_imp_map_list=[{15: [u'3:3']}, {7: [u'3:5']}, {10: [u'3:3']}, {9: [u'3:1']}, {16: [u'3:2']}, {22: [u'3:11']}, {23: [u'3:3']}, {18: [u'3:7']}, {0: [u'3:4']}, {1: [u'3:2']}, {19: [u'3:10']}, {8: [u'3:4']}, {21: [u'3:2']}, {6: [u'3:1']}])]
        df = df.groupBy('uckey', 'day').agg(
            collect_list('hour_price_imp_map').alias('hour_price_imp_map_list'))

        # [Row(uckey=u'native,68bcd2720e5011e79bc8fa163e05184e,4G,g_m,2,CPM,19', day=u'2019-11-02', day_price_imp=[u'3:58'])]
        df = df.withColumn('day_price_imp', udf(
            sum_count_array, ArrayType(StringType()))(df.hour_price_imp_map_list)).drop('hour_price_imp_map_list')

        # [Row(uckey=u'native,68bcd2720e5011e79bc8fa163e05184e,4G,g_m,2,CPM,19', day=u'2019-11-02', day_price_imp=[u'3:58'], day_price_imp_map={u'2019-11-02': [u'3:58']})]
        df = df.withColumn('day_price_imp_map', expr(
            "map(day, day_price_imp)"))

        # [Row(uckey=u'native,z041bf6g4s,WIFI,g_f,1,CPM,71', day_price_imp_map_list=[{u'2019-11-02': [u'1:2', u'2:261']}, {u'2019-11-03': [u'2:515']}])])
        df = df.groupBy('uckey').agg(collect_list(
            'day_price_imp_map').alias('day_price_imp_map_list'))

        # [Row(uckey=u'native,z041bf6g4s,WIFI,g_f,1,CPM,71', day_price_imp_map_list=[{u'2019-11-02': [u'1:2', u'2:261']}, {u'2019-11-03': [u'2:515']}], ratio=0.09467455744743347, cluster_uckey=u'892', price_cat=u'1')]
        df = df.join(df_dist, on=['uckey'], how='inner')

        # df_uckey_cluster keeps the ratio and cluster_key for only uckeys that are being processed
        if not df_uckey_cluster:
            df_uckey_cluster = df.select(
                'uckey', 'cluster_uckey', 'ratio', 'price_cat')
            df_uckey_cluster.cache()
        else:
            df_uckey_cluster = df.select(
                'uckey', 'cluster_uckey', 'ratio', 'price_cat').union(df_uckey_cluster)
            df_uckey_cluster.cache()

        # [Row(cluster_uckey=u'2469', price_cat=u'2', cluster_day_price_imp_list=[[{u'2019-11-02': [u'2:90']}, {u'2019-11-03': [u'2:172']}]])])
        df = df.groupBy('cluster_uckey', 'price_cat').agg(
            collect_list('day_price_imp_map_list').alias('cluster_day_price_imp_list'))

        df = df.withColumn('ts', udf(sum_day_count_array,
                                     ArrayType(MapType(StringType(), ArrayType(StringType()))))(df.cluster_day_price_imp_list))

        # [Row(cluster_uckey=u'2469', price_cat=u'2', ts=[{u'2019-11-02': [u'2:90'], u'2019-11-03': [u'2:172']}])]
        df = df.drop('cluster_day_price_imp_list')

        if not df_prediction_ready:
            df_prediction_ready = df
            df_prediction_ready.cache()
        else:
            df = df_prediction_ready.union(df)
            df = df.groupBy('cluster_uckey', 'price_cat').agg(
                collect_list('ts').alias('ts_list'))
            df = df.withColumn('ts', udf(sum_day_count_array,
                                         ArrayType(MapType(StringType(), ArrayType(StringType()))))(df.ts_list))
            df = df.drop('ts_list')

            # [Row(cluster_uckey=u'magazinelock,03,WIFI,g_f,1,CPM,60', ts=[{u'2019-11-02': [u'1:2']}])]
            df_prediction_ready = df
            df_prediction_ready.cache()

    # [Row(cluster_uckey=u'1119', price_cat=u'2', ts=[{u'2019-11-02': [u'1:862', u'3:49', u'2:1154'], u'2019-11-03': [u'1:596', u'3:67', u'2:1024']}])]
    df = df_prediction_ready

    df = df.join(df_norm, on=['cluster_uckey', 'price_cat'], how='inner')

    # [Row(cluster_uckey=u'1119', price_cat=u'2', ts=[{u'2019-11-02': [u'1:862', u'3:49', u'2:1154'], u'2019-11-03': [u'1:596', u'3:67', u'2:1024']}], a__n=-0.005224577616900206, a_1_n=0.6089736819267273, a_2_n=-0.21013110876083374, a_3_n=0.16884993016719818, a_4_n=-0.3416250944137573, a_5_n=0.15184317529201508, a_6_n=-0.16529197990894318, t_UNKNOWN_n=-0.4828081429004669, t_3G_n=1.2522615194320679, t_4G_n=-0.15080969035625458, t_WIFI_n=-0.35078370571136475, t_2G_n=1.991615653038025, g__n=-0.08197031915187836, g_g_f_n=0.010901159606873989, g_g_m_n=-0.21557298302650452, g_g_x_n=1.4449801445007324, price_cat_1_n=-1.2043436765670776, price_cat_2_n=1.885549783706665, price_cat_3_n=-0.48205748200416565, si_vec_n=[-0.20294927060604095, -0.27017056941986084, -0.16821187734603882, -0.20294314622879028, -0.11777336895465851, 0.9738097786903381, 0.23326143622398376, -0.16500996053218842, -0.19148004055023193, -0.15753313899040222, -0.149298757314682, -0.19954630732536316, -0.15968738496303558, 0.12466698884963989, -0.15369804203510284, 0.04789407551288605, -0.22501590847969055, 0.14411255717277527, -0.209896981716156, -0.17969290912151337, 0.06794296950101852, -0.12367484718561172, 0.5581679344177246, 0.8108972311019897, -0.20487570762634277, 2.597964286804199, -0.2720063328742981, 0.1152268648147583, 0.27174681425094604, -0.20653237402439117, -0.2899857461452484, -0.15441325306892395, -0.17766059935092926, -0.11622612923383713, 0.3738412857055664, 1.0858312845230103, 0.6114567518234253], r_vec_n=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], uckey=u'native,66bcd2720e5011e79bc8fa163e05184e,WIFI,g_m,5,CPC,5', ratio=0.11989551782608032)]
    df = df.join(df_uckey_cluster, on=[
                 'cluster_uckey', 'price_cat'], how='inner')

    predictor_udf = udf(transform.predict_daily_uckey(days=day_list,
                                                      serving_url=serving_url, forecaster=forecaster, model_stats=model_stats, columns=df.columns), MapType(StringType(), FloatType()))

    df = df.withColumn('day_prediction_map',
                       predictor_udf(struct([df[name] for name in df.columns])))

    # [Row(cluster_uckey=u'1119', price_cat=u'2', day_prediction_map={u'2019-11-02': 220.0, u'2019-11-03': 305.0}, ratio=0.11989551782608032, uckey=u'native,66bcd2720e5011e79bc8fa163e05184e,WIFI,g_m,5,CPC,5')]
    df = df.select('cluster_uckey', 'price_cat',
                   'day_prediction_map', 'ratio', 'uckey')

    # [Row(ucdoc_elements=Row(price_cat=u'2', ratio=0.11989551782608032, day_prediction_map={u'2019-11-02': 220.0, u'2019-11-03': 305.0}), uckey=u'native,66bcd2720e5011e79bc8fa163e05184e,WIFI,g_m,5,CPC,5')]
    ucdoc_elements_type = StructType([StructField('price_cat', StringType(), False), StructField(
        'ratio', FloatType(), False), StructField('day_prediction_map', MapType(StringType(), FloatType()), False)])
    df = df.withColumn('ucdoc_elements_pre_price_cat', udf(lambda price_cat, ratio, day_prediction_map:
                                                           (price_cat, ratio, day_prediction_map), ucdoc_elements_type)(df.price_cat, df.ratio, df.day_prediction_map)).select('ucdoc_elements_pre_price_cat', 'uckey')

    # [Row(uckey=u'splash,d971z9825e,WIFI,g_m,1,CPT,74', ucdoc_elements=[Row(price_cat=u'1', ratio=0.5007790923118591, day_prediction_map={u'2019-11-02': 220.0, u'2019-11-03': 305.0})])]
    df = df.groupBy('uckey').agg(collect_list('ucdoc_elements_pre_price_cat').alias('ucdoc_elements'))

    df = df.withColumn('prediction_output', udf(transform.generate_ucdoc(traffic_dist), StringType())(
        df.uckey, df.ucdoc_elements))

    df_predictions_doc = df.select('uckey', 'prediction_output')
    rdd = df_predictions_doc.rdd.map(lambda x: transform.format_data(x, 'ucdoc'))
    rdd.saveAsNewAPIHadoopFile(
        path='-',
        outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat",
        keyClass="org.apache.hadoop.io.NullWritable",
        valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
        conf=es_write_conf)

    sc.stop()
def run(cfg, yesterday, serving_url):
    sc = SparkContext()
    hive_context = HiveContext(sc)
    sqlcontext = SQLContext(sc)
    forecaster = Forecaster(cfg)
    sc.setLogLevel(cfg['log_level'])

    dl_data_path = cfg['dl_predict_ready_path']
    dl_uckey_cluster_path = cfg['dl_uckey_cluster_path']
    distribution_table = cfg['distribution_table']
    norm_table = cfg['norm_table']

    # Reza
    # model_stats = get_model_stats_using_pickel(cfg)
    model_stat_table = cfg['model_stat_table']
    model_stats = get_model_stats(hive_context, model_stat_table)
    if not model_stats:
        sys.exit("dl_spark_cmd: " + "null model stats")

    # Read dist
    command = "SELECT DIST.uckey, DIST.ratio, DIST.cluster_uckey, DIST.price_cat FROM {} AS DIST ".format(
        distribution_table)

    df_dist = hive_context.sql(command)
    df_dist = df_dist.repartition("uckey")
    df_dist.cache()

    # Read norm table
    command = "SELECT uckey AS cluster_uckey, price_cat, a__n,a_1_n,a_2_n,a_3_n,a_4_n,a_5_n,a_6_n, t_UNKNOWN_n,t_3G_n,t_4G_n,t_WIFI_n,t_2G_n, g__n, g_g_f_n, g_g_m_n, g_g_x_n, price_cat_1_n, price_cat_2_n, price_cat_3_n, si_vec_n FROM {} ".format(
        norm_table)
    df_norm = hive_context.sql(command)

    # create day_list from yesterday for train_window
    duration = model_stats['model']['duration']
    day = datetime.strptime(yesterday, '%Y-%m-%d')
    day_list = []
    for _ in range(0, duration):
        day_list.append(datetime.strftime(day, '%Y-%m-%d'))
        day = day + timedelta(days=-1)
    day_list.sort()

    df = sqlcontext.read.parquet(dl_data_path)
    df_uckey_cluster = sqlcontext.read.parquet(dl_uckey_cluster_path)

    # TODO: where is sum_day_count_array?
    df = df.groupBy('cluster_uckey',
                    'price_cat').agg(collect_list('ts').alias('ts_list'))
    df = df.withColumn(
        'ts',
        udf(sum_day_count_array,
            ArrayType(MapType(StringType(),
                              ArrayType(StringType()))))(df.ts_list))
    df = df.drop('ts_list')

    df = df.join(df_norm, on=['cluster_uckey', 'price_cat'], how='inner')
    df = df.join(df_uckey_cluster,
                 on=['cluster_uckey', 'price_cat'],
                 how='inner')

    # df = df.where(df.uckey.like('%native,b6le0s4qo8,4G,g_f,5,CPC,,1156320000%'))
    predictor_udf = udf(
        transform.predict_daily_uckey(days=day_list,
                                      serving_url=serving_url,
                                      forecaster=forecaster,
                                      model_stats=model_stats,
                                      columns=df.columns),
        MapType(StringType(), FloatType()))

    df = df.withColumn(
        'day_prediction_map',
        predictor_udf(struct([df[name] for name in df.columns])))

    df = df.select('cluster_uckey', 'price_cat', 'day_prediction_map', 'ratio',
                   'uckey')

    mul_udf = udf(multiply_each_value_of_map_with_ratio,
                  MapType(StringType(), FloatType()))
    df = df.withColumn('day_prediction_map',
                       mul_udf(df.day_prediction_map, df.ratio))

    df = df.groupBy('uckey').agg(
        collect_list('day_prediction_map').alias('map_list'))

    count_map_udf = udf(
        get_day_count_map,
        MapType(StringType(), ArrayType(MapType(StringType(), LongType()))))
    df = df.withColumn('day_count_map', count_map_udf(df.map_list))
    df = df.select(df.uckey, df.day_count_map)

    df.cache()
    hdfs_df = df

    df = df.withColumn(
        'hits',
        udf(lambda uckey, maps: add_uckey_to_json(uckey, maps),
            StringType())(df.uckey, df.day_count_map)).select("hits")

    hdfs_df = get_preditction_in_hdfs_formate(hdfs_df)
    hdfs_df.show()

    #hdfs_df.coalesce(hdfs_write_threads).write.mode('overwrite').partitionBy("day").parquet(cfg["hdfs_prefix_path"])
    hdfs_df.write.option('header',
                         'true').mode('overwrite').format('hive').saveAsTable(
                             cfg["es_predictions_index"])

    sc.stop()
Example #5
0
def run(cfg, yesterday):
    sc = SparkContext()
    hive_context = HiveContext(sc)
    forecaster = Forecaster(cfg)
    sc.setLogLevel(cfg['log_level'])

    # Reading the max bucket_id
    dl_data_path = cfg['dl_predict_ready_path']
    bucket_size = cfg['bucket_size']
    bucket_step = cfg['bucket_step']
    factdata_area_map = cfg['area_map_table']
    distribution_table = cfg['distribution_table']
    norm_table = cfg['norm_table']
    dl_uckey_cluster_path = cfg['dl_uckey_cluster_path']

    # Reza
    # model_stats = get_model_stats_using_pickel(cfg)
    model_stat_table = cfg['model_stat_table']
    model_stats = get_model_stats(hive_context, model_stat_table)
    if not model_stats:
        sys.exit("dl_spark_cmd: " + "null model stats")

    # Read dist
    command = "SELECT DIST.uckey, DIST.ratio, DIST.cluster_uckey, DIST.price_cat FROM {} AS DIST ".format(
        distribution_table)

    df_dist = hive_context.sql(command)
    df_dist = df_dist.repartition("uckey")
    df_dist.cache()

    # create day_list from yesterday for train_window
    duration = model_stats['model']['duration']
    day = datetime.strptime(yesterday, '%Y-%m-%d')
    day_list = []
    for _ in range(0, duration):
        day_list.append(datetime.strftime(day, '%Y-%m-%d'))
        day = day + timedelta(days=-1)
    day_list.sort()

    df_prediction_ready = None
    df_uckey_cluster = None
    start_bucket = 0
    global i
    i = sc.accumulator(0)

    while True:

        end_bucket = min(bucket_size, start_bucket + bucket_step)

        if start_bucket > end_bucket:
            break

        # Read factdata table
        command = " SELECT FACTDATA.count_array, FACTDATA.day, FACTDATA.hour, FACTDATA.uckey FROM {} AS FACTDATA WHERE FACTDATA.bucket_id BETWEEN {} AND {}  and FACTDATA.day in {}".format(
            factdata_area_map, str(start_bucket), str(end_bucket),
            tuple(day_list))

        start_bucket = end_bucket + 1

        df = hive_context.sql(command)
        # decrease partitions
        df = df.coalesce(200)

        if len(eligble_slot_ids) > 0:
            df = df.filter(
                udf(lambda x: eligble_slot_ids.__contains__(x.split(",")[1]),
                    BooleanType())(df.uckey))
        df = df.withColumn('hour_price_imp_map',
                           expr("map(hour, count_array)"))

        df = df.groupBy('uckey', 'day').agg(
            collect_list('hour_price_imp_map').alias(
                'hour_price_imp_map_list'))

        df = df.withColumn(
            'day_price_imp',
            udf(sum_count_array, ArrayType(StringType()))(
                df.hour_price_imp_map_list)).drop('hour_price_imp_map_list')

        df = df.withColumn('day_price_imp_map',
                           expr("map(day, day_price_imp)"))

        df = df.groupBy('uckey').agg(
            collect_list('day_price_imp_map').alias('day_price_imp_map_list'))

        df = df.join(df_dist, on=['uckey'], how='inner')
        df.cache()

        # df_uckey_cluster keeps the ratio and cluster_key for only uckeys that are being processed

        df_uckey_cluster = df.select('uckey', 'cluster_uckey', 'ratio',
                                     'price_cat')

        df = df.groupBy('cluster_uckey', 'price_cat').agg(
            collect_list('day_price_imp_map_list').alias(
                'cluster_day_price_imp_list'))
        df = df.withColumn(
            'ts',
            udf(sum_day_count_array,
                ArrayType(MapType(StringType(), ArrayType(StringType()))))(
                    df.cluster_day_price_imp_list))

        df = df.drop('cluster_day_price_imp_list')
        #dl_data_path = 'dl_prediction_ready'

        if i.value == 0:
            df.coalesce(100).write.mode('overwrite').parquet(dl_data_path)
            df_uckey_cluster.coalesce(100).write.mode('overwrite').parquet(
                dl_uckey_cluster_path)

        else:
            df.coalesce(100).write.mode('append').parquet(dl_data_path)
            df_uckey_cluster.coalesce(100).write.mode('append').parquet(
                dl_uckey_cluster_path)

        i += 1
        df.unpersist()

    sc.stop()
Example #6
0
def run(cfg, model_name, model_version, serving_url):

    # os.environ[
    #     'PYSPARK_SUBMIT_ARGS'] = '--jars /home/reza/eshadoop/elasticsearch-hadoop-6.5.2/dist/elasticsearch-hadoop-6.5.2.jar pyspark-shell'

    es_write_conf = {
        "es.nodes":
        cfg['es_host'],
        "es.port":
        cfg['es_port'],
        "es.resource":
        cfg['es_predictions_index'] + '/' + cfg['es_predictions_type'],
        "es.batch.size.bytes":
        "1000000",
        "es.batch.size.entries":
        "100",
        "es.input.json":
        "yes",
        "es.mapping.id":
        "uckey",
        "es.nodes.wan.only":
        "true",
        "es.write.operation":
        "upsert"
    }

    sc = SparkContext()
    hive_context = HiveContext(sc)
    forecaster = Forecaster(cfg)
    sc.setLogLevel('WARN')

    # Reading the max bucket_id
    bucket_size = cfg['bucket_size']
    bucket_step = cfg['bucket_step']
    factdata = cfg['factdata']

    model_stats = get_model_stats(cfg, model_name, model_version)

    start_bucket = 0
    while True:

        end_bucket = min(bucket_size, start_bucket + bucket_step)

        if start_bucket > end_bucket:
            break

        # Read factdata table
        command = """
        select count_array,day,hour,uckey from {} where bucket_id between {} and {} and not day='2018-03-29' and not day='2018-03-30' and not day='2018-03-31'
        """.format(factdata, str(start_bucket), str(end_bucket))

        start_bucket = end_bucket + 1

        df = hive_context.sql(command)

        df = df.withColumn('hour_count', expr("map(hour, count_array)"))

        df = df.groupBy('uckey', 'day').agg(
            collect_list('hour_count').alias('hour_counts'))

        df = df.withColumn('day_hour_counts', expr("map(day, hour_counts)"))

        df = df.groupBy('uckey').agg(
            collect_list('day_hour_counts').alias('prediction_input'))

        predictor_udf = udf(
            predict_counts_for_uckey(serving_url, forecaster, model_stats,
                                     cfg), StringType())

        df = df.withColumn('prediction_output',
                           predictor_udf(df.uckey, df.prediction_input))

        df_predictions_doc = df.select('uckey', 'prediction_output')
        rdd = df_predictions_doc.rdd.map(lambda x: format_data(x, 'ucdoc'))
        rdd.saveAsNewAPIHadoopFile(
            path='-',
            outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat",
            keyClass="org.apache.hadoop.io.NullWritable",
            valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
            conf=es_write_conf)