Ejemplo n.º 1
0
def get_predict_post_body(model_stats, day_list, day_list_cut, page_ix, pf_age,
                          pf_si, pf_network, pf_gender, full_record, hits,
                          pf_price_cat, predict_day_list, forward_offset):

    train_window = model_stats['model']['train_window']  # comes from cfg
    predict_window = model_stats['model']['predict_window']  # comes from cfg
    x_hits = np.log(np.add(hits, 1)).tolist()  # ln + 1
    full_record_exp = np.log(np.add(full_record, 1)).tolist()

    if len(day_list_cut) != train_window + predict_window:
        raise Exception(
            'day_list_cut and train window + predicti_window do not match. {} {} {}'
            .format(len(day_list_cut), train_window, predict_window))

    dow = get_dow(day_list_cut)
    dow = [[dow[0][i], dow[1][i]]
           for i in range(train_window + predict_window)]
    for x in predict_day_list:
        if x not in day_list:
            day_list.extend(predict_day_list)
    lagged_indx = np.stack(lag_indexes(day_list), axis=-1)

    # not used in the model (but we should keep it)
    page_popularity = np.median(full_record_exp)
    page_popularity = (
                              page_popularity - model_stats['stats']['page_popularity'][0]) / \
                      model_stats['stats']['page_popularity'][1]
    quarter_autocorr = 1

    duration = model_stats['model']['duration']

    # x_hits, x_features, norm_x_hits, x_lagged, y_features, mean, std, flat_ucdoc_features, page_ix
    truex, timex, normx, laggedx, timey, normmean, normstd, pgfeatures, pageix = make_pred_input(
        duration, train_window, predict_window, full_record_exp, x_hits, dow,
        lagged_indx, pf_age, pf_si, pf_network, pf_gender, page_ix,
        pf_price_cat, page_popularity, quarter_autocorr, forward_offset)

    # ys are not important]
    truey = [1 for _ in range(predict_window)]
    normy = [1 for _ in range(predict_window)]

    instance = {
        "truex": truex,
        "timex": timex.tolist(),
        "normx": normx,
        "laggedx": laggedx.tolist(),
        "truey": truey,
        "timey": timey.tolist(),
        "normy": normy,
        "normmean": normmean,
        "normstd": normstd,
        "page_features": pgfeatures.tolist(),
        "pageix": pageix
    }
    # print(instance)
    return instance  # , stat
Ejemplo n.º 2
0
def run(cfg):

    with open(cfg['tf_statistics_path'], 'rb') as f:
        tf_stat = pickle.load(f)

    names = []
    tfrecord_location = cfg['tfrecords_local_path']
    for file in os.listdir(tfrecord_location):
        if file.startswith("part"):
            names.append(file)
    file_paths = [os.path.join(tfrecord_location, name) for name in names]

    # read and make the dataset from tfrecord
    dataset = tf.data.TFRecordDataset(file_paths)
    dataset = dataset.map(__data_parser)

    batch_size = cfg['batch_size']
    duration = cfg['duration']

    dataset = dataset.batch(batch_size).shuffle(SHUFFLE_BUFFER)
    iterator = dataset.make_one_shot_iterator()
    next_el = iterator.get_next()

    # lagged_ix = numpy.ones((duration, 4), dtype=float)
    # lagged_ix = np.where(lagged_ix == 1, -1, lagged_ix)
    lagged_ix = np.stack(lag_indexes(tf_stat), axis=-1)
    # quarter_autocorr = numpy.ones((batch_size,), dtype=float)

    date_list = tf_stat['days']
    dow = get_dow(date_list)

    holiday_list = cfg['holidays']

    holidays = [1 if _ in holiday_list else 0 for _ in date_list]
    a_list = []
    b_list = []
    for _ in holidays:
        a, b = holiday_norm(_)
        a_list.append(a)
        b_list.append(b)
    holiday = (a_list, b_list)

    with tf.Session() as sess:

        x = sess.run(next_el)
        quarter_autocorr = numpy.ones((x[0].size, ), dtype=float)
        page_indx = list(x[0])

        fill_isolated_zeros(x[21])
        tensors = dict(
            hits=pd.DataFrame(x[21], index=page_indx, columns=date_list),
            lagged_ix=lagged_ix,
            page_ix=page_indx,
            pf_age=pd.DataFrame(x[8:15],
                                columns=page_indx,
                                index=(1, 2, 3, 4, 5, 6, 7)).T,
            pf_si=pd.DataFrame(x[20], index=page_indx),
            pf_network=pd.DataFrame(x[15:20],
                                    columns=page_indx,
                                    index=('2G', '3G', '4G', 'UNKNOWN',
                                           'WIFI')).T,
            pf_price_cat=pd.DataFrame(x[1:4],
                                      columns=page_indx,
                                      index=('pc1', 'pc2', 'pc3')).T,
            pf_gender=pd.DataFrame(x[4:8],
                                   columns=page_indx,
                                   index=('none', 'f', 'm', 'x')).T,
            page_popularity=x[22],
            # page_popularity = quarter_autocorr,
            quarter_autocorr=quarter_autocorr,
            dow=pd.DataFrame(dow).T,
            holiday=pd.DataFrame(holiday).T)

        data_len = tensors['hits'].shape[1]
        plain = dict(data_days=data_len - cfg['add_days'],
                     features_days=data_len,
                     data_start=date_list[0],
                     data_end=date_list[-1],
                     features_end=date_list[-1],
                     n_pages=batch_size)
        VarFeeder(cfg['data_dir'], tensors, plain)
Ejemplo n.º 3
0
def get_predict_post_body(model_stats, day_list, day_list_cut, uckey, age, si,
                          network, gender, media, ip_location, full_record,
                          hits, hour, price_cat):

    price_cat = str(price_cat)
    hour = str(hour)

    train_window = model_stats['model']['train_window']  # comes from cfg
    predict_window = model_stats['model']['predict_window']  # comes from cfg
    x_hits = np.log(np.add(hits, 1)).tolist()  # ln + 1
    full_record_exp = np.log(np.add(full_record, 1)).tolist()

    if len(day_list_cut) != train_window + predict_window:
        raise Exception(
            'day_list_cut and train window + predicti_window do not match. {} {} {}'
            .format(len(day_list_cut), train_window, predict_window))

    dow = get_dow(day_list_cut)
    dow = [[dow[0][i], dow[1][i]]
           for i in range(train_window + predict_window)]

    lagged_indx = np.stack(lag_indexes(day_list), axis=-1)
    # lagged_hits = [0 for i in range(2)]
    # lagged_hits = [lagged_hits for _ in range(train_window+predict_window)]

    m = model_stats['stats']
    pf_age = [(int(age == '1') - m['a_1'][0]) / m['a_1'][1],
              (int(age == '2') - m['a_2'][0]) / m['a_2'][1],
              (int(age == '3') - m['a_3'][0]) / m['a_3'][1],
              (int(age == '4') - m['a_4'][0]) / m['a_4'][1]]

    pf_si = [(int(si == '1') - m['si_1'][0]) / m['si_1'][1],
             (int(si == '2') - m['si_2'][0]) / m['si_2'][1],
             (int(si == '3') - m['si_3'][0]) / m['si_3'][1]]

    pf_network = [(int(network == '3G') - m['t_3G'][0]) / m['t_3G'][1],
                  (int(network == '4G') - m['t_4G'][0]) / m['t_4G'][1],
                  (int(network == '5G') - m['t_5G'][0]) / m['t_5G'][1]]

    pf_gender = [(int(gender == 'g_f') - m['g_g_f'][0]) / m['g_g_f'][1],
                 (int(gender == 'g_m') - m['g_g_m'][0]) / m['g_g_m'][1],
                 (int(gender == 'g_x') - m['g_g_x'][0]) / m['g_g_x'][1]]

    pf_price_cat = [
        (int(price_cat == '0') - m['price_cat_0'][0]) / m['price_cat_0'][1],
        (int(price_cat == '1') - m['price_cat_1'][0]) / m['price_cat_1'][1],
        (int(price_cat == '2') - m['price_cat_2'][0]) / m['price_cat_2'][1],
        (int(price_cat == '3') - m['price_cat_3'][0]) / m['price_cat_3'][1]
    ]

    page_ix = ','.join([uckey, price_cat, hour])

    # not used
    page_popularity = np.median(full_record_exp)
    page_popularity = (page_popularity -
                       model_stats['stats']['page_popularity'][0]
                       ) / model_stats['stats']['page_popularity'][1]
    quarter_autocorr = 1

    duration = model_stats['model']['duration']
    # x_hits, x_features, norm_x_hits, x_lagged, y_features, mean, std, flat_ucdoc_features, page_ix
    truex, timex, normx, laggedx, timey, normmean, normstd, pgfeatures, pageix = make_pred_input(
        duration, train_window, predict_window, full_record_exp, x_hits, dow,
        lagged_indx, pf_age, pf_si, pf_network, pf_gender, page_ix,
        pf_price_cat, page_popularity, quarter_autocorr)

    # ys are not important]
    truey = [1 for _ in range(predict_window)]
    normy = [1 for _ in range(predict_window)]

    instance = {
        "truex": truex,
        "timex": timex.tolist(),
        "normx": normx,
        "laggedx": laggedx.tolist(),
        "truey": truey,
        "timey": timey.tolist(),
        "normy": normy,
        "normmean": normmean,
        "normstd": normstd,
        "page_features": pgfeatures.tolist(),
        "pageix": pageix
    }
    # print(instance)
    return instance  # , stat