def run():
    parser = argparse.ArgumentParser(description='Prepare data')
    parser.add_argument(
        '--data_dir',
        default='/home/oniculaescu/train/features',
        help=
        'The location where we are going to save the features for constructing the NN'
    )
    parser.add_argument(
        '--threshold',
        default=0.0,
        type=float,
        help='Series minimal length threshold/points of data length')
    parser.add_argument(
        '--add_days',
        default=64,
        type=int,
        help='Number of days to be added in the future for prediction')
    parser.add_argument('--start', help='Effective start date')
    parser.add_argument('--end', help="Effective end date")
    parser.add_argument('--attr',
                        default='download',
                        help="tell what pkl file to use for feature creation")
    parser.add_argument('--corr_backoffset',
                        default=0,
                        type=int,
                        help="Offset for correlation computation")

    args = parser.parse_args()

    # get the data
    df, nans, starts, ends = prepare_data(args.attr, args.start, args.end,
                                          args.threshold)

    # find the working date range
    data_start, data_end = df.columns[0], df.columns[-1]

    # project date-dependent features (like day of week) to the future dates for prediction
    #features_end = data_end + " " + str(pd.Timedelta(args.add_days, unit='D'))
    print data_end
    data_end_1 = datetime.datetime.strptime(data_end, '%Y-%m-%d')
    #features_end = data_end_1 + pd.TimeDelta(args.add_days, unit='D')
    features_end = data_end_1 + datetime.timedelta(days=args.add_days)
    print("start: " + data_start + ", end: " + data_end + ", features_end: " +
          str(features_end))

    # Group unique ases by continent
    assert df.index.is_monotonic_increasing
    continent_map = uniq_continent_map(df.index.values)

    # Group unique ases by country
    country_map = uniq_country_map(df.index.values)

    # yearly autocorrelation
    raw_year_autocorr = batch_autocorrelation(df.values, 365, starts, ends,
                                              1.5, args.corr_backoffset)
    year_unknown_pct = np.sum(np.isnan(raw_year_autocorr)) / len(
        raw_year_autocorr)  # type: float

    # quarterly autocorrelation
    raw_quarter_autocorr = batch_autocorrelation(df.values,
                                                 int(round(365.25 / 4)),
                                                 starts, ends, 2,
                                                 args.corr_backoffset)
    quarter_unknown_pct = np.sum(np.isnan(raw_quarter_autocorr)) / len(
        raw_quarter_autocorr)  # type: float

    print("Percent of undefined autocorr = yearly:%.3f, quarterly:%.3f" %
          (year_unknown_pct, quarter_unknown_pct))

    # Normalise all the things
    year_autocorr = normalize(np.nan_to_num(raw_year_autocorr))
    quarter_autocorr = normalize(np.nan_to_num(raw_quarter_autocorr))

    # Make time-dependent features
    features_days = pd.date_range(data_start, features_end)
    #dow = normalize(features_days.dayofweek.values)
    week_period = 7 / (2 * np.pi)
    dow_norm = features_days.dayofweek / week_period
    print dow_norm
    dow = np.stack([np.cos(dow_norm), np.sin(dow_norm)], axis=-1)

    # Assemble indices for quarterly lagged data
    lagged_ix = np.stack(lag_indexes(data_start, features_end), axis=-1)

    #page_popularity = df.median(axis=1)
    #page_popularity = (page_popularity - page_popularity.mean()) / page_popularity.std()

    # Put NaNs back
    df[nans] = np.NaN

    # Assemble final output
    tensors = dict(
        hits=df,
        lagged_ix=lagged_ix,
        continent_map=continent_map,
        #country_map=country_map,
        as_ix=df.index.values,
        year_autocorr=year_autocorr,
        quarter_autocorr=quarter_autocorr,
        dow=dow,
    )
    plain = dict(features_days=len(features_days),
                 data_days=len(df.columns),
                 n_ases=len(df),
                 data_start=data_start,
                 data_end=data_end,
                 features_end=features_end)

    print args.data_dir
    #print tensors
    # Store data to the disk
    VarFeeder(args.data_dir, tensors, plain)
def run(args):    
    # Get the data
    df, nans, starts, ends = prepare_data(args['start'], args['end'], args['valid_threshold'])

    # Our working date range
    data_start, data_end = df.columns[0], df.columns[-1]

    # We have to project some date-dependent features (day of week, etc) to the future dates for prediction
    features_end = data_end + pd.Timedelta(args['add_days'], unit='D')
    print(f"start: {data_start}, end:{data_end}, features_end:{features_end}")

    # Group unique pages by agents
    assert df.index.is_monotonic_increasing # 判断index是不是单调增
    page_map = uniq_page_map(df.index.values)

    # Yearly(annual) autocorrelation
    raw_year_autocorr = batch_autocorr(df.values, 365, starts, ends, 1.5, args['corr_backoffset'])
    year_unknown_pct = np.sum(np.isnan(raw_year_autocorr))/len(raw_year_autocorr)  # type: float

    # Quarterly autocorrelation
    raw_quarter_autocorr = batch_autocorr(df.values, int(round(365.25/4)), starts, ends, 2, args['corr_backoffset'])
    quarter_unknown_pct = np.sum(np.isnan(raw_quarter_autocorr)) / len(raw_quarter_autocorr)  # type: float

    print("Percent of undefined autocorr = yearly:%.3f, quarterly:%.3f" % (year_unknown_pct, quarter_unknown_pct))

    # Normalize all the things
    year_autocorr = normalize(np.nan_to_num(raw_year_autocorr)) # replace NaN with 0 and infinity with large finite numbers
    quarter_autocorr = normalize(np.nan_to_num(raw_quarter_autocorr))

    # Calculate and encode page features (site, contry, etc)
    page_features = make_page_features(df.index.values)
    encoded_page_features = encode_page_features(page_features)

    # Make time-dependent features
    features_days = pd.date_range(data_start, features_end)
    #dow = normalize(features_days.dayofweek.values)
    week_period = 7 / (2 * np.pi)
    dow_norm = features_days.dayofweek.values / week_period
    dow = np.stack([np.cos(dow_norm), np.sin(dow_norm)], axis=-1) # 纵向连接,两列

    # Assemble 聚集 indices 指数 for quarterly lagged data
    lagged_ix = np.stack(lag_indexes(data_start, features_end), axis=-1)

    page_popularity = df.median(axis=1)
    page_popularity = (page_popularity - page_popularity.mean()) / page_popularity.std() # 标准化

    # Put NaNs back
    df[nans] = np.NaN

    # Assemble final output
    tensors = dict(
        hits=df,
        lagged_ix=lagged_ix,
        page_map=page_map,
        page_ix=df.index.values,
        pf_agent=encoded_page_features['agent'],
        pf_country=encoded_page_features['country'],
        pf_site=encoded_page_features['site'],
        page_popularity=page_popularity,
        year_autocorr=year_autocorr,
        quarter_autocorr=quarter_autocorr,
        dow=dow,
    )
    plain = dict(
        features_days=len(features_days),
        data_days=len(df.columns),
        n_pages=len(df),
        data_start=data_start,
        data_end=data_end,
        features_end=features_end
    )

    # Store data to the disk
    VarFeeder(args['data_dir'], tensors, plain)
def main(_):
    if len(sys.argv) < 3:
        print(
            'Usage: ucdoc_saved_model.py [--model_version=y] --data_dir=xxx --ckpt_dir=xxx --saved_dir=xxx'
        )
        sys.exit(-1)
    if FLAGS.training_iteration <= 0:
        print('Please specify a positive value for training iteration.')
        sys.exit(-1)
    if FLAGS.model_version <= 0:
        print('Please specify a positive value for version number.')
        sys.exit(-1)

    # create deploy model first
    with tf.variable_scope('input') as inp_scope:
        with tf.device("/cpu:0"):
            #inp = VarFeeder.read_vars("data/vars")
            inp = VarFeeder.read_vars(FLAGS.data_dir)
            pipe = InputPipe(inp,
                             ucdoc_features(inp),
                             inp.hits.shape[0],
                             mode=ModelMode.PREDICT,
                             batch_size=FLAGS.batch_size,
                             n_epoch=1,
                             verbose=False,
                             train_completeness_threshold=0.01,
                             predict_window=FLAGS.predict_window,
                             predict_completeness_threshold=0.0,
                             train_window=FLAGS.train_window,
                             back_offset=FLAGS.predict_window + 1)

    asgd_decay = 0.99 if FLAGS.asgd else None

    if FLAGS.n_models == 1:
        model = Model(pipe,
                      build_from_set(FLAGS.hparam_set),
                      is_train=False,
                      seed=1,
                      asgd_decay=asgd_decay)
    else:
        models = []
        for i in range(FLAGS.n_models):
            prefix = f"m_{i}"
            with tf.variable_scope(prefix) as scope:
                models.append(
                    Model(pipe,
                          build_from_set(FLAGS.hparam_set),
                          is_train=False,
                          seed=1,
                          asgd_decay=asgd_decay,
                          graph_prefix=prefix))
        model = models[FLAGS.target_model]

    # load checkpoint model from training
    #ckpt_path = FLAGS.ckpt_dir
    print('loading checkpoint model...')
    ckpt_file = tf.train.latest_checkpoint(FLAGS.ckpt_dir)
    #graph = tf.Graph()
    graph = model.predictions.graph

    saver = tf.train.Saver(name='deploy_saver', var_list=None)
    with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(
            allow_growth=True))) as sess:
        pipe.load_vars(sess)
        pipe.init_iterator(sess)
        saver.restore(sess, ckpt_file)
        print('Done loading checkpoint model')
        export_path_base = FLAGS.saved_dir
        export_path = os.path.join(
            tf.compat.as_bytes(export_path_base),
            tf.compat.as_bytes(str(FLAGS.model_version)))
        print('Exporting trained model to', export_path)
        if os.path.isdir(export_path):
            shutil.rmtree(export_path)
        builder = tf.saved_model.builder.SavedModelBuilder(export_path)

        true_x = tf.saved_model.utils.build_tensor_info(model.inp.true_x)
        time_x = tf.saved_model.utils.build_tensor_info(model.inp.time_x)
        norm_x = tf.saved_model.utils.build_tensor_info(model.inp.norm_x)
        lagged_x = tf.saved_model.utils.build_tensor_info(model.inp.lagged_x)
        true_y = tf.saved_model.utils.build_tensor_info(model.inp.true_y)
        time_y = tf.saved_model.utils.build_tensor_info(model.inp.time_y)
        norm_y = tf.saved_model.utils.build_tensor_info(model.inp.norm_y)
        norm_mean = tf.saved_model.utils.build_tensor_info(model.inp.norm_mean)
        norm_std = tf.saved_model.utils.build_tensor_info(model.inp.norm_std)
        pg_features = tf.saved_model.utils.build_tensor_info(
            model.inp.ucdoc_features)
        page_ix = tf.saved_model.utils.build_tensor_info(model.inp.page_ix)

        pred = tf.saved_model.utils.build_tensor_info(model.predictions)

        labeling_signature = (
            tf.saved_model.signature_def_utils.build_signature_def(
                inputs={
                    "truex": true_x,
                    "timex": time_x,
                    "normx": norm_x,
                    "laggedx": lagged_x,
                    "truey": true_y,
                    "timey": time_y,
                    "normy": norm_y,
                    "normmean": norm_mean,
                    "normstd": norm_std,
                    "page_features": pg_features,
                    "pageix": page_ix,
                },
                outputs={"pred": pred},
                method_name="tensorflow/serving/predict"))

        legacy_init_op = tf.group(tf.tables_initializer(),
                                  name='legacy_init_op')

        builder.add_meta_graph_and_variables(
            sess, [tf.saved_model.tag_constants.SERVING],
            signature_def_map={
                tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
                labeling_signature
            },
            main_op=tf.tables_initializer(),
            strip_default_attrs=True)

        builder.save()
        print("Build Done")
Beispiel #4
0
def predict(checkpoints, hparams, return_x=False, verbose=False, predict_window=6, back_offset=0, n_models=1,
            target_model=0, asgd=False, seed=1, batch_size=1024):
    with tf.variable_scope('input') as inp_scope:
        with tf.device("/cpu:0"):
            inp = VarFeeder.read_vars("data/vars")
            pipe = InputPipe(inp, page_features(inp), inp.n_pages, mode=ModelMode.PREDICT, batch_size=batch_size,
                             n_epoch=1, verbose=verbose,
                             train_completeness_threshold=0.01,
                             predict_window=predict_window,
                             predict_completeness_threshold=0.0, train_window=hparams.train_window,
                             back_offset=back_offset)
    asgd_decay = 0.99 if asgd else None
    if n_models == 1:
        model = Model(pipe, hparams, is_train=False, seed=seed, asgd_decay=asgd_decay)
    else:
        models = []
        for i in range(n_models):
            prefix = f"m_{i}"
            with tf.variable_scope(prefix) as scope:
                models.append(Model(pipe, hparams, is_train=False, seed=seed, asgd_decay=asgd_decay, graph_prefix=prefix))
        model = models[target_model]

    if asgd:
        var_list = model.ema.variables_to_restore()
        prefix = f"m_{target_model}"
        for var in list(var_list.keys()):
            if var.endswith('ExponentialMovingAverage') and not var.startswith(prefix):
                del var_list[var]
    else:
        var_list = None
    saver = tf.train.Saver(name='eval_saver', var_list=var_list)
    x_buffer = []
    predictions = None
    with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))) as sess:
        pipe.load_vars(sess)
        for checkpoint in checkpoints:
            pred_buffer = []
            pipe.init_iterator(sess)
            saver.restore(sess, checkpoint)
            cnt = 0
            while True:
                try:
                    if return_x:
                        pred, x, pname = sess.run([model.predictions, model.inp.true_x, model.inp.page_ix])
                    else:
                        pred, pname = sess.run([model.predictions, model.inp.page_ix])
                    utf_names = [str(name, 'utf-8') for name in pname]
                    pred_df = pd.DataFrame(index=utf_names, data=np.expm1(pred))
                    pred_buffer.append(pred_df)
                    if return_x:
                        # noinspection PyUnboundLocalVariable
                        x_values = pd.DataFrame(index=utf_names, data=np.round(np.expm1(x)).astype(np.int64))
                        x_buffer.append(x_values)
                    newline = cnt % 80 == 0
                    if cnt > 0:
                        print('.', end='\n' if newline else '', flush=True)
                    if newline:
                        print(cnt, end='')
                    cnt += 1
                except tf.errors.OutOfRangeError:
                    print('🎉')
                    break
            cp_predictions = pd.concat(pred_buffer)
            if predictions is None:
                predictions = cp_predictions
            else:
                predictions += cp_predictions
    predictions /= len(checkpoints)
    offset = pd.Timedelta(back_offset, 'D')
    start_prediction = inp.data_end + pd.Timedelta('1D') - offset
    end_prediction = start_prediction + pd.Timedelta(predict_window - 1, 'D')
    predictions.columns = pd.date_range(start_prediction, end_prediction)
    if return_x:
        x = pd.concat(x_buffer)
        start_data = inp.data_end - pd.Timedelta(hparams.train_window - 1, 'D') - back_offset
        end_data = inp.data_end - back_offset
        x.columns = pd.date_range(start_data, end_data)
        return predictions, x
    else:
        return predictions
def run():
    parser = argparse.ArgumentParser(description='Prepare data')
    parser.add_argument('data_dir')
    parser.add_argument(
        '--valid_threshold',
        default=0.04,
        type=float,
        help="Series minimal length threshold (pct of data length)")
    parser.add_argument('--add_timestamp',
                        default=288,
                        type=int,
                        help="Add N timestamp in a future for prediction")
    parser.add_argument(
        '--start',
        default=0,
        type=int,
        help="Effective start date. Data before the start is dropped")
    parser.add_argument(
        '--end',
        default=-288,
        type=int,
        help="Effective end date. Data past the end is dropped")
    parser.add_argument('--corr_backoffset',
                        default=0,
                        type=int,
                        help='Offset for correlation calculation')
    parser.add_argument('--split_df',
                        default=0,
                        type=int,
                        help="Whether to split vms w.r.t. abnormal behaviour")
    args = parser.parse_args()

    # Get the data
    df, starts, ends = prepare_data(args.split_df, args.start, args.end,
                                    args.valid_threshold)

    # Our working date range
    data_start, data_end = df.columns[0], df.columns[-1]

    # We have to project some date-dependent features (day of week, etc) to the future dates for prediction
    features_end = data_end + args.add_timestamp
    print(f"start: {data_start}, end:{data_end}, features_end:{features_end}")
    features_time = features_end - data_start

    assert df.index.is_monotonic_increasing

    # daily autocorrelation
    day_autocorr = batch_autocorr(df.values, 288, starts, ends, 1.5,
                                  args.corr_backoffset)

    # weekly autocorrelation
    week_autocorr = batch_autocorr(df.values, 288 * 7, starts, ends, 2,
                                   args.corr_backoffset)

    # Normalise all the things
    day_autocorr = normalize(np.nan_to_num(day_autocorr))
    week_autocorr = normalize(np.nan_to_num(week_autocorr))

    # Make time-dependent features
    feature_time = np.arange(data_start, features_end + 1) % 288
    day_period = 288 / (2 * np.pi)
    dow_norm = feature_time / day_period
    dow = np.stack([np.cos(dow_norm), np.sin(dow_norm)], axis=-1)

    # Assemble indices for quarterly lagged data
    lagged_ix = np.stack(lag_indexes(data_start, features_end), axis=-1)

    # Assemble final output
    tensors = dict(
        usage=df,
        lagged_ix=lagged_ix,
        vm_ix=df.index.values,
        day_autocorr=day_autocorr,
        week_autocorr=week_autocorr,
        starts=starts,
        ends=ends,
        dow=dow,
    )
    plain = dict(features_time=features_time,
                 data_time=len(df.columns),
                 n_vm=len(df),
                 data_start=data_start,
                 data_end=data_end,
                 features_end=features_end)

    # Store data to the disk
    VarFeeder(args.data_dir, tensors, plain)
Beispiel #6
0
def predict(checkpoints, hparams, return_x=False, verbose=False, predict_window=6, back_offset=0, n_models=1,
            target_model=0, asgd=False, seed=1, batch_size=1024):
    with tf.variable_scope('input') as inp_scope:
        with tf.device("/cpu:0"):
            inp = VarFeeder.read_vars("data/vars")
            pipe = InputPipe(inp, page_features(inp), inp.n_pages, mode=ModelMode.PREDICT, batch_size=batch_size,
                             n_epoch=1, verbose=verbose,
                             train_completeness_threshold=0.01,
                             predict_window=predict_window,
                             predict_completeness_threshold=0.0, train_window=hparams.train_window,
                             back_offset=back_offset)
    asgd_decay = 0.99 if asgd else None
    if n_models == 1:
        model = Model(pipe, hparams, is_train=False, seed=seed, asgd_decay=asgd_decay)
    else:
        models = []
        for i in range(n_models):
            prefix = f"m_{i}"
            with tf.variable_scope(prefix) as scope:
                models.append(Model(pipe, hparams, is_train=False, seed=seed, asgd_decay=asgd_decay, graph_prefix=prefix))
        model = models[target_model]

    if asgd:
        var_list = model.ema.variables_to_restore()
        print("$$$$$$$$$=",var_list)
        prefix = f"m_{target_model}"
        for var in list(var_list.keys()):
            if var.endswith('ExponentialMovingAverage') and not var.startswith(prefix):
                del var_list[var]
    else:
        var_list = None
    
    saver = tf.train.Saver(name='eval_saver', var_list=var_list)
    x_buffer = []
    predictions = None
    with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))) as sess:
        pipe.load_vars(sess)
        for checkpoint in checkpoints:
            pred_buffer = []
            pipe.init_iterator(sess)
            saver.restore(sess, checkpoint)
            cnt = 0
            while True:
                try:
                    if return_x:
                        pred, x, pname = sess.run([model.predictions, model.inp.true_x, model.inp.page_ix])
                    else:
                        pred, pname = sess.run([model.predictions, model.inp.page_ix])
                    utf_names = [str(name, 'utf-8') for name in pname]
                    pred_df = pd.DataFrame(index=utf_names, data=np.expm1(pred))
                    pred_buffer.append(pred_df)
                    if return_x:
                        # noinspection PyUnboundLocalVariable
                        x_values = pd.DataFrame(index=utf_names, data=np.round(np.expm1(x)).astype(np.int64))
                        x_buffer.append(x_values)
                    newline = cnt % 80 == 0
                    if cnt > 0:
                        print('.', end='\n' if newline else '', flush=True)
                    if newline:
                        print(cnt, end='')
                    cnt += 1
                except tf.errors.OutOfRangeError:
                    print('🎉')
                    break
            cp_predictions = pd.concat(pred_buffer)
            if predictions is None:
                predictions = cp_predictions
            else:
                predictions += cp_predictions
    predictions /= len(checkpoints)
    offset = pd.Timedelta(back_offset, 'D')
    start_prediction = inp.data_end + pd.Timedelta('1D') - offset
    end_prediction = start_prediction + pd.Timedelta(predict_window - 1, 'D')
    predictions.columns = pd.date_range(start_prediction, end_prediction)
    if return_x:
        x = pd.concat(x_buffer)
        start_data = inp.data_end - pd.Timedelta(hparams.train_window - 1, 'D') - back_offset
        end_data = inp.data_end - back_offset
        x.columns = pd.date_range(start_data, end_data)
        return predictions, x
    else:
        return predictions
Beispiel #7
0
def run():
    parser = argparse.ArgumentParser(description='Prepare data')
    parser.add_argument('data_dir')
    parser.add_argument(
        '--valid_threshold',
        default=0.0,
        type=float,
        help="Series minimal length threshold (pct of data length)")
    parser.add_argument('--add_days',
                        default=63,
                        type=int,
                        help="Add N days in a future for prediction")
    parser.add_argument(
        '--start',
        help="Effective start date. Data before the start is dropped")
    parser.add_argument(
        '--end', help="Effective end date. Data past the end is dropped")
    parser.add_argument('--corr_backoffset',
                        default=0,
                        type=int,
                        help='Offset for correlation calculation')
    args = parser.parse_args()
    # print("args",args.start)
    # todo python make_features.py data/vars --add_days=63

    # Get the data
    df, nans, starts, ends = prepare_data(args.start, args.end,
                                          args.valid_threshold)
    # print(f"starts={starts},ends={ends}; df.head()={df.head()}")

    # Our working date range
    # todo 页面最早最晚访问时间
    # print("df", df.head(), df.shape)
    data_start, data_end = df.columns[0], df.columns[-1]

    # We have to project some date-dependent features (day of week, etc) to the future dates for prediction
    features_end = data_end + pd.Timedelta(args.add_days, unit='D')
    # todo start: 2015-07-01, end:2017-09-11 00:00:00, features_end:2017-11-13 00:00:00
    # print(f"start: {data_start}, end:{data_end}, features_end:{features_end}")

    # Group unique pages by agents
    # print("df.index.is_monotonic_increasing",df.index.is_monotonic_increasing)
    assert df.index.is_monotonic_increasing
    page_map = uniq_page_map(df.index.values)
    # print(f"page_map={page_map}")

    # Yearly(annual) autocorrelation
    raw_year_autocorr = batch_autocorr(df.values, 365, starts, ends, 1.5,
                                       args.corr_backoffset)
    # todo 相关系数为空的数量
    year_unknown_pct = np.sum(np.isnan(raw_year_autocorr)) / len(
        raw_year_autocorr)  # type: float

    # Quarterly autocorrelation
    raw_quarter_autocorr = batch_autocorr(df.values, int(round(365.25 / 4)),
                                          starts, ends, 2,
                                          args.corr_backoffset)
    quarter_unknown_pct = np.sum(np.isnan(raw_quarter_autocorr)) / len(
        raw_quarter_autocorr)  # type: float

    print("Percent of undefined autocorr = yearly:%.3f, quarterly:%.3f" %
          (year_unknown_pct, quarter_unknown_pct))

    # Normalise all the things使用0代替数组x中的nan元素,使用有限的数字代替inf元素
    # todo 对年度相关系数和季度相关系数做均值方差标准化
    year_autocorr = normalize(np.nan_to_num(raw_year_autocorr))
    quarter_autocorr = normalize(np.nan_to_num(raw_quarter_autocorr))

    # Calculate and encode page features
    page_features = make_page_features(df.index.values)
    encoded_page_features = encode_page_features(page_features)
    # todo encoded_page_features 对网页的中介和国家做了 onehot 编码
    # print("encoded_page_features",type(encoded_page_features))

    # Make time-dependent features
    features_days = pd.date_range(data_start, features_end)
    #dow = normalize(features_days.dayofweek.values)
    week_period = 7 / (2 * np.pi)
    dow_norm = features_days.dayofweek.values / week_period
    dow = np.stack([np.cos(dow_norm), np.sin(dow_norm)], axis=-1)

    # Assemble indices for quarterly lagged data
    # todo 分别滑窗3,6,9,12个月的时间index
    lagged_ix = np.stack(lag_indexes(data_start, features_end), axis=-1)
    # todo data_start=2015-07-01;
    #  lagged_ix=[[ -1  -1  -1  -1]
    #  [ -1  -1  -1  -1]
    #  ...
    #  [773 681 592 500]
    #  [774 682 593 501]]
    # print(f"data_start={data_start}; lagged_ix={lagged_ix}")

    page_popularity = df.median(axis=1)
    # todo page_popularity (145036,)
    # todo 这里用每个页面的访问量的中位数表示该页面的流行程度
    # print("page_popularity",page_popularity.shape,page_popularity)
    page_popularity = (page_popularity -
                       page_popularity.mean()) / page_popularity.std()

    # Put NaNs back
    df[nans] = np.NaN

    # Assemble final output
    # todo
    #  hits    :   过滤掉了访问量很多为0的网页,已经按照网页名称排序,并做log变换
    #  lagged_ix   :   日期index的滑窗,小于最早日期的-1做填充
    #  page_map    :   背景:每个页面最多4种中间商。数据:每一行代表一个页面名称,每一列代表一个中间商
    #  page_ix :   每个网页全名(包括页面名称,中间商,国家等)
    #  pf_agent    :   网页的中间商的onehot编码
    #  pf_country  :   网页国家的onehot编码
    #  pf_site :   网页地址sire编码
    #  page_popularity :   用页面访问量的中位数代表流行度
    #  year_autocorr   :   访问量滑窗一年的相关系数
    #  quarter_autocorr    :   访问量滑窗一个季度的相关系数
    #  dow :   周几转换为sin,cos
    tensors = dict(
        hits=df,
        lagged_ix=lagged_ix,
        page_map=page_map,
        page_ix=df.index.values,
        pf_agent=encoded_page_features['agent'],
        pf_country=encoded_page_features['country'],
        pf_site=encoded_page_features['site'],
        page_popularity=page_popularity,
        year_autocorr=year_autocorr,
        quarter_autocorr=quarter_autocorr,
        dow=dow,
    )
    # todo
    #  data_start  :访问量的起始日期
    #  data_end:访问量的终止日期
    #  features_end:特征的终止日期,比访问量的终止日期晚63天
    #  features_days:特征横跨多少天
    #  data_days:总共有多少天的数据
    #  n_pages :多少个页面
    plain = dict(features_days=len(features_days),
                 data_days=len(df.columns),
                 n_pages=len(df),
                 data_start=data_start,
                 data_end=data_end,
                 features_end=features_end)
    # todo len(features_days)=867,len(df.columns)=805,len(df)=145036,data_start=2015-07-01,
    #  data_end=2017-09-11 00:00:00,features_end=2017-11-13 00:00:00
    # print(f"len(features_days)={len(features_days)},len(df.columns)={len(df.columns)},len(df)={len(df)},data_start={data_start},data_end={data_end},features_end={features_end}")
    # Store data to the disk
    VarFeeder(args.data_dir, tensors, plain)
Beispiel #8
0
def run():
    parser = argparse.ArgumentParser(description='Prepare data')
    parser.add_argument('data_dir')
    parser.add_argument(
        '--valid_threshold',
        default=0.0,
        type=float,
        help="Series minimal length threshold (pct of data length)")
    parser.add_argument('--add_days',
                        default=64,
                        type=int,
                        help="Add N days in a future for prediction")
    parser.add_argument(
        '--start',
        help="Effective start date. Data before the start is dropped")
    parser.add_argument(
        '--end', help="Effective end date. Data past the end is dropped")
    parser.add_argument('--corr_backoffset',
                        default=0,
                        type=int,
                        help='Offset for correlation calculation')
    args = parser.parse_args()

    # Get the data
    df, nans, starts, ends = prepare_data(args.start, args.end,
                                          args.valid_threshold)

    # Our working date range
    data_start, data_end = df.columns[0], df.columns[-1]

    # We have to project some date-dependent features (day of week, etc) to the future dates for prediction
    features_end = data_end + pd.Timedelta(args.add_days, unit='D')
    print(f"start: {data_start}, end:{data_end}, features_end:{features_end}")

    # Group unique pages by agents
    assert df.index.is_monotonic_increasing
    page_map = uniq_page_map(df.index.values)

    # Yearly(annual) autocorrelation of each page. The return is a list of auto_correl of each page.
    raw_year_autocorr = batch_autocorr(df.values, 365, starts, ends, 1.5,
                                       args.corr_backoffset)

    year_unknown_pct = np.sum(np.isnan(raw_year_autocorr)) / len(
        raw_year_autocorr)  # type: float

    # Quarterly autocorrelation of each page. The return is a list of auto_correl of each page.
    raw_quarter_autocorr = batch_autocorr(df.values, int(round(365.25 / 4)),
                                          starts, ends, 2,
                                          args.corr_backoffset)
    quarter_unknown_pct = np.sum(np.isnan(raw_quarter_autocorr)) / len(
        raw_quarter_autocorr)  # type: float

    print("Percent of undefined autocorr = yearly:%.3f, quarterly:%.3f" %
          (year_unknown_pct, quarter_unknown_pct))

    # Normalise all the things
    year_autocorr = normalize(np.nan_to_num(raw_year_autocorr))
    quarter_autocorr = normalize(np.nan_to_num(raw_quarter_autocorr))

    # Calculate and encode page features. To create the following page features, only pages are passed to the function. Date columns and hits are not passed.
    # Features are extracted from the page urls.
    page_features = make_page_features(df.index.values)
    encoded_page_features = encode_page_features(page_features)

    # Make time-dependent features
    # The following gives an array of all the days in the given data range. Eg: ['2015-07-01', '2015-07-02', .... '2017-10-01']
    features_days = pd.date_range(data_start, features_end)
    #dow = normalize(features_days.dayofweek.values)
    week_period = 7 / (2 * np.pi)

    # features_days.dayofweek.values below gives which day of the week does the corresponding date belongs to.
    #  Eg: '2015-07-01' belongs to 2nd day of week i.e Tue. So value will be 2.
    dow_norm = features_days.dayofweek.values / week_period
    dow = np.stack([np.cos(dow_norm), np.sin(dow_norm)], axis=-1)

    # Assemble indices for quarterly lagged data
    lagged_ix = np.stack(lag_indexes(data_start, features_end), axis=-1)

    # This is median of each page hits. So the output is a Df with just one column that has median of all hits of page.
    page_popularity = df.median(axis=1)
    page_popularity = (page_popularity -
                       page_popularity.mean()) / page_popularity.std()
    median_7 = traffic_median(df, 7)
    median_30 = traffic_median(df, 30)
    median_90 = traffic_median(df, 90)
    median_180 = traffic_median(df, 180)

    # Put NaNs back
    df[nans] = np.NaN

    # Assemble final output
    tensors = dict(
        hits=df,
        lagged_ix=lagged_ix,
        page_map=page_map,
        page_ix=df.index.values,
        pf_agent=encoded_page_features['agent'],
        pf_country=encoded_page_features['country'],
        pf_site=encoded_page_features['site'],
        page_popularity=page_popularity,
        median_7=median_7,
        median_30=median_30,
        median_90=median_90,
        median_180=median_180,
        year_autocorr=year_autocorr,
        quarter_autocorr=quarter_autocorr,
        dow=dow,
    )
    plain = dict(features_days=len(features_days),
                 data_days=len(df.columns),
                 n_pages=len(df),
                 data_start=data_start,
                 data_end=data_end,
                 features_end=features_end)

    # Store data to the disk
    VarFeeder(args.data_dir, tensors, plain)
    eval_pct = 0.1
    batch_size = batch_size
    train_window = train_window
    train_completeness_threshold = 1.0
    predict_window = 63
    verbose = False
    train_skip_first = 0
    train_skip_first = 0
    tf.reset_default_graph()
    forward_split = False
    train_sampling = 1.0
    if seed:
        tf.set_random_seed(seed)
    sess = tf.Session()
    with tf.device("/cpu:0"):
        inp = VarFeeder.read_vars("data/vars")
        # 恢复变量
        inp.restore(sess)
        # splitter = FakeSplitter(page_features(inp), 3, seed=seed, test_sampling=eval_sampling)
        splitter = Splitter(page_features(inp),
                            inp.page_map,
                            3,
                            train_sampling=train_sampling,
                            test_sampling=eval_sampling,
                            seed=seed)

    real_train_pages = splitter.splits[0].train_size
    real_eval_pages = splitter.splits[0].test_size
    # 14503.6
    items_per_eval = real_eval_pages * eval_pct
    # 30
Beispiel #10
0
def run(city_path='/nfs/isolation_project/intern/project/lihaocheng/city_forcast/city_day_features_to_yesterday.gbk.csv',
        weafor_path='/nfs/isolation_project/intern/project/lihaocheng/city_forcast/weather_forecast.csv',
        datadir='data',
        city_list=None,
        **args):

    start_time = time.time()
    # Get the data
    [train_x, train_embed_weekday, train_embed_month,
     train_embed_city, train_real_city, train_y_origin],\
    [val_x, val_embed_weekday, val_embed_month,
     val_embed_city, val_real_city, val_y_origin],\
    [infer_x, infer_embed_weekday, infer_embed_month,
     infer_embed_city, infer_city_map, infer_y_origin],\
    city_map, city_max, city_min, train_mean, train_std = read_all(city_list, city_path, weafor_path)

    log.debug(
        "complete generating df_cpu_max and df_cpu_num, time elapse = %S",
        time.time() - start_time)

    train_total = pd.concat([train_x, train_y_origin], axis=1)
    train_features = [pd.DataFrame() for city in city_list]
    train_y = [pd.DataFrame() for city in city_list]
    y_mean = list()
    y_std = list()
    month_autocorr = dict()
    week_autocorr = dict()

    # Make train features
    attrs = [
        'online_time', 'total_finish_order_cnt', 'total_gmv',
        'strive_order_cnt', 'total_no_call_order_cnt'
    ]
    dfs = [pd.DataFrame() for i in range(5)]
    for i, city in enumerate(city_list):
        for idx, attr in enumerate(attrs):
            per_city = train_total[train_total['city_id'] == city]
            train_features[i] = per_city[train_x.columns].values
            train_y[i] = per_city[train_y_origin.columns].apply(
                lambda x: np.log(x + 1))
            series = train_y[i].loc[:, [attr]]
            series.columns = [city]
            series = series.reset_index(drop=True)
            dfs[idx] = pd.concat([dfs[idx], series], axis=1)
    for idx, attr in enumerate(attrs):
        df = dfs[idx].T
        # monthly autocorrelation
        month = batch_autocorr(df.values, 30)

        # weekly autocorrelation
        week = batch_autocorr(df.values, 7)

        # Normalise all the things
        month_autocorr[attr] = normalize(np.nan_to_num(month))
        week_autocorr[attr] = normalize(np.nan_to_num(week))

    # Find train_y mean & std
    for i, per_city in enumerate(train_y):
        y_mean.append(per_city.mean())
        y_std.append(per_city.std())

    # Make val features
    val_total = pd.concat([val_x, val_y_origin], axis=1)
    val_features = list()
    val_y = list()
    for city in city_list:
        per_city = val_total[val_total['city_id'] == city]
        val_features.append(per_city[val_x.columns].values)
        val_y.append(
            per_city[val_y_origin.columns].apply(lambda x: np.log(x + 1)))

    # Make infer features
    infer_x = infer_x.drop(['city_id'], axis=1)
    infer_total = pd.concat([infer_x, infer_y_origin], axis=1)
    infer_features = list()
    infer_y = list()
    for city in city_list:
        per_city = infer_total[infer_total['city_id'] == city]
        infer_features.append(per_city[infer_x.columns].values)
        infer_y.append(
            per_city[infer_y_origin.columns].apply(lambda x: np.log(x + 1)))

    # Make time-dependent features
    time_period = 7 / (2 * np.pi)
    train_dow_norm = train_embed_weekday / time_period
    val_dow_norm = val_embed_weekday / time_period
    infer_dow_norm = infer_embed_weekday / time_period
    time_period = 12 / (2 * np.pi)
    train_dom_norm = train_embed_month / time_period
    val_dom_norm = val_embed_month / time_period
    infer_dom_norm = infer_embed_month / time_period

    train_dow = np.stack([
        np.cos(train_dow_norm),
        np.sin(train_dow_norm),
        np.cos(train_dom_norm),
        np.sin(train_dom_norm)
    ],
                         axis=-1)
    val_dow = np.stack([
        np.cos(val_dow_norm),
        np.sin(val_dow_norm),
        np.cos(val_dom_norm),
        np.sin(val_dom_norm)
    ],
                       axis=-1)
    infer_dow = np.stack([
        np.cos(infer_dow_norm),
        np.sin(infer_dow_norm),
        np.cos(infer_dom_norm),
        np.sin(infer_dom_norm)
    ],
                         axis=-1)

    # Assemble final output
    tensors = dict(train_x=train_features,
                   val_x=val_features,
                   infer_x=infer_features,
                   train_dow=train_dow,
                   val_dow=val_dow,
                   infer_dow=infer_dow,
                   train_y=[df['total_no_call_order_cnt'] for df in train_y],
                   val_y=[df['total_no_call_order_cnt'] for df in val_y],
                   infer_y=[df['total_no_call_order_cnt'] for df in infer_y],
                   train_time=train_features[0].shape[0],
                   val_time=val_features[0].shape[0],
                   infer_time=infer_features[0].shape[0],
                   month_autocorr=month_autocorr['total_no_call_order_cnt'],
                   week_autocorr=week_autocorr['total_no_call_order_cnt'],
                   cities=np.array([city_map[city] for city in city_list]),
                   mean=[per['total_no_call_order_cnt'] for per in y_mean],
                   std=[per['total_no_call_order_cnt'] for per in y_std])
    plain = dict()

    # Store data to the disk
    VarFeeder(os.path.join(datadir, 'vars'), tensors, plain)
    with open(os.path.join(datadir, 'city_map.pickle'), 'wb') as handle:
        pkl.dump(city_map, handle, protocol=pkl.HIGHEST_PROTOCOL)

    infer_y_origin.to_pickle(os.path.join(datadir, 'infer_y.pickle'))
Beispiel #11
0
def main(_):
    if not FLAGS.server:
        print('please specify server host:port')
        return

    channel = grpc.insecure_channel(FLAGS.server)
    stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
    request = predict_pb2.PredictRequest()

    request.model_spec.name = "ucdoc"
    request.model_spec.signature_name = "serving_default"

    with tf.variable_scope('input') as inp_scope:
        with tf.device("/cpu:0"):
            inp = VarFeeder.read_vars("data/vars")
            pipe = InputPipe(inp,
                             ucdoc_features(inp),
                             inp.n_pages,
                             mode=ModelMode.PREDICT,
                             batch_size=FLAGS.batch_size,
                             n_epoch=1,
                             verbose=FLAGS.verbose,
                             train_completeness_threshold=0.01,
                             predict_window=FLAGS.predict_window,
                             predict_completeness_threshold=0.0,
                             train_window=FLAGS.train_window,
                             back_offset=FLAGS.predict_window + 1)
    with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(
            allow_growth=True))) as sess:
        pipe.load_vars(sess)
        pipe.init_iterator(sess)

        while True:
            try:
                truex, timex, normx, laggedx, truey, timey, normy, normmean, normstd, pgfeatures, pageix = \
                    sess.run([pipe.true_x, pipe.time_x, pipe.norm_x, pipe.lagged_x, pipe.true_y, pipe.time_y,
                              pipe.norm_y, pipe.norm_mean, pipe.norm_std, pipe.ucdoc_features, pipe.page_ix])

                request.inputs["truex"].CopyFrom(tf.make_tensor_proto(truex))
                request.inputs["timex"].CopyFrom(tf.make_tensor_proto(timex))
                request.inputs["normx"].CopyFrom(tf.make_tensor_proto(normx))
                request.inputs["laggedx"].CopyFrom(
                    tf.make_tensor_proto(laggedx))
                request.inputs["truey"].CopyFrom(tf.make_tensor_proto(truey))
                request.inputs["timey"].CopyFrom(tf.make_tensor_proto(timey))
                request.inputs["normy"].CopyFrom(tf.make_tensor_proto(normy))
                request.inputs["normmean"].CopyFrom(
                    tf.make_tensor_proto(normmean))
                request.inputs["normstd"].CopyFrom(
                    tf.make_tensor_proto(normstd))
                request.inputs["page_features"].CopyFrom(
                    tf.make_tensor_proto(pgfeatures))
                request.inputs["pageix"].CopyFrom(tf.make_tensor_proto(pageix))

                response = stub.Predict(request, 10)
                tensor_proto = response.outputs['pred']
                if not 'pred_result' in locals():
                    pred_result = tf.contrib.util.make_ndarray(tensor_proto)
                else:
                    pred_result = np.concatenate([
                        pred_result,
                        tf.contrib.util.make_ndarray(tensor_proto)
                    ])
            except tf.errors.OutOfRangeError:
                print('done with prediction')
                break
        pred_result = np.expm1(pred_result) + 0.5
        pred_result = pred_result.astype(int)
        if not os.path.exists(FLAGS.result_dir):
            os.mkdir(FLAGS.result_dir)
        result_file = os.path.join(FLAGS.result_dir, "predict.pkl")
        pickle.dump(pred_result, open(result_file, "wb"))
        print('finished prediction')
Beispiel #12
0
def run_local():
    data_dir = 'data/vars'
    add_days = 63
    valid_threshold = 0.0
    corr_backoffset = 0
    start = None
    end = None

    # Get the data   df:log1p后结果 浮点数  145036--过滤掉27条 长度不够的
    df, nans, starts, ends = prepare_data(start, end, valid_threshold)

    # Our working date range
    data_start, data_end = df.columns[0], df.columns[-1]

    # We have to project some date-dependent features (day of week, etc) to the future dates for prediction
    features_end = data_end + pd.Timedelta(add_days, unit='D')
    print(f"start: {data_start}, end:{data_end}, features_end:{features_end}")

    # Group unique pages by agents 每个唯一页面对应最多四个agent,0 1 2 3  在相应位置填上 页面原来的行号
    assert df.index.is_monotonic_increasing
    page_map = uniq_page_map(df.index.values)

    # Yearly(annual) autocorrelation 求365天的相关系数值,不符合条件的置np.nan
    raw_year_autocorr = batch_autocorr(df.values, 365, starts, ends, 1.5,
                                       corr_backoffset)
    year_unknown_pct = np.sum(np.isnan(raw_year_autocorr)) / len(
        raw_year_autocorr)  # type: float

    # Quarterly autocorrelation
    raw_quarter_autocorr = batch_autocorr(df.values, int(round(365.25 / 4)),
                                          starts, ends, 2, corr_backoffset)
    quarter_unknown_pct = np.sum(np.isnan(raw_quarter_autocorr)) / len(
        raw_quarter_autocorr)  # type: float

    print("Percent of undefined autocorr = yearly:%.3f, quarterly:%.3f" %
          (year_unknown_pct, quarter_unknown_pct))

    # Normalise all the things
    year_autocorr = normalize(np.nan_to_num(raw_year_autocorr))  # -3,+3
    quarter_autocorr = normalize(
        np.nan_to_num(raw_quarter_autocorr))  # -4.5 ,+4.5

    # Calculate and encode page features  页面url没有变化,提取了三个特征 agent country  site
    page_features = make_page_features(df.index.values)
    encoded_page_features = encode_page_features(
        page_features)  # 10555 + 127181 + 7300 =145036

    # Make time-dependent features 867
    features_days = pd.date_range(data_start, features_end)
    # dow = normalize(features_days.dayofweek.values)
    week_period = 7 / (2 * np.pi)
    dow_norm = features_days.dayofweek.values / week_period
    dow = np.stack([np.cos(dow_norm), np.sin(dow_norm)], axis=-1)

    # Assemble indices for quarterly lagged data
    lagged_ix = np.stack(lag_indexes(data_start, features_end), axis=-1)
    # 一列
    page_popularity = df.median(axis=1)
    page_popularity = (page_popularity -
                       page_popularity.mean()) / page_popularity.std()

    # Put NaNs back 缺失重置np.NaN
    df[nans] = np.NaN

    # Assemble final output
    tensors = dict(
        hits=df,
        lagged_ix=lagged_ix,
        page_map=page_map,
        page_ix=df.index.values,
        pf_agent=encoded_page_features['agent'],
        pf_country=encoded_page_features['country'],
        pf_site=encoded_page_features['site'],
        page_popularity=page_popularity,
        year_autocorr=year_autocorr,
        quarter_autocorr=quarter_autocorr,
        dow=dow,
    )
    plain = dict(features_days=len(features_days),
                 data_days=len(df.columns),
                 n_pages=len(df),
                 data_start=data_start,
                 data_end=data_end,
                 features_end=features_end)

    # Store data to the disk
    VarFeeder(data_dir, tensors, plain)
Beispiel #13
0
def extend_inp(data_path, predict_window, holiday_list):

    with tf.variable_scope('input', reuse=tf.AUTO_REUSE) as inp_scope:
        with tf.device("/cpu:0"):
            # inp = VarFeeder.read_vars("data/vars")
            inp = VarFeeder.read_vars(data_path)

            yesterday = pd.to_datetime(inp.data_end) + pd.Timedelta(
                predict_window, 'D')
            yesterday = yesterday.date().strftime('%Y-%m-%d')
            day = datetime.strptime(yesterday, '%Y-%m-%d')
            day_list = []
            for _ in range(0, inp.features_days + predict_window):
                day_list.append(datetime.strftime(day, '%Y-%m-%d'))
                day = day + timedelta(days=-1)
            day_list.sort()

            # computing lagged_ix
            date_range = pd.date_range(day_list[0], day_list[-1])
            base_index = pd.Series(np.arange(0, len(date_range)),
                                   index=date_range)

            def lag(offset):
                dates = date_range - offset
                return pd.Series(data=base_index[dates].fillna(-1).astype(
                    np.int16).values,
                                 index=date_range)

            lagged_ix = np.stack(
                [lag(pd.DateOffset(months=m)) for m in (1, 2)], axis=-1)

            with tf.Session() as sess:
                inp.restore(sess)
                # ts_log, page_ix_, pf_age_, pf_si_, pf_network_, pf_price_model_, \
                ts_log, page_ix_, pf_age_, pf_si_, pf_network_, pf_price_cat_, \
                pf_gender_, page_popularity_, quarter_autocorr_ = \
                    sess.run([inp.hits, inp.page_ix, inp.pf_age, inp.pf_si,
                              # inp.pf_network, inp.pf_price_model, inp.pf_gender,
                              inp.pf_network, inp.pf_price_cat, inp.pf_gender,
                              inp.page_popularity, inp.quarter_autocorr])
                print(
                    f'start: {inp.data_start}\tend: {inp.data_end}\tlength: {inp.features_days}'
                )

            df_ts = pd.DataFrame(np.append(ts_log,
                                           np.zeros((len(page_ix_),
                                                     predict_window)),
                                           axis=1),
                                 index=list(page_ix_),
                                 columns=day_list)
            df_age = pd.DataFrame(pf_age_, index=list(page_ix_))
            df_si = pd.DataFrame(pf_si_, index=list(page_ix_))
            df_network = pd.DataFrame(pf_network_, index=list(page_ix_))
            # df_price_model = pd.DataFrame(pf_price_model_, index=list(page_ix_))
            df_price_cat = pd.DataFrame(pf_price_cat_, index=list(page_ix_))
            df_gender = pd.DataFrame(pf_gender_, index=list(page_ix_))

            def get_dow(day_list):
                dow_list = []
                for day in day_list:
                    dow = datetime.strptime(day, '%Y-%m-%d').weekday()
                    dow_list.append(dow)

                week_period = 7.0 / (2 * math.pi)
                sin_list = [math.sin(x / week_period) for x in dow_list]
                cos_list = [math.cos(x / week_period) for x in dow_list]
                return (sin_list, cos_list)

            dow_ = get_dow(day_list)

            # holiday_list = cfg['pipeline']['normalization']['holidays']

            holidays = [1 if _ in holiday_list else 0 for _ in day_list]
            a_list = []
            b_list = []
            for _ in holidays:
                a, b = math.sin(_), math.cos(_)
                a_list.append(a)
                b_list.append(b)
            holiday = (a_list, b_list)

            tensors = dict(
                hits=df_ts,
                lagged_ix=lagged_ix,
                page_ix=list(page_ix_),
                pf_age=df_age,
                pf_si=df_si,
                pf_network=df_network,
                # pf_price_model=df_price_model,
                pf_price_cat=df_price_cat,
                pf_gender=df_gender,
                page_popularity=page_popularity_,
                quarter_autocorr=quarter_autocorr_,
                dow=pd.DataFrame(dow_).T,
                holiday=pd.DataFrame(holiday).T)

            batch_size = len(page_ix_)
            data_len = tensors['hits'].shape[1]
            plain = dict(data_days=data_len - 0,
                         features_days=data_len,
                         data_start=day_list[0],
                         data_end=day_list[-1],
                         n_pages=batch_size)
            # dump_path = 'data/vars/predict_future'
            dump_path = os.path.join(data_path, 'predict_future')
            if not os.path.exists(dump_path):
                os.mkdir(dump_path)
            VarFeeder(dump_path, tensors, plain)
    tf.reset_default_graph()
Beispiel #14
0
def main(_):
    if len(sys.argv) < 3:
        print(
            'Usage: saved_model.py [--model_version=y] --data_dir=xxx --ckpt_dir=xxx --saved_dir=xxx'
        )
        sys.exit(-1)
    if FLAGS.training_iteration <= 0:
        print('Please specify a positive value for training iteration.')
        sys.exit(-1)
    if FLAGS.model_version <= 0:
        print('Please specify a positive value for version number.')
        sys.exit(-1)

    with open(FLAGS.config_file, 'r') as ymlfile:
        cfg = yaml.load(ymlfile)

    holiday_list = cfg['pipeline']['normalization']['holidays']
    if FLAGS.back_offset < FLAGS.predict_window:
        extend_inp(FLAGS.data_dir, FLAGS.predict_window, holiday_list)

    # create deploy model first
    back_offset_ = FLAGS.back_offset
    with tf.variable_scope('input') as inp_scope:
        with tf.device("/cpu:0"):
            if FLAGS.back_offset < FLAGS.predict_window:
                inp = VarFeeder.read_vars(
                    os.path.join(FLAGS.data_dir, 'predict_future'))
                back_offset_ += FLAGS.predict_window
            else:
                inp = VarFeeder.read_vars(FLAGS.data_dir)
            pipe = InputPipe(inp,
                             ucdoc_features(inp),
                             inp.hits.shape[0],
                             mode=ModelMode.PREDICT,
                             batch_size=FLAGS.batch_size,
                             n_epoch=1,
                             verbose=False,
                             train_completeness_threshold=0.01,
                             predict_window=FLAGS.predict_window,
                             predict_completeness_threshold=0.0,
                             train_window=FLAGS.train_window,
                             back_offset=back_offset_)

    asgd_decay = 0.99 if FLAGS.asgd else None

    if FLAGS.n_models == 1:
        model = Model(pipe,
                      build_from_set(FLAGS.hparam_set),
                      is_train=False,
                      seed=1,
                      asgd_decay=asgd_decay)
    else:
        models = []
        for i in range(FLAGS.n_models):
            prefix = f"m_{i}"
            with tf.variable_scope(prefix) as scope:
                models.append(
                    Model(pipe,
                          build_from_set(FLAGS.hparam_set),
                          is_train=False,
                          seed=1,
                          asgd_decay=asgd_decay,
                          graph_prefix=prefix))
        model = models[FLAGS.target_model]

    if FLAGS.asgd:
        var_list = model.ema.variables_to_restore()
        if FLAGS.n_models > 1:
            prefix = f"m_{target_model}"
            for var in list(var_list.keys()):
                if var.endswith('ExponentialMovingAverage'
                                ) and not var.startswith(prefix):
                    del var_list[var]
    else:
        var_list = None

    # load checkpoint model from training
    #ckpt_path = FLAGS.ckpt_dir
    print('loading checkpoint model...')
    ckpt_file = tf.train.latest_checkpoint(FLAGS.ckpt_dir)
    #graph = tf.Graph()
    graph = model.predictions.graph

    init = tf.global_variables_initializer()

    saver = tf.train.Saver(name='deploy_saver', var_list=var_list)
    with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(
            allow_growth=True))) as sess:
        sess.run(init)
        pipe.load_vars(sess)
        pipe.init_iterator(sess)
        saver.restore(sess, ckpt_file)
        print('Done loading checkpoint model')
        export_path_base = FLAGS.saved_dir
        export_path = os.path.join(
            tf.compat.as_bytes(export_path_base),
            tf.compat.as_bytes(str(FLAGS.model_version)))
        print('Exporting trained model to', export_path)
        if os.path.isdir(export_path):
            shutil.rmtree(export_path)
        builder = tf.saved_model.builder.SavedModelBuilder(export_path)

        true_x = tf.saved_model.utils.build_tensor_info(
            model.inp.true_x)  # pipe.true_x
        time_x = tf.saved_model.utils.build_tensor_info(
            model.inp.time_x)  # pipe.time_x
        norm_x = tf.saved_model.utils.build_tensor_info(
            model.inp.norm_x)  # pipe.norm_x
        lagged_x = tf.saved_model.utils.build_tensor_info(
            model.inp.lagged_x)  # pipe.lagged_x
        true_y = tf.saved_model.utils.build_tensor_info(
            model.inp.true_y)  # pipe.true_y
        time_y = tf.saved_model.utils.build_tensor_info(
            model.inp.time_y)  # pipe.time_y
        norm_y = tf.saved_model.utils.build_tensor_info(
            model.inp.norm_y)  # pipe.norm_y
        norm_mean = tf.saved_model.utils.build_tensor_info(
            model.inp.norm_mean)  # pipe.norm_mean
        norm_std = tf.saved_model.utils.build_tensor_info(
            model.inp.norm_std)  # pipe.norm_std
        pg_features = tf.saved_model.utils.build_tensor_info(
            model.inp.ucdoc_features)  # pipe.ucdoc_features
        page_ix = tf.saved_model.utils.build_tensor_info(
            model.inp.page_ix)  # pipe.page_ix

        #pred = tf.saved_model.utils.build_tensor_info(graph.get_operation_by_name('m_0/add').outputs[0])
        pred = tf.saved_model.utils.build_tensor_info(model.predictions)

        labeling_signature = (
            tf.saved_model.signature_def_utils.build_signature_def(
                inputs={
                    "truex": true_x,
                    "timex": time_x,
                    "normx": norm_x,
                    "laggedx": lagged_x,
                    "truey": true_y,
                    "timey": time_y,
                    "normy": norm_y,
                    "normmean": norm_mean,
                    "normstd": norm_std,
                    "page_features": pg_features,
                    "pageix": page_ix,
                },
                outputs={"predictions": pred},
                method_name="tensorflow/serving/predict"))

        legacy_init_op = tf.group(tf.tables_initializer(),
                                  name='legacy_init_op')

        builder.add_meta_graph_and_variables(
            sess, [tf.saved_model.tag_constants.SERVING],
            signature_def_map={
                tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
                labeling_signature
            },
            main_op=tf.tables_initializer(),
            strip_default_attrs=True)

        builder.save()
        print("Build Done")
def run():
    parser = argparse.ArgumentParser(description='Prepare data')
    parser.add_argument('data_dir')
    parser.add_argument(
        '--valid_threshold',
        default=0.0,
        type=float,
        help="Series minimal length threshold (pct of data length)")
    parser.add_argument('--add_days',
                        default=64,
                        type=int,
                        help="Add N days in a future for prediction")
    parser.add_argument(
        '--start',
        help="Effective start date. Data before the start is dropped")
    parser.add_argument(
        '--end', help="Effective end date. Data past the end is dropped")
    parser.add_argument('--corr_backoffset',
                        default=0,
                        type=int,
                        help='Offset for correlation calculation')
    args = parser.parse_args()

    # Get the data
    df, nans, starts, ends = prepare_data(args.start, args.end,
                                          args.valid_threshold)

    # Our working date range
    data_start, data_end = df.columns[0], df.columns[-1]

    # We have to project some date-dependent features (day of week, etc) to the future dates for prediction
    features_end = data_end + pd.Timedelta(args.add_days, unit='D')
    print(f"start: {data_start}, end:{data_end}, features_end:{features_end}")

    # Group unique pages by agents
    assert df.index.is_monotonic_increasing
    page_map = uniq_page_map(df.index.values)

    # Yearly(annual) autocorrelation
    raw_year_autocorr = batch_autocorr(df.values, 365, starts, ends, 1.5,
                                       args.corr_backoffset)
    year_unknown_pct = np.sum(np.isnan(raw_year_autocorr)) / len(
        raw_year_autocorr)  # type: float

    # Quarterly autocorrelation
    raw_quarter_autocorr = batch_autocorr(df.values, int(round(365.25 / 4)),
                                          starts, ends, 2,
                                          args.corr_backoffset)
    quarter_unknown_pct = np.sum(np.isnan(raw_quarter_autocorr)) / len(
        raw_quarter_autocorr)  # type: float

    print("Percent of undefined autocorr = yearly:%.3f, quarterly:%.3f" %
          (year_unknown_pct, quarter_unknown_pct))

    # Normalise all the things
    year_autocorr = normalize(np.nan_to_num(raw_year_autocorr))
    quarter_autocorr = normalize(np.nan_to_num(raw_quarter_autocorr))

    # Calculate and encode page features
    page_features = make_page_features(df.index.values)
    encoded_page_features = encode_page_features(page_features)

    # Make time-dependent features
    features_days = pd.date_range(data_start, features_end)
    # dow = normalize(features_days.dayofweek.values)
    week_period = 7 / (2 * np.pi)
    dow_norm = features_days.dayofweek.values / week_period
    dow = np.stack([np.cos(dow_norm), np.sin(dow_norm)], axis=-1)

    # Assemble indices for quarterly lagged data
    lagged_ix = np.stack(lag_indexes(data_start, features_end), axis=-1)

    page_popularity = df.median(axis=1)
    page_popularity = (page_popularity -
                       page_popularity.mean()) / page_popularity.std()

    # Put NaNs back
    df[nans] = np.NaN

    # Assemble final output
    tensors = dict(
        lagged_ix=lagged_ix,
        page_map=page_map,
        dow=dow,
        hits=df,
        pf_agent=encoded_page_features['agent'],
        pf_country=encoded_page_features['country'],
        pf_site=encoded_page_features['site'],
        page_ix=df.index.values,
        page_popularity=page_popularity,
        year_autocorr=year_autocorr,
        quarter_autocorr=quarter_autocorr,
    )
    plain = dict(features_days=len(features_days),
                 data_days=len(df.columns),
                 n_pages=len(df),
                 data_start=data_start,
                 data_end=data_end,
                 features_end=features_end)

    # Store data to the disk
    VarFeeder(args.data_dir, tensors, plain)
Beispiel #16
0
def predict(checkpoints,
            hparams,
            datadir="data",
            verbose=False,
            n_models=1,
            target_model=0,
            asgd=False,
            seed=1,
            batch_size=50):
    with tf.variable_scope('input') as inp_scope:
        with tf.device("/cpu:0"):
            inp = VarFeeder.read_vars(os.path.join(datadir, "vars"))
            pipe = InputPipe(datadir,
                             inp,
                             infer_features(inp),
                             mode=ModelMode.PREDICT,
                             batch_size=batch_size,
                             n_epoch=1,
                             verbose=verbose,
                             train_completeness_threshold=0.01,
                             train_window=hparams.train_window)
    asgd_decay = 0.99 if asgd else None
    if n_models == 1:
        model = Model(pipe,
                      hparams,
                      is_train=False,
                      seed=seed,
                      asgd_decay=asgd_decay)
    else:
        models = []
        for i in range(n_models):
            prefix = f"m_{i}"
            with tf.variable_scope(prefix) as scope:
                models.append(
                    Model(pipe,
                          hparams,
                          is_train=False,
                          seed=seed,
                          asgd_decay=asgd_decay,
                          graph_prefix=prefix))
        model = models[target_model]

    if asgd:
        var_list = model.ema.variables_to_restore()
        prefix = f"m_{target_model}"
        for var in list(var_list.keys()):
            if var.endswith(
                    'ExponentialMovingAverage') and not var.startswith(prefix):
                del var_list[var]
    else:
        var_list = None
    saver = tf.train.Saver(name='eval_saver', var_list=var_list)
    x_buffer = []
    predictions = None
    with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(
            allow_growth=True))) as sess:
        pipe.load_vars(sess)
        for checkpoint in checkpoints:
            pred_buffer = []
            pipe.init_iterator(sess)
            saver.restore(sess, checkpoint)
            cnt = 0
            while True:
                try:
                    pred, pname = sess.run([model.prediction, model.inp.vm_ix])
                    # utf_names = [str(name, 'utf-8') for name in pname]
                    utf_names = pname
                    pred_df = pd.DataFrame(index=utf_names,
                                           data=np.expm1(pred) - 1)
                    pred_buffer.append(pred_df)
                    newline = cnt % 80 == 0
                    if cnt > 0:
                        print('.', end='\n' if newline else '', flush=True)
                    if newline:
                        print(cnt, end='')
                    cnt += 1
                except tf.errors.OutOfRangeError:
                    print('Done!')
                    break
            cp_predictions = pd.concat(pred_buffer)
            if predictions is None:
                predictions = cp_predictions
            else:
                predictions += cp_predictions
    predictions /= len(checkpoints)
    return predictions.iloc[:, -1]
Beispiel #17
0
def train(name, hparams, multi_gpu=False, n_models=1, train_completeness_threshold=0.01,
          seed=None, logdir='data/logs', max_epoch=100, patience=2, train_sampling=1.0,
          eval_sampling=1.0, eval_memsize=5, gpu=0, gpu_allow_growth=False, save_best_model=False,
          forward_split=False, write_summaries=False, verbose=False, asgd_decay=None, tqdm=True,
          side_split=True, max_steps=None, save_from_step=None, do_eval=True, predict_window=63):

    eval_k = int(round(26214 * eval_memsize / n_models))
    eval_batch_size = int(
        eval_k / (hparams.rnn_depth * hparams.encoder_rnn_layers))  # 128 -> 1024, 256->512, 512->256
    eval_pct = 0.1
    batch_size = hparams.batch_size
    train_window = hparams.train_window
    tf.reset_default_graph()
    if seed:
        tf.set_random_seed(seed)

    with tf.device("/cpu:0"):
        inp = VarFeeder.read_vars("data/vars")
        if side_split:
            splitter = Splitter(page_features(inp), inp.page_map, 3, train_sampling=train_sampling,
                                test_sampling=eval_sampling, seed=seed)
        else:
            splitter = FakeSplitter(page_features(inp), 3, seed=seed, test_sampling=eval_sampling)

    real_train_pages = splitter.splits[0].train_size
    real_eval_pages = splitter.splits[0].test_size

    items_per_eval = real_eval_pages * eval_pct
    eval_batches = int(np.ceil(items_per_eval / eval_batch_size))
    steps_per_epoch = real_train_pages // batch_size
    eval_every_step = int(round(steps_per_epoch * eval_pct))
    # eval_every_step = int(round(items_per_eval * train_sampling / batch_size))

    global_step = tf.train.get_or_create_global_step()
    inc_step = tf.assign_add(global_step, 1)


    all_models: List[ModelTrainerV2] = []

    def create_model(scope, index, prefix, seed):

        with tf.variable_scope('input') as inp_scope:
            with tf.device("/cpu:0"):
                split = splitter.splits[index]
                pipe = InputPipe(inp, features=split.train_set, n_pages=split.train_size,
                                 mode=ModelMode.TRAIN, batch_size=batch_size, n_epoch=None, verbose=verbose,
                                 train_completeness_threshold=train_completeness_threshold,
                                 predict_completeness_threshold=train_completeness_threshold, train_window=train_window,
                                 predict_window=predict_window,
                                 rand_seed=seed, train_skip_first=hparams.train_skip_first,
                                 back_offset=predict_window if forward_split else 0)
                inp_scope.reuse_variables()
                if side_split:
                    side_eval_pipe = InputPipe(inp, features=split.test_set, n_pages=split.test_size,
                                               mode=ModelMode.EVAL, batch_size=eval_batch_size, n_epoch=None,
                                               verbose=verbose, predict_window=predict_window,
                                               train_completeness_threshold=0.01, predict_completeness_threshold=0,
                                               train_window=train_window, rand_seed=seed, runs_in_burst=eval_batches,
                                               back_offset=predict_window * (2 if forward_split else 1))
                else:
                    side_eval_pipe = None
                if forward_split:
                    forward_eval_pipe = InputPipe(inp, features=split.test_set, n_pages=split.test_size,
                                                  mode=ModelMode.EVAL, batch_size=eval_batch_size, n_epoch=None,
                                                  verbose=verbose, predict_window=predict_window,
                                                  train_completeness_threshold=0.01, predict_completeness_threshold=0,
                                                  train_window=train_window, rand_seed=seed, runs_in_burst=eval_batches,
                                                  back_offset=predict_window)
                else:
                    forward_eval_pipe = None
        avg_sgd = asgd_decay is not None
        #asgd_decay = 0.99 if avg_sgd else None
        train_model = Model(pipe, hparams, is_train=True, graph_prefix=prefix, asgd_decay=asgd_decay, seed=seed)
        scope.reuse_variables()

        eval_stages = []
        if side_split:
            side_eval_model = Model(side_eval_pipe, hparams, is_train=False,
                                    #loss_mask=np.concatenate([np.zeros(50, dtype=np.float32), np.ones(10, dtype=np.float32)]),
                                    seed=seed)
            eval_stages.append((Stage.EVAL_SIDE, side_eval_model))
            if avg_sgd:
                eval_stages.append((Stage.EVAL_SIDE_EMA, side_eval_model))
        if forward_split:
            forward_eval_model = Model(forward_eval_pipe, hparams, is_train=False, seed=seed)
            eval_stages.append((Stage.EVAL_FRWD, forward_eval_model))
            if avg_sgd:
                eval_stages.append((Stage.EVAL_FRWD_EMA, forward_eval_model))

        if write_summaries:
            summ_path = f"{logdir}/{name}_{index}"
            if os.path.exists(summ_path):
                shutil.rmtree(summ_path)
            summ_writer = tf.summary.FileWriter(summ_path)  # , graph=tf.get_default_graph()
        else:
            summ_writer = None
        if do_eval and forward_split:
            stop_metric = lambda metrics: metrics[Stage.EVAL_FRWD]['SMAPE'].avg_epoch
        else:
            stop_metric = None
        return ModelTrainerV2(train_model, eval_stages, index, patience=patience,
                              stop_metric=stop_metric,
                              summary_writer=summ_writer)


    if n_models == 1:
        with tf.device(f"/gpu:{gpu}"):
            scope = tf.get_variable_scope()
            all_models = [create_model(scope, 0, None, seed=seed)]
    else:
        for i in range(n_models):
            device = f"/gpu:{i}" if multi_gpu else f"/gpu:{gpu}"
            with tf.device(device):
                prefix = f"m_{i}"
                with tf.variable_scope(prefix) as scope:
                    all_models.append(create_model(scope, i, prefix=prefix, seed=seed + i))
    trainer = MultiModelTrainer(all_models, inc_step)
    if save_best_model or save_from_step:
        saver_path = f'data/cpt/{name}'
        if os.path.exists(saver_path):
            shutil.rmtree(saver_path)
        os.makedirs(saver_path)
        saver = tf.train.Saver(max_to_keep=10, name='train_saver')
    else:
        saver = None
    avg_sgd = asgd_decay is not None
    if avg_sgd:
        from itertools import chain
        def ema_vars(model):
            ema = model.train_model.ema
            return {ema.average_name(v):v for v in model.train_model.ema._averages}

        ema_names = dict(chain(*[ema_vars(model).items() for model in all_models]))
        #ema_names = all_models[0].train_model.ema.variables_to_restore()
        ema_loader = tf.train.Saver(var_list=ema_names,  max_to_keep=1, name='ema_loader')
        ema_saver = tf.train.Saver(max_to_keep=1, name='ema_saver')
    else:
        ema_loader = None

    init = tf.global_variables_initializer()

    if forward_split and do_eval:
        eval_smape = trainer.metric(Stage.EVAL_FRWD, 'SMAPE')
        eval_mae = trainer.metric(Stage.EVAL_FRWD, 'MAE')
    else:
        eval_smape = DummyMetric()
        eval_mae = DummyMetric()

    if side_split and do_eval:
        eval_mae_side = trainer.metric(Stage.EVAL_SIDE, 'MAE')
        eval_smape_side = trainer.metric(Stage.EVAL_SIDE, 'SMAPE')
    else:
        eval_mae_side = DummyMetric()
        eval_smape_side = DummyMetric()

    train_smape = trainer.metric(Stage.TRAIN, 'SMAPE')
    train_mae = trainer.metric(Stage.TRAIN, 'MAE')
    grad_norm = trainer.metric(Stage.TRAIN, 'GrNorm')
    eval_stages = []
    ema_eval_stages = []
    if forward_split and do_eval:
        eval_stages.append(Stage.EVAL_FRWD)
        ema_eval_stages.append(Stage.EVAL_FRWD_EMA)
    if side_split and do_eval:
        eval_stages.append(Stage.EVAL_SIDE)
        ema_eval_stages.append(Stage.EVAL_SIDE_EMA)

    # gpu_options=tf.GPUOptions(allow_growth=False),
    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                          gpu_options=tf.GPUOptions(allow_growth=gpu_allow_growth))) as sess:
        sess.run(init)
        # pipe.load_vars(sess)
        inp.restore(sess)
        for model in all_models:
            model.init(sess)
        # if beholder:
        #    visualizer = Beholder(session=sess, logdir=summ_path)
        step = 0
        prev_top = np.inf
        best_smape = np.inf
        # Contains best value (first item) and subsequent values
        best_epoch_smape = []

        for epoch in range(max_epoch):

            # n_steps = pusher.n_pages // batch_size
            if tqdm:
                tqr = trange(steps_per_epoch, desc="%2d" % (epoch + 1), leave=False)
            else:
                tqr = range(steps_per_epoch)

            for _ in tqr:
                try:
                    step = trainer.train_step(sess, epoch)
                except tf.errors.OutOfRangeError:
                    break
                    # if beholder:
                    #  if step % 5 == 0:
                    # noinspection PyUnboundLocalVariable
                    #  visualizer.update()
                if step % eval_every_step == 0:
                    if eval_stages:
                        trainer.eval_step(sess, epoch, step, eval_batches, stages=eval_stages)

                    if save_best_model and epoch > 0 and eval_smape.last < best_smape:
                        best_smape = eval_smape.last
                        saver.save(sess, f'data/cpt/{name}/cpt', global_step=step)
                    if save_from_step and step >= save_from_step:
                        saver.save(sess, f'data/cpt/{name}/cpt', global_step=step)

                    if avg_sgd and ema_eval_stages:
                        ema_saver.save(sess, 'data/cpt_tmp/ema',  write_meta_graph=False)
                        # restore ema-backed vars
                        ema_loader.restore(sess, 'data/cpt_tmp/ema')

                        trainer.eval_step(sess, epoch, step, eval_batches, stages=ema_eval_stages)
                        # restore normal vars
                        ema_saver.restore(sess, 'data/cpt_tmp/ema')

                MAE = "%.3f/%.3f/%.3f" % (eval_mae.last, eval_mae_side.last, train_mae.last)
                improvement = '↑' if eval_smape.improved else ' '
                SMAPE = "%s%.3f/%.3f/%.3f" % (improvement, eval_smape.last, eval_smape_side.last,  train_smape.last)
                if tqdm:
                    tqr.set_postfix(gr=grad_norm.last, MAE=MAE, SMAPE=SMAPE)
                if not trainer.has_active() or (max_steps and step > max_steps):
                    break

            if tqdm:
                tqr.close()
            trainer.end_epoch()
            if not best_epoch_smape or eval_smape.avg_epoch < best_epoch_smape[0]:
                best_epoch_smape = [eval_smape.avg_epoch]
            else:
                best_epoch_smape.append(eval_smape.avg_epoch)

            current_top = eval_smape.top
            if prev_top > current_top:
                prev_top = current_top
                has_best_indicator = '↑'
            else:
                has_best_indicator = ' '
            status = "%2d: Best top SMAPE=%.3f%s (%s)" % (
                epoch + 1, current_top, has_best_indicator,
                ",".join(["%.3f" % m.top for m in eval_smape.metrics]))

            if trainer.has_active():
                status += ", frwd/side best MAE=%.3f/%.3f, SMAPE=%.3f/%.3f; avg MAE=%.3f/%.3f, SMAPE=%.3f/%.3f, %d am" % \
                          (eval_mae.best_epoch, eval_mae_side.best_epoch, eval_smape.best_epoch, eval_smape_side.best_epoch,
                           eval_mae.avg_epoch,  eval_mae_side.avg_epoch,  eval_smape.avg_epoch,  eval_smape_side.avg_epoch,
                           trainer.has_active())
                print(status, file=sys.stderr)
            else:
                print(status, file=sys.stderr)
                print("Early stopping!", file=sys.stderr)
                break
            if max_steps and step > max_steps:
                print("Max steps calculated", file=sys.stderr)
                break
            sys.stderr.flush()

        # noinspection PyUnboundLocalVariable
        return np.mean(best_epoch_smape, dtype=np.float64)
Beispiel #18
0
def train(name, hparams, multi_gpu=False, n_models=1, train_completeness_threshold=0.01,
          seed=None, logdir='data/logs', max_epoch=100, patience=2, train_sampling=1.0,
          eval_sampling=1.0, eval_memsize=5, gpu=0, gpu_allow_growth=False, save_best_model=False,
          forward_split=False, write_summaries=False, verbose=False, asgd_decay=None, tqdm=True,
          side_split=True, max_steps=None, save_from_step=None, do_eval=True, predict_window=63):

    eval_k = int(round(26214 * eval_memsize / n_models))
    eval_batch_size = int(
        eval_k / (hparams.rnn_depth * hparams.encoder_rnn_layers))  # 128 -> 1024, 256->512, 512->256
    eval_pct = 0.1
    batch_size = hparams.batch_size
    train_window = hparams.train_window
    tf.reset_default_graph()
    if seed:
        tf.set_random_seed(seed)

    with tf.device("/cpu:0"):
        inp = VarFeeder.read_vars("data/vars")
        if side_split:
            splitter = Splitter(page_features(inp), inp.page_map, 3, train_sampling=train_sampling,
                                test_sampling=eval_sampling, seed=seed)
        else:
            splitter = FakeSplitter(page_features(inp), 3, seed=seed, test_sampling=eval_sampling)

    real_train_pages = splitter.splits[0].train_size
    real_eval_pages = splitter.splits[0].test_size

    items_per_eval = real_eval_pages * eval_pct
    eval_batches = int(np.ceil(items_per_eval / eval_batch_size))
    steps_per_epoch = real_train_pages // batch_size
    eval_every_step = int(round(steps_per_epoch * eval_pct))
    # eval_every_step = int(round(items_per_eval * train_sampling / batch_size))

    global_step = tf.train.get_or_create_global_step()
    inc_step = tf.assign_add(global_step, 1)


    all_models: List[ModelTrainerV2] = []

    def create_model(scope, index, prefix, seed):

        with tf.variable_scope('input') as inp_scope:
            with tf.device("/cpu:0"):
                split = splitter.splits[index]
                pipe = InputPipe(inp, features=split.train_set, n_pages=split.train_size,
                                 mode=ModelMode.TRAIN, batch_size=batch_size, n_epoch=None, verbose=verbose,
                                 train_completeness_threshold=train_completeness_threshold,
                                 predict_completeness_threshold=train_completeness_threshold, train_window=train_window,
                                 predict_window=predict_window,
                                 rand_seed=seed, train_skip_first=hparams.train_skip_first,
                                 back_offset=predict_window if forward_split else 0)
                inp_scope.reuse_variables()
                if side_split:
                    side_eval_pipe = InputPipe(inp, features=split.test_set, n_pages=split.test_size,
                                               mode=ModelMode.EVAL, batch_size=eval_batch_size, n_epoch=None,
                                               verbose=verbose, predict_window=predict_window,
                                               train_completeness_threshold=0.01, predict_completeness_threshold=0,
                                               train_window=train_window, rand_seed=seed, runs_in_burst=eval_batches,
                                               back_offset=predict_window * (2 if forward_split else 1))
                else:
                    side_eval_pipe = None
                if forward_split:
                    forward_eval_pipe = InputPipe(inp, features=split.test_set, n_pages=split.test_size,
                                                  mode=ModelMode.EVAL, batch_size=eval_batch_size, n_epoch=None,
                                                  verbose=verbose, predict_window=predict_window,
                                                  train_completeness_threshold=0.01, predict_completeness_threshold=0,
                                                  train_window=train_window, rand_seed=seed, runs_in_burst=eval_batches,
                                                  back_offset=predict_window)
                else:
                    forward_eval_pipe = None
        avg_sgd = asgd_decay is not None
        #asgd_decay = 0.99 if avg_sgd else None
        train_model = Model(pipe, hparams, is_train=True, graph_prefix=prefix, asgd_decay=asgd_decay, seed=seed)
        scope.reuse_variables()

        eval_stages = []
        if side_split:
            side_eval_model = Model(side_eval_pipe, hparams, is_train=False,
                                    #loss_mask=np.concatenate([np.zeros(50, dtype=np.float32), np.ones(10, dtype=np.float32)]),
                                    seed=seed)
            eval_stages.append((Stage.EVAL_SIDE, side_eval_model))
            if avg_sgd:
                eval_stages.append((Stage.EVAL_SIDE_EMA, side_eval_model))
        if forward_split:
            forward_eval_model = Model(forward_eval_pipe, hparams, is_train=False, seed=seed)
            eval_stages.append((Stage.EVAL_FRWD, forward_eval_model))
            if avg_sgd:
                eval_stages.append((Stage.EVAL_FRWD_EMA, forward_eval_model))

        if write_summaries:
            summ_path = f"{logdir}/{name}_{index}"
            if os.path.exists(summ_path):
                shutil.rmtree(summ_path)
            summ_writer = tf.summary.FileWriter(summ_path)  # , graph=tf.get_default_graph()
        else:
            summ_writer = None
        if do_eval and forward_split:
            stop_metric = lambda metrics: metrics[Stage.EVAL_FRWD]['SMAPE'].avg_epoch
        else:
            stop_metric = None
        return ModelTrainerV2(train_model, eval_stages, index, patience=patience,
                              stop_metric=stop_metric,
                              summary_writer=summ_writer)


    if n_models == 1:
        with tf.device(f"/gpu:{gpu}"):
            scope = tf.get_variable_scope()
            all_models = [create_model(scope, 0, None, seed=seed)]
    else:
        for i in range(n_models):
            device = f"/gpu:{i}" if multi_gpu else f"/gpu:{gpu}"
            with tf.device(device):
                prefix = f"m_{i}"
                with tf.variable_scope(prefix) as scope:
                    all_models.append(create_model(scope, i, prefix=prefix, seed=seed + i))
    trainer = MultiModelTrainer(all_models, inc_step)
    if save_best_model or save_from_step:
        saver_path = f'data/cpt/{name}'
        if os.path.exists(saver_path):
            shutil.rmtree(saver_path)
        os.makedirs(saver_path)
        saver = tf.train.Saver(max_to_keep=10, name='train_saver')
    else:
        saver = None
    avg_sgd = asgd_decay is not None
    if avg_sgd:
        from itertools import chain
        def ema_vars(model):
            ema = model.train_model.ema
            return {ema.average_name(v):v for v in model.train_model.ema._averages}

        ema_names = dict(chain(*[ema_vars(model).items() for model in all_models]))
        #ema_names = all_models[0].train_model.ema.variables_to_restore()
        ema_loader = tf.train.Saver(var_list=ema_names,  max_to_keep=1, name='ema_loader')
        ema_saver = tf.train.Saver(max_to_keep=1, name='ema_saver')
    else:
        ema_loader = None

    init = tf.global_variables_initializer()

    if forward_split and do_eval:
        eval_smape = trainer.metric(Stage.EVAL_FRWD, 'SMAPE')
        eval_mae = trainer.metric(Stage.EVAL_FRWD, 'MAE')
    else:
        eval_smape = DummyMetric()
        eval_mae = DummyMetric()

    if side_split and do_eval:
        eval_mae_side = trainer.metric(Stage.EVAL_SIDE, 'MAE')
        eval_smape_side = trainer.metric(Stage.EVAL_SIDE, 'SMAPE')
    else:
        eval_mae_side = DummyMetric()
        eval_smape_side = DummyMetric()

    train_smape = trainer.metric(Stage.TRAIN, 'SMAPE')
    train_mae = trainer.metric(Stage.TRAIN, 'MAE')
    grad_norm = trainer.metric(Stage.TRAIN, 'GrNorm')
    eval_stages = []
    ema_eval_stages = []
    if forward_split and do_eval:
        eval_stages.append(Stage.EVAL_FRWD)
        ema_eval_stages.append(Stage.EVAL_FRWD_EMA)
    if side_split and do_eval:
        eval_stages.append(Stage.EVAL_SIDE)
        ema_eval_stages.append(Stage.EVAL_SIDE_EMA)

    # gpu_options=tf.GPUOptions(allow_growth=False),
    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                          gpu_options=tf.GPUOptions(allow_growth=gpu_allow_growth))) as sess:
        sess.run(init)
        # pipe.load_vars(sess)
        inp.restore(sess)
        for model in all_models:
            model.init(sess)
        # if beholder:
        #    visualizer = Beholder(session=sess, logdir=summ_path)
        step = 0
        prev_top = np.inf
        best_smape = np.inf
        # Contains best value (first item) and subsequent values
        best_epoch_smape = []

        for epoch in range(max_epoch):

            # n_steps = pusher.n_pages // batch_size
            if tqdm:
                tqr = trange(steps_per_epoch, desc="%2d" % (epoch + 1), leave=False)
            else:
                tqr = range(steps_per_epoch)

            for _ in tqr:
                try:
                    step = trainer.train_step(sess, epoch)
                except tf.errors.OutOfRangeError:
                    break
                    # if beholder:
                    #  if step % 5 == 0:
                    # noinspection PyUnboundLocalVariable
                    #  visualizer.update()
                if step % eval_every_step == 0:
                    if eval_stages:
                        trainer.eval_step(sess, epoch, step, eval_batches, stages=eval_stages)

                    if save_best_model and epoch > 0 and eval_smape.last < best_smape:
                        best_smape = eval_smape.last
                        saver.save(sess, f'data/cpt/{name}/cpt', global_step=step)
                    if save_from_step and step >= save_from_step:
                        saver.save(sess, f'data/cpt/{name}/cpt', global_step=step)

                    if avg_sgd and ema_eval_stages:
                        ema_saver.save(sess, 'data/cpt_tmp/ema',  write_meta_graph=False)
                        # restore ema-backed vars
                        ema_loader.restore(sess, 'data/cpt_tmp/ema')

                        trainer.eval_step(sess, epoch, step, eval_batches, stages=ema_eval_stages)
                        # restore normal vars
                        ema_saver.restore(sess, 'data/cpt_tmp/ema')

                MAE = "%.3f/%.3f/%.3f" % (eval_mae.last, eval_mae_side.last, train_mae.last)
                improvement = '↑' if eval_smape.improved else ' '
                SMAPE = "%s%.3f/%.3f/%.3f" % (improvement, eval_smape.last, eval_smape_side.last,  train_smape.last)
                if tqdm:
                    tqr.set_postfix(gr=grad_norm.last, MAE=MAE, SMAPE=SMAPE)
                if not trainer.has_active() or (max_steps and step > max_steps):
                    break

            if tqdm:
                tqr.close()
            trainer.end_epoch()
            if not best_epoch_smape or eval_smape.avg_epoch < best_epoch_smape[0]:
                best_epoch_smape = [eval_smape.avg_epoch]
            else:
                best_epoch_smape.append(eval_smape.avg_epoch)

            current_top = eval_smape.top
            if prev_top > current_top:
                prev_top = current_top
                has_best_indicator = '↑'
            else:
                has_best_indicator = ' '
            status = "%2d: Best top SMAPE=%.3f%s (%s)" % (
                epoch + 1, current_top, has_best_indicator,
                ",".join(["%.3f" % m.top for m in eval_smape.metrics]))

            if trainer.has_active():
                status += ", frwd/side best MAE=%.3f/%.3f, SMAPE=%.3f/%.3f; avg MAE=%.3f/%.3f, SMAPE=%.3f/%.3f, %d am" % \
                          (eval_mae.best_epoch, eval_mae_side.best_epoch, eval_smape.best_epoch, eval_smape_side.best_epoch,
                           eval_mae.avg_epoch,  eval_mae_side.avg_epoch,  eval_smape.avg_epoch,  eval_smape_side.avg_epoch,
                           trainer.has_active())
                print(status, file=sys.stderr)
            else:
                print(status, file=sys.stderr)
                print("Early stopping!", file=sys.stderr)
                break
            if max_steps and step > max_steps:
                print("Max steps calculated", file=sys.stderr)
                break
            sys.stderr.flush()

        # noinspection PyUnboundLocalVariable
        return np.mean(best_epoch_smape, dtype=np.float64)
Beispiel #19
0
def run(cfg):

    with open(cfg['tf_statistics_path'], 'rb') as f:
        tf_stat = pickle.load(f)

    names = []
    tfrecord_location = cfg['tfrecords_local_path']
    for file in os.listdir(tfrecord_location):
        if file.startswith("part"):
            names.append(file)
    file_paths = [os.path.join(tfrecord_location, name) for name in names]

    # read and make the dataset from tfrecord
    dataset = tf.data.TFRecordDataset(file_paths)
    dataset = dataset.map(__data_parser)

    batch_size = cfg['batch_size']
    duration = cfg['duration']

    dataset = dataset.batch(batch_size).shuffle(SHUFFLE_BUFFER)
    iterator = dataset.make_one_shot_iterator()
    next_el = iterator.get_next()

    # lagged_ix = numpy.ones((duration, 4), dtype=float)
    # lagged_ix = np.where(lagged_ix == 1, -1, lagged_ix)
    lagged_ix = np.stack(lag_indexes(tf_stat), axis=-1)
    # quarter_autocorr = numpy.ones((batch_size,), dtype=float)

    date_list = tf_stat['days']
    dow = get_dow(date_list)

    holiday_list = cfg['holidays']

    holidays = [1 if _ in holiday_list else 0 for _ in date_list]
    a_list = []
    b_list = []
    for _ in holidays:
        a, b = holiday_norm(_)
        a_list.append(a)
        b_list.append(b)
    holiday = (a_list, b_list)

    with tf.Session() as sess:

        x = sess.run(next_el)
        quarter_autocorr = numpy.ones((x[0].size, ), dtype=float)
        page_indx = list(x[0])

        fill_isolated_zeros(x[21])
        tensors = dict(
            hits=pd.DataFrame(x[21], index=page_indx, columns=date_list),
            lagged_ix=lagged_ix,
            page_ix=page_indx,
            pf_age=pd.DataFrame(x[8:15],
                                columns=page_indx,
                                index=(1, 2, 3, 4, 5, 6, 7)).T,
            pf_si=pd.DataFrame(x[20], index=page_indx),
            pf_network=pd.DataFrame(x[15:20],
                                    columns=page_indx,
                                    index=('2G', '3G', '4G', 'UNKNOWN',
                                           'WIFI')).T,
            pf_price_cat=pd.DataFrame(x[1:4],
                                      columns=page_indx,
                                      index=('pc1', 'pc2', 'pc3')).T,
            pf_gender=pd.DataFrame(x[4:8],
                                   columns=page_indx,
                                   index=('none', 'f', 'm', 'x')).T,
            page_popularity=x[22],
            # page_popularity = quarter_autocorr,
            quarter_autocorr=quarter_autocorr,
            dow=pd.DataFrame(dow).T,
            holiday=pd.DataFrame(holiday).T)

        data_len = tensors['hits'].shape[1]
        plain = dict(data_days=data_len - cfg['add_days'],
                     features_days=data_len,
                     data_start=date_list[0],
                     data_end=date_list[-1],
                     features_end=date_list[-1],
                     n_pages=batch_size)
        VarFeeder(cfg['data_dir'], tensors, plain)
Beispiel #20
0
def train(name,
          hparams,
          multi_gpu=False,
          n_models=1,
          train_completeness_threshold=0.01,
          seed=None,
          logdir='data/logs',
          max_epoch=100,
          patience=2,
          train_sampling=1.0,
          eval_sampling=1.0,
          eval_memsize=5,
          gpu=0,
          gpu_allow_growth=False,
          save_best_model=False,
          forward_split=False,
          write_summaries=False,
          verbose=False,
          asgd_decay=None,
          tqdm=True,
          side_split=True,
          max_steps=None,
          save_from_step=None,
          do_eval=True,
          predict_window=63):

    eval_k = int(round(26214 * eval_memsize / n_models))
    eval_batch_size = int(
        eval_k /
        (hparams.rnn_depth *
         hparams.encoder_rnn_layers))  # 128 -> 1024, 256->512, 512->256
    eval_pct = 0.1
    batch_size = hparams.batch_size
    train_window = hparams.train_window
    # todo eval_k = 43690,eval_batch_size = 163,eval_pct = 0,batch_size = 128,train_window = 283
    # print("eval_k = %d,eval_batch_size = %d,eval_pct = %d,batch_size = %d,train_window = %d" %(eval_k,eval_batch_size,eval_pct,batch_size,train_window))
    tf.reset_default_graph()
    if seed:
        tf.set_random_seed(seed)

    with tf.device("/cpu:0"):
        inp = VarFeeder.read_vars("data/vars")
        # print("side_split = %d,train_sampling= %d,eval_sampling= %d,seed= %d" % (
        #     side_split,train_sampling,eval_sampling,seed),f"inp={inp}, side_split={side_split}; type(inp)={type(inp)}")
        # todo side_split = 0,train_sampling= 1,eval_sampling= 1,seed= 5
        #  inp={'hits': <tf.Variable 'hits:0' shape=(145036, 805) dtype=float32_ref>,
        #  'lagged_ix': <tf.Variable 'lagged_ix:0' shape=(867, 4) dtype=int16_ref>,
        #  'page_map': <tf.Variable 'page_map:0' shape=(52752, 4) dtype=int32_ref>,
        #  'page_ix': <tf.Variable 'page_ix:0' shape=(145036,) dtype=string_ref>,
        #  'pf_agent': <tf.Variable 'pf_agent:0' shape=(145036, 4) dtype=float32_ref>,
        #  'pf_country': <tf.Variable 'pf_country:0' shape=(145036, 7) dtype=float32_ref>,
        #  'pf_site': <tf.Variable 'pf_site:0' shape=(145036, 3) dtype=float32_ref>,
        #  'page_popularity': <tf.Variable 'page_popularity:0' shape=(145036,) dtype=float32_ref>,
        #  'year_autocorr': <tf.Variable 'year_autocorr:0' shape=(145036,) dtype=float32_ref>,
        #  'quarter_autocorr': <tf.Variable 'quarter_autocorr:0' shape=(145036,) dtype=float32_ref>,
        #  'dow': <tf.Variable 'dow:0' shape=(867, 2) dtype=float32_ref>,'features_days': 867,
        #  'data_days': 805, 'n_pages': 145036, 'data_start': '2015-07-01',
        #  'data_end': Timestamp('2017-09-11 00:00:00'), 'features_end': Timestamp('2017-11-13 00:00:00')}
        #  side_split=False;
        #  type(inp)=<class 'feeder.FeederVars'>;
        # if True:
        if side_split:
            splitter = Splitter(page_features(inp),
                                inp.page_map,
                                3,
                                train_sampling=train_sampling,
                                test_sampling=eval_sampling,
                                seed=seed)
        else:
            splitter = FakeSplitter(page_features(inp),
                                    3,
                                    seed=seed,
                                    test_sampling=eval_sampling)

    real_train_pages = splitter.splits[0].train_size
    real_eval_pages = splitter.splits[0].test_size

    items_per_eval = real_eval_pages * eval_pct
    eval_batches = int(np.ceil(items_per_eval / eval_batch_size))
    steps_per_epoch = real_train_pages // batch_size
    eval_every_step = int(round(steps_per_epoch * eval_pct))
    # todo real_train_pages = 145036,real_eval_pages= 145036,items_per_eval= 14503,eval_batches= 89,
    #  steps_per_epoch= 1133,eval_every_step= 113 -- 每个epoch有1133个step,每113个step打印一下当前模型的效果
    # print("real_train_pages = %d,real_eval_pages= %d,items_per_eval= %d,eval_batches= %d,steps_per_epoch= %d,eval_every_step= %d; eval_pct" % (
    #     real_train_pages, real_eval_pages, items_per_eval, eval_batches, steps_per_epoch, eval_every_step,eval_pct
    # ))
    # return
    # eval_every_step = int(round(items_per_eval * train_sampling / batch_size))
    # todo get_or_create_global_step 这个函数主要用于返回或者创建(如果有必要的话)一个全局步数的tensor变量。
    global_step = tf.train.get_or_create_global_step()
    # todo tf.assign_add(ref,value,use_locking=None,name=None);通过增加value,更新ref的值,即:ref = ref + value;
    #  inc increase_step
    inc_step = tf.assign_add(global_step, 1)

    all_models: List[ModelTrainerV2] = []

    def create_model(scope, index, prefix, seed):
        # todo 主要是创建了模型,以及返回一些None的东西。
        #  数据在构建模型的时候使用了,模型中只使用了数据的预测窗口的长度--不对,应该是创建模型的时候直接喂入数据了。
        with tf.variable_scope('input') as inp_scope:
            with tf.device("/cpu:0"):
                split = splitter.splits[index]
                pipe = InputPipe(
                    inp,
                    features=split.train_set,
                    n_pages=split.train_size,
                    mode=ModelMode.TRAIN,
                    batch_size=batch_size,
                    n_epoch=None,
                    verbose=verbose,
                    train_completeness_threshold=train_completeness_threshold,
                    predict_completeness_threshold=train_completeness_threshold,
                    train_window=train_window,
                    predict_window=predict_window,
                    rand_seed=seed,
                    train_skip_first=hparams.train_skip_first,
                    back_offset=predict_window if forward_split else 0)
                inp_scope.reuse_variables()
                # todo side_split: False; forward_split:False; eval_stages: [];
                if side_split:
                    side_eval_pipe = InputPipe(
                        inp,
                        features=split.test_set,
                        n_pages=split.test_size,
                        mode=ModelMode.EVAL,
                        batch_size=eval_batch_size,
                        n_epoch=None,
                        verbose=verbose,
                        predict_window=predict_window,
                        train_completeness_threshold=0.01,
                        predict_completeness_threshold=0,
                        train_window=train_window,
                        rand_seed=seed,
                        runs_in_burst=eval_batches,
                        back_offset=predict_window *
                        (2 if forward_split else 1))
                else:
                    side_eval_pipe = None
                if forward_split:
                    forward_eval_pipe = InputPipe(
                        inp,
                        features=split.test_set,
                        n_pages=split.test_size,
                        mode=ModelMode.EVAL,
                        batch_size=eval_batch_size,
                        n_epoch=None,
                        verbose=verbose,
                        predict_window=predict_window,
                        train_completeness_threshold=0.01,
                        predict_completeness_threshold=0,
                        train_window=train_window,
                        rand_seed=seed,
                        runs_in_burst=eval_batches,
                        back_offset=predict_window)
                else:
                    forward_eval_pipe = None
        avg_sgd = asgd_decay is not None
        #asgd_decay = 0.99 if avg_sgd else None
        train_model = Model(pipe,
                            hparams,
                            is_train=True,
                            graph_prefix=prefix,
                            asgd_decay=asgd_decay,
                            seed=seed)
        scope.reuse_variables()

        eval_stages = []
        if side_split:
            # print('2 side_split side_eval_model')
            side_eval_model = Model(
                side_eval_pipe,
                hparams,
                is_train=False,
                #loss_mask=np.concatenate([np.zeros(50, dtype=np.float32), np.ones(10, dtype=np.float32)]),
                seed=seed)
            # print("2  side_eval_model -- 2")
            # todo TRAIN = 0; EVAL_SIDE = 1; EVAL_FRWD = 2; EVAL_SIDE_EMA = 3; EVAL_FRWD_EMA = 4
            eval_stages.append((Stage.EVAL_SIDE, side_eval_model))
            if avg_sgd:
                eval_stages.append((Stage.EVAL_SIDE_EMA, side_eval_model))
        if forward_split:
            # print("3 forward_split forward_eval_model")
            # tf.reset_default_graph()
            forward_eval_model = Model(forward_eval_pipe,
                                       hparams,
                                       is_train=False,
                                       seed=seed)
            # print("3 forward_split forward_eval_model -- 2")
            eval_stages.append((Stage.EVAL_FRWD, forward_eval_model))
            if avg_sgd:
                eval_stages.append((Stage.EVAL_FRWD_EMA, forward_eval_model))

        if write_summaries:
            summ_path = f"{logdir}/{name}_{index}"
            # print("write_summaries summ_path",summ_path)
            if os.path.exists(summ_path):
                shutil.rmtree(summ_path)
            summ_writer = tf.summary.FileWriter(
                summ_path)  # , graph=tf.get_default_graph()
        else:
            summ_writer = None
        if do_eval and forward_split:
            stop_metric = lambda metrics: metrics[Stage.EVAL_FRWD]['SMAPE'
                                                                   ].avg_epoch
        else:
            stop_metric = None
        # todo side_split: False; forward_split:False;
        #  summ_writer=<tensorflow.python.summary.writer.writer.FileWriter object at 0x7ff5dc176710>;
        #  eval_stages: []; stop_metric=None; patience=2; index=0
        # print(f"side_split: {side_split}; forward_split:{forward_split}; summ_writer={summ_writer};"
        #       f"eval_stages: {eval_stages}; stop_metric={stop_metric}; patience={patience}; index={index}")
        return ModelTrainerV2(train_model,
                              eval_stages,
                              index,
                              patience=patience,
                              stop_metric=stop_metric,
                              summary_writer=summ_writer)

    # todo n_models == 3
    if n_models == 1:
        with tf.device(f"/gpu:{gpu}"):
            scope = tf.get_variable_scope()
            all_models = [create_model(scope, 0, None, seed=seed)]
    else:
        for i in range(n_models):
            device = f"/gpu:{i}" if multi_gpu else f"/gpu:{gpu}"
            with tf.device(device):
                prefix = f"m_{i}"
                with tf.variable_scope(prefix) as scope:
                    all_models.append(
                        create_model(scope, i, prefix=prefix, seed=seed + i))
    # todo inc_step = tf.assign_add(global_step, 1)
    trainer = MultiModelTrainer(all_models, inc_step)
    # return
    # todo save_best_model or save_from_step: False 10500
    # print("save_best_model or save_from_step: ", save_best_model, save_from_step)
    if save_best_model or save_from_step:
        saver_path = f'data/cpt/{name}'
        # todo saver_path: data/cpt/s32
        # print("saver_path: ",saver_path)
        if os.path.exists(saver_path):
            shutil.rmtree(saver_path)
        os.makedirs(saver_path)
        # todo  max_to_keep 参数,这个是用来设置保存模型的个数,默认为5,即 max_to_keep=5,保存最近的5个模型
        saver = tf.train.Saver(max_to_keep=10, name='train_saver')
    else:
        saver = None
    # todo EMA decay for averaged SGD. Not use ASGD if not set
    avg_sgd = asgd_decay is not None
    # todo asgd_decay=0.99; avg_sgd=True
    # print(f"asgd_decay={asgd_decay}; avg_sgd={avg_sgd}")
    if avg_sgd:
        from itertools import chain

        def ema_vars(model):
            ema = model.train_model.ema
            # todo: average_name() methods give access to the shadow variables and their names
            return {
                ema.average_name(v): v
                for v in model.train_model.ema._averages
            }

        ema_names = dict(
            chain(*[ema_vars(model).items() for model in all_models]))
        # todo ema_names=
        #  {'m_0/m_0/cudnn_gru/opaque_kernel/ExponentialMovingAverage': <tf.Variable 'm_0/cudnn_gru/opaque_kernel:0' shape=<unknown> dtype=float32_ref>,
        #  'm_0/m_0/fingerpint/convnet/conv1d/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d/kernel:0' shape=(7, 5, 16) dtype=float32_ref>,
        #  'm_0/m_0/fingerpint/convnet/conv1d/bias/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d/bias:0' shape=(16,) dtype=float32_ref>,
        #  'm_0/m_0/fingerpint/convnet/conv1d_1/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_1/kernel:0' shape=(3, 16, 16) dtype=float32_ref>,
        #  'm_0/m_0/fingerpint/convnet/conv1d_1/bias/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_1/bias:0' shape=(16,) dtype=float32_ref>,
        #  'm_0/m_0/fingerpint/convnet/conv1d_2/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_2/kernel:0' shape=(3, 16, 32) dtype=float32_ref>,
        #  'm_0/m_0/fingerpint/convnet/conv1d_2/bias/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_2/bias:0' shape=(32,) dtype=float32_ref>,
        #  'm_0/m_0/fingerpint/convnet/conv1d_3/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_3/kernel:0' shape=(3, 32, 32) dtype=float32_ref>,
        #  'm_0/m_0/fingerpint/convnet/conv1d_3/bias/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_3/bias:0' shape=(32,) dtype=float32_ref>,
        #  'm_0/m_0/fingerpint/convnet/conv1d_4/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_4/kernel:0' shape=(3, 32, 64) dtype=float32_ref>,
        #  'm_0/m_0/fingerpint/convnet/conv1d_4/bias/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_4/bias:0' shape=(64,) dtype=float32_ref>,
        #  'm_0/m_0/fingerpint/convnet/conv1d_5/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_5/kernel:0' shape=(3, 64, 64) dtype=float32_ref>,
        #  'm_0/m_0/fingerpint/convnet/conv1d_5/bias/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_5/bias:0' shape=(64,) dtype=float32_ref>,
        #  'm_0/m_0/fingerpint/fc_convnet/fc_encoder/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/fc_convnet/fc_encoder/kernel:0' shape=(2304, 512) dtype=float32_ref>,
        #  'm_0/m_0/fingerpint/fc_convnet/fc_encoder/bias/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/fc_convnet/fc_encoder/bias:0' shape=(512,) dtype=float32_ref>,
        #  'm_0/m_0/fingerpint/fc_convnet/out_encoder/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/fc_convnet/out_encoder/kernel:0' shape=(512, 16) dtype=float32_ref>,
        #  'm_0/m_0/fingerpint/fc_convnet/out_encoder/bias/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/fc_convnet/out_encoder/bias:0' shape=(16,) dtype=float32_ref>,
        #  'm_0/m_0/attn_focus/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/attn_focus/kernel:0' shape=(16, 221) dtype=float32_ref>,
        #  'm_0/m_0/attn_focus/bias/ExponentialMovingAverage': <tf.Variable 'm_0/attn_focus/bias:0' shape=(221,) dtype=float32_ref>,
        #  'm_0/m_0/gru_cell/w_ru/ExponentialMovingAverage': <tf.Variable 'm_0/gru_cell/w_ru:0' shape=(291, 534) dtype=float32_ref>,
        #  'm_0/m_0/gru_cell/b_ru/ExponentialMovingAverage': <tf.Variable 'm_0/gru_cell/b_ru:0' shape=(534,) dtype=float32_ref>,
        #  'm_0/m_0/gru_cell/w_c/ExponentialMovingAverage': <tf.Variable 'm_0/gru_cell/w_c:0' shape=(291, 267) dtype=float32_ref>,
        #  'm_0/m_0/gru_cell/b_c/ExponentialMovingAverage': <tf.Variable 'm_0/gru_cell/b_c:0' shape=(267,) dtype=float32_ref>,
        #  'm_0/m_0/decoder_output_proj/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/decoder_output_proj/kernel:0' shape=(267, 1) dtype=float32_ref>,
        #  'm_0/m_0/decoder_output_proj/bias/ExponentialMovingAverage': <tf.Variable 'm_0/decoder_output_proj/bias:0' shape=(1,) dtype=float32_ref>,
        # print(f"ema_names={ema_names}")
        # todo chain=<itertools.chain object at 0x7fe6587cbf98>,
        #  [] = [dict_items([
        #  ('m_0/m_0/cudnn_gru/opaque_kernel/ExponentialMovingAverage', <tf.Variable 'm_0/cudnn_gru/opaque_kernel:0' shape=<unknown> dtype=float32_ref>),
        # 	...
        #  ('m_2/m_2/decoder_output_proj/bias/ExponentialMovingAverage', <tf.Variable 'm_2/decoder_output_proj/bias:0' shape=(1,) dtype=float32_ref>)
        #  ])]
        # print(f"chain={chain(*[ema_vars(model).items() for model in all_models])},\n[] = {[ema_vars(model).items() for model in all_models]}")
        #ema_names = all_models[0].train_model.ema.variables_to_restore()
        ema_loader = tf.train.Saver(var_list=ema_names,
                                    max_to_keep=1,
                                    name='ema_loader')
        ema_saver = tf.train.Saver(max_to_keep=1, name='ema_saver')
    else:
        ema_loader = None

    init = tf.global_variables_initializer()

    # print(f"forward_split={forward_split}; do_eval={do_eval}; side_split={side_split}")
    if forward_split and do_eval:
        eval_smape = trainer.metric(Stage.EVAL_FRWD, 'SMAPE')
        eval_mae = trainer.metric(Stage.EVAL_FRWD, 'MAE')
    else:
        eval_smape = DummyMetric()
        eval_mae = DummyMetric()

    if side_split and do_eval:
        eval_mae_side = trainer.metric(Stage.EVAL_SIDE, 'MAE')
        eval_smape_side = trainer.metric(Stage.EVAL_SIDE, 'SMAPE')
    else:
        eval_mae_side = DummyMetric()
        eval_smape_side = DummyMetric()

    train_smape = trainer.metric(Stage.TRAIN, 'SMAPE')
    train_mae = trainer.metric(Stage.TRAIN, 'MAE')
    grad_norm = trainer.metric(Stage.TRAIN, 'GrNorm')
    eval_stages = []
    ema_eval_stages = []
    if forward_split and do_eval:
        eval_stages.append(Stage.EVAL_FRWD)
        ema_eval_stages.append(Stage.EVAL_FRWD_EMA)
    if side_split and do_eval:
        eval_stages.append(Stage.EVAL_SIDE)
        ema_eval_stages.append(Stage.EVAL_SIDE_EMA)
    # todo eval_stages=[]; ema_eval_stages=[]
    # print(f"eval_stages={eval_stages}; ema_eval_stages={ema_eval_stages}")

    # gpu_options=tf.GPUOptions(allow_growth=False),
    with tf.Session(
            config=tf.ConfigProto(allow_soft_placement=True,
                                  gpu_options=tf.GPUOptions(
                                      allow_growth=gpu_allow_growth))) as sess:
        sess.run(init)
        # pipe.load_vars(sess)
        # todo 之前inp是加载了这个数据对象,restore是把数据tensor加载到sess中吧?
        #  这里加载了数据在哪里用到了呢?
        inp.restore(sess)
        for model in all_models:
            # todo 这里是什么意思呢?这样的到什么呢?运行了一下init_iterator?
            #  上面建好模型结构之后,在哪里喂入数据呢?
            model.init(sess)
        # if beholder:
        #    visualizer = Beholder(session=sess, logdir=summ_path)
        step = 0
        prev_top = np.inf
        best_smape = np.inf
        # Contains best value (first item) and subsequent values
        best_epoch_smape = []

        for epoch in range(max_epoch):

            # n_steps = pusher.n_pages // batch_size
            if tqdm:
                # todo Tqdm 是一个快速,可扩展的Python进度条,可以在 Python 长循环中添加一个进度提示信息,
                #  用户只需要封装任意的迭代器 tqdm(iterator)。trange(i) 是 tqdm(range(i)) 的简单写法
                #  desc=进度条前面的描述;leave:保留进度条存在的痕迹,简单来说就是会把进度条的最终形态保留下来,默认为True
                tqr = trange(steps_per_epoch,
                             desc="%2d" % (epoch + 1),
                             leave=False)
            else:
                tqr = range(steps_per_epoch)
            for _ in tqr:
                try:
                    # print("PRINT step = trainer.train_step")
                    # todo 训练模型只有这一行对吧
                    step = trainer.train_step(sess, epoch)
                    # if epoch == 0:
                    #     print(f"step={step}, _={_}, epoch = {epoch}")
                except tf.errors.OutOfRangeError:
                    break
                    # if beholder:
                    #  if step % 5 == 0:
                    # noinspection PyUnboundLocalVariable
                    #  visualizer.update()
                # todo 应该是每训练一个epoch,会对其中的100(eval_pct)个batch的结果做一个评估;eval_every_step= 113
                if step % eval_every_step == 0:
                    # todo eval_stages=[];save_best_model=False; save_from_step=10500; avg_sgd=True; ema_eval_stages=[]
                    # print(f"eval_stages={eval_stages};save_best_model={save_best_model}; save_from_step={save_from_step}; avg_sgd={avg_sgd}; ema_eval_stages={ema_eval_stages}")
                    if eval_stages:
                        trainer.eval_step(sess,
                                          epoch,
                                          step,
                                          eval_batches,
                                          stages=eval_stages)
                    if save_best_model and epoch > 0 and eval_smape.last < best_smape:
                        best_smape = eval_smape.last
                        saver.save(sess,
                                   f'data/cpt/{name}/cpt',
                                   global_step=step)
                    if save_from_step and step >= save_from_step:
                        saver.save(sess,
                                   f'data/cpt/{name}/cpt',
                                   global_step=step)

                    if avg_sgd and ema_eval_stages:
                        ema_saver.save(sess,
                                       'data/cpt_tmp/ema',
                                       write_meta_graph=False)
                        # restore ema-backed vars
                        ema_loader.restore(sess, 'data/cpt_tmp/ema')

                        trainer.eval_step(sess,
                                          epoch,
                                          step,
                                          eval_batches,
                                          stages=ema_eval_stages)
                        # restore normal vars
                        ema_saver.restore(sess, 'data/cpt_tmp/ema')

                MAE = "%.3f/%.3f/%.3f" % (eval_mae.last, eval_mae_side.last,
                                          train_mae.last)
                improvement = '↑' if eval_smape.improved else ' '
                SMAPE = "%s%.3f/%.3f/%.3f" % (improvement, eval_smape.last,
                                              eval_smape_side.last,
                                              train_smape.last)
                if tqdm:
                    # todo .set_description("GEN %i"%i)	#设置进度条左边显示的信息
                    #  .set_postfix(loss=random(),gen=randint(1,999),str="h",lst=[1,2])	#设置进度条右边显示的信息
                    tqr.set_postfix(gr=grad_norm.last, MAE=MAE, SMAPE=SMAPE)
                if not trainer.has_active() or (max_steps
                                                and step > max_steps):
                    break
            if tqdm:
                tqr.close()
            trainer.end_epoch()

            if not best_epoch_smape or eval_smape.avg_epoch < best_epoch_smape[
                    0]:
                best_epoch_smape = [eval_smape.avg_epoch]
            else:
                best_epoch_smape.append(eval_smape.avg_epoch)

            current_top = eval_smape.top
            if prev_top > current_top:
                prev_top = current_top
                has_best_indicator = '↑'
            else:
                has_best_indicator = ' '
            status = "%2d: Best top SMAPE=%.3f%s (%s)" % (
                epoch + 1, current_top, has_best_indicator, ",".join(
                    ["%.3f" % m.top for m in eval_smape.metrics]))

            if trainer.has_active():
                status += ", frwd/side best MAE=%.3f/%.3f, SMAPE=%.3f/%.3f; avg MAE=%.3f/%.3f, SMAPE=%.3f/%.3f, %d am" % \
                          (eval_mae.best_epoch, eval_mae_side.best_epoch, eval_smape.best_epoch, eval_smape_side.best_epoch,
                           eval_mae.avg_epoch,  eval_mae_side.avg_epoch,  eval_smape.avg_epoch,  eval_smape_side.avg_epoch,
                           trainer.has_active())
            else:
                print("Early stopping!", file=sys.stderr)
                break
            if max_steps and step > max_steps:
                print("Max steps calculated", file=sys.stderr)
                break
            sys.stderr.flush()
            # todo best_epoch_smape=[nan]; eval_smape.avg_epoch=nan; trainer.has_active()=3; prev_top=inf; current_top=nan
            # print(f"best_epoch_smape={best_epoch_smape}; eval_smape.avg_epoch={eval_smape.avg_epoch}; "
            #       f"trainer.has_active()={trainer.has_active()}; prev_top={prev_top}; current_top={current_top}")
        # noinspection PyUnboundLocalVariable
        return np.mean(best_epoch_smape, dtype=np.float64)