def run(): parser = argparse.ArgumentParser(description='Prepare data') parser.add_argument( '--data_dir', default='/home/oniculaescu/train/features', help= 'The location where we are going to save the features for constructing the NN' ) parser.add_argument( '--threshold', default=0.0, type=float, help='Series minimal length threshold/points of data length') parser.add_argument( '--add_days', default=64, type=int, help='Number of days to be added in the future for prediction') parser.add_argument('--start', help='Effective start date') parser.add_argument('--end', help="Effective end date") parser.add_argument('--attr', default='download', help="tell what pkl file to use for feature creation") parser.add_argument('--corr_backoffset', default=0, type=int, help="Offset for correlation computation") args = parser.parse_args() # get the data df, nans, starts, ends = prepare_data(args.attr, args.start, args.end, args.threshold) # find the working date range data_start, data_end = df.columns[0], df.columns[-1] # project date-dependent features (like day of week) to the future dates for prediction #features_end = data_end + " " + str(pd.Timedelta(args.add_days, unit='D')) print data_end data_end_1 = datetime.datetime.strptime(data_end, '%Y-%m-%d') #features_end = data_end_1 + pd.TimeDelta(args.add_days, unit='D') features_end = data_end_1 + datetime.timedelta(days=args.add_days) print("start: " + data_start + ", end: " + data_end + ", features_end: " + str(features_end)) # Group unique ases by continent assert df.index.is_monotonic_increasing continent_map = uniq_continent_map(df.index.values) # Group unique ases by country country_map = uniq_country_map(df.index.values) # yearly autocorrelation raw_year_autocorr = batch_autocorrelation(df.values, 365, starts, ends, 1.5, args.corr_backoffset) year_unknown_pct = np.sum(np.isnan(raw_year_autocorr)) / len( raw_year_autocorr) # type: float # quarterly autocorrelation raw_quarter_autocorr = batch_autocorrelation(df.values, int(round(365.25 / 4)), starts, ends, 2, args.corr_backoffset) quarter_unknown_pct = np.sum(np.isnan(raw_quarter_autocorr)) / len( raw_quarter_autocorr) # type: float print("Percent of undefined autocorr = yearly:%.3f, quarterly:%.3f" % (year_unknown_pct, quarter_unknown_pct)) # Normalise all the things year_autocorr = normalize(np.nan_to_num(raw_year_autocorr)) quarter_autocorr = normalize(np.nan_to_num(raw_quarter_autocorr)) # Make time-dependent features features_days = pd.date_range(data_start, features_end) #dow = normalize(features_days.dayofweek.values) week_period = 7 / (2 * np.pi) dow_norm = features_days.dayofweek / week_period print dow_norm dow = np.stack([np.cos(dow_norm), np.sin(dow_norm)], axis=-1) # Assemble indices for quarterly lagged data lagged_ix = np.stack(lag_indexes(data_start, features_end), axis=-1) #page_popularity = df.median(axis=1) #page_popularity = (page_popularity - page_popularity.mean()) / page_popularity.std() # Put NaNs back df[nans] = np.NaN # Assemble final output tensors = dict( hits=df, lagged_ix=lagged_ix, continent_map=continent_map, #country_map=country_map, as_ix=df.index.values, year_autocorr=year_autocorr, quarter_autocorr=quarter_autocorr, dow=dow, ) plain = dict(features_days=len(features_days), data_days=len(df.columns), n_ases=len(df), data_start=data_start, data_end=data_end, features_end=features_end) print args.data_dir #print tensors # Store data to the disk VarFeeder(args.data_dir, tensors, plain)
def run(args): # Get the data df, nans, starts, ends = prepare_data(args['start'], args['end'], args['valid_threshold']) # Our working date range data_start, data_end = df.columns[0], df.columns[-1] # We have to project some date-dependent features (day of week, etc) to the future dates for prediction features_end = data_end + pd.Timedelta(args['add_days'], unit='D') print(f"start: {data_start}, end:{data_end}, features_end:{features_end}") # Group unique pages by agents assert df.index.is_monotonic_increasing # 判断index是不是单调增 page_map = uniq_page_map(df.index.values) # Yearly(annual) autocorrelation raw_year_autocorr = batch_autocorr(df.values, 365, starts, ends, 1.5, args['corr_backoffset']) year_unknown_pct = np.sum(np.isnan(raw_year_autocorr))/len(raw_year_autocorr) # type: float # Quarterly autocorrelation raw_quarter_autocorr = batch_autocorr(df.values, int(round(365.25/4)), starts, ends, 2, args['corr_backoffset']) quarter_unknown_pct = np.sum(np.isnan(raw_quarter_autocorr)) / len(raw_quarter_autocorr) # type: float print("Percent of undefined autocorr = yearly:%.3f, quarterly:%.3f" % (year_unknown_pct, quarter_unknown_pct)) # Normalize all the things year_autocorr = normalize(np.nan_to_num(raw_year_autocorr)) # replace NaN with 0 and infinity with large finite numbers quarter_autocorr = normalize(np.nan_to_num(raw_quarter_autocorr)) # Calculate and encode page features (site, contry, etc) page_features = make_page_features(df.index.values) encoded_page_features = encode_page_features(page_features) # Make time-dependent features features_days = pd.date_range(data_start, features_end) #dow = normalize(features_days.dayofweek.values) week_period = 7 / (2 * np.pi) dow_norm = features_days.dayofweek.values / week_period dow = np.stack([np.cos(dow_norm), np.sin(dow_norm)], axis=-1) # 纵向连接,两列 # Assemble 聚集 indices 指数 for quarterly lagged data lagged_ix = np.stack(lag_indexes(data_start, features_end), axis=-1) page_popularity = df.median(axis=1) page_popularity = (page_popularity - page_popularity.mean()) / page_popularity.std() # 标准化 # Put NaNs back df[nans] = np.NaN # Assemble final output tensors = dict( hits=df, lagged_ix=lagged_ix, page_map=page_map, page_ix=df.index.values, pf_agent=encoded_page_features['agent'], pf_country=encoded_page_features['country'], pf_site=encoded_page_features['site'], page_popularity=page_popularity, year_autocorr=year_autocorr, quarter_autocorr=quarter_autocorr, dow=dow, ) plain = dict( features_days=len(features_days), data_days=len(df.columns), n_pages=len(df), data_start=data_start, data_end=data_end, features_end=features_end ) # Store data to the disk VarFeeder(args['data_dir'], tensors, plain)
def main(_): if len(sys.argv) < 3: print( 'Usage: ucdoc_saved_model.py [--model_version=y] --data_dir=xxx --ckpt_dir=xxx --saved_dir=xxx' ) sys.exit(-1) if FLAGS.training_iteration <= 0: print('Please specify a positive value for training iteration.') sys.exit(-1) if FLAGS.model_version <= 0: print('Please specify a positive value for version number.') sys.exit(-1) # create deploy model first with tf.variable_scope('input') as inp_scope: with tf.device("/cpu:0"): #inp = VarFeeder.read_vars("data/vars") inp = VarFeeder.read_vars(FLAGS.data_dir) pipe = InputPipe(inp, ucdoc_features(inp), inp.hits.shape[0], mode=ModelMode.PREDICT, batch_size=FLAGS.batch_size, n_epoch=1, verbose=False, train_completeness_threshold=0.01, predict_window=FLAGS.predict_window, predict_completeness_threshold=0.0, train_window=FLAGS.train_window, back_offset=FLAGS.predict_window + 1) asgd_decay = 0.99 if FLAGS.asgd else None if FLAGS.n_models == 1: model = Model(pipe, build_from_set(FLAGS.hparam_set), is_train=False, seed=1, asgd_decay=asgd_decay) else: models = [] for i in range(FLAGS.n_models): prefix = f"m_{i}" with tf.variable_scope(prefix) as scope: models.append( Model(pipe, build_from_set(FLAGS.hparam_set), is_train=False, seed=1, asgd_decay=asgd_decay, graph_prefix=prefix)) model = models[FLAGS.target_model] # load checkpoint model from training #ckpt_path = FLAGS.ckpt_dir print('loading checkpoint model...') ckpt_file = tf.train.latest_checkpoint(FLAGS.ckpt_dir) #graph = tf.Graph() graph = model.predictions.graph saver = tf.train.Saver(name='deploy_saver', var_list=None) with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True))) as sess: pipe.load_vars(sess) pipe.init_iterator(sess) saver.restore(sess, ckpt_file) print('Done loading checkpoint model') export_path_base = FLAGS.saved_dir export_path = os.path.join( tf.compat.as_bytes(export_path_base), tf.compat.as_bytes(str(FLAGS.model_version))) print('Exporting trained model to', export_path) if os.path.isdir(export_path): shutil.rmtree(export_path) builder = tf.saved_model.builder.SavedModelBuilder(export_path) true_x = tf.saved_model.utils.build_tensor_info(model.inp.true_x) time_x = tf.saved_model.utils.build_tensor_info(model.inp.time_x) norm_x = tf.saved_model.utils.build_tensor_info(model.inp.norm_x) lagged_x = tf.saved_model.utils.build_tensor_info(model.inp.lagged_x) true_y = tf.saved_model.utils.build_tensor_info(model.inp.true_y) time_y = tf.saved_model.utils.build_tensor_info(model.inp.time_y) norm_y = tf.saved_model.utils.build_tensor_info(model.inp.norm_y) norm_mean = tf.saved_model.utils.build_tensor_info(model.inp.norm_mean) norm_std = tf.saved_model.utils.build_tensor_info(model.inp.norm_std) pg_features = tf.saved_model.utils.build_tensor_info( model.inp.ucdoc_features) page_ix = tf.saved_model.utils.build_tensor_info(model.inp.page_ix) pred = tf.saved_model.utils.build_tensor_info(model.predictions) labeling_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={ "truex": true_x, "timex": time_x, "normx": norm_x, "laggedx": lagged_x, "truey": true_y, "timey": time_y, "normy": norm_y, "normmean": norm_mean, "normstd": norm_std, "page_features": pg_features, "pageix": page_ix, }, outputs={"pred": pred}, method_name="tensorflow/serving/predict")) legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op') builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={ tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: labeling_signature }, main_op=tf.tables_initializer(), strip_default_attrs=True) builder.save() print("Build Done")
def predict(checkpoints, hparams, return_x=False, verbose=False, predict_window=6, back_offset=0, n_models=1, target_model=0, asgd=False, seed=1, batch_size=1024): with tf.variable_scope('input') as inp_scope: with tf.device("/cpu:0"): inp = VarFeeder.read_vars("data/vars") pipe = InputPipe(inp, page_features(inp), inp.n_pages, mode=ModelMode.PREDICT, batch_size=batch_size, n_epoch=1, verbose=verbose, train_completeness_threshold=0.01, predict_window=predict_window, predict_completeness_threshold=0.0, train_window=hparams.train_window, back_offset=back_offset) asgd_decay = 0.99 if asgd else None if n_models == 1: model = Model(pipe, hparams, is_train=False, seed=seed, asgd_decay=asgd_decay) else: models = [] for i in range(n_models): prefix = f"m_{i}" with tf.variable_scope(prefix) as scope: models.append(Model(pipe, hparams, is_train=False, seed=seed, asgd_decay=asgd_decay, graph_prefix=prefix)) model = models[target_model] if asgd: var_list = model.ema.variables_to_restore() prefix = f"m_{target_model}" for var in list(var_list.keys()): if var.endswith('ExponentialMovingAverage') and not var.startswith(prefix): del var_list[var] else: var_list = None saver = tf.train.Saver(name='eval_saver', var_list=var_list) x_buffer = [] predictions = None with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))) as sess: pipe.load_vars(sess) for checkpoint in checkpoints: pred_buffer = [] pipe.init_iterator(sess) saver.restore(sess, checkpoint) cnt = 0 while True: try: if return_x: pred, x, pname = sess.run([model.predictions, model.inp.true_x, model.inp.page_ix]) else: pred, pname = sess.run([model.predictions, model.inp.page_ix]) utf_names = [str(name, 'utf-8') for name in pname] pred_df = pd.DataFrame(index=utf_names, data=np.expm1(pred)) pred_buffer.append(pred_df) if return_x: # noinspection PyUnboundLocalVariable x_values = pd.DataFrame(index=utf_names, data=np.round(np.expm1(x)).astype(np.int64)) x_buffer.append(x_values) newline = cnt % 80 == 0 if cnt > 0: print('.', end='\n' if newline else '', flush=True) if newline: print(cnt, end='') cnt += 1 except tf.errors.OutOfRangeError: print('🎉') break cp_predictions = pd.concat(pred_buffer) if predictions is None: predictions = cp_predictions else: predictions += cp_predictions predictions /= len(checkpoints) offset = pd.Timedelta(back_offset, 'D') start_prediction = inp.data_end + pd.Timedelta('1D') - offset end_prediction = start_prediction + pd.Timedelta(predict_window - 1, 'D') predictions.columns = pd.date_range(start_prediction, end_prediction) if return_x: x = pd.concat(x_buffer) start_data = inp.data_end - pd.Timedelta(hparams.train_window - 1, 'D') - back_offset end_data = inp.data_end - back_offset x.columns = pd.date_range(start_data, end_data) return predictions, x else: return predictions
def run(): parser = argparse.ArgumentParser(description='Prepare data') parser.add_argument('data_dir') parser.add_argument( '--valid_threshold', default=0.04, type=float, help="Series minimal length threshold (pct of data length)") parser.add_argument('--add_timestamp', default=288, type=int, help="Add N timestamp in a future for prediction") parser.add_argument( '--start', default=0, type=int, help="Effective start date. Data before the start is dropped") parser.add_argument( '--end', default=-288, type=int, help="Effective end date. Data past the end is dropped") parser.add_argument('--corr_backoffset', default=0, type=int, help='Offset for correlation calculation') parser.add_argument('--split_df', default=0, type=int, help="Whether to split vms w.r.t. abnormal behaviour") args = parser.parse_args() # Get the data df, starts, ends = prepare_data(args.split_df, args.start, args.end, args.valid_threshold) # Our working date range data_start, data_end = df.columns[0], df.columns[-1] # We have to project some date-dependent features (day of week, etc) to the future dates for prediction features_end = data_end + args.add_timestamp print(f"start: {data_start}, end:{data_end}, features_end:{features_end}") features_time = features_end - data_start assert df.index.is_monotonic_increasing # daily autocorrelation day_autocorr = batch_autocorr(df.values, 288, starts, ends, 1.5, args.corr_backoffset) # weekly autocorrelation week_autocorr = batch_autocorr(df.values, 288 * 7, starts, ends, 2, args.corr_backoffset) # Normalise all the things day_autocorr = normalize(np.nan_to_num(day_autocorr)) week_autocorr = normalize(np.nan_to_num(week_autocorr)) # Make time-dependent features feature_time = np.arange(data_start, features_end + 1) % 288 day_period = 288 / (2 * np.pi) dow_norm = feature_time / day_period dow = np.stack([np.cos(dow_norm), np.sin(dow_norm)], axis=-1) # Assemble indices for quarterly lagged data lagged_ix = np.stack(lag_indexes(data_start, features_end), axis=-1) # Assemble final output tensors = dict( usage=df, lagged_ix=lagged_ix, vm_ix=df.index.values, day_autocorr=day_autocorr, week_autocorr=week_autocorr, starts=starts, ends=ends, dow=dow, ) plain = dict(features_time=features_time, data_time=len(df.columns), n_vm=len(df), data_start=data_start, data_end=data_end, features_end=features_end) # Store data to the disk VarFeeder(args.data_dir, tensors, plain)
def predict(checkpoints, hparams, return_x=False, verbose=False, predict_window=6, back_offset=0, n_models=1, target_model=0, asgd=False, seed=1, batch_size=1024): with tf.variable_scope('input') as inp_scope: with tf.device("/cpu:0"): inp = VarFeeder.read_vars("data/vars") pipe = InputPipe(inp, page_features(inp), inp.n_pages, mode=ModelMode.PREDICT, batch_size=batch_size, n_epoch=1, verbose=verbose, train_completeness_threshold=0.01, predict_window=predict_window, predict_completeness_threshold=0.0, train_window=hparams.train_window, back_offset=back_offset) asgd_decay = 0.99 if asgd else None if n_models == 1: model = Model(pipe, hparams, is_train=False, seed=seed, asgd_decay=asgd_decay) else: models = [] for i in range(n_models): prefix = f"m_{i}" with tf.variable_scope(prefix) as scope: models.append(Model(pipe, hparams, is_train=False, seed=seed, asgd_decay=asgd_decay, graph_prefix=prefix)) model = models[target_model] if asgd: var_list = model.ema.variables_to_restore() print("$$$$$$$$$=",var_list) prefix = f"m_{target_model}" for var in list(var_list.keys()): if var.endswith('ExponentialMovingAverage') and not var.startswith(prefix): del var_list[var] else: var_list = None saver = tf.train.Saver(name='eval_saver', var_list=var_list) x_buffer = [] predictions = None with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))) as sess: pipe.load_vars(sess) for checkpoint in checkpoints: pred_buffer = [] pipe.init_iterator(sess) saver.restore(sess, checkpoint) cnt = 0 while True: try: if return_x: pred, x, pname = sess.run([model.predictions, model.inp.true_x, model.inp.page_ix]) else: pred, pname = sess.run([model.predictions, model.inp.page_ix]) utf_names = [str(name, 'utf-8') for name in pname] pred_df = pd.DataFrame(index=utf_names, data=np.expm1(pred)) pred_buffer.append(pred_df) if return_x: # noinspection PyUnboundLocalVariable x_values = pd.DataFrame(index=utf_names, data=np.round(np.expm1(x)).astype(np.int64)) x_buffer.append(x_values) newline = cnt % 80 == 0 if cnt > 0: print('.', end='\n' if newline else '', flush=True) if newline: print(cnt, end='') cnt += 1 except tf.errors.OutOfRangeError: print('🎉') break cp_predictions = pd.concat(pred_buffer) if predictions is None: predictions = cp_predictions else: predictions += cp_predictions predictions /= len(checkpoints) offset = pd.Timedelta(back_offset, 'D') start_prediction = inp.data_end + pd.Timedelta('1D') - offset end_prediction = start_prediction + pd.Timedelta(predict_window - 1, 'D') predictions.columns = pd.date_range(start_prediction, end_prediction) if return_x: x = pd.concat(x_buffer) start_data = inp.data_end - pd.Timedelta(hparams.train_window - 1, 'D') - back_offset end_data = inp.data_end - back_offset x.columns = pd.date_range(start_data, end_data) return predictions, x else: return predictions
def run(): parser = argparse.ArgumentParser(description='Prepare data') parser.add_argument('data_dir') parser.add_argument( '--valid_threshold', default=0.0, type=float, help="Series minimal length threshold (pct of data length)") parser.add_argument('--add_days', default=63, type=int, help="Add N days in a future for prediction") parser.add_argument( '--start', help="Effective start date. Data before the start is dropped") parser.add_argument( '--end', help="Effective end date. Data past the end is dropped") parser.add_argument('--corr_backoffset', default=0, type=int, help='Offset for correlation calculation') args = parser.parse_args() # print("args",args.start) # todo python make_features.py data/vars --add_days=63 # Get the data df, nans, starts, ends = prepare_data(args.start, args.end, args.valid_threshold) # print(f"starts={starts},ends={ends}; df.head()={df.head()}") # Our working date range # todo 页面最早最晚访问时间 # print("df", df.head(), df.shape) data_start, data_end = df.columns[0], df.columns[-1] # We have to project some date-dependent features (day of week, etc) to the future dates for prediction features_end = data_end + pd.Timedelta(args.add_days, unit='D') # todo start: 2015-07-01, end:2017-09-11 00:00:00, features_end:2017-11-13 00:00:00 # print(f"start: {data_start}, end:{data_end}, features_end:{features_end}") # Group unique pages by agents # print("df.index.is_monotonic_increasing",df.index.is_monotonic_increasing) assert df.index.is_monotonic_increasing page_map = uniq_page_map(df.index.values) # print(f"page_map={page_map}") # Yearly(annual) autocorrelation raw_year_autocorr = batch_autocorr(df.values, 365, starts, ends, 1.5, args.corr_backoffset) # todo 相关系数为空的数量 year_unknown_pct = np.sum(np.isnan(raw_year_autocorr)) / len( raw_year_autocorr) # type: float # Quarterly autocorrelation raw_quarter_autocorr = batch_autocorr(df.values, int(round(365.25 / 4)), starts, ends, 2, args.corr_backoffset) quarter_unknown_pct = np.sum(np.isnan(raw_quarter_autocorr)) / len( raw_quarter_autocorr) # type: float print("Percent of undefined autocorr = yearly:%.3f, quarterly:%.3f" % (year_unknown_pct, quarter_unknown_pct)) # Normalise all the things使用0代替数组x中的nan元素,使用有限的数字代替inf元素 # todo 对年度相关系数和季度相关系数做均值方差标准化 year_autocorr = normalize(np.nan_to_num(raw_year_autocorr)) quarter_autocorr = normalize(np.nan_to_num(raw_quarter_autocorr)) # Calculate and encode page features page_features = make_page_features(df.index.values) encoded_page_features = encode_page_features(page_features) # todo encoded_page_features 对网页的中介和国家做了 onehot 编码 # print("encoded_page_features",type(encoded_page_features)) # Make time-dependent features features_days = pd.date_range(data_start, features_end) #dow = normalize(features_days.dayofweek.values) week_period = 7 / (2 * np.pi) dow_norm = features_days.dayofweek.values / week_period dow = np.stack([np.cos(dow_norm), np.sin(dow_norm)], axis=-1) # Assemble indices for quarterly lagged data # todo 分别滑窗3,6,9,12个月的时间index lagged_ix = np.stack(lag_indexes(data_start, features_end), axis=-1) # todo data_start=2015-07-01; # lagged_ix=[[ -1 -1 -1 -1] # [ -1 -1 -1 -1] # ... # [773 681 592 500] # [774 682 593 501]] # print(f"data_start={data_start}; lagged_ix={lagged_ix}") page_popularity = df.median(axis=1) # todo page_popularity (145036,) # todo 这里用每个页面的访问量的中位数表示该页面的流行程度 # print("page_popularity",page_popularity.shape,page_popularity) page_popularity = (page_popularity - page_popularity.mean()) / page_popularity.std() # Put NaNs back df[nans] = np.NaN # Assemble final output # todo # hits : 过滤掉了访问量很多为0的网页,已经按照网页名称排序,并做log变换 # lagged_ix : 日期index的滑窗,小于最早日期的-1做填充 # page_map : 背景:每个页面最多4种中间商。数据:每一行代表一个页面名称,每一列代表一个中间商 # page_ix : 每个网页全名(包括页面名称,中间商,国家等) # pf_agent : 网页的中间商的onehot编码 # pf_country : 网页国家的onehot编码 # pf_site : 网页地址sire编码 # page_popularity : 用页面访问量的中位数代表流行度 # year_autocorr : 访问量滑窗一年的相关系数 # quarter_autocorr : 访问量滑窗一个季度的相关系数 # dow : 周几转换为sin,cos tensors = dict( hits=df, lagged_ix=lagged_ix, page_map=page_map, page_ix=df.index.values, pf_agent=encoded_page_features['agent'], pf_country=encoded_page_features['country'], pf_site=encoded_page_features['site'], page_popularity=page_popularity, year_autocorr=year_autocorr, quarter_autocorr=quarter_autocorr, dow=dow, ) # todo # data_start :访问量的起始日期 # data_end:访问量的终止日期 # features_end:特征的终止日期,比访问量的终止日期晚63天 # features_days:特征横跨多少天 # data_days:总共有多少天的数据 # n_pages :多少个页面 plain = dict(features_days=len(features_days), data_days=len(df.columns), n_pages=len(df), data_start=data_start, data_end=data_end, features_end=features_end) # todo len(features_days)=867,len(df.columns)=805,len(df)=145036,data_start=2015-07-01, # data_end=2017-09-11 00:00:00,features_end=2017-11-13 00:00:00 # print(f"len(features_days)={len(features_days)},len(df.columns)={len(df.columns)},len(df)={len(df)},data_start={data_start},data_end={data_end},features_end={features_end}") # Store data to the disk VarFeeder(args.data_dir, tensors, plain)
def run(): parser = argparse.ArgumentParser(description='Prepare data') parser.add_argument('data_dir') parser.add_argument( '--valid_threshold', default=0.0, type=float, help="Series minimal length threshold (pct of data length)") parser.add_argument('--add_days', default=64, type=int, help="Add N days in a future for prediction") parser.add_argument( '--start', help="Effective start date. Data before the start is dropped") parser.add_argument( '--end', help="Effective end date. Data past the end is dropped") parser.add_argument('--corr_backoffset', default=0, type=int, help='Offset for correlation calculation') args = parser.parse_args() # Get the data df, nans, starts, ends = prepare_data(args.start, args.end, args.valid_threshold) # Our working date range data_start, data_end = df.columns[0], df.columns[-1] # We have to project some date-dependent features (day of week, etc) to the future dates for prediction features_end = data_end + pd.Timedelta(args.add_days, unit='D') print(f"start: {data_start}, end:{data_end}, features_end:{features_end}") # Group unique pages by agents assert df.index.is_monotonic_increasing page_map = uniq_page_map(df.index.values) # Yearly(annual) autocorrelation of each page. The return is a list of auto_correl of each page. raw_year_autocorr = batch_autocorr(df.values, 365, starts, ends, 1.5, args.corr_backoffset) year_unknown_pct = np.sum(np.isnan(raw_year_autocorr)) / len( raw_year_autocorr) # type: float # Quarterly autocorrelation of each page. The return is a list of auto_correl of each page. raw_quarter_autocorr = batch_autocorr(df.values, int(round(365.25 / 4)), starts, ends, 2, args.corr_backoffset) quarter_unknown_pct = np.sum(np.isnan(raw_quarter_autocorr)) / len( raw_quarter_autocorr) # type: float print("Percent of undefined autocorr = yearly:%.3f, quarterly:%.3f" % (year_unknown_pct, quarter_unknown_pct)) # Normalise all the things year_autocorr = normalize(np.nan_to_num(raw_year_autocorr)) quarter_autocorr = normalize(np.nan_to_num(raw_quarter_autocorr)) # Calculate and encode page features. To create the following page features, only pages are passed to the function. Date columns and hits are not passed. # Features are extracted from the page urls. page_features = make_page_features(df.index.values) encoded_page_features = encode_page_features(page_features) # Make time-dependent features # The following gives an array of all the days in the given data range. Eg: ['2015-07-01', '2015-07-02', .... '2017-10-01'] features_days = pd.date_range(data_start, features_end) #dow = normalize(features_days.dayofweek.values) week_period = 7 / (2 * np.pi) # features_days.dayofweek.values below gives which day of the week does the corresponding date belongs to. # Eg: '2015-07-01' belongs to 2nd day of week i.e Tue. So value will be 2. dow_norm = features_days.dayofweek.values / week_period dow = np.stack([np.cos(dow_norm), np.sin(dow_norm)], axis=-1) # Assemble indices for quarterly lagged data lagged_ix = np.stack(lag_indexes(data_start, features_end), axis=-1) # This is median of each page hits. So the output is a Df with just one column that has median of all hits of page. page_popularity = df.median(axis=1) page_popularity = (page_popularity - page_popularity.mean()) / page_popularity.std() median_7 = traffic_median(df, 7) median_30 = traffic_median(df, 30) median_90 = traffic_median(df, 90) median_180 = traffic_median(df, 180) # Put NaNs back df[nans] = np.NaN # Assemble final output tensors = dict( hits=df, lagged_ix=lagged_ix, page_map=page_map, page_ix=df.index.values, pf_agent=encoded_page_features['agent'], pf_country=encoded_page_features['country'], pf_site=encoded_page_features['site'], page_popularity=page_popularity, median_7=median_7, median_30=median_30, median_90=median_90, median_180=median_180, year_autocorr=year_autocorr, quarter_autocorr=quarter_autocorr, dow=dow, ) plain = dict(features_days=len(features_days), data_days=len(df.columns), n_pages=len(df), data_start=data_start, data_end=data_end, features_end=features_end) # Store data to the disk VarFeeder(args.data_dir, tensors, plain)
eval_pct = 0.1 batch_size = batch_size train_window = train_window train_completeness_threshold = 1.0 predict_window = 63 verbose = False train_skip_first = 0 train_skip_first = 0 tf.reset_default_graph() forward_split = False train_sampling = 1.0 if seed: tf.set_random_seed(seed) sess = tf.Session() with tf.device("/cpu:0"): inp = VarFeeder.read_vars("data/vars") # 恢复变量 inp.restore(sess) # splitter = FakeSplitter(page_features(inp), 3, seed=seed, test_sampling=eval_sampling) splitter = Splitter(page_features(inp), inp.page_map, 3, train_sampling=train_sampling, test_sampling=eval_sampling, seed=seed) real_train_pages = splitter.splits[0].train_size real_eval_pages = splitter.splits[0].test_size # 14503.6 items_per_eval = real_eval_pages * eval_pct # 30
def run(city_path='/nfs/isolation_project/intern/project/lihaocheng/city_forcast/city_day_features_to_yesterday.gbk.csv', weafor_path='/nfs/isolation_project/intern/project/lihaocheng/city_forcast/weather_forecast.csv', datadir='data', city_list=None, **args): start_time = time.time() # Get the data [train_x, train_embed_weekday, train_embed_month, train_embed_city, train_real_city, train_y_origin],\ [val_x, val_embed_weekday, val_embed_month, val_embed_city, val_real_city, val_y_origin],\ [infer_x, infer_embed_weekday, infer_embed_month, infer_embed_city, infer_city_map, infer_y_origin],\ city_map, city_max, city_min, train_mean, train_std = read_all(city_list, city_path, weafor_path) log.debug( "complete generating df_cpu_max and df_cpu_num, time elapse = %S", time.time() - start_time) train_total = pd.concat([train_x, train_y_origin], axis=1) train_features = [pd.DataFrame() for city in city_list] train_y = [pd.DataFrame() for city in city_list] y_mean = list() y_std = list() month_autocorr = dict() week_autocorr = dict() # Make train features attrs = [ 'online_time', 'total_finish_order_cnt', 'total_gmv', 'strive_order_cnt', 'total_no_call_order_cnt' ] dfs = [pd.DataFrame() for i in range(5)] for i, city in enumerate(city_list): for idx, attr in enumerate(attrs): per_city = train_total[train_total['city_id'] == city] train_features[i] = per_city[train_x.columns].values train_y[i] = per_city[train_y_origin.columns].apply( lambda x: np.log(x + 1)) series = train_y[i].loc[:, [attr]] series.columns = [city] series = series.reset_index(drop=True) dfs[idx] = pd.concat([dfs[idx], series], axis=1) for idx, attr in enumerate(attrs): df = dfs[idx].T # monthly autocorrelation month = batch_autocorr(df.values, 30) # weekly autocorrelation week = batch_autocorr(df.values, 7) # Normalise all the things month_autocorr[attr] = normalize(np.nan_to_num(month)) week_autocorr[attr] = normalize(np.nan_to_num(week)) # Find train_y mean & std for i, per_city in enumerate(train_y): y_mean.append(per_city.mean()) y_std.append(per_city.std()) # Make val features val_total = pd.concat([val_x, val_y_origin], axis=1) val_features = list() val_y = list() for city in city_list: per_city = val_total[val_total['city_id'] == city] val_features.append(per_city[val_x.columns].values) val_y.append( per_city[val_y_origin.columns].apply(lambda x: np.log(x + 1))) # Make infer features infer_x = infer_x.drop(['city_id'], axis=1) infer_total = pd.concat([infer_x, infer_y_origin], axis=1) infer_features = list() infer_y = list() for city in city_list: per_city = infer_total[infer_total['city_id'] == city] infer_features.append(per_city[infer_x.columns].values) infer_y.append( per_city[infer_y_origin.columns].apply(lambda x: np.log(x + 1))) # Make time-dependent features time_period = 7 / (2 * np.pi) train_dow_norm = train_embed_weekday / time_period val_dow_norm = val_embed_weekday / time_period infer_dow_norm = infer_embed_weekday / time_period time_period = 12 / (2 * np.pi) train_dom_norm = train_embed_month / time_period val_dom_norm = val_embed_month / time_period infer_dom_norm = infer_embed_month / time_period train_dow = np.stack([ np.cos(train_dow_norm), np.sin(train_dow_norm), np.cos(train_dom_norm), np.sin(train_dom_norm) ], axis=-1) val_dow = np.stack([ np.cos(val_dow_norm), np.sin(val_dow_norm), np.cos(val_dom_norm), np.sin(val_dom_norm) ], axis=-1) infer_dow = np.stack([ np.cos(infer_dow_norm), np.sin(infer_dow_norm), np.cos(infer_dom_norm), np.sin(infer_dom_norm) ], axis=-1) # Assemble final output tensors = dict(train_x=train_features, val_x=val_features, infer_x=infer_features, train_dow=train_dow, val_dow=val_dow, infer_dow=infer_dow, train_y=[df['total_no_call_order_cnt'] for df in train_y], val_y=[df['total_no_call_order_cnt'] for df in val_y], infer_y=[df['total_no_call_order_cnt'] for df in infer_y], train_time=train_features[0].shape[0], val_time=val_features[0].shape[0], infer_time=infer_features[0].shape[0], month_autocorr=month_autocorr['total_no_call_order_cnt'], week_autocorr=week_autocorr['total_no_call_order_cnt'], cities=np.array([city_map[city] for city in city_list]), mean=[per['total_no_call_order_cnt'] for per in y_mean], std=[per['total_no_call_order_cnt'] for per in y_std]) plain = dict() # Store data to the disk VarFeeder(os.path.join(datadir, 'vars'), tensors, plain) with open(os.path.join(datadir, 'city_map.pickle'), 'wb') as handle: pkl.dump(city_map, handle, protocol=pkl.HIGHEST_PROTOCOL) infer_y_origin.to_pickle(os.path.join(datadir, 'infer_y.pickle'))
def main(_): if not FLAGS.server: print('please specify server host:port') return channel = grpc.insecure_channel(FLAGS.server) stub = prediction_service_pb2_grpc.PredictionServiceStub(channel) request = predict_pb2.PredictRequest() request.model_spec.name = "ucdoc" request.model_spec.signature_name = "serving_default" with tf.variable_scope('input') as inp_scope: with tf.device("/cpu:0"): inp = VarFeeder.read_vars("data/vars") pipe = InputPipe(inp, ucdoc_features(inp), inp.n_pages, mode=ModelMode.PREDICT, batch_size=FLAGS.batch_size, n_epoch=1, verbose=FLAGS.verbose, train_completeness_threshold=0.01, predict_window=FLAGS.predict_window, predict_completeness_threshold=0.0, train_window=FLAGS.train_window, back_offset=FLAGS.predict_window + 1) with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True))) as sess: pipe.load_vars(sess) pipe.init_iterator(sess) while True: try: truex, timex, normx, laggedx, truey, timey, normy, normmean, normstd, pgfeatures, pageix = \ sess.run([pipe.true_x, pipe.time_x, pipe.norm_x, pipe.lagged_x, pipe.true_y, pipe.time_y, pipe.norm_y, pipe.norm_mean, pipe.norm_std, pipe.ucdoc_features, pipe.page_ix]) request.inputs["truex"].CopyFrom(tf.make_tensor_proto(truex)) request.inputs["timex"].CopyFrom(tf.make_tensor_proto(timex)) request.inputs["normx"].CopyFrom(tf.make_tensor_proto(normx)) request.inputs["laggedx"].CopyFrom( tf.make_tensor_proto(laggedx)) request.inputs["truey"].CopyFrom(tf.make_tensor_proto(truey)) request.inputs["timey"].CopyFrom(tf.make_tensor_proto(timey)) request.inputs["normy"].CopyFrom(tf.make_tensor_proto(normy)) request.inputs["normmean"].CopyFrom( tf.make_tensor_proto(normmean)) request.inputs["normstd"].CopyFrom( tf.make_tensor_proto(normstd)) request.inputs["page_features"].CopyFrom( tf.make_tensor_proto(pgfeatures)) request.inputs["pageix"].CopyFrom(tf.make_tensor_proto(pageix)) response = stub.Predict(request, 10) tensor_proto = response.outputs['pred'] if not 'pred_result' in locals(): pred_result = tf.contrib.util.make_ndarray(tensor_proto) else: pred_result = np.concatenate([ pred_result, tf.contrib.util.make_ndarray(tensor_proto) ]) except tf.errors.OutOfRangeError: print('done with prediction') break pred_result = np.expm1(pred_result) + 0.5 pred_result = pred_result.astype(int) if not os.path.exists(FLAGS.result_dir): os.mkdir(FLAGS.result_dir) result_file = os.path.join(FLAGS.result_dir, "predict.pkl") pickle.dump(pred_result, open(result_file, "wb")) print('finished prediction')
def run_local(): data_dir = 'data/vars' add_days = 63 valid_threshold = 0.0 corr_backoffset = 0 start = None end = None # Get the data df:log1p后结果 浮点数 145036--过滤掉27条 长度不够的 df, nans, starts, ends = prepare_data(start, end, valid_threshold) # Our working date range data_start, data_end = df.columns[0], df.columns[-1] # We have to project some date-dependent features (day of week, etc) to the future dates for prediction features_end = data_end + pd.Timedelta(add_days, unit='D') print(f"start: {data_start}, end:{data_end}, features_end:{features_end}") # Group unique pages by agents 每个唯一页面对应最多四个agent,0 1 2 3 在相应位置填上 页面原来的行号 assert df.index.is_monotonic_increasing page_map = uniq_page_map(df.index.values) # Yearly(annual) autocorrelation 求365天的相关系数值,不符合条件的置np.nan raw_year_autocorr = batch_autocorr(df.values, 365, starts, ends, 1.5, corr_backoffset) year_unknown_pct = np.sum(np.isnan(raw_year_autocorr)) / len( raw_year_autocorr) # type: float # Quarterly autocorrelation raw_quarter_autocorr = batch_autocorr(df.values, int(round(365.25 / 4)), starts, ends, 2, corr_backoffset) quarter_unknown_pct = np.sum(np.isnan(raw_quarter_autocorr)) / len( raw_quarter_autocorr) # type: float print("Percent of undefined autocorr = yearly:%.3f, quarterly:%.3f" % (year_unknown_pct, quarter_unknown_pct)) # Normalise all the things year_autocorr = normalize(np.nan_to_num(raw_year_autocorr)) # -3,+3 quarter_autocorr = normalize( np.nan_to_num(raw_quarter_autocorr)) # -4.5 ,+4.5 # Calculate and encode page features 页面url没有变化,提取了三个特征 agent country site page_features = make_page_features(df.index.values) encoded_page_features = encode_page_features( page_features) # 10555 + 127181 + 7300 =145036 # Make time-dependent features 867 features_days = pd.date_range(data_start, features_end) # dow = normalize(features_days.dayofweek.values) week_period = 7 / (2 * np.pi) dow_norm = features_days.dayofweek.values / week_period dow = np.stack([np.cos(dow_norm), np.sin(dow_norm)], axis=-1) # Assemble indices for quarterly lagged data lagged_ix = np.stack(lag_indexes(data_start, features_end), axis=-1) # 一列 page_popularity = df.median(axis=1) page_popularity = (page_popularity - page_popularity.mean()) / page_popularity.std() # Put NaNs back 缺失重置np.NaN df[nans] = np.NaN # Assemble final output tensors = dict( hits=df, lagged_ix=lagged_ix, page_map=page_map, page_ix=df.index.values, pf_agent=encoded_page_features['agent'], pf_country=encoded_page_features['country'], pf_site=encoded_page_features['site'], page_popularity=page_popularity, year_autocorr=year_autocorr, quarter_autocorr=quarter_autocorr, dow=dow, ) plain = dict(features_days=len(features_days), data_days=len(df.columns), n_pages=len(df), data_start=data_start, data_end=data_end, features_end=features_end) # Store data to the disk VarFeeder(data_dir, tensors, plain)
def extend_inp(data_path, predict_window, holiday_list): with tf.variable_scope('input', reuse=tf.AUTO_REUSE) as inp_scope: with tf.device("/cpu:0"): # inp = VarFeeder.read_vars("data/vars") inp = VarFeeder.read_vars(data_path) yesterday = pd.to_datetime(inp.data_end) + pd.Timedelta( predict_window, 'D') yesterday = yesterday.date().strftime('%Y-%m-%d') day = datetime.strptime(yesterday, '%Y-%m-%d') day_list = [] for _ in range(0, inp.features_days + predict_window): day_list.append(datetime.strftime(day, '%Y-%m-%d')) day = day + timedelta(days=-1) day_list.sort() # computing lagged_ix date_range = pd.date_range(day_list[0], day_list[-1]) base_index = pd.Series(np.arange(0, len(date_range)), index=date_range) def lag(offset): dates = date_range - offset return pd.Series(data=base_index[dates].fillna(-1).astype( np.int16).values, index=date_range) lagged_ix = np.stack( [lag(pd.DateOffset(months=m)) for m in (1, 2)], axis=-1) with tf.Session() as sess: inp.restore(sess) # ts_log, page_ix_, pf_age_, pf_si_, pf_network_, pf_price_model_, \ ts_log, page_ix_, pf_age_, pf_si_, pf_network_, pf_price_cat_, \ pf_gender_, page_popularity_, quarter_autocorr_ = \ sess.run([inp.hits, inp.page_ix, inp.pf_age, inp.pf_si, # inp.pf_network, inp.pf_price_model, inp.pf_gender, inp.pf_network, inp.pf_price_cat, inp.pf_gender, inp.page_popularity, inp.quarter_autocorr]) print( f'start: {inp.data_start}\tend: {inp.data_end}\tlength: {inp.features_days}' ) df_ts = pd.DataFrame(np.append(ts_log, np.zeros((len(page_ix_), predict_window)), axis=1), index=list(page_ix_), columns=day_list) df_age = pd.DataFrame(pf_age_, index=list(page_ix_)) df_si = pd.DataFrame(pf_si_, index=list(page_ix_)) df_network = pd.DataFrame(pf_network_, index=list(page_ix_)) # df_price_model = pd.DataFrame(pf_price_model_, index=list(page_ix_)) df_price_cat = pd.DataFrame(pf_price_cat_, index=list(page_ix_)) df_gender = pd.DataFrame(pf_gender_, index=list(page_ix_)) def get_dow(day_list): dow_list = [] for day in day_list: dow = datetime.strptime(day, '%Y-%m-%d').weekday() dow_list.append(dow) week_period = 7.0 / (2 * math.pi) sin_list = [math.sin(x / week_period) for x in dow_list] cos_list = [math.cos(x / week_period) for x in dow_list] return (sin_list, cos_list) dow_ = get_dow(day_list) # holiday_list = cfg['pipeline']['normalization']['holidays'] holidays = [1 if _ in holiday_list else 0 for _ in day_list] a_list = [] b_list = [] for _ in holidays: a, b = math.sin(_), math.cos(_) a_list.append(a) b_list.append(b) holiday = (a_list, b_list) tensors = dict( hits=df_ts, lagged_ix=lagged_ix, page_ix=list(page_ix_), pf_age=df_age, pf_si=df_si, pf_network=df_network, # pf_price_model=df_price_model, pf_price_cat=df_price_cat, pf_gender=df_gender, page_popularity=page_popularity_, quarter_autocorr=quarter_autocorr_, dow=pd.DataFrame(dow_).T, holiday=pd.DataFrame(holiday).T) batch_size = len(page_ix_) data_len = tensors['hits'].shape[1] plain = dict(data_days=data_len - 0, features_days=data_len, data_start=day_list[0], data_end=day_list[-1], n_pages=batch_size) # dump_path = 'data/vars/predict_future' dump_path = os.path.join(data_path, 'predict_future') if not os.path.exists(dump_path): os.mkdir(dump_path) VarFeeder(dump_path, tensors, plain) tf.reset_default_graph()
def main(_): if len(sys.argv) < 3: print( 'Usage: saved_model.py [--model_version=y] --data_dir=xxx --ckpt_dir=xxx --saved_dir=xxx' ) sys.exit(-1) if FLAGS.training_iteration <= 0: print('Please specify a positive value for training iteration.') sys.exit(-1) if FLAGS.model_version <= 0: print('Please specify a positive value for version number.') sys.exit(-1) with open(FLAGS.config_file, 'r') as ymlfile: cfg = yaml.load(ymlfile) holiday_list = cfg['pipeline']['normalization']['holidays'] if FLAGS.back_offset < FLAGS.predict_window: extend_inp(FLAGS.data_dir, FLAGS.predict_window, holiday_list) # create deploy model first back_offset_ = FLAGS.back_offset with tf.variable_scope('input') as inp_scope: with tf.device("/cpu:0"): if FLAGS.back_offset < FLAGS.predict_window: inp = VarFeeder.read_vars( os.path.join(FLAGS.data_dir, 'predict_future')) back_offset_ += FLAGS.predict_window else: inp = VarFeeder.read_vars(FLAGS.data_dir) pipe = InputPipe(inp, ucdoc_features(inp), inp.hits.shape[0], mode=ModelMode.PREDICT, batch_size=FLAGS.batch_size, n_epoch=1, verbose=False, train_completeness_threshold=0.01, predict_window=FLAGS.predict_window, predict_completeness_threshold=0.0, train_window=FLAGS.train_window, back_offset=back_offset_) asgd_decay = 0.99 if FLAGS.asgd else None if FLAGS.n_models == 1: model = Model(pipe, build_from_set(FLAGS.hparam_set), is_train=False, seed=1, asgd_decay=asgd_decay) else: models = [] for i in range(FLAGS.n_models): prefix = f"m_{i}" with tf.variable_scope(prefix) as scope: models.append( Model(pipe, build_from_set(FLAGS.hparam_set), is_train=False, seed=1, asgd_decay=asgd_decay, graph_prefix=prefix)) model = models[FLAGS.target_model] if FLAGS.asgd: var_list = model.ema.variables_to_restore() if FLAGS.n_models > 1: prefix = f"m_{target_model}" for var in list(var_list.keys()): if var.endswith('ExponentialMovingAverage' ) and not var.startswith(prefix): del var_list[var] else: var_list = None # load checkpoint model from training #ckpt_path = FLAGS.ckpt_dir print('loading checkpoint model...') ckpt_file = tf.train.latest_checkpoint(FLAGS.ckpt_dir) #graph = tf.Graph() graph = model.predictions.graph init = tf.global_variables_initializer() saver = tf.train.Saver(name='deploy_saver', var_list=var_list) with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True))) as sess: sess.run(init) pipe.load_vars(sess) pipe.init_iterator(sess) saver.restore(sess, ckpt_file) print('Done loading checkpoint model') export_path_base = FLAGS.saved_dir export_path = os.path.join( tf.compat.as_bytes(export_path_base), tf.compat.as_bytes(str(FLAGS.model_version))) print('Exporting trained model to', export_path) if os.path.isdir(export_path): shutil.rmtree(export_path) builder = tf.saved_model.builder.SavedModelBuilder(export_path) true_x = tf.saved_model.utils.build_tensor_info( model.inp.true_x) # pipe.true_x time_x = tf.saved_model.utils.build_tensor_info( model.inp.time_x) # pipe.time_x norm_x = tf.saved_model.utils.build_tensor_info( model.inp.norm_x) # pipe.norm_x lagged_x = tf.saved_model.utils.build_tensor_info( model.inp.lagged_x) # pipe.lagged_x true_y = tf.saved_model.utils.build_tensor_info( model.inp.true_y) # pipe.true_y time_y = tf.saved_model.utils.build_tensor_info( model.inp.time_y) # pipe.time_y norm_y = tf.saved_model.utils.build_tensor_info( model.inp.norm_y) # pipe.norm_y norm_mean = tf.saved_model.utils.build_tensor_info( model.inp.norm_mean) # pipe.norm_mean norm_std = tf.saved_model.utils.build_tensor_info( model.inp.norm_std) # pipe.norm_std pg_features = tf.saved_model.utils.build_tensor_info( model.inp.ucdoc_features) # pipe.ucdoc_features page_ix = tf.saved_model.utils.build_tensor_info( model.inp.page_ix) # pipe.page_ix #pred = tf.saved_model.utils.build_tensor_info(graph.get_operation_by_name('m_0/add').outputs[0]) pred = tf.saved_model.utils.build_tensor_info(model.predictions) labeling_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={ "truex": true_x, "timex": time_x, "normx": norm_x, "laggedx": lagged_x, "truey": true_y, "timey": time_y, "normy": norm_y, "normmean": norm_mean, "normstd": norm_std, "page_features": pg_features, "pageix": page_ix, }, outputs={"predictions": pred}, method_name="tensorflow/serving/predict")) legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op') builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={ tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: labeling_signature }, main_op=tf.tables_initializer(), strip_default_attrs=True) builder.save() print("Build Done")
def run(): parser = argparse.ArgumentParser(description='Prepare data') parser.add_argument('data_dir') parser.add_argument( '--valid_threshold', default=0.0, type=float, help="Series minimal length threshold (pct of data length)") parser.add_argument('--add_days', default=64, type=int, help="Add N days in a future for prediction") parser.add_argument( '--start', help="Effective start date. Data before the start is dropped") parser.add_argument( '--end', help="Effective end date. Data past the end is dropped") parser.add_argument('--corr_backoffset', default=0, type=int, help='Offset for correlation calculation') args = parser.parse_args() # Get the data df, nans, starts, ends = prepare_data(args.start, args.end, args.valid_threshold) # Our working date range data_start, data_end = df.columns[0], df.columns[-1] # We have to project some date-dependent features (day of week, etc) to the future dates for prediction features_end = data_end + pd.Timedelta(args.add_days, unit='D') print(f"start: {data_start}, end:{data_end}, features_end:{features_end}") # Group unique pages by agents assert df.index.is_monotonic_increasing page_map = uniq_page_map(df.index.values) # Yearly(annual) autocorrelation raw_year_autocorr = batch_autocorr(df.values, 365, starts, ends, 1.5, args.corr_backoffset) year_unknown_pct = np.sum(np.isnan(raw_year_autocorr)) / len( raw_year_autocorr) # type: float # Quarterly autocorrelation raw_quarter_autocorr = batch_autocorr(df.values, int(round(365.25 / 4)), starts, ends, 2, args.corr_backoffset) quarter_unknown_pct = np.sum(np.isnan(raw_quarter_autocorr)) / len( raw_quarter_autocorr) # type: float print("Percent of undefined autocorr = yearly:%.3f, quarterly:%.3f" % (year_unknown_pct, quarter_unknown_pct)) # Normalise all the things year_autocorr = normalize(np.nan_to_num(raw_year_autocorr)) quarter_autocorr = normalize(np.nan_to_num(raw_quarter_autocorr)) # Calculate and encode page features page_features = make_page_features(df.index.values) encoded_page_features = encode_page_features(page_features) # Make time-dependent features features_days = pd.date_range(data_start, features_end) # dow = normalize(features_days.dayofweek.values) week_period = 7 / (2 * np.pi) dow_norm = features_days.dayofweek.values / week_period dow = np.stack([np.cos(dow_norm), np.sin(dow_norm)], axis=-1) # Assemble indices for quarterly lagged data lagged_ix = np.stack(lag_indexes(data_start, features_end), axis=-1) page_popularity = df.median(axis=1) page_popularity = (page_popularity - page_popularity.mean()) / page_popularity.std() # Put NaNs back df[nans] = np.NaN # Assemble final output tensors = dict( lagged_ix=lagged_ix, page_map=page_map, dow=dow, hits=df, pf_agent=encoded_page_features['agent'], pf_country=encoded_page_features['country'], pf_site=encoded_page_features['site'], page_ix=df.index.values, page_popularity=page_popularity, year_autocorr=year_autocorr, quarter_autocorr=quarter_autocorr, ) plain = dict(features_days=len(features_days), data_days=len(df.columns), n_pages=len(df), data_start=data_start, data_end=data_end, features_end=features_end) # Store data to the disk VarFeeder(args.data_dir, tensors, plain)
def predict(checkpoints, hparams, datadir="data", verbose=False, n_models=1, target_model=0, asgd=False, seed=1, batch_size=50): with tf.variable_scope('input') as inp_scope: with tf.device("/cpu:0"): inp = VarFeeder.read_vars(os.path.join(datadir, "vars")) pipe = InputPipe(datadir, inp, infer_features(inp), mode=ModelMode.PREDICT, batch_size=batch_size, n_epoch=1, verbose=verbose, train_completeness_threshold=0.01, train_window=hparams.train_window) asgd_decay = 0.99 if asgd else None if n_models == 1: model = Model(pipe, hparams, is_train=False, seed=seed, asgd_decay=asgd_decay) else: models = [] for i in range(n_models): prefix = f"m_{i}" with tf.variable_scope(prefix) as scope: models.append( Model(pipe, hparams, is_train=False, seed=seed, asgd_decay=asgd_decay, graph_prefix=prefix)) model = models[target_model] if asgd: var_list = model.ema.variables_to_restore() prefix = f"m_{target_model}" for var in list(var_list.keys()): if var.endswith( 'ExponentialMovingAverage') and not var.startswith(prefix): del var_list[var] else: var_list = None saver = tf.train.Saver(name='eval_saver', var_list=var_list) x_buffer = [] predictions = None with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True))) as sess: pipe.load_vars(sess) for checkpoint in checkpoints: pred_buffer = [] pipe.init_iterator(sess) saver.restore(sess, checkpoint) cnt = 0 while True: try: pred, pname = sess.run([model.prediction, model.inp.vm_ix]) # utf_names = [str(name, 'utf-8') for name in pname] utf_names = pname pred_df = pd.DataFrame(index=utf_names, data=np.expm1(pred) - 1) pred_buffer.append(pred_df) newline = cnt % 80 == 0 if cnt > 0: print('.', end='\n' if newline else '', flush=True) if newline: print(cnt, end='') cnt += 1 except tf.errors.OutOfRangeError: print('Done!') break cp_predictions = pd.concat(pred_buffer) if predictions is None: predictions = cp_predictions else: predictions += cp_predictions predictions /= len(checkpoints) return predictions.iloc[:, -1]
def train(name, hparams, multi_gpu=False, n_models=1, train_completeness_threshold=0.01, seed=None, logdir='data/logs', max_epoch=100, patience=2, train_sampling=1.0, eval_sampling=1.0, eval_memsize=5, gpu=0, gpu_allow_growth=False, save_best_model=False, forward_split=False, write_summaries=False, verbose=False, asgd_decay=None, tqdm=True, side_split=True, max_steps=None, save_from_step=None, do_eval=True, predict_window=63): eval_k = int(round(26214 * eval_memsize / n_models)) eval_batch_size = int( eval_k / (hparams.rnn_depth * hparams.encoder_rnn_layers)) # 128 -> 1024, 256->512, 512->256 eval_pct = 0.1 batch_size = hparams.batch_size train_window = hparams.train_window tf.reset_default_graph() if seed: tf.set_random_seed(seed) with tf.device("/cpu:0"): inp = VarFeeder.read_vars("data/vars") if side_split: splitter = Splitter(page_features(inp), inp.page_map, 3, train_sampling=train_sampling, test_sampling=eval_sampling, seed=seed) else: splitter = FakeSplitter(page_features(inp), 3, seed=seed, test_sampling=eval_sampling) real_train_pages = splitter.splits[0].train_size real_eval_pages = splitter.splits[0].test_size items_per_eval = real_eval_pages * eval_pct eval_batches = int(np.ceil(items_per_eval / eval_batch_size)) steps_per_epoch = real_train_pages // batch_size eval_every_step = int(round(steps_per_epoch * eval_pct)) # eval_every_step = int(round(items_per_eval * train_sampling / batch_size)) global_step = tf.train.get_or_create_global_step() inc_step = tf.assign_add(global_step, 1) all_models: List[ModelTrainerV2] = [] def create_model(scope, index, prefix, seed): with tf.variable_scope('input') as inp_scope: with tf.device("/cpu:0"): split = splitter.splits[index] pipe = InputPipe(inp, features=split.train_set, n_pages=split.train_size, mode=ModelMode.TRAIN, batch_size=batch_size, n_epoch=None, verbose=verbose, train_completeness_threshold=train_completeness_threshold, predict_completeness_threshold=train_completeness_threshold, train_window=train_window, predict_window=predict_window, rand_seed=seed, train_skip_first=hparams.train_skip_first, back_offset=predict_window if forward_split else 0) inp_scope.reuse_variables() if side_split: side_eval_pipe = InputPipe(inp, features=split.test_set, n_pages=split.test_size, mode=ModelMode.EVAL, batch_size=eval_batch_size, n_epoch=None, verbose=verbose, predict_window=predict_window, train_completeness_threshold=0.01, predict_completeness_threshold=0, train_window=train_window, rand_seed=seed, runs_in_burst=eval_batches, back_offset=predict_window * (2 if forward_split else 1)) else: side_eval_pipe = None if forward_split: forward_eval_pipe = InputPipe(inp, features=split.test_set, n_pages=split.test_size, mode=ModelMode.EVAL, batch_size=eval_batch_size, n_epoch=None, verbose=verbose, predict_window=predict_window, train_completeness_threshold=0.01, predict_completeness_threshold=0, train_window=train_window, rand_seed=seed, runs_in_burst=eval_batches, back_offset=predict_window) else: forward_eval_pipe = None avg_sgd = asgd_decay is not None #asgd_decay = 0.99 if avg_sgd else None train_model = Model(pipe, hparams, is_train=True, graph_prefix=prefix, asgd_decay=asgd_decay, seed=seed) scope.reuse_variables() eval_stages = [] if side_split: side_eval_model = Model(side_eval_pipe, hparams, is_train=False, #loss_mask=np.concatenate([np.zeros(50, dtype=np.float32), np.ones(10, dtype=np.float32)]), seed=seed) eval_stages.append((Stage.EVAL_SIDE, side_eval_model)) if avg_sgd: eval_stages.append((Stage.EVAL_SIDE_EMA, side_eval_model)) if forward_split: forward_eval_model = Model(forward_eval_pipe, hparams, is_train=False, seed=seed) eval_stages.append((Stage.EVAL_FRWD, forward_eval_model)) if avg_sgd: eval_stages.append((Stage.EVAL_FRWD_EMA, forward_eval_model)) if write_summaries: summ_path = f"{logdir}/{name}_{index}" if os.path.exists(summ_path): shutil.rmtree(summ_path) summ_writer = tf.summary.FileWriter(summ_path) # , graph=tf.get_default_graph() else: summ_writer = None if do_eval and forward_split: stop_metric = lambda metrics: metrics[Stage.EVAL_FRWD]['SMAPE'].avg_epoch else: stop_metric = None return ModelTrainerV2(train_model, eval_stages, index, patience=patience, stop_metric=stop_metric, summary_writer=summ_writer) if n_models == 1: with tf.device(f"/gpu:{gpu}"): scope = tf.get_variable_scope() all_models = [create_model(scope, 0, None, seed=seed)] else: for i in range(n_models): device = f"/gpu:{i}" if multi_gpu else f"/gpu:{gpu}" with tf.device(device): prefix = f"m_{i}" with tf.variable_scope(prefix) as scope: all_models.append(create_model(scope, i, prefix=prefix, seed=seed + i)) trainer = MultiModelTrainer(all_models, inc_step) if save_best_model or save_from_step: saver_path = f'data/cpt/{name}' if os.path.exists(saver_path): shutil.rmtree(saver_path) os.makedirs(saver_path) saver = tf.train.Saver(max_to_keep=10, name='train_saver') else: saver = None avg_sgd = asgd_decay is not None if avg_sgd: from itertools import chain def ema_vars(model): ema = model.train_model.ema return {ema.average_name(v):v for v in model.train_model.ema._averages} ema_names = dict(chain(*[ema_vars(model).items() for model in all_models])) #ema_names = all_models[0].train_model.ema.variables_to_restore() ema_loader = tf.train.Saver(var_list=ema_names, max_to_keep=1, name='ema_loader') ema_saver = tf.train.Saver(max_to_keep=1, name='ema_saver') else: ema_loader = None init = tf.global_variables_initializer() if forward_split and do_eval: eval_smape = trainer.metric(Stage.EVAL_FRWD, 'SMAPE') eval_mae = trainer.metric(Stage.EVAL_FRWD, 'MAE') else: eval_smape = DummyMetric() eval_mae = DummyMetric() if side_split and do_eval: eval_mae_side = trainer.metric(Stage.EVAL_SIDE, 'MAE') eval_smape_side = trainer.metric(Stage.EVAL_SIDE, 'SMAPE') else: eval_mae_side = DummyMetric() eval_smape_side = DummyMetric() train_smape = trainer.metric(Stage.TRAIN, 'SMAPE') train_mae = trainer.metric(Stage.TRAIN, 'MAE') grad_norm = trainer.metric(Stage.TRAIN, 'GrNorm') eval_stages = [] ema_eval_stages = [] if forward_split and do_eval: eval_stages.append(Stage.EVAL_FRWD) ema_eval_stages.append(Stage.EVAL_FRWD_EMA) if side_split and do_eval: eval_stages.append(Stage.EVAL_SIDE) ema_eval_stages.append(Stage.EVAL_SIDE_EMA) # gpu_options=tf.GPUOptions(allow_growth=False), with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=gpu_allow_growth))) as sess: sess.run(init) # pipe.load_vars(sess) inp.restore(sess) for model in all_models: model.init(sess) # if beholder: # visualizer = Beholder(session=sess, logdir=summ_path) step = 0 prev_top = np.inf best_smape = np.inf # Contains best value (first item) and subsequent values best_epoch_smape = [] for epoch in range(max_epoch): # n_steps = pusher.n_pages // batch_size if tqdm: tqr = trange(steps_per_epoch, desc="%2d" % (epoch + 1), leave=False) else: tqr = range(steps_per_epoch) for _ in tqr: try: step = trainer.train_step(sess, epoch) except tf.errors.OutOfRangeError: break # if beholder: # if step % 5 == 0: # noinspection PyUnboundLocalVariable # visualizer.update() if step % eval_every_step == 0: if eval_stages: trainer.eval_step(sess, epoch, step, eval_batches, stages=eval_stages) if save_best_model and epoch > 0 and eval_smape.last < best_smape: best_smape = eval_smape.last saver.save(sess, f'data/cpt/{name}/cpt', global_step=step) if save_from_step and step >= save_from_step: saver.save(sess, f'data/cpt/{name}/cpt', global_step=step) if avg_sgd and ema_eval_stages: ema_saver.save(sess, 'data/cpt_tmp/ema', write_meta_graph=False) # restore ema-backed vars ema_loader.restore(sess, 'data/cpt_tmp/ema') trainer.eval_step(sess, epoch, step, eval_batches, stages=ema_eval_stages) # restore normal vars ema_saver.restore(sess, 'data/cpt_tmp/ema') MAE = "%.3f/%.3f/%.3f" % (eval_mae.last, eval_mae_side.last, train_mae.last) improvement = '↑' if eval_smape.improved else ' ' SMAPE = "%s%.3f/%.3f/%.3f" % (improvement, eval_smape.last, eval_smape_side.last, train_smape.last) if tqdm: tqr.set_postfix(gr=grad_norm.last, MAE=MAE, SMAPE=SMAPE) if not trainer.has_active() or (max_steps and step > max_steps): break if tqdm: tqr.close() trainer.end_epoch() if not best_epoch_smape or eval_smape.avg_epoch < best_epoch_smape[0]: best_epoch_smape = [eval_smape.avg_epoch] else: best_epoch_smape.append(eval_smape.avg_epoch) current_top = eval_smape.top if prev_top > current_top: prev_top = current_top has_best_indicator = '↑' else: has_best_indicator = ' ' status = "%2d: Best top SMAPE=%.3f%s (%s)" % ( epoch + 1, current_top, has_best_indicator, ",".join(["%.3f" % m.top for m in eval_smape.metrics])) if trainer.has_active(): status += ", frwd/side best MAE=%.3f/%.3f, SMAPE=%.3f/%.3f; avg MAE=%.3f/%.3f, SMAPE=%.3f/%.3f, %d am" % \ (eval_mae.best_epoch, eval_mae_side.best_epoch, eval_smape.best_epoch, eval_smape_side.best_epoch, eval_mae.avg_epoch, eval_mae_side.avg_epoch, eval_smape.avg_epoch, eval_smape_side.avg_epoch, trainer.has_active()) print(status, file=sys.stderr) else: print(status, file=sys.stderr) print("Early stopping!", file=sys.stderr) break if max_steps and step > max_steps: print("Max steps calculated", file=sys.stderr) break sys.stderr.flush() # noinspection PyUnboundLocalVariable return np.mean(best_epoch_smape, dtype=np.float64)
def run(cfg): with open(cfg['tf_statistics_path'], 'rb') as f: tf_stat = pickle.load(f) names = [] tfrecord_location = cfg['tfrecords_local_path'] for file in os.listdir(tfrecord_location): if file.startswith("part"): names.append(file) file_paths = [os.path.join(tfrecord_location, name) for name in names] # read and make the dataset from tfrecord dataset = tf.data.TFRecordDataset(file_paths) dataset = dataset.map(__data_parser) batch_size = cfg['batch_size'] duration = cfg['duration'] dataset = dataset.batch(batch_size).shuffle(SHUFFLE_BUFFER) iterator = dataset.make_one_shot_iterator() next_el = iterator.get_next() # lagged_ix = numpy.ones((duration, 4), dtype=float) # lagged_ix = np.where(lagged_ix == 1, -1, lagged_ix) lagged_ix = np.stack(lag_indexes(tf_stat), axis=-1) # quarter_autocorr = numpy.ones((batch_size,), dtype=float) date_list = tf_stat['days'] dow = get_dow(date_list) holiday_list = cfg['holidays'] holidays = [1 if _ in holiday_list else 0 for _ in date_list] a_list = [] b_list = [] for _ in holidays: a, b = holiday_norm(_) a_list.append(a) b_list.append(b) holiday = (a_list, b_list) with tf.Session() as sess: x = sess.run(next_el) quarter_autocorr = numpy.ones((x[0].size, ), dtype=float) page_indx = list(x[0]) fill_isolated_zeros(x[21]) tensors = dict( hits=pd.DataFrame(x[21], index=page_indx, columns=date_list), lagged_ix=lagged_ix, page_ix=page_indx, pf_age=pd.DataFrame(x[8:15], columns=page_indx, index=(1, 2, 3, 4, 5, 6, 7)).T, pf_si=pd.DataFrame(x[20], index=page_indx), pf_network=pd.DataFrame(x[15:20], columns=page_indx, index=('2G', '3G', '4G', 'UNKNOWN', 'WIFI')).T, pf_price_cat=pd.DataFrame(x[1:4], columns=page_indx, index=('pc1', 'pc2', 'pc3')).T, pf_gender=pd.DataFrame(x[4:8], columns=page_indx, index=('none', 'f', 'm', 'x')).T, page_popularity=x[22], # page_popularity = quarter_autocorr, quarter_autocorr=quarter_autocorr, dow=pd.DataFrame(dow).T, holiday=pd.DataFrame(holiday).T) data_len = tensors['hits'].shape[1] plain = dict(data_days=data_len - cfg['add_days'], features_days=data_len, data_start=date_list[0], data_end=date_list[-1], features_end=date_list[-1], n_pages=batch_size) VarFeeder(cfg['data_dir'], tensors, plain)
def train(name, hparams, multi_gpu=False, n_models=1, train_completeness_threshold=0.01, seed=None, logdir='data/logs', max_epoch=100, patience=2, train_sampling=1.0, eval_sampling=1.0, eval_memsize=5, gpu=0, gpu_allow_growth=False, save_best_model=False, forward_split=False, write_summaries=False, verbose=False, asgd_decay=None, tqdm=True, side_split=True, max_steps=None, save_from_step=None, do_eval=True, predict_window=63): eval_k = int(round(26214 * eval_memsize / n_models)) eval_batch_size = int( eval_k / (hparams.rnn_depth * hparams.encoder_rnn_layers)) # 128 -> 1024, 256->512, 512->256 eval_pct = 0.1 batch_size = hparams.batch_size train_window = hparams.train_window # todo eval_k = 43690,eval_batch_size = 163,eval_pct = 0,batch_size = 128,train_window = 283 # print("eval_k = %d,eval_batch_size = %d,eval_pct = %d,batch_size = %d,train_window = %d" %(eval_k,eval_batch_size,eval_pct,batch_size,train_window)) tf.reset_default_graph() if seed: tf.set_random_seed(seed) with tf.device("/cpu:0"): inp = VarFeeder.read_vars("data/vars") # print("side_split = %d,train_sampling= %d,eval_sampling= %d,seed= %d" % ( # side_split,train_sampling,eval_sampling,seed),f"inp={inp}, side_split={side_split}; type(inp)={type(inp)}") # todo side_split = 0,train_sampling= 1,eval_sampling= 1,seed= 5 # inp={'hits': <tf.Variable 'hits:0' shape=(145036, 805) dtype=float32_ref>, # 'lagged_ix': <tf.Variable 'lagged_ix:0' shape=(867, 4) dtype=int16_ref>, # 'page_map': <tf.Variable 'page_map:0' shape=(52752, 4) dtype=int32_ref>, # 'page_ix': <tf.Variable 'page_ix:0' shape=(145036,) dtype=string_ref>, # 'pf_agent': <tf.Variable 'pf_agent:0' shape=(145036, 4) dtype=float32_ref>, # 'pf_country': <tf.Variable 'pf_country:0' shape=(145036, 7) dtype=float32_ref>, # 'pf_site': <tf.Variable 'pf_site:0' shape=(145036, 3) dtype=float32_ref>, # 'page_popularity': <tf.Variable 'page_popularity:0' shape=(145036,) dtype=float32_ref>, # 'year_autocorr': <tf.Variable 'year_autocorr:0' shape=(145036,) dtype=float32_ref>, # 'quarter_autocorr': <tf.Variable 'quarter_autocorr:0' shape=(145036,) dtype=float32_ref>, # 'dow': <tf.Variable 'dow:0' shape=(867, 2) dtype=float32_ref>,'features_days': 867, # 'data_days': 805, 'n_pages': 145036, 'data_start': '2015-07-01', # 'data_end': Timestamp('2017-09-11 00:00:00'), 'features_end': Timestamp('2017-11-13 00:00:00')} # side_split=False; # type(inp)=<class 'feeder.FeederVars'>; # if True: if side_split: splitter = Splitter(page_features(inp), inp.page_map, 3, train_sampling=train_sampling, test_sampling=eval_sampling, seed=seed) else: splitter = FakeSplitter(page_features(inp), 3, seed=seed, test_sampling=eval_sampling) real_train_pages = splitter.splits[0].train_size real_eval_pages = splitter.splits[0].test_size items_per_eval = real_eval_pages * eval_pct eval_batches = int(np.ceil(items_per_eval / eval_batch_size)) steps_per_epoch = real_train_pages // batch_size eval_every_step = int(round(steps_per_epoch * eval_pct)) # todo real_train_pages = 145036,real_eval_pages= 145036,items_per_eval= 14503,eval_batches= 89, # steps_per_epoch= 1133,eval_every_step= 113 -- 每个epoch有1133个step,每113个step打印一下当前模型的效果 # print("real_train_pages = %d,real_eval_pages= %d,items_per_eval= %d,eval_batches= %d,steps_per_epoch= %d,eval_every_step= %d; eval_pct" % ( # real_train_pages, real_eval_pages, items_per_eval, eval_batches, steps_per_epoch, eval_every_step,eval_pct # )) # return # eval_every_step = int(round(items_per_eval * train_sampling / batch_size)) # todo get_or_create_global_step 这个函数主要用于返回或者创建(如果有必要的话)一个全局步数的tensor变量。 global_step = tf.train.get_or_create_global_step() # todo tf.assign_add(ref,value,use_locking=None,name=None);通过增加value,更新ref的值,即:ref = ref + value; # inc increase_step inc_step = tf.assign_add(global_step, 1) all_models: List[ModelTrainerV2] = [] def create_model(scope, index, prefix, seed): # todo 主要是创建了模型,以及返回一些None的东西。 # 数据在构建模型的时候使用了,模型中只使用了数据的预测窗口的长度--不对,应该是创建模型的时候直接喂入数据了。 with tf.variable_scope('input') as inp_scope: with tf.device("/cpu:0"): split = splitter.splits[index] pipe = InputPipe( inp, features=split.train_set, n_pages=split.train_size, mode=ModelMode.TRAIN, batch_size=batch_size, n_epoch=None, verbose=verbose, train_completeness_threshold=train_completeness_threshold, predict_completeness_threshold=train_completeness_threshold, train_window=train_window, predict_window=predict_window, rand_seed=seed, train_skip_first=hparams.train_skip_first, back_offset=predict_window if forward_split else 0) inp_scope.reuse_variables() # todo side_split: False; forward_split:False; eval_stages: []; if side_split: side_eval_pipe = InputPipe( inp, features=split.test_set, n_pages=split.test_size, mode=ModelMode.EVAL, batch_size=eval_batch_size, n_epoch=None, verbose=verbose, predict_window=predict_window, train_completeness_threshold=0.01, predict_completeness_threshold=0, train_window=train_window, rand_seed=seed, runs_in_burst=eval_batches, back_offset=predict_window * (2 if forward_split else 1)) else: side_eval_pipe = None if forward_split: forward_eval_pipe = InputPipe( inp, features=split.test_set, n_pages=split.test_size, mode=ModelMode.EVAL, batch_size=eval_batch_size, n_epoch=None, verbose=verbose, predict_window=predict_window, train_completeness_threshold=0.01, predict_completeness_threshold=0, train_window=train_window, rand_seed=seed, runs_in_burst=eval_batches, back_offset=predict_window) else: forward_eval_pipe = None avg_sgd = asgd_decay is not None #asgd_decay = 0.99 if avg_sgd else None train_model = Model(pipe, hparams, is_train=True, graph_prefix=prefix, asgd_decay=asgd_decay, seed=seed) scope.reuse_variables() eval_stages = [] if side_split: # print('2 side_split side_eval_model') side_eval_model = Model( side_eval_pipe, hparams, is_train=False, #loss_mask=np.concatenate([np.zeros(50, dtype=np.float32), np.ones(10, dtype=np.float32)]), seed=seed) # print("2 side_eval_model -- 2") # todo TRAIN = 0; EVAL_SIDE = 1; EVAL_FRWD = 2; EVAL_SIDE_EMA = 3; EVAL_FRWD_EMA = 4 eval_stages.append((Stage.EVAL_SIDE, side_eval_model)) if avg_sgd: eval_stages.append((Stage.EVAL_SIDE_EMA, side_eval_model)) if forward_split: # print("3 forward_split forward_eval_model") # tf.reset_default_graph() forward_eval_model = Model(forward_eval_pipe, hparams, is_train=False, seed=seed) # print("3 forward_split forward_eval_model -- 2") eval_stages.append((Stage.EVAL_FRWD, forward_eval_model)) if avg_sgd: eval_stages.append((Stage.EVAL_FRWD_EMA, forward_eval_model)) if write_summaries: summ_path = f"{logdir}/{name}_{index}" # print("write_summaries summ_path",summ_path) if os.path.exists(summ_path): shutil.rmtree(summ_path) summ_writer = tf.summary.FileWriter( summ_path) # , graph=tf.get_default_graph() else: summ_writer = None if do_eval and forward_split: stop_metric = lambda metrics: metrics[Stage.EVAL_FRWD]['SMAPE' ].avg_epoch else: stop_metric = None # todo side_split: False; forward_split:False; # summ_writer=<tensorflow.python.summary.writer.writer.FileWriter object at 0x7ff5dc176710>; # eval_stages: []; stop_metric=None; patience=2; index=0 # print(f"side_split: {side_split}; forward_split:{forward_split}; summ_writer={summ_writer};" # f"eval_stages: {eval_stages}; stop_metric={stop_metric}; patience={patience}; index={index}") return ModelTrainerV2(train_model, eval_stages, index, patience=patience, stop_metric=stop_metric, summary_writer=summ_writer) # todo n_models == 3 if n_models == 1: with tf.device(f"/gpu:{gpu}"): scope = tf.get_variable_scope() all_models = [create_model(scope, 0, None, seed=seed)] else: for i in range(n_models): device = f"/gpu:{i}" if multi_gpu else f"/gpu:{gpu}" with tf.device(device): prefix = f"m_{i}" with tf.variable_scope(prefix) as scope: all_models.append( create_model(scope, i, prefix=prefix, seed=seed + i)) # todo inc_step = tf.assign_add(global_step, 1) trainer = MultiModelTrainer(all_models, inc_step) # return # todo save_best_model or save_from_step: False 10500 # print("save_best_model or save_from_step: ", save_best_model, save_from_step) if save_best_model or save_from_step: saver_path = f'data/cpt/{name}' # todo saver_path: data/cpt/s32 # print("saver_path: ",saver_path) if os.path.exists(saver_path): shutil.rmtree(saver_path) os.makedirs(saver_path) # todo max_to_keep 参数,这个是用来设置保存模型的个数,默认为5,即 max_to_keep=5,保存最近的5个模型 saver = tf.train.Saver(max_to_keep=10, name='train_saver') else: saver = None # todo EMA decay for averaged SGD. Not use ASGD if not set avg_sgd = asgd_decay is not None # todo asgd_decay=0.99; avg_sgd=True # print(f"asgd_decay={asgd_decay}; avg_sgd={avg_sgd}") if avg_sgd: from itertools import chain def ema_vars(model): ema = model.train_model.ema # todo: average_name() methods give access to the shadow variables and their names return { ema.average_name(v): v for v in model.train_model.ema._averages } ema_names = dict( chain(*[ema_vars(model).items() for model in all_models])) # todo ema_names= # {'m_0/m_0/cudnn_gru/opaque_kernel/ExponentialMovingAverage': <tf.Variable 'm_0/cudnn_gru/opaque_kernel:0' shape=<unknown> dtype=float32_ref>, # 'm_0/m_0/fingerpint/convnet/conv1d/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d/kernel:0' shape=(7, 5, 16) dtype=float32_ref>, # 'm_0/m_0/fingerpint/convnet/conv1d/bias/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d/bias:0' shape=(16,) dtype=float32_ref>, # 'm_0/m_0/fingerpint/convnet/conv1d_1/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_1/kernel:0' shape=(3, 16, 16) dtype=float32_ref>, # 'm_0/m_0/fingerpint/convnet/conv1d_1/bias/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_1/bias:0' shape=(16,) dtype=float32_ref>, # 'm_0/m_0/fingerpint/convnet/conv1d_2/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_2/kernel:0' shape=(3, 16, 32) dtype=float32_ref>, # 'm_0/m_0/fingerpint/convnet/conv1d_2/bias/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_2/bias:0' shape=(32,) dtype=float32_ref>, # 'm_0/m_0/fingerpint/convnet/conv1d_3/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_3/kernel:0' shape=(3, 32, 32) dtype=float32_ref>, # 'm_0/m_0/fingerpint/convnet/conv1d_3/bias/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_3/bias:0' shape=(32,) dtype=float32_ref>, # 'm_0/m_0/fingerpint/convnet/conv1d_4/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_4/kernel:0' shape=(3, 32, 64) dtype=float32_ref>, # 'm_0/m_0/fingerpint/convnet/conv1d_4/bias/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_4/bias:0' shape=(64,) dtype=float32_ref>, # 'm_0/m_0/fingerpint/convnet/conv1d_5/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_5/kernel:0' shape=(3, 64, 64) dtype=float32_ref>, # 'm_0/m_0/fingerpint/convnet/conv1d_5/bias/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/convnet/conv1d_5/bias:0' shape=(64,) dtype=float32_ref>, # 'm_0/m_0/fingerpint/fc_convnet/fc_encoder/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/fc_convnet/fc_encoder/kernel:0' shape=(2304, 512) dtype=float32_ref>, # 'm_0/m_0/fingerpint/fc_convnet/fc_encoder/bias/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/fc_convnet/fc_encoder/bias:0' shape=(512,) dtype=float32_ref>, # 'm_0/m_0/fingerpint/fc_convnet/out_encoder/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/fc_convnet/out_encoder/kernel:0' shape=(512, 16) dtype=float32_ref>, # 'm_0/m_0/fingerpint/fc_convnet/out_encoder/bias/ExponentialMovingAverage': <tf.Variable 'm_0/fingerpint/fc_convnet/out_encoder/bias:0' shape=(16,) dtype=float32_ref>, # 'm_0/m_0/attn_focus/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/attn_focus/kernel:0' shape=(16, 221) dtype=float32_ref>, # 'm_0/m_0/attn_focus/bias/ExponentialMovingAverage': <tf.Variable 'm_0/attn_focus/bias:0' shape=(221,) dtype=float32_ref>, # 'm_0/m_0/gru_cell/w_ru/ExponentialMovingAverage': <tf.Variable 'm_0/gru_cell/w_ru:0' shape=(291, 534) dtype=float32_ref>, # 'm_0/m_0/gru_cell/b_ru/ExponentialMovingAverage': <tf.Variable 'm_0/gru_cell/b_ru:0' shape=(534,) dtype=float32_ref>, # 'm_0/m_0/gru_cell/w_c/ExponentialMovingAverage': <tf.Variable 'm_0/gru_cell/w_c:0' shape=(291, 267) dtype=float32_ref>, # 'm_0/m_0/gru_cell/b_c/ExponentialMovingAverage': <tf.Variable 'm_0/gru_cell/b_c:0' shape=(267,) dtype=float32_ref>, # 'm_0/m_0/decoder_output_proj/kernel/ExponentialMovingAverage': <tf.Variable 'm_0/decoder_output_proj/kernel:0' shape=(267, 1) dtype=float32_ref>, # 'm_0/m_0/decoder_output_proj/bias/ExponentialMovingAverage': <tf.Variable 'm_0/decoder_output_proj/bias:0' shape=(1,) dtype=float32_ref>, # print(f"ema_names={ema_names}") # todo chain=<itertools.chain object at 0x7fe6587cbf98>, # [] = [dict_items([ # ('m_0/m_0/cudnn_gru/opaque_kernel/ExponentialMovingAverage', <tf.Variable 'm_0/cudnn_gru/opaque_kernel:0' shape=<unknown> dtype=float32_ref>), # ... # ('m_2/m_2/decoder_output_proj/bias/ExponentialMovingAverage', <tf.Variable 'm_2/decoder_output_proj/bias:0' shape=(1,) dtype=float32_ref>) # ])] # print(f"chain={chain(*[ema_vars(model).items() for model in all_models])},\n[] = {[ema_vars(model).items() for model in all_models]}") #ema_names = all_models[0].train_model.ema.variables_to_restore() ema_loader = tf.train.Saver(var_list=ema_names, max_to_keep=1, name='ema_loader') ema_saver = tf.train.Saver(max_to_keep=1, name='ema_saver') else: ema_loader = None init = tf.global_variables_initializer() # print(f"forward_split={forward_split}; do_eval={do_eval}; side_split={side_split}") if forward_split and do_eval: eval_smape = trainer.metric(Stage.EVAL_FRWD, 'SMAPE') eval_mae = trainer.metric(Stage.EVAL_FRWD, 'MAE') else: eval_smape = DummyMetric() eval_mae = DummyMetric() if side_split and do_eval: eval_mae_side = trainer.metric(Stage.EVAL_SIDE, 'MAE') eval_smape_side = trainer.metric(Stage.EVAL_SIDE, 'SMAPE') else: eval_mae_side = DummyMetric() eval_smape_side = DummyMetric() train_smape = trainer.metric(Stage.TRAIN, 'SMAPE') train_mae = trainer.metric(Stage.TRAIN, 'MAE') grad_norm = trainer.metric(Stage.TRAIN, 'GrNorm') eval_stages = [] ema_eval_stages = [] if forward_split and do_eval: eval_stages.append(Stage.EVAL_FRWD) ema_eval_stages.append(Stage.EVAL_FRWD_EMA) if side_split and do_eval: eval_stages.append(Stage.EVAL_SIDE) ema_eval_stages.append(Stage.EVAL_SIDE_EMA) # todo eval_stages=[]; ema_eval_stages=[] # print(f"eval_stages={eval_stages}; ema_eval_stages={ema_eval_stages}") # gpu_options=tf.GPUOptions(allow_growth=False), with tf.Session( config=tf.ConfigProto(allow_soft_placement=True, gpu_options=tf.GPUOptions( allow_growth=gpu_allow_growth))) as sess: sess.run(init) # pipe.load_vars(sess) # todo 之前inp是加载了这个数据对象,restore是把数据tensor加载到sess中吧? # 这里加载了数据在哪里用到了呢? inp.restore(sess) for model in all_models: # todo 这里是什么意思呢?这样的到什么呢?运行了一下init_iterator? # 上面建好模型结构之后,在哪里喂入数据呢? model.init(sess) # if beholder: # visualizer = Beholder(session=sess, logdir=summ_path) step = 0 prev_top = np.inf best_smape = np.inf # Contains best value (first item) and subsequent values best_epoch_smape = [] for epoch in range(max_epoch): # n_steps = pusher.n_pages // batch_size if tqdm: # todo Tqdm 是一个快速,可扩展的Python进度条,可以在 Python 长循环中添加一个进度提示信息, # 用户只需要封装任意的迭代器 tqdm(iterator)。trange(i) 是 tqdm(range(i)) 的简单写法 # desc=进度条前面的描述;leave:保留进度条存在的痕迹,简单来说就是会把进度条的最终形态保留下来,默认为True tqr = trange(steps_per_epoch, desc="%2d" % (epoch + 1), leave=False) else: tqr = range(steps_per_epoch) for _ in tqr: try: # print("PRINT step = trainer.train_step") # todo 训练模型只有这一行对吧 step = trainer.train_step(sess, epoch) # if epoch == 0: # print(f"step={step}, _={_}, epoch = {epoch}") except tf.errors.OutOfRangeError: break # if beholder: # if step % 5 == 0: # noinspection PyUnboundLocalVariable # visualizer.update() # todo 应该是每训练一个epoch,会对其中的100(eval_pct)个batch的结果做一个评估;eval_every_step= 113 if step % eval_every_step == 0: # todo eval_stages=[];save_best_model=False; save_from_step=10500; avg_sgd=True; ema_eval_stages=[] # print(f"eval_stages={eval_stages};save_best_model={save_best_model}; save_from_step={save_from_step}; avg_sgd={avg_sgd}; ema_eval_stages={ema_eval_stages}") if eval_stages: trainer.eval_step(sess, epoch, step, eval_batches, stages=eval_stages) if save_best_model and epoch > 0 and eval_smape.last < best_smape: best_smape = eval_smape.last saver.save(sess, f'data/cpt/{name}/cpt', global_step=step) if save_from_step and step >= save_from_step: saver.save(sess, f'data/cpt/{name}/cpt', global_step=step) if avg_sgd and ema_eval_stages: ema_saver.save(sess, 'data/cpt_tmp/ema', write_meta_graph=False) # restore ema-backed vars ema_loader.restore(sess, 'data/cpt_tmp/ema') trainer.eval_step(sess, epoch, step, eval_batches, stages=ema_eval_stages) # restore normal vars ema_saver.restore(sess, 'data/cpt_tmp/ema') MAE = "%.3f/%.3f/%.3f" % (eval_mae.last, eval_mae_side.last, train_mae.last) improvement = '↑' if eval_smape.improved else ' ' SMAPE = "%s%.3f/%.3f/%.3f" % (improvement, eval_smape.last, eval_smape_side.last, train_smape.last) if tqdm: # todo .set_description("GEN %i"%i) #设置进度条左边显示的信息 # .set_postfix(loss=random(),gen=randint(1,999),str="h",lst=[1,2]) #设置进度条右边显示的信息 tqr.set_postfix(gr=grad_norm.last, MAE=MAE, SMAPE=SMAPE) if not trainer.has_active() or (max_steps and step > max_steps): break if tqdm: tqr.close() trainer.end_epoch() if not best_epoch_smape or eval_smape.avg_epoch < best_epoch_smape[ 0]: best_epoch_smape = [eval_smape.avg_epoch] else: best_epoch_smape.append(eval_smape.avg_epoch) current_top = eval_smape.top if prev_top > current_top: prev_top = current_top has_best_indicator = '↑' else: has_best_indicator = ' ' status = "%2d: Best top SMAPE=%.3f%s (%s)" % ( epoch + 1, current_top, has_best_indicator, ",".join( ["%.3f" % m.top for m in eval_smape.metrics])) if trainer.has_active(): status += ", frwd/side best MAE=%.3f/%.3f, SMAPE=%.3f/%.3f; avg MAE=%.3f/%.3f, SMAPE=%.3f/%.3f, %d am" % \ (eval_mae.best_epoch, eval_mae_side.best_epoch, eval_smape.best_epoch, eval_smape_side.best_epoch, eval_mae.avg_epoch, eval_mae_side.avg_epoch, eval_smape.avg_epoch, eval_smape_side.avg_epoch, trainer.has_active()) else: print("Early stopping!", file=sys.stderr) break if max_steps and step > max_steps: print("Max steps calculated", file=sys.stderr) break sys.stderr.flush() # todo best_epoch_smape=[nan]; eval_smape.avg_epoch=nan; trainer.has_active()=3; prev_top=inf; current_top=nan # print(f"best_epoch_smape={best_epoch_smape}; eval_smape.avg_epoch={eval_smape.avg_epoch}; " # f"trainer.has_active()={trainer.has_active()}; prev_top={prev_top}; current_top={current_top}") # noinspection PyUnboundLocalVariable return np.mean(best_epoch_smape, dtype=np.float64)