def train(config, trainfile, testfile): """Entry for trainig Args: config: (configparser) All the hyperparameters for training """ keys = get_norm_keys(config['input'].get('conf')) train_dirs = trainfile.split(',') train_files = [[] for _ in range(5)] for train_dir in train_dirs: for f in os.listdir(train_dir): if f != "_SUCCESS": ind = int(int(f.split('-')[-1]) / 40) train_files[ind].append(os.path.join(train_dir, f)) #train_files = [os.path.join(train_dir, f) for train_dir in train_dirs for f in os.listdir(train_dir) if f != "_SUCCESS"] #train_files = tf.random_shuffle(tf.train.match_filenames_once([os.path.join(train_dir, f) for f in os.listdir(train_dir) if f != "_SUCCESS"])) #train_files = tf.random_shuffle(tf.train.match_filenames_once(['%s/%s/part-r-*' % (data_path, dt) for dt in date_list])) #logging.info('train directory: {}'.format(train_dirs)) #logging.info('train files: {}'.format(reprlib.repr(train_files))) dev_dirs = testfile.split(',') dev_files = [ os.path.join(dev_dir, f) for dev_dir in dev_dirs for f in os.listdir(dev_dir) if f != "_SUCCESS" ] #logging.info('dev directory: {}'.format(dev_dirs)) #logging.info('dev files: {}'.format(reprlib.repr(dev_files))) #特征的配置文件 在input 这个section的spec这个key feature_config = configparser.ConfigParser() feature_config.read(config['input']['spec']) #特征配置文件 有boundaries等信息 columns, spec = FCGen.GetFeatureSpec( feature_config) #按特征列对特征进行处理,不同类型处理会不一样,比如数值、embed等 batch_size = int(config['train']['batch_size']) conf = tf.ConfigProto() conf.gpu_options.allow_growth = True os.environ["CUDA_VISIBLE_DEVICES"] = "3" run_config = tf.estimator.RunConfig().replace( model_dir=config['train'].get('model_dir', 'model_dir'), save_checkpoints_secs=3600, session_config=conf) dynamic = config['train']['dynamic'] == 'true' print("dynamic:", dynamic) logging.info("Creating model...") # Define the model hidden_units = [int(n) for n in config['model']['hidden_units'].split(',')] learning_rate = float(config['model']['learning_rate']) ctr_reg = float(config['model'].get('ctr_reg', '1e-4')) cvr_reg = float(config['model'].get('cvr_reg', '1e-4')) ctcvr_loss_weight = float(config['model'].get('ctcvr_loss_weight', '1.0')) model = tf.estimator.Estimator(model_fn=esmm_model_fn, params={ 'cat_columns': columns['cat'], 'val_columns': columns['val'], 'dnn_columns': list(columns['dnn'].values()), 'weight_columns': list(columns['weight'].values()), 'column_to_field': {}, 'hidden_units': hidden_units, 'learning_rate': learning_rate, 'ctr_reg': ctr_reg, 'cvr_reg': cvr_reg, 'reg': 1e-4, 'ctcvr_loss_weight': ctcvr_loss_weight, 'model': config['model']['model'], 'embed_dim': int(config['model']['embedding_dim']), 'dynamic': dynamic }, config=run_config) # Train and evaluate max_steps = config['train'].get('max_step', '') if max_steps == '': max_steps = None else: max_steps = int(max_steps) #for variable_name in model.get_variable_names(): # print(variable_name) logging.info("training...") epochs = int(config['train'].get('epochs', '1')) #train_input_fn = lambda: input_fn(train_files, spec, True, batch_size, mt=True) eval_input_fn = lambda: input_fn( dev_files, spec, False, batch_size, mt=True) for i in range(epochs): logging.info("{}th training...".format(i + 1)) for j in range(len(train_files)): model.train(input_fn=lambda: input_fn( train_files[j], spec, True, batch_size, mt=True), steps=max_steps) results = model.evaluate(input_fn=eval_input_fn) logging.info("{}th test results...".format(i + j + 1)) for key in sorted(results): print('%s: %s' % (key, results[key])) model.export_savedmodel( export_dir_base=config['train'].get('export_dir', 'export_dir'), serving_input_receiver_fn=lambda: input_receiver(spec), strip_default_attrs=True)
def train(config, trainfile, testfile): """Entry for trainig Args: config: (configparser) All the hyperparameters for training """ prefix = "/data/home/graywang/esmm/tfrecords/rt_mt" train_dirs = trainfile.split(',') cluster = "hdfs://ss-sng-dc-v2/stage/outface/SNG/g_sng_qqmusic_develop/g_sng_qqmusic_develop/timmili/gray_temp/" if config['train']['source'] == 'hdfs': train_files = [[] for _ in range(5)] for train_dir in train_dirs: for i in range(5): train_files[i].append(cluster + train_dir + "/part-r-00" + str(i) + "*") else: train_files = [[] for _ in range(4)] for train_dir in train_dirs: for f in os.listdir(prefix + "/" + train_dir): if len(os.listdir(prefix + "/" + train_dir)) > 250: div = 125 else: div = 50 if f != "_SUCCESS": ind = int(int(f.split('-')[-1]) / div) train_files[ind].append(os.path.join(prefix, train_dir, f)) logging.info('train directory: {}'.format(train_dirs)) logging.info('train files: {}'.format(reprlib.repr(train_files))) dev_dirs = testfile.split(',') dev_files = [ os.path.join(prefix, dev_dir, f) for dev_dir in dev_dirs for f in os.listdir(prefix + "/" + dev_dir) if f != "_SUCCESS" ] logging.info('dev directory: {}'.format(dev_dirs)) logging.info('dev files: {}'.format(reprlib.repr(dev_files))) #logging.info('dev directory: {}'.format(dev_dirs)) #logging.info('dev files: {}'.format(reprlib.repr(dev_files))) #特征的配置文件 在input 这个section的spec这个key feature_config = configparser.ConfigParser() feature_config.read(config['input']['spec']) #特征配置文件 有boundaries等信息 columns, spec, dimension_config = FCGen.GetFeatureSpec( feature_config) #按特征列对特征进行处理,不同类型处理会不一样,比如数值、embed等 batch_size = int(config['train']['batch_size']) conf = tf.ConfigProto() conf.gpu_options.allow_growth = True dimension_config = {} os.environ["CUDA_VISIBLE_DEVICES"] = "3" run_config = tf.estimator.RunConfig().replace( model_dir=config['train'].get('model_dir', 'model_dir'), save_checkpoints_secs=3600, session_config=conf) dynamic = config['train']['dynamic'] == 'true' warm_dir = config['train'].get('warm_dir', '') if len(warm_dir) > 1: ws = tf.estimator.WarmStartSettings(ckpt_to_initialize_from=warm_dir, vars_to_warm_start=".*") else: ws = None print("dynamic:", dynamic) logging.info("Creating model...") # Define the model hidden_units = [int(n) for n in config['model']['hidden_units'].split(',')] learning_rate = float(config['model']['learning_rate']) ctr_reg = float(config['model'].get('ctr_reg', '1e-4')) cvr_reg = float(config['model'].get('cvr_reg', '1e-4')) ctcvr_loss_weight = float(config['model'].get('ctcvr_loss_weight', '1.0')) model = tf.estimator.Estimator(model_fn=esmm_model_fn, params={ 'cat_columns': columns['cat'], 'val_columns': columns['val'], 'dnn_columns': list(columns['dnn'].values()), 'weight_columns': list(columns['weight'].values())[0], 'column_to_field': {}, 'hidden_units': hidden_units, 'learning_rate': learning_rate, 'ctr_reg': ctr_reg, 'cvr_reg': cvr_reg, 'reg': 1e-4, 'dimension_config': dimension_config, 'ctcvr_loss_weight': ctcvr_loss_weight, 'model': config['model']['model'], 'embed_dim': int(config['model']['embedding_dim']), 'expert_num': int(config['model']['expert_num']), 'expert_unit': int(config['model']['expert_unit']), 'dynamic': dynamic }, config=run_config, warm_start_from=ws) # Train and evaluate max_steps = config['train'].get('max_step', '') if max_steps == '': max_steps = None else: max_steps = int(max_steps) #for variable_name in model.get_variable_names(): # print(variable_name) logging.info("training...") epochs = int(config['train'].get('epochs', '1')) #train_input_fn = lambda: input_fn(train_files, spec, True, batch_size, mt=True) if config['train']['source'] == 'hdfs': input_func = input_fn_pattern else: input_func = input_fn eval_input_fn = lambda: input_fn( dev_files, spec, False, batch_size, mt=True) for i in range(epochs): logging.info("{}th training...".format(i + 1)) for j in range(len(train_files)): model.train(input_fn=lambda: input_func( train_files[j], spec, True, batch_size, mt=True)) results = model.evaluate(input_fn=eval_input_fn) logging.info("{}th test results...".format(i + j + 1)) for key in sorted(results): print('%s: %s' % (key, results[key])) model.export_savedmodel( export_dir_base=config['train'].get('export_dir', 'export_dir'), serving_input_receiver_fn=lambda: input_receiver(spec), strip_default_attrs=True)
def train(config , trainfile, testfile): """Entry for trainig Args: config: (configparser) All the hyperparameters for training """ keys = get_norm_keys(config['input'].get('conf')) train_dirs = trainfile.split(',') train_files = [[] for _ in range(5)] for train_dir in train_dirs: for f in os.listdir(train_dir): if f != "_SUCCESS": ind = int(int(f.split('-')[-1]) / 40) train_files[ind].append(os.path.join(train_dir, f)) #train_files = [os.path.join(train_dir, f) for train_dir in train_dirs for f in os.listdir(train_dir) if f != "_SUCCESS"] #train_files = tf.random_shuffle(tf.train.match_filenames_once([os.path.join(train_dir, f) for f in os.listdir(train_dir) if f != "_SUCCESS"])) #train_files = tf.random_shuffle(tf.train.match_filenames_once(['%s/%s/part-r-*' % (data_path, dt) for dt in date_list])) logging.info('train directory: {}'.format(train_dirs)) logging.info('train files: {}'.format(reprlib.repr(train_files))) dev_dirs = testfile.split(',') dev_files = [os.path.join(dev_dir, f) for dev_dir in dev_dirs for f in os.listdir(dev_dir) if f != "_SUCCESS"] logging.info('dev directory: {}'.format(dev_dirs)) logging.info('dev files: {}'.format(reprlib.repr(dev_files))) #特征的配置文件 在input 这个section的spec这个key feature_config = configparser.ConfigParser() feature_config.read(config['input']['spec'])#特征配置文件 有boundaries等信息 columns, spec = FCGen.GetFeatureSpec(feature_config)#按特征列对特征进行处理,不同类型处理会不一样,比如数值、embed等 batch_size = int(config['train']['batch_size']) conf = tf.ConfigProto() conf.gpu_options.allow_growth=True os.environ["CUDA_VISIBLE_DEVICES"] = "5" run_config = tf.estimator.RunConfig().replace( model_dir=config['train'].get('model_dir', 'model_dir'), session_config=conf) logging.info("Creating model...") # Define the model hidden_units = [int(n) for n in config['model']['hidden_units'].split(',')] learning_rate = float(config['model']['learning_rate']) ctr_reg = float(config['model'].get('ctr_reg', '1e-6')) cvr_reg = float(config['model'].get('cvr_reg', '1e-4')) ctcvr_loss_weight = float(config['model'].get('ctcvr_loss_weight', '1.0')) model = tf.estimator.Estimator( model_fn=esmm_model_fn, params={ 'dnn_columns': list(columns['dnn'].values()), 'linear_columns': list(columns['linear'].values()), 'weight_columns': list(columns['weight'].values()), 'hidden_units': hidden_units, 'learning_rate': learning_rate, 'ctr_reg': ctr_reg, 'cvr_reg': cvr_reg, 'ctcvr_loss_weight': ctcvr_loss_weight, 'model': config['model']['model'] }, config = run_config ) print(model.evaluate(input_fn=lambda: input_fn(dev_files[0:1], spec, False, batch_size, mt=True))) model.export_savedmodel(export_dir_base=config['train'].get('export_dir', 'export_dir'), serving_input_receiver_fn=lambda: input_receiver(spec), strip_default_attrs=True)
def train(config, trainfile, testfile): """Entry for trainig Args: config: (configparser) All the hyperparameters for training """ train_dirs = trainfile.split(',') train_files = [ os.path.join(train_dir, f) for train_dir in train_dirs for f in os.listdir(train_dir) if f != "_SUCCESS" ] #train_files = tf.random_shuffle(tf.train.match_filenames_once([os.path.join(train_dir, f) for f in os.listdir(train_dir) if f != "_SUCCESS"])) #train_files = tf.random_shuffle(tf.train.match_filenames_once(['%s/%s/part-r-*' % (data_path, dt) for dt in date_list])) logging.info('train directory: {}'.format(train_dirs)) logging.info('train files: {}'.format(reprlib.repr(train_files))) dev_dirs = testfile.split(',') dev_files = [ os.path.join(dev_dir, f) for dev_dir in dev_dirs for f in os.listdir(dev_dir) if f != "_SUCCESS" ] logging.info('dev directory: {}'.format(dev_dirs)) logging.info('dev files: {}'.format(reprlib.repr(dev_files))) #特征的配置文件 在input 这个section的spec这个key feature_config = configparser.ConfigParser() feature_config.read(config['input']['spec']) #特征配置文件 有boundaries等信息 columns, spec = FCGen.GetFeatureSpec( feature_config) #按特征列对特征进行处理,不同类型处理会不一样,比如数值、embed等 batch_size = int(config['train']['batch_size']) conf = tf.ConfigProto() conf.gpu_options.allow_growth = True os.environ["CUDA_VISIBLE_DEVICES"] = "5" run_config = tf.estimator.RunConfig().replace(session_config=conf) logging.info("Creating model...") # Define the model hidden_units = [ int(n) for n in config['dnn_model']['hidden_units'].split(',') ] dropout = config['dnn_model'].get('dropout', '') if dropout == '': dropout = None else: dropout = float(dropout) #print(columns['weight'][0])#如果有weight这个就不能注释 model = tf.estimator.DNNLinearCombinedClassifier( config=run_config, model_dir=config['train'].get('model_dir', 'model_dir'), linear_feature_columns=columns['linear'], linear_optimizer=tf.train.FtrlOptimizer( learning_rate=float(config['linear_model']['learning_rate']), #l1_regularization_strength=float(config['linear_model']['l1_reg']), #l2_regularization_strength=float(config['linear_model']['l2_reg'])), l1_regularization_strength=0.01, l2_regularization_strength=0.01), dnn_feature_columns=columns['dnn'], #没有dnn的话这个就注销 dnn_hidden_units=hidden_units, weight_column=columns['weight'][0], #如果有weight这个就不能注释 #dnn_optimizer=tf.train.AdamOptimizer( # learning_rate=float(config['dnn_model']['learning_rate'])), dnn_optimizer=tf.train.AdagradOptimizer(learning_rate=float( config['dnn_model']['learning_rate']), initial_accumulator_value=0.1, use_locking=False), batch_norm=True, #dnn_dropout=dropout, #dnn_dropout=None, loss_reduction=tf.losses.Reduction.SUM_OVER_BATCH_SIZE) # Train and evaluate max_steps = config['train'].get('max_step', '') if max_steps == '': max_steps = None else: max_steps = int(max_steps) epochs = int(config['train']['epochs']) for i in range(epochs): logging.info("training...") model.train(input_fn=lambda: input_fn( train_files, spec, shuffle=True, batch_size=batch_size), steps=max_steps) results = model.evaluate(input_fn=lambda: input_fn( dev_files, spec, shuffle=False, batch_size=batch_size)) logging.info("results...") for key in sorted(results): print('{}th {}: {}'.format(i + 1, key, results[key])) model.export_savedmodel( export_dir_base=config['train'].get('export_dir', 'export_dir'), serving_input_receiver_fn=lambda: input_receiver(spec), strip_default_attrs=True)
def train(config, hdfs_prefix, ftime, gap, ckpt_dir, export_dir, metric_dir): """Entry for trainig Args: config: (configparser) All the hyperparameters for training """ train_files = [] dev_files = [] cur_date = datetime.datetime.strptime(ftime, "%Y%m%d") for i in range(1, gap + 1): dest_date = (cur_date + datetime.timedelta(days=-i)).strftime("%Y%m%d") train_files.append(hdfs_prefix + "/" + dest_date + "/train/part-r-*") dev_files.append(hdfs_prefix + "/" + dest_date + "/test/part-r-*") logging.info('train files: {}'.format(reprlib.repr(train_files))) logging.info('dev files: {}'.format(reprlib.repr(dev_files))) #特征的配置文件 在input 这个section的spec这个key feature_config = configparser.ConfigParser() feature_config.read(config['input']['spec']) #特征配置文件 有boundaries等信息 columns, spec = FCGen.GetFeatureSpec( feature_config) #按特征列对特征进行处理,不同类型处理会不一样,比如数值、embed等 batch_size = int(config['train']['batch_size']) conf = tf.ConfigProto() conf.gpu_options.allow_growth = True os.environ["CUDA_VISIBLE_DEVICES"] = "-1" run_config = tf.estimator.RunConfig(save_checkpoints_secs=1800).replace( session_config=conf) logging.info("Creating model...") # Define the model model = tf.estimator.BoostedTreesClassifier( config=run_config, n_batches_per_layer=1000, n_trees=100, learning_rate=0.2, l1_regularization=0.01, l2_regularization=0.01, max_depth=10, model_dir=ckpt_dir, feature_columns=list(columns['linear'].values()), weight_column=list(columns['weight'].values())[0] #如果有weight这个就不能注释 ) #model = tf.estimator.add_metrics(model, metric_auc) # Train and evaluate epochs = int(config['train']['epochs']) for i in range(epochs): logging.info("training...") model.train(input_fn=lambda: input_fn( train_files, spec, shuffle=True, batch_size=batch_size)) results = model.evaluate(input_fn=lambda: input_fn( dev_files, spec, shuffle=False, batch_size=batch_size)) auc = float(results["auc"]) logloss = float(results["loss"]) index = [{ "name": "auc", "type": "float", "value": str(auc) }, { "name": "logloss", "type": "float", "value": str(logloss) }] file_name = metric_dir + "/metrics_info.json" with open(file_name, 'w') as file_obj: json.dump(index, file_obj) model.export_savedmodel( export_dir_base=export_dir, serving_input_receiver_fn=lambda: input_receiver(spec), strip_default_attrs=True)
def train(config, trainfile, testfile): """Entry for trainig Args: config: (configparser) All the hyperparameters for training """ train_dir = trainfile train_files = [ os.path.join(train_dir, f) for f in os.listdir(train_dir) if f != "_SUCCESS" ] logging.info('train directory: {}'.format(train_dir)) logging.info('train files: {}'.format(reprlib.repr(train_files))) dev_dir = testfile dev_files = [ os.path.join(dev_dir, f) for f in os.listdir(dev_dir) if f != "_SUCCESS" ] logging.info('dev directory: {}'.format(dev_dir)) logging.info('dev files: {}'.format(reprlib.repr(dev_files))) feature_config = configparser.ConfigParser() feature_config.read(config['input']['spec']) columns, spec = FCGen.GetFeatureSpec(feature_config) batch_size = int(config['train']['batch_size']) conf = tf.ConfigProto() conf.gpu_options.allow_growth = True os.environ["CUDA_VISIBLE_DEVICES"] = "3" run_config = tf.estimator.RunConfig().replace(session_config=conf) logging.info("Creating model...") # Define the model hidden_units = [ int(n) for n in config['dnn_model']['hidden_units'].split(',') ] dropout = config['dnn_model'].get('dropout', '') if dropout == '': dropout = None else: dropout = float(dropout) print(columns['weight'][0]) model = tf.estimator.DNNLinearCombinedClassifier( config=run_config, model_dir=config['train'].get('model_dir', 'model_dir'), linear_feature_columns=columns['linear'], linear_optimizer=tf.train.FtrlOptimizer( learning_rate=float(config['linear_model']['learning_rate']), l1_regularization_strength=float(config['linear_model']['l1_reg']), l2_regularization_strength=float( config['linear_model']['l2_reg'])), dnn_feature_columns=columns['dnn'], dnn_hidden_units=hidden_units, weight_column=columns['weight'][0], dnn_optimizer=tf.train.AdamOptimizer( learning_rate=float(config['dnn_model']['learning_rate'])), dnn_dropout=dropout, loss_reduction=tf.losses.Reduction.SUM_OVER_BATCH_SIZE) # Train and evaluate max_steps = config['train'].get('max_step', '') if max_steps == '': max_steps = None else: max_steps = int(max_steps) logging.info("training...") model.train(input_fn=lambda: input_fn( train_files, spec, shuffle=True, batch_size=batch_size), steps=max_steps) results = model.evaluate(input_fn=lambda: input_fn( dev_files, spec, shuffle=False, batch_size=batch_size)) logging.info("results...") for key in sorted(results): print('%s: %s' % (key, results[key])) model.export_savedmodel( export_dir_base=config['train'].get('export_dir', 'export_dir'), serving_input_receiver_fn=lambda: input_receiver(spec), strip_default_attrs=True)