def build_graph(self): tf.reset_default_graph() self.dataset = as_dataset(FLAGS.dataset) with tf.device(self.device_op(0)): with tf.variable_scope(tf.get_variable_scope()): self.global_step = tf.get_variable( name='global_step', dtype=tf.int32, shape=[], initializer=tf.constant_initializer(0), trainable=False) self.learning_rate = tf.get_variable( name='learning_rate', dtype=tf.float32, shape=[], initializer=tf.constant_initializer(FLAGS.learning_rate), trainable=False) # self.lr_decay_op = tf.assign(self.learning_rate, self.learning_rate * FLAGS.decay) self.opt = get_optimizer(FLAGS.optimizer, self.learning_rate) self.model = as_model(FLAGS.model, input_dim=self.dataset.num_features, num_fields=self.dataset.num_fields, **self.model_param) tf.get_variable_scope().reuse_variables() self.grads = self.opt.compute_gradients(self.model.loss) with tf.device(self.device_op(0, local=True)): if self.lazy_update > 1: local_grads = [] accumulate_op = [] reset_op = [] self.local_grads = [] for grad, v in self.grads: zero_grad = tf.zeros_like(v) local_grad = tf.Variable( zero_grad, dtype=tf.float32, trainable=False, name=v.name.split(':')[0] + '_local_grad', collections=[tf.GraphKeys.LOCAL_VARIABLES]) self.local_grads.append(local_grad) reset_grad = local_grad.assign(zero_grad) if FLAGS.sparse_grad and isinstance( grad, tf.IndexedSlices): accumulate_grad = local_grad.scatter_sub(-grad) else: accumulate_grad = local_grad.assign_add(grad) local_grads.append((local_grad, v)) accumulate_op.append(accumulate_grad) reset_op.append(reset_grad) if self.lazy_update > 1: self.update_op = self.opt.apply_gradients( local_grads, global_step=self.global_step) self.accumulate_op = tf.group(*accumulate_op) self.reset_op = tf.group(*reset_op) else: self.train_op = self.opt.minimize(self.model.loss, global_step=self.global_step) self.saver = tf.train.Saver()
def build_graph_multi_gpu(self): tf.reset_default_graph() self.dataset = as_dataset(FLAGS.dataset) self.tower_grads = [] self.models = [] with tf.device(self.device_op(0)): with tf.variable_scope(tf.get_variable_scope()): self.global_step = tf.get_variable(name='global_step', dtype=tf.int32, shape=[], initializer=tf.constant_initializer(0), trainable=False) self.learning_rate = tf.get_variable(name='learning_rate', dtype=tf.float32, shape=[], initializer=tf.constant_initializer(FLAGS.learning_rate), trainable=False) self.opt = get_optimizer(FLAGS.optimizer, self.learning_rate) for i in xrange(self.num_gpus): with tf.device(self.device_op(i)): print('Deploying gpu:%d ...' % i) with tf.name_scope('tower_%d' % i): model = as_model(FLAGS.model, input_dim=self.dataset.num_features, num_fields=self.dataset.num_fields, **self.model_param) self.models.append(model) tf.get_variable_scope().reuse_variables() grads = self.opt.compute_gradients(model.loss) self.tower_grads.append(grads) with tf.device(self.device_op(0, local=True)): print('###################################') average_grads = [] if self.lazy_update > 1: local_grads = [] accumulate_op = [] reset_op = [] self.local_grads = [] for grad_and_vars in zip(*self.tower_grads): grads = [] if FLAGS.sparse_grad and isinstance(grad_and_vars[0][0], tf.IndexedSlices): grad = sparse_grads_mean(grad_and_vars) grad_shape = grad.dense_shape else: for g, _ in grad_and_vars: expanded_g = tf.expand_dims(g, 0) grads.append(expanded_g) grad = tf.concat(axis=0, values=grads) grad = tf.reduce_mean(grad, 0) grad_shape = grad.shape v = grad_and_vars[0][1] grad_and_var = (grad, v) print(type(grad), grad_shape, type(v), v.shape) average_grads.append(grad_and_var) if self.lazy_update > 1: zero_grad = tf.zeros_like(v) local_grad = tf.Variable(zero_grad, dtype=tf.float32, trainable=False, name=v.name.split(':')[0] + '_local_grad', collections=[tf.GraphKeys.LOCAL_VARIABLES]) self.local_grads.append(local_grad) reset_grad = local_grad.assign(zero_grad) if FLAGS.sparse_grad and isinstance(grad, tf.IndexedSlices): accumulate_grad = local_grad.scatter_sub(-grad) else: accumulate_grad = local_grad.assign_add(grad) local_grads.append((local_grad, v)) accumulate_op.append(accumulate_grad) reset_op.append(reset_grad) print('###################################') # TODO test this # self.grad_op = tf.group([(x[0].op, x[1].op) for x in average_grads]) if self.lazy_update > 1: self.update_op = self.opt.apply_gradients(local_grads, global_step=self.global_step) # self.grad_op = tf.group(average_grads) # tf.ver < 1.5 need *inputs self.accumulate_op = tf.group(*accumulate_op) self.reset_op = tf.group(*reset_op) else: self.train_op = self.opt.apply_gradients(average_grads, global_step=self.global_step) self.saver = tf.train.Saver()
import os import __init__ sys.path.append(__init__.config['data_path']) # add your data path here from datasets import as_dataset from tf_trainer import Trainer from tf_models import AutoDeepFM import tensorflow as tf import traceback seeds = [ 0x0123, 0x4567, 0x3210, 0x7654, 0x89AB, 0xCDEF, 0xBA98, 0xFEDC, 0x0123, 0x4567, 0x3210, 0x7654, 0x89AB, 0xCDEF, 0xBA98, 0xFEDC ] data_name = 'avazu' dataset = as_dataset(data_name) backend = 'tf' batch_size = 2000 train_data_param = { 'gen_type': 'train', 'random_sample': True, 'batch_size': batch_size, 'split_fields': False, 'on_disk': True, 'squeeze_output': True, } test_data_param = { 'gen_type': 'test', 'random_sample': False, 'batch_size': batch_size,
class Config: # # general config # epoch_display_periods = 10 # epoch display periods summaries_dir = "./summaries" # tensorboard writer target directory model_dir = "checkpoints" # save model in this directory save_periods = 100 # save periods sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess_config.gpu_options.allow_growth = True keras_sess = tf.Session(config=sess_config) K.set_session(keras_sess) # # environment config # environment_combination_len = 3 environment_combinations_num = 10 # # actor config # lr = 0.001 # learning rate gamma = 0.5 # the discount factor in G value_scale = 0.5 # the weight of value function approximation in total loss reinforce_batch_size = 100 # batch size used in Reinforce algorithm gradient_clip = 40 # graient clip, avoid too large gradient # # encoder config # encoder_dim = 64 # # reinforce config reinforce_logdir = "./summaries/reinforce_logdir" reinforce_learning_rate = 0.001 # # evaluator configs # evaluator_model_name = "lr" # 'pin', 'lr' evaluator_optimizer_name = 'adam' evaluator_learning_rate = 0.03 evaluator_epsilon = 1e-4 evaluator_max_rounds = 2000 evaluator_early_stop = 8 evaluator_embedding_size = 20 evaluator_log_step_frequency = 0 evaluator_eval_round_frequency = 1 evaluator_train_logdir = "./summaries/evaluator_train" evaluator_valid_logdir = "./summaries/evaluator_valid" evaluator_graph_logdir = "./summaries/evaluator_graph" # # dataset # data_name = "Couple" dataset = as_dataset(data_name, True) dataset.load_data(gen_type='train') dataset.load_data(gen_type='test') dataset.summary() num_fields = dataset.num_fields feat_sizes = dataset.feat_sizes feat_min = dataset.feat_min target_combination_num = 30 target_combination_len = 4
def __init__(self): # parse params self.config = {} self.logdir, self.logfile = get_logdir(FLAGS=FLAGS) self.ckpt_dir = os.path.join(self.logdir, 'checkpoints') self.ckpt_name = 'model.ckpt' self.worker_dir = '' self.sub_file = os.path.join(self.logdir, 'submission.%d.csv') redirect_stdout(self.logfile) self.train_data_param = { 'gen_type': 'train', 'random_sample': True, 'batch_size': FLAGS.batch_size, 'squeeze_output': False, 'val_ratio': FLAGS.val_ratio, } self.valid_data_param = { 'gen_type': 'valid' if FLAGS.val else 'test', 'random_sample': False, 'batch_size': FLAGS.test_batch_size, 'squeeze_output': False, 'val_ratio': FLAGS.val_ratio, } self.test_data_param = { 'gen_type': 'test', 'random_sample': False, 'batch_size': FLAGS.test_batch_size, 'squeeze_output': False, } self.train_logdir = os.path.join(self.logdir, 'train', self.worker_dir) self.valid_logdir = os.path.join(self.logdir, 'valid', self.worker_dir) self.test_logdir = os.path.join(self.logdir, 'test', self.worker_dir) gpu_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, gpu_options={'allow_growth': True}) self.model_param = { 'l2_embed': FLAGS.l2_embed, 'input_norm': FLAGS.input_norm, 'init_sparse': FLAGS.init_sparse, 'init_fused': FLAGS.init_fused, 'loss_mode': FLAGS.loss_mode } if FLAGS.model != 'lr': self.model_param['embed_size'] = FLAGS.embed_size if FLAGS.model == 'kfm': self.model_param['unit_kernel'] = FLAGS.unit_kernel self.model_param['fix_kernel'] = FLAGS.fix_kernel self.model_param['l2_kernel'] = FLAGS.l2_kernel self.model_param['kernel_type'] = FLAGS.kernel_type self.dump_config() # create graph tf.reset_default_graph() # load dataset self.dataset = as_dataset(FLAGS.dataset) # build model with tf.device('/gpu:0'): with tf.variable_scope(tf.get_variable_scope()): self.global_step = tf.get_variable( name='global_step', dtype=tf.int32, shape=[], initializer=tf.constant_initializer(1), trainable=False) self.learning_rate = tf.get_variable( name='learning_rate', dtype=tf.float32, shape=[], initializer=tf.constant_initializer(FLAGS.learning_rate), trainable=False) self.opt = get_optimizer(FLAGS.optimizer, self.learning_rate) self.model = as_model(FLAGS.model, input_dim=self.dataset.num_features, num_fields=self.dataset.num_fields, **self.model_param) tf.get_variable_scope().reuse_variables() self.train_op = self.opt.minimize(self.model.loss, global_step=self.global_step) self.saver = tf.train.Saver() def sess_op(): return tf.Session(config=gpu_config) train_size = int(self.dataset.train_size * (1 - FLAGS.val_ratio)) self.num_steps = int(np.ceil(train_size / FLAGS.batch_size)) self.eval_steps = self.num_steps # start train with sess_op() as self.sess: print('Train size = %d, Batch size = %d' % (self.dataset.train_size, FLAGS.batch_size)) print( '%d rounds in total, One round = %d steps, One evaluation = %d steps' % (FLAGS.num_rounds, self.num_steps, self.eval_steps)) # data generator self.train_gen = self.dataset.batch_generator( self.train_data_param) self.valid_gen = self.dataset.batch_generator( self.valid_data_param) self.test_gen = self.dataset.batch_generator(self.test_data_param) # log writer self.train_writer = tf.summary.FileWriter(logdir=self.train_logdir, graph=self.sess.graph, flush_secs=30) self.test_writer = tf.summary.FileWriter(logdir=self.test_logdir, graph=self.sess.graph, flush_secs=30) self.valid_writer = tf.summary.FileWriter(logdir=self.valid_logdir, graph=self.sess.graph, flush_secs=30) # init model if not FLAGS.restore: self.sess.run(tf.global_variables_initializer()) else: checkpoint_state = tf.train.get_checkpoint_state(self.ckpt_dir) if checkpoint_state and checkpoint_state.model_checkpoint_path: self.saver.restore(self.sess, checkpoint_state.model_checkpoint_path) print('Restore model from:', checkpoint_state.model_checkpoint_path) print('Run initial evaluation...') self.evaluate(self.test_gen, self.test_writer) else: print('Restore failed') # init check print('Initial evaluation') cnt = 0 for xs, ys in self.test_gen: feed_dict = {self.model.inputs: xs, self.model.labels: ys} if self.model.training is not None: feed_dict[self.model.training] = False self.sess.run(fetches=self.model.preds, feed_dict=feed_dict) cnt += 1 if cnt == 100: break self.begin_step = self.global_step.eval(self.sess) self.step = self.begin_step self.start_time = time.time() for r in range(1, FLAGS.num_rounds + 1): print('Round: %d' % r) for batch_xs, batch_ys in self.train_gen: fetches = [self.train_op, self.global_step] train_feed = {} fetches += [ self.model.loss, self.model.log_loss, self.model.l2_loss ] train_feed[self.model.inputs] = batch_xs train_feed[self.model.labels] = batch_ys if self.model.training is not None: train_feed[self.model.training] = True _, self.step, _loss_, _log_loss_, _l2_loss_ = self.sess.run( fetches=fetches, feed_dict=train_feed) if self.step % FLAGS.log_frequency == 0: elapsed_time = self.get_elapsed() print( 'Done step %d, Elapsed: %.2fs, Train-Loss: %.4f, Log-Loss: %.4f, L2-Loss: %g' % (self.step, elapsed_time, _loss_, _log_loss_, _l2_loss_)) summary = tf.Summary(value=[ tf.Summary.Value(tag='loss', simple_value=_loss_), tf.Summary.Value(tag='log_loss', simple_value=_log_loss_), tf.Summary.Value(tag='l2_loss', simple_value=_l2_loss_) ]) self.train_writer.add_summary(summary, global_step=self.step) self.saver.save( self.sess, os.path.join(self.logdir, 'checkpoints', 'model.ckpt'), self.step) print('Round %d finished, Elapsed: %s' % (r, self.get_timedelta())) self.evaluate(self.test_gen, submission=r)
import time import os import __init__ sys.path.append(__init__.config['data_path']) # add your data path here from datasets import as_dataset from tf_trainer import Trainer from tf_models import AutoFM import tensorflow as tf import traceback import random seeds = [ 0x0123, 0x4567, 0x3210, 0x7654, 0x89AB, 0xCDEF, 0xBA98, 0xFEDC, 0x0123, 0x4567, 0x3210, 0x7654, 0x89AB, 0xCDEF, 0xBA98, 0xFEDC ] data_name = 'avazu' dataset = as_dataset( data_name) # https://github.com/Atomu2014/Ads-RecSys-Datasets使用的这个 backend = 'tf' batch_size = 2000 train_data_param = { 'gen_type': 'train', 'random_sample': True, 'batch_size': batch_size, 'split_fields': False, 'on_disk': True, 'squeeze_output': True, } test_data_param = { 'gen_type': 'test', 'random_sample': False, 'batch_size': batch_size,