def construct_graph(self, sess): with sess.graph.as_default(): # Set the random seed for tensorflow tf.set_random_seed(cfg.RNG_SEED) # Build the main computation graph layers = self.net.create_architecture(True) # is_training flag: True # Define the loss loss = layers['total_loss'] path_iter = self.pretrained_model.split('.ckpt')[0] iter_num = path_iter.split('_')[-1] # from iter_ckpt if cfg.TRAIN_MODULE_CONTINUE == 1: global_step = tf.Variable(int(iter_num), trainable=False) # from iter 0 elif cfg.TRAIN_MODULE_CONTINUE == 2: global_step = tf.Variable(0, trainable=False) first_decay_steps = 2 * len(self.Trainval_GT) # need test, 2 epoches lr = cosine_decay_restarts(cfg.TRAIN.LEARNING_RATE * 10, global_step, first_decay_steps, t_mul=2.0, m_mul=1.0, alpha=0.0) self.optimizer = tf.train.MomentumOptimizer(lr, cfg.TRAIN.MOMENTUM) list_var_to_update = tf.trainable_variables() for var in list_var_to_update: print(var.name) grads_and_vars = self.optimizer.compute_gradients(loss, list_var_to_update) capped_gvs = [(tf.clip_by_norm(grad, 1.), var) for grad, var in grads_and_vars] train_op = self.optimizer.apply_gradients(capped_gvs,global_step=global_step) self.saver = tf.train.Saver(max_to_keep=cfg.TRAIN.SNAPSHOT_KEPT) # Write the train and validation information to tensorboard self.writer = tf.summary.FileWriter(self.tbdir, sess.graph) return lr, train_op
def testDecay(self): num_training_steps = 1000 initial_lr = 1.0 for step in range(0, 1500, 250): decayed_lr = learning_rate_decay.cosine_decay_restarts( initial_lr, step, num_training_steps) expected = self.np_cosine_decay_restarts(step, num_training_steps) self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
def construct_graph(self, sess): with sess.graph.as_default(): # Set the random seed for tensorflow tf.set_random_seed(cfg.RNG_SEED) # Build the main computation graph layers = self.net.create_architecture(True) # is_training flag: True # Define the loss loss = layers['total_loss'] path_iter = self.pretrained_model.split('.ckpt')[0] iter_num = path_iter.split('_')[-1] # from iter_ckpt if cfg.TRAIN_MODULE_CONTINUE == 1: global_step = tf.Variable(int(iter_num), trainable=False) # from iter 0 if cfg.TRAIN_MODULE_CONTINUE == 2: global_step = tf.Variable(0, trainable=False) # lr = tf.train.exponential_decay(cfg.TRAIN.LEARNING_RATE * 10, global_step, cfg.TRAIN.STEPSIZE * 5, cfg.TRAIN.GAMMA, staircase=True) # here we use cos lr scheme, i.e. first_decay_steps = 2 * len(self.Trainval_GT) # 2 epoches lr = cosine_decay_restarts(cfg.TRAIN.LEARNING_RATE * 10, global_step, first_decay_steps, t_mul=2.0, m_mul=1.0, alpha=0.0) self.optimizer = tf.train.MomentumOptimizer(lr, cfg.TRAIN.MOMENTUM) # list_var_to_update = [] # if cfg.TRAIN_MODULE_UPDATE == 1: # list_var_to_update = tf.trainable_variables() # if cfg.TRAIN_MODULE_UPDATE == 2: # list_var_to_update = [var for var in tf.trainable_variables() if 'fc_binary' in var.name or 'binary_classification' in var.name] # 1--Update_all_parameter, 2--Only_Update_D, 3--Update_H+O+SP, 4--updating except classifiers of S(fc) list_var_to_update = [] if cfg.TRAIN_MODULE_UPDATE == 1: list_var_to_update = tf.trainable_variables() if cfg.TRAIN_MODULE_UPDATE == 2: list_var_to_update = [var for var in tf.trainable_variables() if 'fc_binary' in var.name or 'binary_classification' in var.name] if cfg.TRAIN_MODULE_UPDATE == 3: list_var_to_update = [var for var in tf.trainable_variables() if 'fc_binary' not in var.name or 'binary_classification' not in var.name] if cfg.TRAIN_MODULE_UPDATE == 4: list_var_to_update = [var for var in tf.trainable_variables() if 'classification' not in var.name] grads_and_vars = self.optimizer.compute_gradients(loss, list_var_to_update) capped_gvs = [(tf.clip_by_norm(grad, 1.), var) for grad, var in grads_and_vars] train_op = self.optimizer.apply_gradients(capped_gvs, global_step=global_step) self.saver = tf.train.Saver(max_to_keep=cfg.TRAIN.SNAPSHOT_KEPT) # Write the train and validation information to tensorboard self.writer = tf.summary.FileWriter(self.tbdir, sess.graph) return lr, train_op
def testTMul(self): num_training_steps = 1000 initial_lr = 1.0 t_mul = 1.0 for step in range(0, 1500, 250): with self.test_session(): decayed_lr = learning_rate_decay.cosine_decay_restarts( initial_lr, step, num_training_steps, t_mul=t_mul) expected = self.np_cosine_decay_restarts(step, num_training_steps, t_mul=t_mul) self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
def construct_graph(self, sess): with sess.graph.as_default(): tf.set_random_seed(cfg.RNG_SEED) layers = self.net.create_architecture(True) loss = layers['total_loss'] if cfg.TRAIN_MODULE_CONTINUE == 1: path_iter = self.pretrained_model.split('.ckpt')[0] iter_num = path_iter.split('_')[-1] global_step = tf.Variable(int(iter_num), trainable=False) if cfg.TRAIN_MODULE_CONTINUE == 2: global_step = tf.Variable(0, trainable=False) first_decay_steps = 2 * len(self.Trainval_GT) lr = cosine_decay_restarts(cfg.TRAIN.LEARNING_RATE * 10, global_step, first_decay_steps, t_mul=cfg.LR_DECAY.T_MUL, m_mul=cfg.LR_DECAY.M_MUL, alpha=0.0) self.optimizer = tf.train.GradientDescentOptimizer(lr) list_var_to_update = [] if cfg.TRAIN_MODULE_UPDATE == 1: list_var_to_update = tf.trainable_variables() elif cfg.TRAIN_MODULE_UPDATE == 2: list_var_to_update = [var for var in tf.trainable_variables() if 'Att_sp' in var.name\ or 'bottleneck_sp' in var.name\ or 'conv1_sp' in var.name\ or 'conv2_sp' in var.name\ or 'Concat_SHsp' in var.name\ or 'fc7_SHsp' in var.name\ or 'body_to_head' in var.name\ or 'attention_3D' in var.name\ or 'triplet_align' in var.name\ or 'space_classification' in var.name\ or 'joint_classification' in var.name] grads_and_vars = self.optimizer.compute_gradients( loss, list_var_to_update) capped_gvs = [(tf.clip_by_norm(grad, 1.), var) for grad, var in grads_and_vars] train_op = self.optimizer.apply_gradients(capped_gvs, global_step=global_step) self.saver = tf.train.Saver(max_to_keep=cfg.TRAIN.SNAPSHOT_KEPT) self.writer = tf.summary.FileWriter(self.tbdir, sess.graph) return lr, train_op
def get_optimzer_lr(self, global_step, step_factor): stepsize = int(cfg.TRAIN.STEPSIZE * step_factor) gamma = cfg.TRAIN.GAMMA epoch_iters = get_epoch_iters(self.net.model_name) stepsize = epoch_iters * 2 lr = tf.train.exponential_decay(cfg.TRAIN.LEARNING_RATE * 10, global_step, stepsize, gamma, staircase=True) optimizer = tf.train.MomentumOptimizer(lr, cfg.TRAIN.MOMENTUM) if self.net.model_name.__contains__('cosine'): print('cosine =========') first_decay_steps = epoch_iters * 10 # 2 epoches from tensorflow.python.training.learning_rate_decay import cosine_decay_restarts lr = cosine_decay_restarts(cfg.TRAIN.LEARNING_RATE * 10, global_step, first_decay_steps, t_mul=2.0, m_mul=0.9, alpha=cfg.TRAIN.LEARNING_RATE * 0.1) optimizer = tf.train.MomentumOptimizer(lr, cfg.TRAIN.MOMENTUM) elif self.net.model_name.__contains__('zsrare'): #rare first lr = tf.train.exponential_decay(cfg.TRAIN.LEARNING_RATE * 10, global_step, int(cfg.TRAIN.STEPSIZE * 2), gamma, staircase=True) optimizer = tf.train.MomentumOptimizer(lr, cfg.TRAIN.MOMENTUM) elif self.net.model_name.__contains__('zsnrare'): # non rare first lr = tf.train.exponential_decay(cfg.TRAIN.LEARNING_RATE * 10, global_step, int(cfg.TRAIN.STEPSIZE * step_factor), gamma, staircase=True) optimizer = tf.train.MomentumOptimizer(lr, cfg.TRAIN.MOMENTUM) return lr, optimizer
def construct_graph(self, sess): with sess.graph.as_default(): # Set the random seed for tensorflow tf.compat.v1.set_random_seed(cfg.RNG_SEED) # Build the main computation graph layers = self.net.create_architecture( True) # is_training flag: True # Define the loss loss = layers['total_loss'] # 获取 global_step if cfg.TRAIN_MODULE_CONTINUE == 1: # from iter_ckpt path_iter = self.pretrained_model.split('.ckpt')[0] iter_num = path_iter.split('_')[-1] global_step = tf.Variable(int(iter_num), trainable=False) elif cfg.TRAIN_MODULE_CONTINUE == 2: # from iter 0 global_step = tf.Variable(0, trainable=False) # 根据 global_step 获取 lr # 指数下降的 lr # lr = tf.train.exponential_decay(cfg.TRAIN.LEARNING_RATE * 10, global_step, cfg.TRAIN.STEPSIZE * 5, cfg.TRAIN.GAMMA, staircase=True) # 余弦下降的 lr first_decay_steps = 80000 # first_decay_steps 是指第一次完全下降的 step 数 t_mul, m_mul = 2.0, 1.0 # t_mul 是指每一次循环的步数都将乘以 t_mul 倍, # m_mul 指每一次循环重新开始时的初始 lr 是上一次循环初始值的 m_mul 倍 lr = cosine_decay_restarts(cfg.TRAIN.LEARNING_RATE * 10, global_step, first_decay_steps, t_mul, m_mul, alpha=0.0) # 定义优化函数,使用Momentum算法的Optimizer self.optimizer = tf.compat.v1.train.MomentumOptimizer( lr, cfg.TRAIN.MOMENTUM) # 1--Update_all_parameter, 2--Only_Update_D, 3--Update_H+O+SP, 4--updating except classifiers of S(fc) list_var_to_update = [] if cfg.TRAIN_MODULE_UPDATE == 1: list_var_to_update = tf.compat.v1.trainable_variables() if cfg.TRAIN_MODULE_UPDATE == 2: list_var_to_update = [ var for var in tf.compat.v1.trainable_variables() if 'fc_binary' in var.name or 'binary_classification' in var.name ] if cfg.TRAIN_MODULE_UPDATE == 3: list_var_to_update = [ var for var in tf.compat.v1.trainable_variables() if 'fc_binary' not in var.name or 'binary_classification' not in var.name ] if cfg.TRAIN_MODULE_UPDATE == 4: list_var_to_update = [ var for var in tf.compat.v1.trainable_variables() if 'classification' not in var.name ] # 计算各个train_variable的梯度 grads_and_vars = self.optimizer.compute_gradients( loss, list_var_to_update) capped_gvs = [(tf.clip_by_norm(grad, 1.), var) for grad, var in grads_and_vars ] # 将梯度的L2范数缩放为 1, 防止梯度爆炸 # 将计算出的梯度应用到变量上 train_op = self.optimizer.apply_gradients(capped_gvs, global_step=global_step) # 新建一个 Saver 用于保存模型,这个Saver在snapshot()中用到 # max_to_keep用来控制检查点的数量,超过max_to_keep后,新来一个checkpoint将删除原有的旧的一个,我们程序里max_to_keep=None self.saver = tf.compat.v1.train.Saver( max_to_keep=cfg.TRAIN.SNAPSHOT_KEPT) # 新建一个 Writer 用于将训练时的summary写入tensorboard self.writer = tf.compat.v1.summary.FileWriter( self.tbdir, sess.graph) return lr, train_op
def train(args): '''start by getting the data_loader object''' data_loader = TextLoader(args.reverse, args.data_dir, args.test_split, args.batch_size, args.seq_length, args.input_encoding) '''some informative prints''' args.vocab_size = data_loader.vocab_size print("Train size: ", data_loader.num_batches * args.batch_size) if args.test_split > 0: print("Test size: ", data_loader.test_num_batches * args.batch_size) print("Vocab size: ", args.vocab_size) # check compatibility if training is continued from previously saved model if args.init_from is not None: # check if all necessary files exist assert os.path.isdir( args.init_from), " %s must be a path" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "config.pkl") ), "config.pkl file does not exist in path %s" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "words_vocab.pkl") ), "words_vocab.pkl.pkl file does not exist in path %s" % args.init_from ckpt = tf.train.get_checkpoint_state(args.init_from) assert ckpt, "No checkpoint found" assert ckpt.model_checkpoint_path, "No model path found in checkpoint" # open old config and check if models are compatible with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f: saved_model_args = cPickle.load(f) need_be_same = ["model", "rnn_size", "num_layers", "seq_length"] for checkme in need_be_same: assert vars(saved_model_args)[checkme] == vars( args )[checkme], "Command line argument and saved model disagree on '%s' " % checkme # open saved vocab/dict and check if vocabs/dicts are compatible with open(os.path.join(args.init_from, 'words_vocab.pkl'), 'rb') as f: saved_words, saved_vocab = cPickle.load(f) assert saved_words == data_loader.words, "Data and loaded model disagree on word set!" assert saved_vocab == data_loader.vocab, "Data and loaded model disagree on dictionary mappings!" '''idk what the pickle.dump does''' with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'words_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.words, data_loader.vocab), f) '''start up the model''' model = Model(args) '''if a test split is requested, get it''' if args.test_split > 0: test_x = data_loader.test_x test_y = data_loader.test_y '''not sure about this stuff''' merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(args.log_dir) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_mem) '''begin the session for training''' with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: # take a look at the learning rate schedule, if so desired (note parameters here should match the ones used) plot = False if plot: n = args.num_epochs * data_loader.num_batches n = 150000 x = np.arange(n) y = np.zeros((n, 1)) y = cosine_decay_restarts( args.learning_rate, x, # shift down every epoch 50000, # check out this sweet graph https://github.com/tensorflow/tensorflow/pull/11749 .9, # doesn't hurt to look at the tf docs too .1, 1e-12).eval() plt.figure() plt.plot(x, y) plt.title("Learning rate schedule") plt.show() '''not sure what this does''' train_writer.add_graph(sess.graph) tf.global_variables_initializer().run() saver = tf.train.Saver(tf.global_variables()) '''fun fact: you cant put comments inside if-else clauses''' '''initialize from a previous model OR start from scratch, which means grabbing GloVe embeddings ''' if args.init_from is not None: saver.restore(sess, ckpt.model_checkpoint_path) else: print("Loading my knowledge of the English language...") embeddings = data_loader.get_embeddings() sess.run([model.embedding_init], {model.embedding_placeholder: embeddings}) '''iterate over the range of epochs specified''' for e in range(model.epoch_pointer.eval(), args.num_epochs): #sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e))) #this is the vanilla exponential deca '''learning rate decay is cosine annealing''' sess.run( tf.assign( model.lr, cosine_decay_restarts( args.learning_rate, e * data_loader.num_batches, # shift down every epoch 20000, # check out this sweet graph https://github.com/tensorflow/tensorflow/pull/11749 1, # doesn't hurt to look at the tf docs too .1, 1e-12))) '''reset the pointer to start from the beginning''' data_loader.reset_batch_pointer() state = sess.run(model.initial_state) speed = 0 if args.init_from is None: assign_op = model.epoch_pointer.assign(e) sess.run(assign_op) if args.init_from is not None: data_loader.pointer = model.batch_pointer.eval() args.init_from = None '''iterative over the batches in the dataset''' for b in range(data_loader.pointer, data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() '''the feed dictionary gets passed to the model when tensorflow variables are computed''' feed = { model.input_data: x, model.targets: y, model.initial_state: state, model.batch_time: speed, model.dropout: args.dropout } '''variables to be trained, either with or without word embeddings''' run_list_full = [ merged, model.cost, model.final_state, model.train_op, model.inc_batch_pointer_op ] run_list_no_W = [ merged, model.cost, model.final_state, model.train_op_no_W, model.inc_batch_pointer_op ] # YES, TRAIN THE EMBEDDINGS if args.trainable_embeddings == 1: summary, train_loss, state, _, _ = sess.run( run_list_full, feed) # NO, DO NOT TRAIN THE EMBEDDINGS (train_op_no_W) elif args.trainable_embeddings == 0: summary, train_loss, state, _, _ = sess.run( run_list_no_W, feed) # it's been e epochs, so start training the embeddings elif e > args.trainable_embeddings: summary, train_loss, state, _, _ = sess.run( run_list_full, feed) # it hasn't been e epochs, don't train the embeddings else: summary, train_loss, state, _, _ = sess.run( run_list_no_W, feed) '''some diagnostics to be printed, and the model gets saved here too''' train_writer.add_summary(summary, e * data_loader.num_batches + b) speed = time.time() - start if (e * data_loader.num_batches + b) % args.batch_size == 0: print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, speed)) if (e * data_loader.num_batches + b) % args.save_every == 0 \ or (e==args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e * data_loader.num_batches + b) print("model saved to {}".format(checkpoint_path)) print("learning rate: ", model.lr.eval()) #TEST LOSS EVAL - evaluates batch by batch with same batch size as for training if (args.test_split > 0): test_loss = 0 batches_in_test = len(test_x) save_state = state state = sess.run(model.initial_state) for i in range(batches_in_test): feed = { model.test_x: test_x[i], model.test_y: test_y[i], model.initial_state: state } loss, state, _ = sess.run([ model.test_cost, model.test_final_state, model.inc_batch_pointer_op ], feed) test_loss += loss test_loss = test_loss / batches_in_test state = save_state print("test_loss = {:.3f}".format(test_loss)) '''one final evaluation of the entire dataset to check the loss''' data_loader.reset_batch_pointer() state = sess.run(model.initial_state) ovr_loss = 0 start = time.time() for b in range(data_loader.pointer, data_loader.num_batches): x, y = data_loader.next_batch() feed = { model.input_data: x, model.targets: y, model.initial_state: state } train_loss, state, _ = sess.run( [model.cost, model.final_state, model.inc_batch_pointer_op], feed) ovr_loss += train_loss speed = time.time() - start print("ovr_train_loss = {:.3f}, time_to_eval = {:.3f}".format( ovr_loss / data_loader.num_batches, speed)) '''lets you initialize a model without training it''' if args.num_epochs == 0: saver.save(sess, checkpoint_path, global_step=e * data_loader.num_batches + b) print("model saved to {}".format(checkpoint_path)) train_writer.close()