def train(self): LR, HR = self.double_input_producer() global_step = tf.Variable(initial_value=0, trainable=False) self.global_step = global_step self.build() lr = tf.train.polynomial_decay(self.learning_rate, global_step, self.decay_step, end_learning_rate=self.end_lr, power=1.) vars_all = tf.trainable_variables() print('Params num of all:', get_num_params(vars_all)) training_op = tf.train.AdamOptimizer(lr).minimize( self.loss, var_list=vars_all, global_step=global_step) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) #sess=tf.Session() self.sess = sess sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(max_to_keep=100, keep_checkpoint_every_n_hours=1) if self.reload: self.load(sess, self.save_dir) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) cost_time = 0 start_time = time.time() gs = sess.run(global_step) for step in range(sess.run(global_step), self.max_step): if step > gs and step % 20 == 0: print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), 'Step:{}, loss:{}'.format(step, loss_v)) if step % 500 == 0: if step > gs: self.save(sess, self.save_dir, step) cost_time = time.time() - start_time print('cost {}s.'.format(cost_time)) self.eval() cost_time = time.time() - start_time start_time = time.time() print('cost {}s.'.format(cost_time)) lr1, hr = sess.run([LR, HR]) _, loss_v = sess.run([training_op, self.loss], feed_dict={ self.L: lr1, self.H: hr, self.is_train: True }) if step > 500 and loss_v > 10: print('Model collapsed with loss={}'.format(loss_v)) break
def evaluate(): # load test datasets datasets = ['Set5', 'Set14', 'B100', 'Urban100', 'Manga109'] loader.load_test_datasets(datasets, [scale]) with tf.Graph().as_default(): model.model_compile(np.array(loader.data_mean), scale) param = get_num_params() print("======== %s [X%d, param = %d] ========" % (model.name, scale, param)) # main body of evalution config = tf.ConfigProto() config.log_device_placement = True config.allow_soft_placement = True with tf.Session(config=config) as sess: # restore model from disk model_dir = os.path.join(record_dir, model.name, "X%d" % scale, 'train_logs') model_dir = os.path.join(model_dir, 'model.ckpt-X%d' % scale) model.saver.restore(sess, model_dir) for k in list(loader.test_datasets.keys()): print("\n%s" % k) dataset = loader.test_datasets[k] for s in list(dataset.keys()): test_one_dataset(model, dataset[s], s, k) print("Done!")
def evaluate(self, load_path=general_config.load_path_test, validFile=None, vocab2intPath=None): if validFile is None or vocab2intPath is None: validFile = general_config.training_file vocab2intPath = general_config.global_nonstatic_v2i_path train_generator = PaddedDataIterator(loadPath=validFile, vocab2intPath=vocab2intPath, sent_len_cut=self.min_len) load_dir = load_path if os.path.isdir(load_path) else os.path.dirname( load_path) log_dir = load_dir.replace("checkpoints", "logs") logger = my_logger(log_dir + "/log_evaluate.txt") os.environ['CUDA_VISIBLE_DEVICES'] = str(0) config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.8 with tf.Session(config=config, graph=self.graph) as sess: logger.info("Loading model...") saver = tf.train.Saver() if os.path.isdir(load_path): ckpt = tf.train.get_checkpoint_state(load_path) saver.restore(sess, ckpt.model_checkpoint_path) global_step = ckpt.model_checkpoint_path.split("-")[-1] else: saver.restore(sess, load_path) global_step = load_path.split("-")[-1] logger.info("Loading successfully, loading epoch is %s" % global_step) logger.info("The total number of trainable variables: %s" % get_num_params()) cur_loop = train_generator.loop cur_count = 0 avg_loss_t, avg_accuracy_t = 0., 0. _, batch_seqs, batch_labels, batch_lens = train_generator.next( 1024, need_all=True) while (train_generator.loop == cur_loop): cur_count += 1 loss_t, acc_t = sess.run([self.loss_op, self.acc_op], feed_dict=self._feed_dict_valid( batch_seqs, batch_labels, batch_lens)) avg_loss_t += loss_t avg_accuracy_t += acc_t _, batch_seqs, batch_labels, batch_lens = train_generator.next( 1024, need_all=True) avg_loss_t /= cur_count avg_accuracy_t /= cur_count logger.info("Loss: %.4f, Accuracy: %.4f " % (avg_loss_t, avg_accuracy_t)) return avg_loss_t, avg_accuracy_t
def fit(self,trainFile=None,with_validation=general_config.with_validation, log_dir=general_config.log_dir+"/TextCNN",save_dir=general_config.save_dir+"/TextCNN", load_path=general_config.load_path_train, num_epochs=general_config.num_epochs, steps_every_epoch=general_config.steps_every_epoch, batch_size=general_config.batch_size, learning_rate=general_config.learning_rate, lr_changing=general_config.lr_changing, min_learning_rate=general_config.min_learning_rate, learning_rate_decay=general_config.learning_rate_decay, save_epochs=general_config.save_epochs, early_stopping=general_config.early_stopping, num_visual=general_config.num_visualize): self.learning_rate_value = learning_rate self.trainFile = trainFile self.validFile = None self.with_validation = with_validation if self.trainFile is None: if self.with_validation: self.trainFile = general_config.train_file else: self.trainFile = general_config.training_file if self.with_validation: self.validFile = self.trainFile.replace("train", "valid") tmp = os.path.join(os.path.dirname(self.trainFile), os.path.basename(self.trainFile).replace(".txt", "").split("_")[0]) if self.model_type in ["static","multichannel"]: self.int2vocabPath = general_config.global_static_i2v_path self.vocab2intPath = general_config.global_static_v2i_path else: self.int2vocabPath = tmp + "_i2v.json" self.vocab2intPath = tmp + "_v2i.json" metadataPath = { "static": "/home/leechen/code/python/TextSentimentClassification/data_helpers/dataset/training_testing_metadata.tsv"} metadataPath["nonstatic"] = "/home/leechen/code/python/TextSentimentClassification/" \ + self.vocab2intPath.replace("v2i.json", "metadata.tsv") train_loss = [] train_accuracy = [] valid_loss = [] valid_accuracy = [] # 训练过程中的日志保存文件以及模型保存路径 if self.with_validation: log_dir=ensure_dir_exist(log_dir+"/"+self.model_type+"/train_valid") train_dir = os.path.join(log_dir, "train") val_dir = os.path.join(log_dir, "valid") save_dir = ensure_dir_exist(save_dir + "/" + self.model_type + "/train_valid") else: log_dir=ensure_dir_exist(log_dir+"/"+self.model_type+"/train") train_dir = os.path.join(log_dir, "train") val_dir=None save_dir = ensure_dir_exist(save_dir + "/" + self.model_type + "/train") # 生成日志 logger=my_logger(log_dir+"/log_fit.txt") msg = "\n--filter_size_list: %s\n" % self.filter_size_list \ + "--filter_num: %s\n" % self.filter_num \ + "--fc_layer_size_list: %s\n" % self.fc_layer_size_list \ + "--embedding_size: %s\n" % self.embedding_size \ + "--dropout: %s\n" % self.dropout_value \ + "--max_l2_norm: %s\n" % self.max_l2_norm \ + "--learning_rate: %s\n" % self.learning_rate_value \ + "--lr_changing: %s\n" % lr_changing \ + "--min_learning_rate: %s\n" % min_learning_rate\ + "--learning_rate_decay: %s\n" % learning_rate_decay\ +"--load_path: %s\n" % load_path\ +"--num_epochs: %s\n" % num_epochs\ +"--steps_every_epoch: %s\n" % steps_every_epoch\ +"--batch_size: %s\n" % batch_size\ +"--save_epochs: %s\n" % save_epochs\ +"--early_stopping: %s\n" % early_stopping\ +"--num_visual: %s\n"%num_visual logger.info(msg) # 定义数据生成器 train_generator = PaddedDataIterator(loadPath=self.trainFile,vocab2intPath=self.vocab2intPath) val_generator = None if self.validFile is None else PaddedDataIterator(loadPath=self.validFile, vocab2intPath=self.vocab2intPath) os.environ["CUDA_VISIBLE_DEVICES"] = str(0) config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.8 with tf.Session(config=config,graph=self.graph) as sess: train_writer = tf.summary.FileWriter(train_dir, sess.graph) val_writer = None if val_dir is None else tf.summary.FileWriter(val_dir) saver = tf.train.Saver(max_to_keep=5) sess.run(tf.global_variables_initializer()) start = 0 if isinstance(load_path,str): if os.path.isdir(load_path): ckpt = tf.train.get_checkpoint_state(load_path) saver.restore(sess, ckpt.model_checkpoint_path) start = ckpt.model_checkpoint_path.split("-")[-1] else: saver.restore(sess, load_path) start = load_path.split("-")[-1] logger.info("Loading successfully, loading epoch is %s" % start) logger.info("The total number of trainable variables: %s"%get_num_params()) cur_early_stopping=0 cur_max_acc=0. logger.info('******* start training with %d *******' % start) epoch=0 for epoch in range(start, num_epochs): if lr_changing: try: if (train_loss[-1]>train_loss[-2]): tmp=self.learning_rate_value*learning_rate_decay if (tmp>=min_learning_rate): self.learning_rate_value=tmp logger.info("Learning rate multiplied by %s at epoch %s." %(learning_rate_decay,epoch+1)) else: if (train_loss[-1]<train_loss[-2]-0.015): self.learning_rate_value*=1.05 logger.info("Learning rate multiplied by 1.05 at epoch %s."%(epoch+1)) except: pass avg_loss_t, avg_accuracy_t = 0, 0 avg_loss_v, avg_accuracy_v = 0, 0 for step in range(steps_every_epoch): _, batch_seqs, batch_labels, _ = train_generator.next(batch_size) batch_seqs_ns=None if self.model_type=="multichannel": batch_seqs_ns = self._X2X_ns(batch_seqs) sess.run(self.train_op,self._feed_dict_train(batch_x=batch_seqs, batch_y=batch_labels, batch_x_ns=batch_seqs_ns)) loss_t, acc_t= sess.run([self.loss_op, self.acc_op], self._feed_dict_valid(batch_x=batch_seqs, batch_y=batch_labels, batch_x_ns=batch_seqs_ns)) avg_loss_t += loss_t avg_accuracy_t += acc_t avg_loss_t/=steps_every_epoch avg_accuracy_t/=steps_every_epoch train_loss.append(avg_loss_t) train_accuracy.append(avg_accuracy_t) self.loss_accuracy_summary.value[0].simple_value = avg_loss_t self.loss_accuracy_summary.value[1].simple_value = avg_accuracy_t train_writer.add_summary(summary=self.loss_accuracy_summary, global_step=epoch + 1) if self.with_validation: # 计算验证集上的表现 cur_loop=val_generator.loop _, batch_seqs, batch_labels,_ = val_generator.next(1024,need_all=True) batch_seqs_ns=None if self.model_type == "multichannel": batch_seqs_ns = self._X2X_ns(batch_seqs) cur_count=0 while(val_generator.loop==cur_loop): loss_v, acc_v = sess.run([self.loss_op, self.acc_op], feed_dict= self._feed_dict_valid(batch_x=batch_seqs, batch_y=batch_labels,batch_x_ns=batch_seqs_ns)) avg_loss_v += loss_v avg_accuracy_v += acc_v cur_count += 1 _, batch_seqs, batch_labels, _ = val_generator.next(1024, need_all=True) batch_seqs_ns = None if self.model_type == "multichannel": batch_seqs_ns = self._X2X_ns(batch_seqs) avg_loss_v/=cur_count avg_accuracy_v/=cur_count valid_loss.append(avg_loss_v) valid_accuracy.append(avg_accuracy_v) self.loss_accuracy_summary.value[0].simple_value = avg_loss_v self.loss_accuracy_summary.value[1].simple_value = avg_accuracy_v val_writer.add_summary(summary=self.loss_accuracy_summary, global_step=epoch + 1) logger.info("Epoch: [%04d/%04d], " "Training Loss: %.4f, Training Accuracy: %.4f, " "Validation Loss: %.4f, Validation Accuracy: %.4f" \ % (epoch + 1, num_epochs, avg_loss_t, avg_accuracy_t, avg_loss_v, avg_accuracy_v)) # 如果验证集上的准确率连续低于历史最高准确率的次数超过early_stopping次,则提前停止迭代。 if (avg_accuracy_v > cur_max_acc): cur_max_acc = avg_accuracy_v cur_early_stopping = 0 logger.info("Saving model-%s" % (epoch + 1)) saver.save(sess, os.path.join(save_dir, 'model.ckpt'), global_step=epoch + 1) else: cur_early_stopping += 1 if cur_early_stopping > early_stopping: logger.info("Early stopping after epoch %s !" % (epoch + 1)) break else: logger.info("Epoch: [%04d/%04d], " "Training Loss: %.4f, Training Accuracy: %.4f " \ % (epoch + 1, num_epochs,avg_loss_t, avg_accuracy_t)) # 保存一次模型 if (epoch - start + 1) % save_epochs == 0: logger.info("Saving model-%s"%(epoch+1)) saver.save(sess, os.path.join(save_dir, 'model.ckpt'), global_step=epoch + 1) if num_visual > 0: # 可视化最终词向量 config = projector.ProjectorConfig() final_embeddings = {} try: final_embeddings["static"] = self.embedding_matrix_s.eval()[:num_visual] except: pass try: final_embeddings["nonstatic"] = self.embedding_matrix_ns.eval()[:num_visual] except: pass for (name, final_embedding) in final_embeddings.items(): embedding_var = tf.Variable(final_embedding, name="word_embeddings_" + name) sess.run(embedding_var.initializer) saver = tf.train.Saver([embedding_var]) saver.save(sess, log_dir + "/embeddings_" + name+".ckpt-"+str(epoch+1)) embedding = config.embeddings.add() embedding.tensor_name = embedding_var.name embedding.metadata_path = metadataPath[name] projector.visualize_embeddings(train_writer, config) return train_loss, train_accuracy, valid_loss, valid_accuracy
def train(scale): model.model_compile( np.array(loader.data_mean), scale) model_params = get_num_params() # prepare data loader.load_train_dataset() loader.load_valid_dataset() loader.load_batch() max_saver = tf.train.Saver(max_to_keep = 2, allow_empty = True) config = tf.ConfigProto() config.log_device_placement = True config.allow_soft_placement = True with tf.Session(config = config) as sess: # defining summary writer summary_writer = tf.summary.FileWriter(train_log_dir, sess.graph) # retrain the existed models init_step = 0 if train_from_exist: fmtstr = "restoring model from %s..." % exist_model_dir print(colored(fmtstr, "green", attrs = ["bold"])) init_step = model.restore_model(exist_model_dir, model.global_steps) else: fmtstr = "initializing variables..." print(colored(fmtstr, "green", attrs = ["bold"])) sess.run(tf.global_variables_initializer()) max_psnr = 0 cur_psnr = 0 print(colored("starting to train...", 'green', attrs = ['bold'])) for step in range(init_step, max_steps): # To check the time of data preprocessing, extracting batches is also # included here. start_time = time.time() lr_batch, hr_batch, scale = loader.work_queue.get() model.train_batch(lr_batch, scale, hr_batch) duration = time.time() - start_time if step == 0 or ((step + 1) % 1000 == 0): # valid model using Set5 formatstr = "%s: [%s (%d)]" % (datetime.now(), model.name, model_params) print(colored(formatstr, 'green', attrs = ['bold'])) examples_per_sec = loader.batch_size/(duration + 1e-10) formatstr = 'step %d: %.4f images/sec' % (step + 1, examples_per_sec) print(colored(formatstr, 'blue', attrs = ['bold'])) cur_psnr = valid_one_scale(model, loader.valid_dataset, scale, step + 1) if (step + 1) % 200 == 0: model.feed_dict[model.inputs] = lr_batch model.feed_dict[model.scale] = scale model.feed_dict[model.labels] = hr_batch summary_str = sess.run(model.summary_ops, feed_dict = model.feed_dict) summary_writer.add_summary(summary_str, step + 1) if (step + 1) % 500 == 0: checkpoint_path = os.path.join(train_log_dir, 'model.ckpt') print("saving checkpoint into: %s-%d" % (checkpoint_path, step + 1)) model.saver.save(sess, checkpoint_path, global_step = step + 1) if ((step + 1) % 500 == 0) and (cur_psnr > max_psnr): max_psnr = cur_psnr checkpoint_path = os.path.join(max_log_dir, 'model.ckpt') print("saving checkpoint into: %s-%d" % (checkpoint_path, step + 1)) max_saver.save(sess, checkpoint_path, global_step = step + 1) summary_writer.close()
def train(self): """Train video sr network""" global_step = tf.Variable(initial_value=0, trainable=False) self.global_step = global_step # Create folder for logs if not tf.gfile.Exists(self.save_dir): tf.gfile.MakeDirs(self.save_dir) self.build_model() lr = tf.train.polynomial_decay(self.learning_rate, global_step, self.decay_step, end_learning_rate=self.end_lr, power=1.) tf.summary.scalar('learning_rate', lr) vars_all = tf.trainable_variables() vars_sr = [v for v in vars_all if 'srmodel' in v.name] vars_flow = [v for v in vars_all if 'flow' in v.name] train_all = tf.train.AdamOptimizer(lr).minimize( self.loss, var_list=vars_all, global_step=global_step) train_flow = tf.train.AdamOptimizer(lr).minimize( self.loss_flow, var_list=vars_flow, global_step=global_step) train_sr = tf.train.AdamOptimizer(lr).minimize(self.loss_mse, var_list=vars_sr, global_step=global_step) print('params num of flow:', get_num_params(vars_flow)) print('params num of sr:', get_num_params(vars_sr)) print('params num of all:', get_num_params(vars_all)) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) #sess=tf.Session() self.sess = sess sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(max_to_keep=100, keep_checkpoint_every_n_hours=1) if self.reload: self.load(sess, self.save_dir) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) cost_time = 0 start_time = time.time() gs = sess.run(global_step) for step in range(sess.run(global_step), self.max_step): if step < 10000: train_op = train_sr else: train_op = train_all if step > gs and step % 20 == 0: print( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), 'Step:{}, loss:({:.3f},{:.3f},{:.3f}), mse:{}'.format( step, loss_value, loss_mse_value, loss_flow_value * 100, str(mse_value))) if step % 500 == 0: if step > gs: self.save(sess, self.save_dir, step) cost_time = time.time() - start_time print('cost {}s.'.format(cost_time)) self.evaluation() cost_time = time.time() - start_time start_time = time.time() print('cost {}s.'.format(cost_time)) _, loss_value, mse_value, loss_mse_value, loss_flow_value = sess.run( [train_op, self.loss, self.mse, self.loss_mse, self.loss_flow]) # print (loss_value) assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
def __init__(self, parameters): self.num_frames = parameters.num_frames self.scale = parameters.scale self.in_size = parameters.in_size self.gt_size = self.in_size * self.scale self.batch_size = parameters.batch_size self.learning_rate = parameters.learning_rate self.end_lr = parameters.end_lr self.reload = parameters.reload self.max_step = parameters.max_step self.decay_step = parameters.decay_step self.train_dir = parameters.train_dir self.eval_dir = parameters.eval_dir self.save_dir = parameters.save_dir self.log_dir = parameters.log_dir self.tensorboard_dir = parameters.tensorboard_dir self.main_channel_nums = parameters.main_channel_nums self.save_iter_gap = parameters.save_iter_gap self.start_epoch = parameters.start_epoch # build the main network computational graph # # the main SR network: EDVR or PFNL self.model = EDVR_Core(nf=self.main_channel_nums, nframes=self.num_frames) self.GT = tf.placeholder(tf.float32, shape=[None, 1, None, None, 3], name='H_truth') self.L_train = tf.placeholder(tf.float32, shape=[ self.batch_size, self.num_frames, self.in_size, self.in_size, 3 ], name='L_train') self.SR = self.forward(self.L_train) self.L_test = tf.placeholder(tf.float32, shape=[1, self.num_frames, None, None, 3], name='L_test') self.SR_test = self.forward(self.L_test) self.loss = tf.reduce_mean(tf.sqrt((self.SR - self.GT)**2 + 1e-6)) # data loader and training supports self.LR_one_batch, self.HR_one_batch = self.double_input_producer() global_step = tf.Variable(initial_value=0, trainable=False) self.global_step = global_step lr = tf.train.polynomial_decay(self.learning_rate, global_step, self.decay_step, end_learning_rate=self.end_lr, power=1.) vars_all = tf.trainable_variables() print('Params num of all:', get_num_params(vars_all)) self.training_op = tf.train.AdamOptimizer(lr).minimize( self.loss, var_list=vars_all, global_step=global_step) # For tensorboard visualization # used in eval func self.loss_epoch = tf.placeholder(tf.float32, shape=[], name='epoch_loss_placeholder') self.epoch_loss_summary_op = tf.summary.scalar('loss/epoch loss', self.loss_epoch) self.psnr_eval = tf.placeholder(tf.float32, shape=[], name='eval_psnr_placeholder') self.eval_psnr_summary_op = tf.summary.scalar('metrics/eval psnr', self.psnr_eval) self.ssim_eval = tf.placeholder(tf.float32, shape=[], name='eval_ssim_placeholder') self.eval_ssim_summary_op = tf.summary.scalar('metrics/eval ssim', self.ssim_eval) self.merge_op_eval = tf.summary.merge([ self.epoch_loss_summary_op, self.eval_psnr_summary_op, self.eval_ssim_summary_op ]) # used in iter training func iter_loss_summary_op = tf.summary.scalar("loss/iter loss", self.loss) lr_summary_op = tf.summary.scalar("lr", lr) self.merge_op_training = tf.summary.merge( [iter_loss_summary_op, lr_summary_op]) # writer, get session and hold it and some configs self.writer = tf.summary.FileWriter(self.tensorboard_dir, tf.get_default_graph()) config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) print('[**] Initialzing global varibles ...') self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(max_to_keep=50, keep_checkpoint_every_n_hours=1) if self.reload: print('[**] loading checkpoint in dir:' + self.save_dir) self.load(self.sess, self.save_dir) # eval file prepare self.eval_frame_data_HR = [] self.eval_frame_data_LR = [] pathlists = open(self.eval_dir, 'rt').read().splitlines() for dataPath in pathlists: inList = sorted( glob.glob( os.path.join('H:/AI4K/data/frame_data/validation/LR', dataPath, '*.png'))) gtList = sorted( glob.glob( os.path.join('H:/AI4K/data/frame_data/validation/HR', dataPath, '*.png'))) assert (len(inList) == len(gtList)) self.eval_frame_data_HR.append(gtList) self.eval_frame_data_LR.append(inList)
def train_model(model_class, run_func, args, quiet=False, splits=None, abs_output_dir=False): output_dir = args.output_dir val_stat = args.val_stat # Keeps track of certain stats for all the data splits all_stats = { 'val_%s' % val_stat: [], 'test_%s' % val_stat: [], 'best_epoch': [], 'train_last': [], 'train_best': [], 'nce': [], } # Iterate over splits splits_iter = splits if splits is not None else range(args.n_splits) # Iterates through each split of the data for split_idx in splits_iter: # print('Training split idx: %d' % split_idx) # Creates the output directory for the run of the current split if not abs_output_dir: args.output_dir = output_dir + '/run_%d' % split_idx args.model_dir = args.output_dir + '/models' if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if not os.path.exists(args.model_dir): os.makedirs(args.model_dir) write_args(args) # Create model and optimizer model = model_class(args) model.to(args.device) if args.separate_lr: optim = model.get_model_optim() else: optim = torch.optim.Adam(model.parameters(), lr=args.lr) if split_idx == 0: # Print the number of parameters num_params = get_num_params(model) if not quiet: print('Initialized model with %d params' % num_params) # Load the train, val, test data dataset_loaders = {} for data_type in ['train', 'val', 'test']: dataset_loaders[data_type] = get_loader( args.data_dir, data_type=data_type, batch_size=args.batch_size, shuffle=data_type == 'train', split=split_idx, n_labels=args.n_labels) # Keeps track of stats across all the epochs train_m, val_m = StatsManager(), StatsManager() # Tensorboard logging, only for the first run split if args.log_tb and split_idx == 0: log_dir = output_dir + '/logs' tb_writer = SummaryWriter(log_dir, max_queue=1, flush_secs=60) log_tensorboard(tb_writer, {'params': num_params}, '', 0) else: args.log_tb = False # Training loop args.latest_train_stat = 0 args.latest_val_stat = 0 # Keeps track of the latest relevant stat patience_idx = 0 for epoch_idx in range(args.n_epochs): args.epoch = epoch_idx train_stats = run_func(model=model, optim=optim, data_loader=dataset_loaders['train'], data_type='train', args=args, write_path=None, quiet=quiet) should_write = epoch_idx % args.write_every == 0 val_stats = run_func( model=model, optim=None, data_loader=dataset_loaders['val'], data_type='val', args=args, write_path='%s/val_output_%d.jsonl' % (args.output_dir, epoch_idx) if should_write else None, quiet=quiet) if not quiet: train_stats.print_stats('Train %d: ' % epoch_idx) val_stats.print_stats('Val %d: ' % epoch_idx) if args.log_tb: log_tensorboard(tb_writer, train_stats.get_stats(), 'train', epoch_idx) log_tensorboard(tb_writer, val_stats.get_stats(), 'val', epoch_idx) train_stats.add_stat('epoch', epoch_idx) val_stats.add_stat('epoch', epoch_idx) train_m.add_stats(train_stats.get_stats()) val_m.add_stats(val_stats.get_stats()) if val_stats.get_stats()[val_stat] == min(val_m.stats[val_stat]): save_model(model, args, args.model_dir, epoch_idx, should_print=not quiet) patience_idx = 0 else: patience_idx += 1 if args.patience != -1 and patience_idx >= args.patience: print( 'Validation error has not improved in %d, stopping at epoch: %d' % (args.patience, args.epoch)) break # Keep track of the latest epoch stats args.latest_train_stat = train_stats.get_stats()[val_stat] args.latest_val_stat = val_stats.get_stats()[val_stat] # Load and save the best model best_epoch = val_m.get_best_epoch_for_stat(args.val_stat) best_model_path = '%s/model_%d' % (args.model_dir, best_epoch) model, _ = load_model(best_model_path, model_class=model_class, device=args.device) if not quiet: print('Loading model from %s' % best_model_path) save_model(model, args, args.model_dir, 'best', should_print=not quiet) # Test model test_stats = run_func(model=model, optim=None, data_loader=dataset_loaders['test'], data_type='test', args=args, write_path='%s/test_output.jsonl' % args.output_dir, quiet=quiet) if not quiet: test_stats.print_stats('Test: ') if args.log_tb: log_tensorboard(tb_writer, test_stats.get_stats(), 'test', 0) tb_writer.close() # Write test output to a summary file with open('%s/summary.txt' % args.output_dir, 'w+') as summary_file: for k, v in test_stats.get_stats().items(): summary_file.write('%s: %.3f\n' % (k, v)) # Aggregate relevant stats all_stats['val_%s' % val_stat].append(min(val_m.stats[val_stat])) all_stats['test_%s' % val_stat].append( test_stats.get_stats()[val_stat]) all_stats['best_epoch'].append(best_epoch) all_stats['train_last'].append(train_m.stats[val_stat][-1]) all_stats['train_best'].append(train_m.stats[val_stat][best_epoch]) if args.nce_coef > 0: all_stats['nce'].append(train_m.stats['nce_reg'][best_epoch]) # Write the stats aggregated across all splits with open('%s/summary.txt' % (output_dir), 'w+') as summary_file: summary_file.write('Num epochs trained: %d\n' % args.epoch) for name, stats_arr in all_stats.items(): if stats_arr == []: continue stats_arr = np.array(stats_arr) stats_mean = np.mean(stats_arr) stats_std = np.std(stats_arr) summary_file.write('%s: %s, mean: %.3f, std: %.3f\n' % (name, str(stats_arr), stats_mean, stats_std)) all_val_stats = np.array(all_stats['val_%s' % val_stat]) all_test_stats = np.array(all_stats['test_%s' % val_stat]) val_mean, val_std = np.mean(all_val_stats), np.std(all_val_stats) test_mean, test_std = np.mean(all_test_stats), np.std(all_val_stats) train_last = np.mean(np.array(all_stats['train_last'])) train_best = np.mean(np.array(all_stats['train_best'])) if args.nce_coef > 0: nce_loss = np.mean(np.array(all_stats['nce'])) else: nce_loss = 0 # Return stats return (val_mean, val_std), (test_mean, test_std), (train_last, train_best), nce_loss
def train(self): def train_op_func(loss, var_list, is_gradient_clip=False): if is_gradient_clip: train_op = tf.train.AdamOptimizer(lr, self.beta1) grads_and_vars = train_op.compute_gradients(loss, var_list=var_list) unchanged_gvs = [(grad, var) for grad, var in grads_and_vars if not 'LSTM' in var.name] rnn_grad = [ grad for grad, var in grads_and_vars if 'LSTM' in var.name ] rnn_var = [ var for grad, var in grads_and_vars if 'LSTM' in var.name ] capped_grad, _ = tf.clip_by_global_norm(rnn_grad, clip_norm=3) capped_gvs = list(zip(capped_grad, rnn_var)) train_op = train_op.apply_gradients(grads_and_vars=capped_gvs + unchanged_gvs, global_step=global_step) else: # train_op = tf.train.GradientDescentOptimizer(lr).minimize(loss, var_list=var_list, global_step=global_step) train_op = tf.train.AdamOptimizer(lr).minimize( loss, var_list=var_list, global_step=global_step) return train_op """Train video sr network""" global_step = tf.Variable(initial_value=0, trainable=False) self.global_step = global_step # Create folder for logs if not tf.gfile.Exists(self.save_dir): tf.gfile.MakeDirs(self.save_dir) self.build_model() lr = tf.train.polynomial_decay(self.learning_rate, global_step, self.decay_step, end_learning_rate=self.end_lr, power=0.9) # tf.summary.scalar('learning_rate', lr) vars_all = tf.trainable_variables() vars_sr = [v for v in vars_all if 'srmodel' in v.name] vars_flow = [v for v in vars_all if 'flow' in v.name] train_all = train_op_func(self.loss, vars_all, is_gradient_clip=True) train_flow = train_op_func(self.loss_flow, vars_flow, is_gradient_clip=True) train_sr = train_op_func(self.loss_mse, vars_sr, is_gradient_clip=True) print('params num of flow:', get_num_params(vars_flow)) print('params num of sr:', get_num_params(vars_sr)) print('params num of all:', get_num_params(vars_all)) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) #sess=tf.Session() self.sess = sess sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(max_to_keep=100, keep_checkpoint_every_n_hours=1) #self.flownets.load_easyflow(sess, os.path.join('./easyflow_log/model1', 'checkpoints')) if self.reload: self.load(sess, self.save_dir) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # summary_op = tf.summary.merge_all() # summary_writer = tf.summary.FileWriter(self.train_dir, sess.graph, flush_secs=30) cost_time = 0 start_time = time.time() gs = sess.run(global_step) for step in range(sess.run(global_step), self.max_steps): if step < 10000: train_op = train_sr #train_flow else: train_op = train_all if step > gs and step % 20 == 0: print( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), 'Step:{}, loss:({:.3f},{:.3f},{:.3f}), mse:{}'.format( step, loss_value, loss_mse_value, loss_flow_value * 100, str(mse_value))) # if step % 50 == 0: # # summary_str = sess.run(summary_op, feed_dict={inputs:batch_input, gt:batch_gt}) # summary_str = sess.run(summary_op) # summary_writer.add_summary(summary_str, global_step=step) if step % 500 == 0: if step > gs: self.save(sess, self.save_dir, step) cost_time = time.time() - start_time print('cost {}s.'.format(cost_time)) self.evaluation() cost_time = time.time() - start_time start_time = time.time() print('cost {}s.'.format(cost_time)) _, loss_value, mse_value, loss_mse_value, loss_flow_value = sess.run( [train_op, self.loss, self.mse, self.loss_mse, self.loss_flow]) # print (loss_value) assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
def train(self): # LR, HR= self.single_input_producer() LR, HR= self.double_input_producer() global_step=tf.Variable(initial_value=0, trainable=False) self.global_step=global_step self.build() lr= tf.train.polynomial_decay(self.learning_rate, global_step, self.decay_step, end_learning_rate=self.end_lr, power=1.) vars_all=tf.trainable_variables() print('Params num of all:',get_num_params(vars_all)) training_op = tf.train.AdamOptimizer(lr).minimize(self.loss, var_list=vars_all, global_step=global_step) # For tensorboard visualization writer = tf.summary.FileWriter(self.tensorboard_dir, tf.get_default_graph()) tf.summary.scalar("loss", self.loss) merge_op = tf.summary.merge_all() config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) self.sess=sess sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(max_to_keep=50, keep_checkpoint_every_n_hours=1) if self.reload: self.load(sess, self.save_dir) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) cost_time=0 start_time=time.time() gs=sess.run(global_step) for step in range(sess.run(global_step), self.max_step): if step>gs: #and step%20==0: print(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()),'Step:{}, loss:{}'.format(step,loss_v)) # eval and save model if step % self.save_iter_gap == 0: if step>gs: print('saving model at global step: '+ str(step)) self.save(sess, self.save_dir, step) cost_time=time.time()-start_time print('cost {}s.'.format(cost_time)) # self.eval() cost_time=time.time()-start_time start_time=time.time() print('cost {}s.'.format(cost_time)) lr1,hr=sess.run([LR,HR]) _,loss_v,ss=sess.run([training_op, self.loss, merge_op],feed_dict={self.L:lr1, self.H:hr}) writer.add_summary(ss, step) if step>500 and loss_v>10: print('Model collapsed with loss={}'.format(loss_v)) break writer.close()
# Get best val score index of each model best_val_cnn_1_index = val_cnn_1.index(max(val_cnn_1)) best_val_lstm_1_index = val_lstm_1.index(max(val_lstm_1)) # Get test score w.r.t best validation score of each model test_cnn_1 = all_F_1_CNN[best_val_cnn_1_index][2] test_lstm_1 = all_F_1_LSTM[best_val_lstm_1_index][2] # Get best val score of each mode val_cnn_1 = max(val_cnn_1) val_lstm_1 = max(val_lstm_1) # Construct results table df = pd.DataFrame() df['model'] = ['cnn_cnn', 'lstm_cnn'] df['parameters'] = [get_num_params(cnn_cnn), get_num_params(lstm_cnn)] df['val'] = [val_cnn_1, val_lstm_1] df['test'] = [test_cnn_1, test_lstm_1] print( 'CNN char-level encoder vs LSTM char-level encoder (both using Single CNN word-level encoder)\n' ) print(df) # #### 2. `Single-layer CNN word-level encoder` vs `Multi-layer CNN word-level encoder` (all using `CNN char-level encoder`) # Get all val scores of each model val_cnn_1 = [f1[1] for f1 in all_F_1_CNN] val_cnn_2 = [f1[1] for f1 in all_F_2_CNN] val_cnn_3 = [f1[1] for f1 in all_F_3_CNN]