def train(args): graph = tf.Graph() with graph.as_default(): global_step = tf.train.create_global_step() # # placeholders for training data imgs = tf.placeholder(tf.float32, [None, args.crop_height, args.crop_width, 3]) scores = tf.placeholder(tf.float32, [None]) dropout_keep_prob = tf.placeholder(tf.float32, []) lr = tf.placeholder(tf.float32, []) with tf.name_scope("create_models"): model = VggNetModel(num_classes=1, dropout_keep_prob=dropout_keep_prob) y_hat = model.inference(imgs, True) y_hat = tf.reshape(y_hat, [ -1, ]) with tf.name_scope("create_loss"): reg_loss = mes(y_hat, scores) with tf.name_scope("create_optimize"): # optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr).minimize(loss) # not converge ?? var_list = [v for v in tf.trainable_variables()] optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize( reg_loss, var_list=var_list) saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=10) tf.summary.scalar('learning_rate', lr) tf.summary.scalar('reg_loss', reg_loss) # Build the summary Tensor based on the TF collection of Summaries. summary_op = tf.summary.merge_all() # Instantiate a SummaryWriter to output summaries and the Graph. timestamp = datetime.fromtimestamp( time.time()).strftime('%Y%m%d-%H:%M') summary_writer = tf.summary.FileWriter(os.path.join( args.logs_dir, 'train/{}-{}'.format(args.exp_name, timestamp)), filename_suffix=args.exp_name) summary_test = tf.summary.FileWriter(os.path.join( args.logs_dir, 'test/{}-{}'.format(args.exp_name, timestamp)), filename_suffix=args.exp_name) train_image_paths, train_scores, test_image_paths, test_scores = get_image_list( args) train_loader = train_generator(train_image_paths, train_scores) train_num_batchs = len(train_image_paths) // args.batch_size + 1 test_loader = val_generator(test_image_paths, test_scores, args.batch_size) test_num_batchs = len(test_image_paths) // args.batch_size + 1 with tf.Session(graph=graph) as sess: sess.run(tf.global_variables_initializer()) ckpt = tf.train.get_checkpoint_state(args.ckpt_dir) counter = 0 if ckpt and ckpt.model_checkpoint_path: counter = __load__(saver, sess, args.ckpt_dir) else: load(saver, sess, args.pretrain_models_path) start_time = time.time() start_step = counter # if counter is not None else 0 base_lr = args.learning_rate for step, (images, targets) in enumerate(train_loader, start_step): if step <= 500: base_lr = args.start_lr + (args.learning_rate - args.start_lr) * step / float(500) else: if (step + 1) % (0.5 * args.iter_max) == 0: base_lr = base_lr / 5 if (step + 1) % (0.8 * args.iter_max) == 0: base_lr = base_lr / 5 # base_lr=(base_lr-base_lr*0.001)/args.iter_max*(args) # other learning rate modify loss_, y_hat_, _ = sess.run( [reg_loss, y_hat, optimizer], feed_dict={ imgs: images, scores: targets, lr: base_lr, dropout_keep_prob: args.dropout_keep_prob }) if (step + 1) % args.summary_step == 0: # logger.info("targets labels is : {}".format(targets)) # logger.info("predict lables is : {}".format(y_hat_)) logger.info( "step %d/%d,reg loss is %f, time %f,learning rate: %.8f" % (step, args.iter_max, loss_, (time.time() - start_time), base_lr)) summary_str = sess.run(summary_op, feed_dict={ imgs: images, scores: targets, lr: base_lr, dropout_keep_prob: args.dropout_keep_prob }) summary_writer.add_summary(summary_str, step) # summary_writer.flush() if (step + 1) % args.test_step == 0: if args.save_ckpt_file: # saver.save(sess, args.checkpoint_dir + 'iteration_' + str(step) + '.ckpt',write_meta_graph=False) save(saver, sess, args.ckpt_dir, step) test_loss = 0 scores_set = np.array([]) lables_set = np.array([]) # for step, (images, targets) in enumerate(test_loader): for i in range(test_num_batchs): images, targets = next(test_loader) loss_, y_hat_ = sess.run( [reg_loss, y_hat], feed_dict={ imgs: images, scores: targets, lr: base_lr, dropout_keep_prob: args.dropout_keep_prob }) test_loss += loss_ scores_set = np.append(scores_set, y_hat_) lables_set = np.append(lables_set, targets) logger.info( 'test_loader step/len(test_loader) :{}/{}'.format( i, test_num_batchs)) # print(type(scores_set), type(lables_set)) # logger.info("scores_set:{}, lables_set:{}.".format(scores_set,lables_set.shape)) srocc, krocc, plcc, rmse, mse = evaluate_metric( lables_set, scores_set) test_loss /= test_num_batchs logger.info( "SROCC_v: %.3f\t KROCC: %.3f\t PLCC_v: %.3f\t RMSE_v: %.3f\t mse: %.3f\t test loss: %.3f\n" % (srocc, krocc, plcc, rmse, mse, test_loss)) s1 = tf.Summary(value=[ tf.Summary.Value(tag='test_loss', simple_value=test_loss) ]) s2 = tf.Summary(value=[ tf.Summary.Value(tag='test_srocc', simple_value=srocc) ]) summary_test.add_summary(s1, step) summary_test.add_summary(s2, step) if step == args.iter_max: saver.save(sess, args.ckpt_dir + '/final_model_' + timestamp + '.ckpt', write_meta_graph=False) logger.info( 'save train_iqa final models max_iter: {}...'.format( args.iter_max)) break logger.info("Optimization finish!")
def train(args): graph = tf.Graph() with graph.as_default(): global_step = tf.train.create_global_step() # # placeholders for training data imgs = tf.placeholder(tf.float32, [None, args.crop_height, args.crop_width, 3]) dropout_keep_prob = tf.placeholder(tf.float32, []) lr = tf.placeholder(tf.float32, []) with tf.name_scope("create_models"): model = VggNetModel(num_classes=1, dropout_keep_prob=dropout_keep_prob) y_hat = model.inference(imgs, True) with tf.name_scope("create_loss"): rank_loss = Rank_loss() loss = rank_loss.get_rankloss(y_hat, args.batch_size) with tf.name_scope("create_optimize"): # optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr).minimize(loss) var_list = [v for v in tf.trainable_variables()] optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize( loss, var_list=var_list) saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=10) tf.summary.scalar('learning_rate', lr) tf.summary.scalar('rank_loss', loss) # Build the summary Tensor based on the TF collection of Summaries. summary_op = tf.summary.merge_all() # Instantiate a SummaryWriter to output summaries and the Graph. timestamp = datetime.fromtimestamp( time.time()).strftime('%Y%m%d-%H:%M') summary_writer = tf.summary.FileWriter(os.path.join( args.logs_dir, 'train/{}-{}'.format(args.exp_name, timestamp)), filename_suffix=args.exp_name) summary_test = tf.summary.FileWriter(os.path.join( args.logs_dir, 'test/{}-{}'.format(args.exp_name, timestamp)), filename_suffix=args.exp_name) train_data = Dataset({ 'root_dir': os.path.abspath('..'), 'data_root': 'data', 'split': 'live_train', 'im_shape': [224, 224], 'batch_size': args.batch_size }) test_data = Dataset({ 'root_dir': os.path.abspath('..'), 'data_root': 'data', 'split': 'live_test', 'im_shape': [224, 224], 'batch_size': args.batch_size }) with tf.Session(graph=graph) as sess: sess.run(tf.global_variables_initializer()) model.load_original_weights(sess, args.vgg_models_path) # global_var = tf.global_variables() # var_list = sess.run(global_var) start_time = time.time() base_lr = args.learning_rate for step in range(args.iter_max): if (step + 1) % (0.5 * args.iter_max) == 0: base_lr = base_lr / 5 if (step + 1) % (0.8 * args.iter_max) == 0: base_lr = base_lr / 5 # base_lr=(base_lr-base_lr*0.001)/args.iter_max*(args) # other learning rate modify image_batch, label_batch = train_data.next_batch() loss_, _ = sess.run( [loss, optimizer], feed_dict={ imgs: image_batch, lr: base_lr, dropout_keep_prob: args.dropout_keep_prob }) if (step + 1) % args.summary_step == 0: logger.info( "step %d/%d,rank loss is %f, time %f,learning rate: %.8f" % (step, args.iter_max, loss_, (time.time() - start_time), base_lr)) summary_str = sess.run(summary_op, feed_dict={ imgs: image_batch, lr: base_lr, dropout_keep_prob: args.dropout_keep_prob }) summary_writer.add_summary(summary_str, step) # summary_writer.flush() if (step + 1) % args.test_step == 0: if args.save_ckpt_file: # saver.save(sess, args.checkpoint_dir + 'iteration_' + str(step) + '.ckpt',write_meta_graph=False) save(saver, sess, args.ckpt_dir, step) test_epoch_step = len( test_data.scores) // test_data.batch_size + 1 test_loss = 0 for _ in range(test_epoch_step): image_batch, label_batch = test_data.next_batch() loss_, _ = sess.run( [loss, optimizer], feed_dict={ imgs: image_batch, lr: base_lr, dropout_keep_prob: args.dropout_keep_prob }) test_loss += loss_ test_loss /= test_epoch_step s = tf.Summary(value=[ tf.Summary.Value(tag='test_loss', simple_value=test_loss) ]) summary_test.add_summary(s, step) if step == args.iter_max: saver.save(sess, args.ckpt_dir + 'rank_model_final' + '.ckpt', write_meta_graph=False) logger.info("Optimization finish!")
def train(self, net, num_epochs, optimizer, train_loader, test_loader): print("Starting training...") # Run training for some number of epochs. for epoch in range(num_epochs): print(f"\nEpoch {epoch + 1} / {num_epochs}\n") torch.cuda.empty_cache() # Save checkpoint periodically. is_checkpoint_epoch = ( self.checkpoint_epochs and epoch % self.checkpoint_epochs == 0 ) if self.checkpoint_name and is_checkpoint_epoch: checkpoint.save(net, self.checkpoint_name, name=self.wandb_name) # Run training loop net.train() for inputs, targets in tqdm(train_loader): batch_size = inputs.shape[0] inputs = inputs.cuda() if self.use_cuda else inputs.cpu() targets = targets.cuda() if self.use_cuda else targets.cpu() # Sanity check training data shape sizes if self.input_shape: expected_shape = tuple([batch_size] + self.input_shape) assert ( inputs.shape == expected_shape ), f"Bad shape: expected {expected_shape} got {inputs.shape}" if self.target_shape: expected_shape = tuple([batch_size] + self.target_shape) assert ( targets.shape == expected_shape ), f"Bad shape: expected {expected_shape} got {target.shape}" # Get a prediction from the model optimizer.zero_grad() outputs = net(inputs) if self.output_shape: expected_shape = tuple([batch_size] + self.output_shape) assert ( outputs.shape == expected_shape ), f"Bad shape: expected {expected_shape} got {outputs.shape}" # Run loss function on over the model's prediction loss = torch.tensor([0.0], requires_grad=True) loss = loss.cuda() if self.use_cuda else loss for loss_fn, weight in self.loss_fns: loss = loss + weight * loss_fn(inputs, outputs, targets) # Calculate model weight gradients from the loss and update model. loss.backward() optimizer.step() if self.scheduler: # Update the learning rate, according to the scheduler. try: self.scheduler.step() except ValueError: pass # Track metric information with torch.no_grad(): for metric_fn, _, train_tracker, _ in self.metric_fns: metric_val = metric_fn(inputs, outputs, targets) train_tracker.update(metric_val) # Check performance (loss) on validation set. net.eval() with torch.no_grad(): for inputs, targets in tqdm(test_loader): inputs = inputs.cuda() if self.use_cuda else inputs.cpu() targets = targets.cuda() if self.use_cuda else targets.cpu() outputs = net(inputs) # Track metric information for metric_fn, _, _, test_tracker in self.metric_fns: metric_val = metric_fn(inputs, outputs, targets) test_tracker.update(metric_val) # Log epoch metrics training_info = {} for _, name, train_tracker, test_tracker in self.metric_fns: training_info[f"Training {name}"] = train_tracker.value training_info[f"Validation {name}"] = test_tracker.value if self.scheduler: try: training_info[f"Learning rate"] = self.scheduler.get_lr()[0] except ValueError: pass # Whatevs log_training_info(training_info, use_wandb=self.use_wandb) # Save final model checkpoint if self.checkpoint_name: checkpoint.save( net, self.checkpoint_name, name=self.wandb_name, use_wandb=self.use_wandb )