class ParseLogCallback(object): """ 1. log distribution's std to tensorboard (as distribution) This function make use of mxnet's "monitor" module, and it's output to a log file. while training, it is possible to specify layers to be monitored. these layers will be printed to a given log file, their values are computed **asynchronously**. 2. log training loss to tensorboard (as scalar) Currently - does not support resume training.. """ def __init__(self, dist_logging_dir=None, scalar_logging_dir=None, logfile_path=None, batch_size=None, iter_monitor=0, frequent=None, prefix='ssd'): self.scalar_logging_dir = scalar_logging_dir self.dist_logging_dir = dist_logging_dir self.logfile_path = logfile_path self.batch_size = batch_size self.iter_monitor = iter_monitor self.frequent = frequent self.prefix = prefix self.batch = 0 self.line_idx = 0 try: from tensorboard import SummaryWriter self.dist_summary_writer = SummaryWriter(dist_logging_dir) self.scalar_summary_writer = SummaryWriter(scalar_logging_dir) except ImportError: logging.error('You can install tensorboard via `pip install tensorboard`.') def __call__(self, param): """Callback to parse a log file and and add params to TensorBoard.""" # save distributions from the monitor output log if not self.iter_monitor == 0 and self.batch % self.iter_monitor == 0: with open(self.logfile_path) as fp: for i in range(self.line_idx): fp.next() for line in fp: if line.startswith('Batch'): line = line.split(' ') line = [x for x in line if x] layer_name = line[2] layer_value = np.array(float(line[3].split('\t')[0])).flatten() if np.isfinite(layer_value): self.dist_summary_writer.add_histogram(layer_name, layer_value) self.line_idx += 1 # save training loss if self.batch % self.frequent == 0: if param.eval_metric is None: return name_value = param.eval_metric.get_name_value() for name, value in name_value: if self.prefix is not None: name = '%s-%s' % (self.prefix, name) self.scalar_summary_writer.add_scalar(name, value, global_step=self.batch) self.batch += 1
class LogMetricsCallback(object): """Log metrics periodically in TensorBoard. This callback works almost same as `callback.Speedometer`, but write TensorBoard event file for visualization. For more usage, please refer https://github.com/dmlc/tensorboard Parameters ---------- logging_dir : str TensorBoard event file directory. After that, use `tensorboard --logdir=path/to/logs` to launch TensorBoard visualization. prefix : str Prefix for a metric name of `scalar` value. You might want to use this param to leverage TensorBoard plot feature, where TensorBoard plots different curves in one graph when they have same `name`. The follow example shows the usage(how to compare a train and eval metric in a same graph). Examples -------- >>> # log train and eval metrics under different directories. >>> training_log = 'logs/train' >>> evaluation_log = 'logs/eval' >>> # in this case, each training and evaluation metric pairs has same name, >>> # you can add a prefix to make it separate. >>> batch_end_callbacks = [mx.contrib.tensorboard.LogMetricsCallback(training_log)] >>> eval_end_callbacks = [mx.contrib.tensorboard.LogMetricsCallback(evaluation_log)] >>> # run >>> model.fit(train, >>> ... >>> batch_end_callback = batch_end_callbacks, >>> eval_end_callback = eval_end_callbacks) >>> # Then use `tensorboard --logdir=logs/` to launch TensorBoard visualization. """ def __init__(self, logging_dir, prefix=None): self.prefix = prefix try: from tensorboard import SummaryWriter self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error( 'You can install tensorboard via `pip install tensorboard`.') def __call__(self, param): """Callback to log training speed and metrics in TensorBoard.""" if param.eval_metric is None: return name_value = param.eval_metric.get_name_value() for name, value in name_value: if self.prefix is not None: name = '%s-%s' % (self.prefix, name) self.summary_writer.add_scalar(name, value, global_step=param.epoch)
class LogMetricsCallback(object): """Log metrics periodically in TensorBoard. This callback works almost same as `callback.Speedometer`, but write TensorBoard event file for visualization. For more usage, please refer https://github.com/dmlc/tensorboard Parameters ---------- logging_dir : str TensorBoard event file directory. After that, use `tensorboard --logdir=path/to/logs` to launch TensorBoard visualization. prefix : str Prefix for a metric name of `scalar` value. You might want to use this param to leverage TensorBoard plot feature, where TensorBoard plots different curves in one graph when they have same `name`. The follow example shows the usage(how to compare a train and eval metric in a same graph). Examples -------- >>> # log train and eval metrics under different directories. >>> training_log = 'logs/train' >>> evaluation_log = 'logs/eval' >>> # in this case, each training and evaluation metric pairs has same name, >>> # you can add a prefix to make it separate. >>> batch_end_callbacks = [mx.contrib.tensorboard.LogMetricsCallback(training_log)] >>> eval_end_callbacks = [mx.contrib.tensorboard.LogMetricsCallback(evaluation_log)] >>> # run >>> model.fit(train, >>> ... >>> batch_end_callback = batch_end_callbacks, >>> eval_end_callback = eval_end_callbacks) >>> # Then use `tensorboard --logdir=logs/` to launch TensorBoard visualization. """ def __init__(self, logging_dir, prefix=None): self.prefix = prefix try: from tensorboard import SummaryWriter self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error('You can install tensorboard via `pip install tensorboard`.') def __call__(self, param): """Callback to log training speed and metrics in TensorBoard.""" if param.eval_metric is None: return name_value = param.eval_metric.get_name_value() for name, value in name_value: if self.prefix is not None: name = '%s-%s' % (self.prefix, name) self.summary_writer.add_scalar(name, value)
class Logger(): def __init__(self, root): self.writer = SummaryWriter(root) self.last_indexes = defaultdict(int) def scalar(self, key, value, index=None): index = index if index is not None else self.last_indexes[key] self.last_indexes[key] += 1 value = to_numeric(value) self.writer.add_scalar(key, value, index) def from_stats(self, key_value_dictionary, index=None): for key in key_value_dictionary: self.scalar(key, key_value_dictionary[key], index)
class LogMetricsCallback(object): def __init__(self, logging_dir, prefix=None): self.prefix = prefix self.itr = 0 try: from tensorboard import SummaryWriter self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error( 'You can install tensorboard via `pip install tensorboard`.') def __call__(self, name_value): """Callback to log training speed and metrics in TensorBoard.""" if name_value is None: return for name, value in name_value: if self.prefix is not None: name = '%s-%s' % (self.prefix, name) self.summary_writer.add_scalar(name, value, self.itr) self.itr += 1
class LogMetricsCallback(object): """Log metrics periodically in TensorBoard. This callback works almost same as `callback.Speedometer`, but write TensorBoard event file for visualization. For more usage, please refer https://github.com/dmlc/tensorboard Parameters ---------- logging_dir : str TensorBoard event file directory. After that, use `tensorboard --logdir=path/to/logs` to launch TensorBoard visualization. prefix : str Prefix for a metric name of `scalar` value. You might want to use this param to leverage TensorBoard plot feature, where TensorBoard plots different curves in one graph when they have same `name`. The follow example shows the usage(how to compare a train and eval metric in a same graph). """ def __init__(self, logging_dir, prefix=None, global_step=100): self.prefix = prefix self.global_step = global_step try: from tensorboard import SummaryWriter self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error( 'You can install tensorboard via `pip install tensorboard`.') def __call__(self, param): """Callback to log training speed and metrics in TensorBoard.""" if param.eval_metric is None: return name_value = param.eval_metric.get_name_value() for name, value in name_value: if self.prefix is not None: name = '%s-%s' % (self.prefix, name) self.summary_writer.add_scalar(name, value, global_step=self.global_step)
def train(self) -> None: epoch_counter = 0 # Resume from serialization path if it contains a saved model. if self._serialization_dir is not None: # Set up tensorboard logging. train_log = SummaryWriter(os.path.join(self._serialization_dir, "log", "train")) validation_log = SummaryWriter(os.path.join(self._serialization_dir, "log", "validation")) if any(["model_state_epoch_" in x for x in os.listdir(self._serialization_dir)]): logger.info("Loading model from checkpoint.") epoch_counter = self._restore_checkpoint() if self._grad_clipping is not None: # Pylint is unable to tell that we're in the case that _glad_clipping is not None... # pylint: disable=invalid-unary-operand-type clip_function = lambda grad: grad.clamp(-self._grad_clipping, self._grad_clipping) for parameter in self._model.parameters(): if parameter.requires_grad: parameter.register_hook(clip_function) logger.info("Beginning training.") num_training_batches = self._iterator.get_num_batches(self._train_dataset) if self._validation_dataset is not None: num_validation_batches = self._iterator.get_num_batches(self._validation_dataset) validation_metric_per_epoch: List[float] = [] for epoch in range(epoch_counter, self._num_epochs): logger.info("Epoch %d/%d", epoch + 1, self._num_epochs) train_loss = 0.0 val_loss = 0.0 # Set the model to "train" mode. self._model.train() train_generator = self._iterator(self._train_dataset, num_epochs=1) train_generator_tqdm = tqdm.tqdm(train_generator, disable=self._no_tqdm, total=num_training_batches) last_log = time.time() batch_num = 0 logger.info("Training") for batch in train_generator_tqdm: batch_num += 1 self._optimizer.zero_grad() output_dict = self._forward(batch, for_training=True) try: loss = output_dict["loss"] loss.backward() # Make sure Variable is on the cpu before converting to numpy. # .cpu() is a no-op if you aren't using GPUs. train_loss += loss.data.cpu().numpy() except KeyError: raise ConfigurationError("The model you are trying to optimize does not contain a" " 'loss' key in the output of model.forward(inputs).") if self._grad_norm: clip_grad_norm(self._model.parameters(), self._grad_norm) self._optimizer.step() metrics = self._model.get_metrics() metrics["loss"] = float(train_loss / batch_num) description = self._description_from_metrics(metrics) train_generator_tqdm.set_description(description) batch_num_total = num_training_batches * epoch + batch_num if self._serialization_dir and batch_num_total % self._summary_interval == 0: for name, param in self._model.named_parameters(): train_log.add_scalar("PARAMETER_MEAN/" + name, param.data.mean(), batch_num_total) train_log.add_scalar("PARAMETER_STD/" + name, param.data.std(), batch_num_total) if param.grad is not None: train_log.add_scalar("GRAD_MEAN/" + name, param.grad.data.mean(), batch_num_total) train_log.add_scalar("GRAD_STD/" + name, param.grad.data.std(), batch_num_total) train_log.add_scalar("LOSS/loss_train", metrics["loss"], batch_num_total) if self._no_tqdm and time.time() - last_log > self._log_interval: logger.info("Batch %d/%d: %s", batch_num, num_training_batches, description) last_log = time.time() metrics = self._model.get_metrics(reset=True) metrics["loss"] = float(train_loss / batch_num) if self._validation_dataset is not None: logger.info("Validating") # Switch to evaluation mode. self._model.eval() val_generator = self._iterator(self._validation_dataset, num_epochs=1) val_generator_tqdm = tqdm.tqdm(val_generator, disable=self._no_tqdm, total=num_validation_batches) batch_num = 0 for batch in val_generator_tqdm: batch_num += 1 val_output_dict = self._forward(batch, for_training=False) loss = val_output_dict["loss"] val_loss += loss.data.cpu().numpy() val_metrics = self._model.get_metrics() val_metrics["loss"] = float(val_loss / batch_num) description = self._description_from_metrics(val_metrics) val_generator_tqdm.set_description(description) if self._no_tqdm and time.time() - last_log > self._log_interval: logger.info("Batch %d/%d: %s", batch_num, num_validation_batches, description) last_log = time.time() val_metrics = self._model.get_metrics(reset=True) val_metrics["loss"] = float(val_loss / batch_num) message_template = "Training %s : %3f Validation %s : %3f " for name, value in metrics.items(): logger.info(message_template, name, value, name, val_metrics[name]) if self._serialization_dir: train_log.add_scalar(name, value, epoch) validation_log.add_scalar(name, val_metrics[name], epoch) this_epoch_val_metric = val_metrics[self._validation_metric] if len(validation_metric_per_epoch) > self._patience: # Is the worst validation performance in past self._patience # epochs is better than current value? if self._validation_metric_decreases: should_stop = max(validation_metric_per_epoch[-self._patience:]) < this_epoch_val_metric else: should_stop = min(validation_metric_per_epoch[-self._patience:]) > this_epoch_val_metric if should_stop: logger.info("Ran out of patience. Stopping training.") break validation_metric_per_epoch.append(this_epoch_val_metric) if self._validation_metric_decreases: is_best_so_far = this_epoch_val_metric == min(validation_metric_per_epoch) else: is_best_so_far = this_epoch_val_metric == max(validation_metric_per_epoch) if self._serialization_dir: self._save_checkpoint(epoch, is_best=is_best_so_far) if self._learning_rate_scheduler: # Grim hack to determine whether the validation metric we are recording # needs to be passed to the scheduler. This is required because the # step() function of the different schedulers are (understandably) # different to ReduceLROnPlateau. if isinstance(self._learning_rate_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): self._learning_rate_scheduler.step(this_epoch_val_metric, epoch) self._learning_rate_scheduler.step(epoch) else: message_template = "Training %s : %3f " for name, value in metrics.items(): logger.info(message_template, name, value) if self._serialization_dir: train_log.add_scalar(name, value, epoch) if self._serialization_dir: self._save_checkpoint(epoch) if self._learning_rate_scheduler: if isinstance(self._learning_rate_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): raise ConfigurationError("The reduce_on_plateau learning rate scheduler requires " "a validation metric to compute the schedule and therefore " "must be used with a validation dataset.") self._learning_rate_scheduler.step(epoch)
optimizer.zero_grad() #calcolo uscita out = net(data_batch) #qui mi serve solo uscita finale #out = out[0] #loss loss_value = loss(out, labels_batch) #accuracy accuracy_value = classification_accuracy(out, labels_batch) # propago loss_value.backward() # adesso ho accesso al gradiente e posso aggiornare anche i pesi optimizer.step() #LOGGING progress.update(progress.value + 1, loss=loss_value.data.cpu().numpy()[0], accuracy=accuracy_value, epoch=i + 1) if j % logging_step == 0: #LOSS ACCURACY writer.add_scalar('loss', loss_value.data[0], step) writer.add_scalar('accuracy', accuracy_value, step) step += 1 #PARAMS #for name, param in net.named_parameters(): # writer.add_histogram(name, param.clone().cpu().data.numpy(), i*batch_number+j) progress.finish()
optimizer_feat = torch.optim.Adam(res101.parameters(), lr=1e-4) for t in range(10): for i, (img, label) in enumerate(loader): img = img.cuda() label = label[0].cuda() label = Variable(label) input = Variable(img) feats = res101(input) output = seg(feats) seg.zero_grad() res101.zero_grad() loss = criterion(output, label) loss.backward() optimizer_feat.step() optimizer_seg.step() ## see input = make_image_grid(img, mean, std) label = make_label_grid(label.data) label = Colorize()(label).type(torch.FloatTensor) output = make_label_grid(torch.max(output, dim=1)[1].data) output = Colorize()(output).type(torch.FloatTensor) writer.add_image('image', input, i) writer.add_image('label', label, i) writer.add_image('pred', output, i) writer.add_scalar('loss', loss.data[0], i) print "epoch %d step %d, loss=%.4f" % (t, i, loss.data.cpu()[0])
def train_loop(thread_id, env_name, shared_model, opt, phi, board_path): logger = logging.getLogger(__name__) agent = Agent_a3c(shared_model=shared_model, optimizer=opt, phi=phi) agent.generagte_local_model(thread_id) done = False episode = 0 r = 0 step = 0 global_step = 0 local_r_sum = 0 env = gym.make(env_name) obs = env.reset() # set writer writer = SummaryWriter(board_path) while True: if done or step == MAX_EPISODE_LEN: obs = env.reset() global_step += step if thread_id == 0: logger.info( 'episode: {}, r_sum: {}, total_step:{}, step len in episode: {}' .format(episode, local_r_sum, step, step - step_last)) if episode % EVAL_INTERVAL == 0: evaluate(env, agent) writer.add_scalar('reward_sum_{}'.format(thread_id), local_r_sum, episode) writer.add_scalar('V_{}'.format(thread_id), agent.shared_V_out.data, episode) writer.add_scalar('A_{}'.format(thread_id), agent.A.data, episode) writer.add_scalar('loss_v_{}'.format(thread_id), agent.V_loss.data, episode) writer.add_scalar('loss_pi_{}'.format(thread_id), agent.pi_loss.data, episode) writer.add_scalar('loss_entropy_{}'.format(thread_id), agent.entropy_loss.data, episode) writer.add_all_parameter_histograms([agent.pi_loss], episode) writer.add_all_parameter_histograms([agent.V_loss], episode) writer.add_all_parameter_histograms([agent.entropy_loss], episode) r = 0 step_last = step local_r_sum = 0 done = False episode += 1 else: a = agent.act_and_train(obs, r, is_state_terminal=done) obs, r, done, info = env.step(a) r = 0.01 * r # reduce if thread_id == 0: logger.debug( 'step: {}, r: {}, a: {}, s: {}, done: {}, info: {}'.format( step, r, a, obs, done, info)) local_r_sum += r step += 1
def main(args): writer = SummaryWriter(args.logs_dir) sys.stdout = Logger(osp.join(args.logs_dir, 'train_log.txt')) print(args) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.benchmark = True # Create data loaders data_dir = osp.join(args.data_dir, args.dataset) dataset, num_classes, dim_featx, dim_featy, train_loader, val_loader, test_loader = \ get_data( args.dataset, data_dir, args.data_type, args.batch_size, args.workers, args.combine_trainval, head_feat_dir=args.head_feat_dir, face_feat_dir=args.face_feat_dir, body_feat_dir=args.body_feat_dir, upperbody_feat_dir=args.upperbody_feat_dir) # Create model model = RANet(4, num_features=dim_featx) # model = torch.nn.DataParallel(model).cuda() model = model.cuda() # load from checkpoint if args.resume: checkpoint = load_checkpoint(args.resume) model.load_state_dict(checkpoint['state_dict']) args.start_epoch = checkpoint['epoch'] best_top1 = checkpoint['best_top1'] print("=> start epoch {} best top1 {:.1%}".format( args.start_epoch, best_top1)) else: best_top1 = 0 # Criterion criterion = OIM4bLoss(dim_featy, num_classes, scalar=args.oim_scalar, momentum=args.oim_momentum) criterion.init_lut(train_loader) criterion.cuda() # Optimizer if args.optimizer == 'sgd': param_groups = model.parameters() optimizer = torch.optim.SGD(param_groups, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=True) elif args.optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: raise ValueError("Cannot recognize optimizer type:", args.optimizer) # Evaluator and Trainer evaluator = RAEvaluator(model) trainer = RATrainer(model, criterion) # Schedule learning rate def adjust_lr(epoch): if args.optimizer == 'sgd': lr = args.lr * (0.1**(epoch // 20)) elif args.optimizer == 'adam': lr = args.lr if epoch <= 50 else \ args.lr * (0.01 ** (epoch - 50) / 30) else: raise ValueError("Cannot recognize optimizer type:", args.optimizer) for g in optimizer.param_groups: g['lr'] = lr * g.get('lr_mult', 1) # start training top1 = evaluator.evaluate(val_loader, print_summary=True) test_top1 = evaluator.test(test_loader, dataset.gallery, dataset.query, print_summary=True) for epoch in range(args.start_epoch, args.epochs): adjust_lr(epoch) loss, prec = trainer.train(epoch, train_loader, optimizer, print_freq=1) writer.add_scalar('Train loss', loss, epoch + 1) writer.add_scalar('Train accuracy', prec, epoch + 1) top1 = evaluator.evaluate(val_loader, print_summary=False) writer.add_scalar('Val accuracy', top1, epoch + 1) test_top1 = evaluator.test(test_loader, dataset.gallery, dataset.query, print_summary=True) test_top1 = evaluator.test(test_loader, dataset.query, dataset.gallery, print_summary=True) writer.add_scalar('Test accuracy', test_top1, epoch + 1) is_best = top1 > best_top1 best_top1 = max(top1, best_top1) save_checkpoint( { 'state_dict': model.state_dict(), 'epoch': epoch + 1, 'best_top1': best_top1, }, is_best, fpath=osp.join(args.logs_dir, 'checkpoint.pth.tar')) print('\n * Finished epoch {:3d} top1: {:5.1%} best: {:5.1%}{}\n'. format(epoch, top1, best_top1, ' *' if is_best else '')) # final test print('Test with best model:') checkpoint = load_checkpoint(osp.join(args.logs_dir, 'model_best.pth.tar')) model.load_state_dict(checkpoint['state_dict']) evaluator.test(test_loader, dataset.gallery, dataset.query)
def do_training(num_epoch, optimizer, kvstore, learning_rate, model_prefix, decay): summary_writer = SummaryWriter(args.tblog_dir) lr_scheduler = SimpleLRScheduler(learning_rate) optimizer_params = {'lr_scheduler': lr_scheduler} module.init_params() module.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) n_epoch = 0 while True: if n_epoch >= num_epoch: break train_iter.reset() val_iter.reset() loss_metric.reset() for n_batch, data_batch in enumerate(train_iter): module.forward_backward(data_batch) module.update() module.update_metric(loss_metric, data_batch.label) loss_metric.get_batch_log(n_batch) train_acc, train_loss, train_recon_err = loss_metric.get_name_value() loss_metric.reset() for n_batch, data_batch in enumerate(val_iter): module.forward(data_batch) module.update_metric(loss_metric, data_batch.label) loss_metric.get_batch_log(n_batch) val_acc, val_loss, val_recon_err = loss_metric.get_name_value() summary_writer.add_scalar('train_acc', train_acc, n_epoch) summary_writer.add_scalar('train_loss', train_loss, n_epoch) summary_writer.add_scalar('train_recon_err', train_recon_err, n_epoch) summary_writer.add_scalar('val_acc', val_acc, n_epoch) summary_writer.add_scalar('val_loss', val_loss, n_epoch) summary_writer.add_scalar('val_recon_err', val_recon_err, n_epoch) print('Epoch[%d] train acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, train_acc, train_loss, train_recon_err)) print('Epoch[%d] val acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, val_acc, val_loss, val_recon_err)) print('SAVE CHECKPOINT') module.save_checkpoint(prefix=model_prefix, epoch=n_epoch) n_epoch += 1 lr_scheduler.learning_rate = learning_rate * (decay ** n_epoch)
def test_log_scalar_summary(): logdir = './experiment/scalar' writer = SummaryWriter(logdir) for i in range(10): writer.add_scalar('test_scalar', i+1) writer.close()
########################### netG.zero_grad() labelv = Variable( label.fill_(real_label)) # fake labels are real for generator cost output = netD(fake) errG = criterion(output, labelv) errG.backward() D_G_z2 = output.data.mean() optimizerG.step() print( '[%d/%d][%d/%d] Loss_D: %.4f Loss_G: %.4f D(x): %.4f D(G(z)): %.4f / %.4f' % (epoch, opt.niter, i, len(dataloader), errD.data[0], errG.data[0], D_x, D_G_z1, D_G_z2)) niter = epoch * len(dataloader) + i writer.add_scalar('Loss/D', errD.data[0], niter) writer.add_scalar('Loss/G', errG.data[0], niter) writer.add_scalar('D(x)', D_x, niter) writer.add_scalar('D(G(z1))', D_G_z1, niter) writer.add_scalar('D(G(z2))', D_G_z2, niter) if i % 100 == 0: vutils.save_image(real_cpu, '%s/real_samples.png' % opt.outf, normalize=True) writer.add_image('real_samples', vutils.make_grid(real_cpu, normalize=True), niter) fake = netG(fixed_noise) vutils.save_image(fake.data, '%s/fake_samples_epoch_%03d.png' % (opt.outf, epoch),
def train(self) -> None: epoch_counter = 0 # Resume from serialization path if it contains a saved model. if self._serialization_prefix is not None: # Set up tensorboard logging. train_log = SummaryWriter( os.path.join(self._serialization_prefix, "log", "train")) validation_log = SummaryWriter( os.path.join(self._serialization_prefix, "log", "validation")) if any([ "model_state_epoch_" in x for x in os.listdir(self._serialization_prefix) ]): logger.info("Loading model from checkpoint.") epoch_counter = self._restore_checkpoint() if self._grad_clipping is not None: # Pylint is unable to tell that we're in the case that _glad_clipping is not None... # pylint: disable=invalid-unary-operand-type clip_function = lambda grad: grad.clamp(-self._grad_clipping, self. _grad_clipping) for parameter in self._model.parameters(): if parameter.requires_grad: parameter.register_hook(clip_function) logger.info("Beginning training.") num_training_batches = self._iterator.get_num_batches( self._train_dataset) if self._validation_dataset is not None: num_validation_batches = self._iterator.get_num_batches( self._validation_dataset) for epoch in range(epoch_counter, self._num_epochs): logger.info("Epoch %d/%d", epoch + 1, self._num_epochs) train_loss = 0.0 val_loss = 0.0 validation_metric_per_epoch = [] # type: List[float] # Set the model to "train" mode. self._model.train() train_generator = self._iterator(self._train_dataset, num_epochs=1) train_generator_tqdm = tqdm.tqdm(train_generator, total=num_training_batches) batch_num = 0 for batch in train_generator_tqdm: batch_num += 1 tensor_batch = arrays_to_variables(batch, self._cuda_device) self._optimizer.zero_grad() output_dict = self._model.forward(**tensor_batch) try: loss = output_dict["loss"] loss.backward() # Make sure Variable is on the cpu before converting to numpy. # .cpu() is a no-op if you aren't using GPUs. train_loss += loss.data.cpu().numpy() except KeyError: raise ConfigurationError( "The model you are trying to optimize does not contain a" " 'loss' key in the output of model.forward(inputs).") if self._grad_norm: clip_grad_norm(self._model.parameters(), self._grad_norm) self._optimizer.step() metrics = self._model.get_metrics() metrics["loss"] = float(train_loss / batch_num) train_generator_tqdm.set_description( self._description_from_metrics(metrics)) metrics = self._model.get_metrics(reset=True) metrics["loss"] = float(train_loss / batch_num) if self._validation_dataset is not None: # Switch to evaluation mode. self._model.eval() val_generator = self._iterator(self._validation_dataset, num_epochs=1) val_generator_tqdm = tqdm.tqdm(val_generator, total=num_validation_batches) batch_num = 0 for batch in val_generator_tqdm: batch_num += 1 tensor_batch = arrays_to_variables(batch, self._cuda_device, for_training=False) val_output_dict = self._model.forward(**tensor_batch) loss = val_output_dict["loss"] val_loss += loss.data.cpu().numpy() val_metrics = self._model.get_metrics() val_metrics["loss"] = float(val_loss / batch_num) val_generator_tqdm.set_description( self._description_from_metrics(val_metrics)) val_metrics = self._model.get_metrics(reset=True) val_metrics["loss"] = float(val_loss / batch_num) message_template = "Training %s : %3f Validation %s : %3f " for name, value in metrics.items(): logger.info(message_template, name, value, name, val_metrics[name]) if self._serialization_prefix: train_log.add_scalar(name, value, epoch) validation_log.add_scalar(name, val_metrics[name], epoch) this_epoch = val_metrics[self._validation_metric] if len(validation_metric_per_epoch) > self._patience: if max(validation_metric_per_epoch[-self._patience:] ) > this_epoch: logger.info("Ran out of patience. Stopping training.") break validation_metric_per_epoch.append(this_epoch) is_best_so_far = this_epoch == max(validation_metric_per_epoch) if self._serialization_prefix: self._save_checkpoint(epoch, is_best=is_best_so_far) else: message_template = "Training %s : %3f " for name, value in metrics.items(): logger.info(message_template, name, value) if self._serialization_prefix: train_log.add_scalar(name, value, epoch) if self._serialization_prefix: self._save_checkpoint(epoch)
def main(): parser = argparse.ArgumentParser(description="Train U-net") parser.add_argument('--name', type=str, default='unknown', help='network name') parser.add_argument('--model_dir', type=str, required=True, help='Where network will be saved and restored') parser.add_argument("--lr", type=float, default=1e-4, help="Adam learning rate") parser.add_argument("--batch_size", type=int, default=5, help="Batch size") parser.add_argument("--input_size", type=int, default=324, help="Input size of the image will fed into network. Input_size = 16*n + 4, Default: 324") parser.add_argument("--output_size", type=int, default=116, help="size of the image produced by network. Default: 116") parser.add_argument("--tb_log_dir", type=str, required=True, help="Tensorboard log dir") parser.add_argument("--n_steps", type=int, default=0, help="Number of the steps. Default: 0 means infinity steps.") parser.add_argument("--dataset_dir", type=str, default="../dataset/trainset") parser.add_argument("--pretrained_vgg", type=str, choices=['yes', 'no'], default="yes", help="Use pretrained vgg weigth") parser.add_argument("--fix_vgg", type=str, choices=['yes', 'no'], default="yes", help="Fix vgg weights while learning") parser.add_argument("--validation_freq", type=int, default=100, help="Validation freq. Default 100") parser.add_argument("--validation_set_size", type=int, default=20, help="metrics will be averaged by validation_set_size. Default 20") parser.add_argument("--channel", type=str, choices=['rgb', 'gray'], default="rgb", help="channel. Default: rgb") args = parser.parse_args() net_name = args.name model_dir = args.model_dir learning_rate = args.lr batch_size = args.batch_size net_input_size = args.input_size net_output_size = args.output_size tb_log_dir = args.tb_log_dir n_steps = args.n_steps dataset_dir = args.dataset_dir pretrained_vgg = args.pretrained_vgg == 'yes' fix_vgg = args.fix_vgg == 'yes' validation_freq = args.validation_freq validation_set_size = args.validation_set_size channel = args.channel print("Load dataset") dataset = DS.DataSet(dataset_dir) print("Initialize network manager") network_manager = NManager(model_dir, net_name) if network_manager.registered: net = network_manager.get_net() else: print("Use pretrained weihts %s" % str(pretrained_vgg)) net = U.Unet(vgg_pretrained=pretrained_vgg) network_manager.register_net(net) print("Move to GPU") net.cuda() if channel == "rgb": def get_features(x): return x.get_ndarray([DS.ChannelRGB_PanSharpen]) else: def get_features(x): img0 = x.get_ndarray([DS.ChannelPAN])[0] img = np.array([img0, img0, img0]) return img def get_target(x): return x.get_interior_mask() train_sampler = S.Sampler(dataset.train_images(), get_features, get_target, net_input_size, net_output_size, rotate_amplitude=20, random_crop=True, reflect=True)() test_sampler = S.Sampler(dataset.test_images(), get_features, get_target, net_input_size, net_output_size, rotate_amplitude=20, random_crop=True, reflect=True)() if fix_vgg: parameters = list(net.bn.parameters()) + list(net.decoder.parameters()) + list(net.conv1x1.parameters()) else: parameters = net.parameters() print("LR: %f" % learning_rate) optimizer = torch.optim.Adam(parameters, lr=learning_rate) logger = SummaryWriter(tb_log_dir + "/" + net_name) print("Start learning") with network_manager.session(n_steps) as (iterator, initial_step): for step in tqdm.tqdm(iterator, initial=initial_step): batch_features, batch_target = batch_generator(train_sampler, batch_size) batch_features = Variable(FloatTensor(batch_features)).cuda() batch_target = Variable(FloatTensor(batch_target)).cuda() predicted = net.forward(batch_features) train_metrics = eval_base_metrics(predicted, batch_target) train_metrics = eval_precision_recall_f1(**train_metrics) loss = train_metrics['loss'] optimizer.zero_grad() loss.backward() optimizer.step() log_metrics(logger, '', train_metrics, step) logger.add_scalar('lr', np.log(learning_rate)/np.log(10), step) if step % 1000 == 0: network_manager.save() if step % validation_freq == 0: test_metrics = average_metrics(net, test_sampler, batch_size, validation_set_size) log_metrics(logger, 'val', test_metrics, step) avg_train_metrics = average_metrics(net, train_sampler, batch_size, validation_set_size) log_metrics(logger, 'avg_train', avg_train_metrics, step) generate_image(logger, net, 'val', dataset.test_images(), get_features, get_target, net_input_size, net_output_size, step) generate_image(logger, net, 'train', dataset.train_images(), get_features, get_target, net_input_size, net_output_size, step)
def do_training(num_epoch, optimizer, kvstore, learning_rate, model_prefix, decay): summary_writer = SummaryWriter(args.tblog_dir) lr_scheduler = SimpleLRScheduler(learning_rate) optimizer_params = {'lr_scheduler': lr_scheduler} module.init_params() module.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) n_epoch = 0 while True: if n_epoch >= num_epoch: break train_iter.reset() val_iter.reset() loss_metric.reset() for n_batch, data_batch in enumerate(train_iter): module.forward_backward(data_batch) module.update() module.update_metric(loss_metric, data_batch.label) loss_metric.get_batch_log(n_batch) train_acc, train_loss, train_recon_err = loss_metric.get_name_value() loss_metric.reset() for n_batch, data_batch in enumerate(val_iter): module.forward(data_batch) module.update_metric(loss_metric, data_batch.label) loss_metric.get_batch_log(n_batch) val_acc, val_loss, val_recon_err = loss_metric.get_name_value() summary_writer.add_scalar('train_acc', train_acc, n_epoch) summary_writer.add_scalar('train_loss', train_loss, n_epoch) summary_writer.add_scalar('train_recon_err', train_recon_err, n_epoch) summary_writer.add_scalar('val_acc', val_acc, n_epoch) summary_writer.add_scalar('val_loss', val_loss, n_epoch) summary_writer.add_scalar('val_recon_err', val_recon_err, n_epoch) print('Epoch[%d] train acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, train_acc, train_loss, train_recon_err)) print('Epoch[%d] val acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, val_acc, val_loss, val_recon_err)) print('SAVE CHECKPOINT') module.save_checkpoint(prefix=model_prefix, epoch=n_epoch) n_epoch += 1 lr_scheduler.learning_rate = learning_rate * (decay**n_epoch)
def main(): global args, best_photo_loss, n_iter args = parser.parse_args() if args.dataset_format == 'stacked': from datasets.stacked_sequence_folders import SequenceFolder elif args.dataset_format == 'sequential': from datasets.sequence_folders import SequenceFolder save_path = Path('{}epochs{},b{},lr{}'.format( args.epochs, ',epochSize' + str(args.epoch_size) if args.epoch_size > 0 else '', args.batch_size, args.lr)) timestamp = datetime.datetime.now().strftime("%m-%d-%H:%M") args.save_path = 'checkpoints' / save_path / timestamp print('=> will save everything to {}'.format(args.save_path)) args.save_path.makedirs_p() torch.manual_seed(args.seed) train_writer = SummaryWriter(args.save_path / 'train') valid_writer = SummaryWriter(args.save_path / 'valid') output_writers = [] if args.log_output: for i in range(3): output_writers.append( SummaryWriter(args.save_path / 'valid' / str(i))) # Data loading code normalize = custom_transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.2, 0.2, 0.2]) input_transform = custom_transforms.Compose([ custom_transforms.RandomHorizontalFlip(), custom_transforms.RandomScaleCrop(), custom_transforms.ArrayToTensor(), normalize ]) print("=> fetching scenes in '{}'".format(args.data)) train_set = SequenceFolder(args.data, transform=input_transform, seed=args.seed, train=True) val_set = SequenceFolder(args.data, transform=custom_transforms.Compose([ custom_transforms.ArrayToTensor(), normalize ]), seed=args.seed, train=False) print('{} samples found in {} train scenes'.format(len(train_set), len(train_set.scenes))) print('{} samples found in {} valid scenes'.format(len(val_set), len(val_set.scenes))) train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_set, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.epoch_size == 0: args.epoch_size = len(train_loader) # create model print("=> creating model") disp_net = models.DispNetS().cuda() pose_exp_net = models.PoseExpNet(nb_ref_imgs=args.sequence_length - 1).cuda() if args.pretrained_exp_pose: print("=> using pre-trained weights for explainabilty and pose net") a = torch.load(args.pretrained_exp_pose) pose_exp_net.load_state_dict(a['state_dict']) else: pose_exp_net.init_weights() if args.pretrained_disp: print("=> using pre-trained weights for Dispnet") a = torch.load(args.pretrained_disp) disp_net.load_state_dict(a['state_dict']) else: disp_net.init_weights() cudnn.benchmark = True print('=> setting adam solver') parameters = set() for net_ in [disp_net, pose_exp_net]: parameters |= set(net_.parameters()) optimizer = torch.optim.Adam(parameters, args.lr, betas=(args.momentum, args.beta), weight_decay=args.weight_decay) with open(args.save_path / args.log_summary, 'w') as csvfile: writer = csv.writer(csvfile, delimiter='\t') writer.writerow(['train_loss', 'validation_loss']) with open(args.save_path / args.log_full, 'w') as csvfile: writer = csv.writer(csvfile, delimiter='\t') writer.writerow( ['train_loss', 'photo_loss', 'explainability_loss', 'smooth_loss']) logger = TermLogger(n_epochs=args.epochs, train_size=min(len(train_loader), args.epoch_size), valid_size=len(val_loader)) logger.epoch_bar.start() for epoch in range(args.epochs): logger.epoch_bar.update(epoch) # train for one epoch logger.reset_train_bar() train_loss = train(train_loader, disp_net, pose_exp_net, optimizer, args.epoch_size, logger, train_writer) logger.train_writer.write(' * Avg Loss : {:.3f}'.format(train_loss)) # evaluate on validation set logger.reset_valid_bar() valid_photo_loss, valid_exp_loss, valid_total_loss = validate( val_loader, disp_net, pose_exp_net, epoch, logger, output_writers) logger.valid_writer.write( ' * Avg Photo Loss : {:.3f}, Valid Loss : {:.3f}, Total Loss : {:.3f}' .format(valid_photo_loss, valid_exp_loss, valid_total_loss)) valid_writer.add_scalar( 'photometric_error', valid_photo_loss * 4, n_iter ) # Loss is multiplied by 4 because it's only one scale, instead of 4 during training valid_writer.add_scalar('explanability_loss', valid_exp_loss * 4, n_iter) valid_writer.add_scalar('total_loss', valid_total_loss * 4, n_iter) if best_photo_loss < 0: best_photo_loss = valid_photo_loss # remember lowest error and save checkpoint is_best = valid_photo_loss < best_photo_loss best_photo_loss = min(valid_photo_loss, best_photo_loss) save_checkpoint(args.save_path, { 'epoch': epoch + 1, 'state_dict': disp_net.state_dict() }, { 'epoch': epoch + 1, 'state_dict': pose_exp_net.state_dict() }, is_best) with open(args.save_path / args.log_summary, 'a') as csvfile: writer = csv.writer(csvfile, delimiter='\t') writer.writerow([train_loss, valid_total_loss]) logger.epoch_bar.finish()
deconv.zero_grad() feature.zero_grad() loss.backward() optimizer_feature.step() optimizer_deconv.step() # visulize image = make_image_grid(inputs.data[:, :3], mean, std) writer.add_image('Image', torchvision.utils.make_grid(image), ib) msk = functional.sigmoid(msk) mask1 = msk.data mask1 = mask1.repeat(1, 3, 1, 1) writer.add_image('Image2', torchvision.utils.make_grid(mask1), ib) print('loss: %.4f (epoch: %d, step: %d)' % (loss.data[0], it, ib)) writer.add_scalar('M_global', loss.data[0], istep) istep += 1 del inputs, msk, lbl, loss, feats, mask1, image gc.collect() if ib % 1000 == 0: filename = ('%s/deconv-epoch-%d-step-%d.pth' % (check_root, it, ib)) torch.save(deconv.state_dict(), filename) filename = ('%s/feature-epoch-%d-step-%d.pth' % (check_root, it, ib)) torch.save(feature.state_dict(), filename) print('save: (epoch: %d, step: %d)' % (it, ib)) validation(val_loader, '%s/%d' % (val_output_root, it), feature, deconv)
class LogMetricsCallback(object): """Log metrics periodically in TensorBoard. This callback works almost same as `callback.Speedometer`, but write TensorBoard event file for visualization. For more usage, please refer https://github.com/dmlc/tensorboard Parameters ---------- logging_dir : str TensorBoard event file directory. After that, use `tensorboard --logdir=path/to/logs` to launch TensorBoard visualization. prefix : str Prefix for a metric name of `scalar` value. You might want to use this param to leverage TensorBoard plot feature, where TensorBoard plots different curves in one graph when they have same `name`. The follow example shows the usage(how to compare a train and eval metric in a same graph). Examples -------- >>> # log train and eval metrics under different directories. >>> training_log = 'logs/train' >>> evaluation_log = 'logs/eval' >>> # in this case, each training and evaluation metric pairs has same name, you can add a prefix >>> # to make it separate. >>> batch_end_callbacks = [mx.tensorboard.LogMetricsCallback(training_log)] >>> eval_end_callbacks = [mx.tensorboard.LogMetricsCallback(evaluation_log)] >>> # run >>> model.fit(train, >>> ... >>> batch_end_callback = batch_end_callbacks, >>> eval_end_callback = eval_end_callbacks) >>> # Then use `tensorboard --logdir=logs/` to launch TensorBoard visualization. """ def __init__(self, logging_dir, score_store=False, prefix=None): self.prefix = prefix self.step = 0 self.score_store = score_store try: self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error( 'You can install tensorboard via `pip install tensorboard`.') def __call__(self, param): """Callback to log training speed and metrics in TensorBoard.""" self.step += 1 if param.eval_metric is None: return name_value = param.eval_metric.get_name_value() if self.step % 20 == 0: for name, value in name_value: if self.prefix is not None: name = '%s-%s' % (self.prefix, name) self.summary_writer.add_scalar(name, value, self.step) if self.step % 1000 == 0: im_ori = param.locals['data_batch'].label[0].asnumpy() im_rec = (param.locals['rec_img'])[0].asnumpy() im_ori = imageFromTensor(im_ori) im_rec = imageFromTensor(im_rec) self.summary_writer.add_image('im_ori', im_ori, self.step) self.summary_writer.add_image('im_rec', im_rec, self.step) if self.score_store: facenet_scores = param.locals['facenet_scores'] self.summary_writer.add_scalar('scores_mean', facenet_scores.mean(), self.step) self.summary_writer.add_histogram('facenet_scores', facenet_scores, self.step)
def main(): global args args = parser.parse_args() # Data preprocessing. print('==> Preparing data......') assert (args.dataset == 'cifar10' or args.dataset == 'cifar100'), "Only support cifar10 or cifar100 dataset" if args.dataset == 'cifar10': print('To train and eval on cifar10 dataset......') num_classes = 10 transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean_cifar10, std_cifar10), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean_cifar10, std_cifar10), ]) train_set = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train) train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=4) test_set = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test) test_loader = torch.utils.data.DataLoader(test_set, batch_size=100, shuffle=False, num_workers=4) else: print('To train and eval on cifar100 dataset......') num_classes = 100 transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean_cifar100, std_cifar100), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean_cifar100, std_cifar100), ]) train_set = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform_train) train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=4) test_set = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform_test) test_loader = torch.utils.data.DataLoader(test_set, batch_size=100, shuffle=False, num_workers=4) # Model if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isdir( args.ckpt_path), 'Error: checkpoint directory not exists!' checkpoint = torch.load(os.path.join(args.ckpt_path, 'ckpt.t7')) model = checkpoint['model'] best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] else: print('==> Building model..') model = models.__dict__[args.arch](num_classes) start_epoch = args.start_epoch print('Number of model parameters: {}'.format( sum([p.data.nelement() for p in model.parameters()]))) # Use GPUs if available. if torch.cuda.is_available(): model.cuda() model = torch.nn.DataParallel(model, device_ids=range( torch.cuda.device_count())) cudnn.benchmark = True # Define loss function and optimizer. criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, nesterov=args.nesterov, weight_decay=args.weight_decay) log_dir = 'logs/' + datetime.now().strftime('%B%d %H:%M:%S') train_writer = SummaryWriter(os.path.join(log_dir, 'train')) test_writer = SummaryWriter(os.path.join(log_dir, 'test')) # Save argparse commandline to a file. with open(os.path.join(log_dir, 'commandline_args.txt'), 'w') as f: f.write('\n'.join(sys.argv[1:])) best_acc = 0 # best test accuracy for epoch in range(start_epoch, args.epochs): # Learning rate schedule. lr = adjust_learning_rate(optimizer, epoch + 1) train_writer.add_scalar('lr', lr, epoch) # Train for one epoch. train(train_loader, model, criterion, optimizer, train_writer, epoch) # Eval on test set. num_iter = (epoch + 1) * len(train_loader) acc = eval(test_loader, model, criterion, test_writer, epoch, num_iter) # Save checkpoint. print('Saving Checkpoint......') state = { 'model': model.module if torch.cuda.is_available() else model, 'best_acc': best_acc, 'epoch': epoch, } if not os.path.isdir(os.path.join(log_dir, 'last_ckpt')): os.mkdir(os.path.join(log_dir, 'last_ckpt')) torch.save(state, os.path.join(log_dir, 'last_ckpt', 'ckpt.t7')) if acc > best_acc: best_acc = acc if not os.path.isdir(os.path.join(log_dir, 'best_ckpt')): os.mkdir(os.path.join(log_dir, 'best_ckpt')) torch.save(state, os.path.join(log_dir, 'best_ckpt', 'ckpt.t7')) train_writer.add_scalar('best_acc', best_acc, epoch) train_writer.close() test_writer.close()
d_real = L_Df(xR) d_fake = L_G#L_Df(xG) L_D = d_real-kt*d_fake L_D.backward() optimD.step() L_D_val = L_D.data[0] L_G_val = L_G.data[0] kt = kt+lamk*(opt.gamma*L_D_val-L_G_val) if kt<0: kt = 0 M_global = L_D_val + math.fabs(opt.gamma*L_D_val-L_G_val) writer.add_scalar('misc/M_global', M_global, n_iter) writer.add_scalar('misc/kt', kt, n_iter) writer.add_scalar('loss/L_D', L_D_val, n_iter) writer.add_scalar('loss/L_G', L_G_val, n_iter) writer.add_scalar('loss/d_real', d_real.data[0], n_iter) writer.add_scalar('loss/d_fake', d_fake.data[0], n_iter) LD_LG = L_D_val-L_G_val log_variable(M_global, L_D_val, L_G_val, kt, LD_LG) if n_iter%10000==0: opt.lr = opt.lr/2 for param_group in optimD.param_groups: param_group['lr'] = opt.lr#param_group['lr']/2 for param_group in optimG.param_groups: param_group['lr'] = opt.lr#param_group['lr']/2
err_d_fake.backward() optimizerD.step() err_d = err_d_fake + err_d_real ############################ # (2) Update G network: maximize log(D(G(z))) ########################### output = net_d(input_fake) output = F.sigmoid(output) label.fill_(1) err_g = criterion(output, Variable(label)) net_g.zero_grad() err_g.backward() optimizerG.step() if i % 100 == 0: ########################## # Visualization ########################## images = make_grid((input_fake.data[:8] + 1) / 2) writer.add_image('images', images, i) writer.add_scalar('error D', err_d.data[0], i) writer.add_scalar('error G', err_g.data[0], i) print 'epoch %d step %d, err_d=%.4f, err_g=%.4f' % ( epoch, i, err_d.data[0], err_g.data[0]) torch.save(net_g.state_dict(), '%s/NetG-epoch-%d-step-%d.pth' % (check_root, epoch, i)) torch.save(net_d.state_dict(), '%s/NetD-epoch-%d-step-%d.pth' % (check_root, epoch, i))
import torch import torchvision.utils as vutils import numpy as np import torchvision.models as models from datetime import datetime from tensorboard import SummaryWriter resnet18 = models.resnet18(True) writer = SummaryWriter('runs/' + datetime.now().strftime('%B%d %H:%M:%S')) sample_rate = 44100 freqs = [262, 294, 330, 349, 392, 440, 440, 440, 440, 440, 440] for n_iter in range(100): M_global = torch.rand(1) # value to keep writer.add_scalar('M_global', M_global[0], n_iter) x = torch.rand(32, 3, 64, 64) # output from network if n_iter % 10 == 0: x = vutils.make_grid(x, normalize=True, scale_each=True) writer.add_image('Image', x, n_iter) x = torch.zeros(sample_rate * 2) for i in range(x.size(0)): x[i] = np.cos( freqs[n_iter // 10] * np.pi * float(i) / float(sample_rate)) # sound amplitude should in [-1, 1] writer.add_audio('Audio', x, n_iter) for name, param in resnet18.named_parameters(): writer.add_histogram(name, param.clone().cpu().data.numpy(), n_iter) writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter) writer.add_text('another Text', 'another text logged at step:' + str(n_iter), n_iter) writer.close()
def do_training(args, module, data_train, data_val, begin_epoch=0): from distutils.dir_util import mkpath from log_util import LogUtil log = LogUtil().getlogger() mkpath(os.path.dirname(get_checkpoint_path(args))) #seq_len = args.config.get('arch', 'max_t_count') batch_size = args.config.getint('common', 'batch_size') save_checkpoint_every_n_epoch = args.config.getint( 'common', 'save_checkpoint_every_n_epoch') save_checkpoint_every_n_batch = args.config.getint( 'common', 'save_checkpoint_every_n_batch') enable_logging_train_metric = args.config.getboolean( 'train', 'enable_logging_train_metric') enable_logging_validation_metric = args.config.getboolean( 'train', 'enable_logging_validation_metric') contexts = parse_contexts(args) num_gpu = len(contexts) eval_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, is_logging=enable_logging_validation_metric, is_epoch_end=True) # tensorboard setting loss_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, is_logging=enable_logging_train_metric, is_epoch_end=False) optimizer = args.config.get('optimizer', 'optimizer') learning_rate = args.config.getfloat('train', 'learning_rate') learning_rate_annealing = args.config.getfloat('train', 'learning_rate_annealing') mode = args.config.get('common', 'mode') num_epoch = args.config.getint('train', 'num_epoch') clip_gradient = args.config.getfloat('optimizer', 'clip_gradient') weight_decay = args.config.getfloat('optimizer', 'weight_decay') save_optimizer_states = args.config.getboolean('train', 'save_optimizer_states') show_every = args.config.getint('train', 'show_every') optimizer_params_dictionary = json.loads( args.config.get('optimizer', 'optimizer_params_dictionary')) kvstore_option = args.config.get('common', 'kvstore_option') n_epoch = begin_epoch is_bucketing = args.config.getboolean('arch', 'is_bucketing') if clip_gradient == 0: clip_gradient = None if is_bucketing and mode == 'load': model_file = args.config.get('common', 'model_file') model_name = os.path.splitext(model_file)[0] model_num_epoch = int(model_name[-4:]) model_path = 'checkpoints/' + str(model_name[:-5]) symbol, data_names, label_names = module(1600) model = STTBucketingModule( sym_gen=module, default_bucket_key=data_train.default_bucket_key, context=contexts) data_train.reset() model.bind(data_shapes=data_train.provide_data, label_shapes=data_train.provide_label, for_training=True) _, arg_params, aux_params = mx.model.load_checkpoint( model_path, model_num_epoch) model.set_params(arg_params, aux_params) module = model else: module.bind(data_shapes=data_train.provide_data, label_shapes=data_train.provide_label, for_training=True) if begin_epoch == 0 and mode == 'train': module.init_params(initializer=get_initializer(args)) lr_scheduler = SimpleLRScheduler(learning_rate=learning_rate) def reset_optimizer(force_init=False): optimizer_params = { 'lr_scheduler': lr_scheduler, 'clip_gradient': clip_gradient, 'wd': weight_decay } optimizer_params.update(optimizer_params_dictionary) module.init_optimizer(kvstore=kvstore_option, optimizer=optimizer, optimizer_params=optimizer_params, force_init=force_init) if mode == "train": reset_optimizer(force_init=True) else: reset_optimizer(force_init=False) data_train.reset() data_train.is_first_epoch = True #tensorboard setting tblog_dir = args.config.get('common', 'tensorboard_log_dir') summary_writer = SummaryWriter(tblog_dir) while True: if n_epoch >= num_epoch: break loss_metric.reset() log.info('---------train---------') for nbatch, data_batch in enumerate(data_train): module.forward_backward(data_batch) module.update() # tensorboard setting if (nbatch + 1) % show_every == 0: module.update_metric(loss_metric, data_batch.label) #summary_writer.add_scalar('loss batch', loss_metric.get_batch_loss(), nbatch) if (nbatch + 1) % save_checkpoint_every_n_batch == 0: log.info('Epoch[%d] Batch[%d] SAVE CHECKPOINT', n_epoch, nbatch) module.save_checkpoint( prefix=get_checkpoint_path(args) + "n_epoch" + str(n_epoch) + "n_batch", epoch=(int( (nbatch + 1) / save_checkpoint_every_n_batch) - 1), save_optimizer_states=save_optimizer_states) # commented for Libri_sample data set to see only train cer log.info('---------validation---------') data_val.reset() eval_metric.reset() for nbatch, data_batch in enumerate(data_val): # when is_train = False it leads to high cer when batch_norm module.forward(data_batch, is_train=True) module.update_metric(eval_metric, data_batch.label) # tensorboard setting val_cer, val_n_label, val_l_dist, _ = eval_metric.get_name_value() log.info("Epoch[%d] val cer=%f (%d / %d)", n_epoch, val_cer, int(val_n_label - val_l_dist), val_n_label) curr_acc = val_cer summary_writer.add_scalar('CER validation', val_cer, n_epoch) assert curr_acc is not None, 'cannot find Acc_exclude_padding in eval metric' data_train.reset() data_train.is_first_epoch = False # tensorboard setting train_cer, train_n_label, train_l_dist, train_ctc_loss = loss_metric.get_name_value( ) summary_writer.add_scalar('loss epoch', train_ctc_loss, n_epoch) summary_writer.add_scalar('CER train', train_cer, n_epoch) # save checkpoints if n_epoch % save_checkpoint_every_n_epoch == 0: log.info('Epoch[%d] SAVE CHECKPOINT', n_epoch) module.save_checkpoint(prefix=get_checkpoint_path(args), epoch=n_epoch, save_optimizer_states=save_optimizer_states) n_epoch += 1 lr_scheduler.learning_rate = learning_rate / learning_rate_annealing log.info('FINISH')
if args.train: # At any point you can hit Ctrl + C to break out of training early. try: # Loop over epochs. for epoch in range(1, args.epochs + 1): epoch_start_time = time.time() train(model, corpus.train, lr=lr, weight_decay=args.weight_decay) if args.prof: break val_ppl = evaluate(model, corpus.valid) if args.tb_name: writer.add_scalar('valid_PPL', val_ppl, epoch) print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s |' 'valid ppl {:8.2f}'.format( epoch, (time.time() - epoch_start_time), val_ppl)) print('-' * 89) with open(args.save + '.epoch_{}'.format(epoch), 'wb') as f: torch.save(model, f) # Save the model if the validation loss is the best we've seen so far. if not best_val_ppl or val_ppl < best_val_ppl: with open(args.save, 'wb') as f: torch.save(model, f) best_val_ppl = val_ppl else: # Anneal the learning rate if no improvement has been seen in the # validation dataset.
# accuracy accuracy_discriminator = classification_accuracy(out_cat, labels_cat) # BACKPROP optimizer_discriminator.zero_grad() net.zero_grad() loss_discriminator.backward(retain_graph=True) optimizer_discriminator.step() # LOGGING progress.update(progress.value + 1, loss_discriminator=loss_discriminator.data.cpu().numpy()[0], accuracy_discriminator=accuracy_discriminator.data.cpu().numpy()[0], ) # LOSS ACCURACY writer.add_scalar('pretrain_loss_discriminator', loss_discriminator.data[0], i) writer.add_scalar('pretrain_accuracy_discriminator', accuracy_discriminator.data[0], i) progress.finish() #print "JOINT TRAIN" for i in xrange(num_epochs): progress = progressbar.ProgressBar(min_value=0, max_value=batch_number, initial_value=0, widgets=widgets).start() for j, (data_batch, labels_batch) in enumerate(loader): net.train(True) # REAL net.batch_real = True # trasformo in variabili data_batch = Variable(data_batch, requires_grad=True).cuda() # calcolo uscita out_real = net(data_batch)
tfboard_writer=tfboard_writer) else : pass niter += 1 # global iteration counter # pass pass # info_header = ['set', 'loss', 'loss core', 'loss bern end', 'acc bern end'] # info_table = [] # logger.info("Epoch %d -- lrate %f -- time %.2fs"%(ee+1, opts['lrate'], time.time() - start_time)) # for set_name in mloss.keys() : # mloss[set_name] /= mcount[set_name] # mloss_core[set_name] /= mcount[set_name] # mloss_bernend[set_name] /= mcount[set_name] # macc_bernend[set_name] /= mcount[set_name] # info_table.append([set_name, mloss[set_name], mloss_core[set_name], mloss_bernend[set_name], macc_bernend[set_name]]) # logger.info('\n'+tab.tabulate(info_table, headers=info_header, floatfmt='.3f', tablefmt='rst')) # serialized best dev model # save_model(model_gen_a2b, 'gen_a2b', ee) save_model(model_gen_b2a, 'gen_b2a', ee) # increase step scheduler # scheduler_coeff_gan.step() if tfboard_writer is not None : tfboard_writer.add_scalar('coeff/coeff_gan', scheduler_coeff_gan.value, ee) pass pass
def train(): loader_train = CityscapesLoader('/home/cattaneod/CITYSCAPES_crop/', split='train', is_transform=True, img_size=None, transforms=data_augmentation) trainloader = data.DataLoader(loader_train, batch_size=batch_size, num_workers=num_workers, shuffle=True, pin_memory=True) loader_test = CityscapesLoader(base_data_folder, split='test', is_transform=True, img_size=None, transforms=data_augmentation) test_loader = data.DataLoader(loader_test, batch_size=batch_size, num_workers=num_workers, shuffle=False, pin_memory=True) loader_val = CityscapesLoader(base_data_folder, split='val', is_transform=True, img_size=image_shape, return_original=True) valloader = data.DataLoader(loader_val, batch_size=batch_size, num_workers=num_workers, shuffle=False, pin_memory=True) model = deeplab_resnet_DUC.Res_Deeplab_DUC(num_classes) if TBWriter: writer = SummaryWriter() ''' if resume: print("Loading from: ", resume_filename) saved_state_dict = torch.load(resume_filename) if num_classes != 21: for i in saved_state_dict: # Scale.layer5.conv2d_list.3.weight i_parts = i.split('.') if i_parts[1] == 'layer5': saved_state_dict[i] = model.state_dict()[i] model.load_state_dict(saved_state_dict) ''' if torch.cuda.is_available(): print("Using GPU") model.cuda(0) else: print("Using CPU") model.train() if opt == "SGD": optimizer = torch.optim.SGD([{ 'params': get_1x_lr_params_NOscale(model), 'lr': l_rate }, { 'params': get_10x_lr_params(model), 'lr': 10 * l_rate }], lr=l_rate, momentum=0.9, weight_decay=5e-4) elif opt == "Adam": optimizer = torch.optim.Adam([{ 'params': get_1x_lr_params_NOscale(model), 'lr': 0 * l_rate }, { 'params': get_10x_lr_params(model), 'lr': 10 * l_rate }], lr=l_rate, weight_decay=5e-4) if resume: print("Resuming From ", resume_filename) checkpoint = torch.load(resume_filename) saved_state_dict = checkpoint['state_dict'] if reset_layer5: for i in model.state_dict(): # Scale.layer5.conv2d_list.3.weight i_parts = i.split('.') if i not in saved_state_dict or i_parts[1] == 'layer5': saved_state_dict[i] = model.state_dict()[i] model.load_state_dict(saved_state_dict) starting_epoch = checkpoint['epoch'] + 1 if poly_lr: lr_ = poly_lr2(l_rate, len(trainloader) * starting_epoch, lr_decay_iter=1, max_iter=len(trainloader) * epochs) if lr_: if opt == "SGD": optimizer = torch.optim.SGD( [{ 'params': get_1x_lr_params_NOscale(model), 'lr': lr_ }, { 'params': get_10x_lr_params(model), 'lr': 10 * lr_ }], lr=lr_, momentum=0.9, weight_decay=5e-4) elif opt == "Adam": optimizer = torch.optim.Adam( [{ 'params': get_1x_lr_params_NOscale(model), 'lr': 0 * lr_ }, { 'params': get_10x_lr_params(model), 'lr': 10 * lr_ }], lr=lr_, weight_decay=5e-4) best_metric = 0 old_file = "" train_acc = AverageMeter() train_IoU = AverageMeter() train_loss = AverageMeter() for epoch in range(starting_epoch, epochs): train_acc.reset() train_IoU.reset() train_loss.reset() train_cfmatrix = np.zeros((num_classes, num_classes)) print("\nEpoch: ", epoch) if overlay_during_training and epoch % 1 == 0: for i in range(15): print("Overlaying image ", i) names, original_img, test_img, _ = loader_val[i] test_img = test_img.unsqueeze(0) original_img = original_img.unsqueeze(0) original_img = Variable(original_img.cuda()) model.eval() test_pred = model( Variable(test_img.cuda(0), requires_grad=True)) test_img = Variable(test_img.cuda(0), requires_grad=True) #if TBWriter and i==0: # writer.add_graph(model, test_pred) test_pred = F.upsample_bilinear(test_pred, (1024, 2048)) overlay_images(names, original_img, test_pred, epoch, str(i) + '_', convert_id=False) del test_pred del test_img model.train() optimizer.zero_grad() with tqdm.tqdm(trainloader, ncols=150) as t: lr_ = l_rate for i, (images, labels) in enumerate(t): if torch.cuda.is_available(): images = Variable(images.cuda(0)) labels = Variable(labels.cuda(0)) else: images = Variable(images) labels = Variable(labels) iter = len(trainloader) * epoch + i outputs = model(images) #g = make_dot(outputs) #g.save('./t.dot') loss = misc.cross_entropy2d(outputs, labels, ignore_index=255) loss = loss / update_batches loss.backward() t.set_description('Loss: %8.4f - LR = %f' % (update_batches * loss.data[0], lr_)) train_loss.update(update_batches * loss.data[0]) acc, IoU, cf_matrix = accuracy_IoU( outputs, labels, np.array(range(num_classes))) if acc is not None: train_acc.update(acc) train_IoU.update(np.nanmean(IoU)) train_cfmatrix = train_cfmatrix + cf_matrix if i % update_batches == 0: optimizer.step() if poly_lr: lr_ = poly_lr2(l_rate, iter, lr_decay_iter=1, max_iter=len(trainloader) * epochs) if lr_: t.set_description( 'Step: %8.4f - LR = %f' % (update_batches * loss.data[0], lr_)) if opt == "SGD": optimizer = torch.optim.SGD( [{ 'params': get_1x_lr_params_NOscale(model), 'lr': lr_ }, { 'params': get_10x_lr_params(model), 'lr': 10 * lr_ }], lr=lr_, momentum=0.9, weight_decay=5e-4) elif opt == "Adam": optimizer = torch.optim.Adam( [{ 'params': get_1x_lr_params_NOscale(model), 'lr': 0 * lr_ }, { 'params': get_10x_lr_params(model), 'lr': 10 * lr_ }], lr=lr_, weight_decay=5e-4) #print("%8.2f %% -> Loss: %8.6f " % (i / len(trainloader) * 100, loss.data[0]), end='\r') optimizer.zero_grad() if i > 0 and i % TBUpdate == 0 and TBWriter: writer.add_scalar('Train Accuracy', train_acc.avg, iter) writer.add_scalar('Train IoU', train_IoU.avg, iter) writer.add_scalar('Train Loss', train_loss.avg, iter) del outputs del loss del images del labels t.update(1) rows = train_cfmatrix.sum(axis=1) cols = train_cfmatrix.sum(axis=0) IoU = np.ndarray(train_cfmatrix.shape[0]) for i in range(train_cfmatrix.shape[0]): if rows[i] + cols[i] > 0.: IoU[i] = train_cfmatrix[i][i] / (rows[i] + cols[i] - train_cfmatrix[i][i]) else: IoU[i] = np.nan print("\nTrain Accuracy: ", train_acc.avg) print("Train Loss: ", train_loss.avg) print("Micro IoU: ", train_IoU.avg, "\n") print("Macro IoU: ", np.nanmean(IoU), "\n") if check_validation: #VALIDATION!!! val_acc = AverageMeter() val_IoU = AverageMeter() val_loss = AverageMeter() val_cfmatrix = np.zeros((num_classes, num_classes)) model.eval() for i, (images, labels) in enumerate(valloader): if torch.cuda.is_available(): images = Variable(images.cuda(0)) labels = Variable(labels.cuda(0)) else: images = Variable(images) labels = Variable(labels) iter = len(trainloader) * epoch + i #poly_lr_scheduler(optimizer, l_rate, iter) outputs = model(images) loss = cross_entropy2d(outputs, labels, ignore_index=255) val_loss.update(loss.data[0]) acc, IoU, cf_matrix = accuracy_IoU( outputs, labels, np.array(range(num_classes))) if acc is not None: val_acc.update(acc) val_IoU.update(np.nanmean(IoU)) val_cfmatrix = val_cfmatrix + cf_matrix del outputs del loss del images del labels print("\nVal Accuracy: ", val_acc.avg) print("Val Loss: ", val_loss.avg) print("Val IoU: ", val_IoU.avg, "\n") if TBWriter: writer.add_scalar('Val Accuracy', val_acc.avg, epoch) writer.add_scalar('Val IoU', val_IoU.avg, epoch) writer.add_scalar('Val Loss', val_loss.avg, epoch) save_metric = train_IoU.avg if check_validation: save_metric = val_IoU.avg if best_metric < save_metric: best_metric = save_metric print("New Best IoU!") if save: torch.save( { 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, base_save_folder + "/checkpoint_" + str(epoch) + "_" + str(save_metric) + ".pth.tar") print("Model Saves As " + base_save_folder + "/checkpoint_" + str(epoch) + "_" + str(save_metric) + ".pth.tar") if os.path.isfile(old_file): os.remove(old_file) old_file = base_save_folder + "/checkpoint_" + str( epoch) + "_" + str(save_metric) + ".pth.tar" print("Best IoU So Far: ", best_metric) if TBWriter: writer.close() print("End Of Training")
def train(): data_augmentation = DataAugmentationTransform_old(translation_range=(0.0, 0.15), rotation_range=10, zoom_range = (0.8, 1.0), flip_p = 0.5, brightness_range = (-0.2, 0.2), gamma_range = (0.5, 1.5), saturation_range=(-0.3, 0.3)) loader_train = CityscapesLoader(base_data_folder, split='train', is_transform=True, img_size=image_shape, transforms=None) trainloader = data.DataLoader(loader_train, batch_size=batch_size, num_workers=4, shuffle=True, pin_memory=True) if overlay_during_training: loader_test = CityscapesLoader(base_data_folder, split='test', is_transform=True, img_size=image_shape) test_loader = data.DataLoader(loader_test, batch_size=batch_size, num_workers=4, shuffle=False, pin_memory=True) if check_validation: loader_val = CityscapesLoader(base_data_folder, split='val', is_transform=True, img_size=image_shape) valloader = data.DataLoader(loader_val, batch_size=batch_size, num_workers=4, shuffle=False, pin_memory=True) model = get_model('fcn1s',num_classes) writer = SummaryWriter() if resume: print("Resuming From ",resume_filename) checkpoint = torch.load(resume_filename) model.load_state_dict(checkpoint['state_dict']) #starting_epoch = checkpoint['epoch'] #optimizer.load_state_dict(checkpoint['optimizer']) for param in model.parameters(): param.requires_grad = True if freeze_layers: print("Freezing VGG layers") for param in model.conv_block1.parameters(): param.requires_grad = False for param in model.conv_block2.parameters(): param.requires_grad = False for param in model.conv_block3.parameters(): param.requires_grad = False for param in model.conv_block4.parameters(): param.requires_grad = False for param in model.conv_block5.parameters(): param.requires_grad = False if torch.cuda.is_available(): print("Using GPU") model.cuda(0) else: print("Using CPU") model.train() parameters = filter(lambda p: p.requires_grad, model.parameters()) if opt == "SGD": optimizer = torch.optim.SGD(parameters, lr=l_rate, momentum=0.9, weight_decay=5e-4) elif opt =="Adam": optimizer = torch.optim.Adam(parameters, lr=l_rate, weight_decay=5e-4) best_metric = 0 old_file = "" for epoch in range(starting_epoch, epochs): train_acc = 0 train_IoU = 0 train_loss = 0 train_count = 0 print("\nEpoch: ",epoch) if overlay_during_training and epoch % 5 == 0: test_img = loader_test[67] test_img = test_img.unsqueeze(0) model.eval() test_pred = model(Variable(test_img.cuda(0), requires_grad=True)) test_img = Variable(test_img.cuda(0), requires_grad=True) overlay_images(test_img, test_pred, epoch, '67_') writer.add_graph(model, test_pred) del test_pred del test_img test_img = loader_test[88] test_img = test_img.unsqueeze(0) test_pred = model(Variable(test_img.cuda(0), requires_grad=True)) test_img = Variable(test_img.cuda(0), requires_grad=True) overlay_images(test_img, test_pred, epoch, '88_') del test_pred del test_img test_img = loader_test[175] test_img = test_img.unsqueeze(0) test_pred = model(Variable(test_img.cuda(0), requires_grad=True)) test_img = Variable(test_img.cuda(0), requires_grad=True) overlay_images(test_img, test_pred, epoch, '175_') del test_pred del test_img model.train() with tqdm.tqdm(trainloader, ncols=100) as t: for i, (images, labels) in enumerate(t): if torch.cuda.is_available(): images = Variable(images.cuda(0)) labels = Variable(labels.cuda(0)) else: images = Variable(images) labels = Variable(labels) iter = len(trainloader) * epoch + i if poly_lr: poly_lr_scheduler(optimizer, l_rate, iter, lr_decay_iter=10) optimizer.zero_grad() outputs = model(images) loss = cross_entropy2d(outputs, labels, ignore_index=255) loss.backward() optimizer.step() #print("%8.2f %% -> Loss: %8.6f " % (i / len(trainloader) * 100, loss.data[0]), end='\r') t.set_description('Loss: %8.6f' % loss.data[0]) t.update(1) train_loss = train_loss + loss.data[0] acc, IoU = accuracy_IoU(outputs,labels, np.array(range(num_classes))) train_acc = train_acc + acc train_IoU = train_IoU + IoU.mean() train_count = train_count + 1 del outputs del loss del images del labels train_acc = train_acc / train_count train_IoU = train_IoU / train_count train_loss = train_loss / train_count print("\nTrain Accuracy: ", train_acc) print("Train Loss: ", train_loss) print("Train IoU: ", train_IoU, "\n") writer.add_scalar('Train Accuracy', train_acc, epoch) writer.add_scalar('Train IoU', train_IoU, epoch) writer.add_scalar('Train Los', train_loss, epoch) if check_validation: #VALIDATION!!! val_acc = 0 val_IoU = 0 val_loss = 0 val_count = 0 model.eval() for i, (images, labels) in enumerate(valloader): if torch.cuda.is_available(): images = Variable(images.cuda(0)) labels = Variable(labels.cuda(0)) else: images = Variable(images) labels = Variable(labels) iter = len(trainloader) * epoch + i #poly_lr_scheduler(optimizer, l_rate, iter) outputs = model(images) loss = cross_entropy2d(outputs, labels, ignore_index=255) val_loss = val_loss + loss.data[0] acc, IoU = accuracy_IoU(outputs,labels, np.array(range(num_classes))) val_acc = val_acc + acc val_IoU = val_IoU + IoU.mean() val_count = val_count + 1 del outputs del loss del images del labels val_acc = val_acc / val_count val_IoU = val_IoU / val_count val_loss = val_loss / val_count print("\nVal Accuracy: ", val_acc) print("Val Loss: ", val_loss) print("Val IoU: ", val_IoU, "\n") writer.add_scalar('Val Accuracy', val_acc, epoch) writer.add_scalar('Val IoU', val_IoU, epoch) writer.add_scalar('Val Loss', val_loss, epoch) save_metric = val_IoU if check_validation: save_metric = val_IoU if best_metric < save_metric: best_metric = save_metric print("New Best IoU!") if save: torch.save({ 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, base_save_folder + "/checkpoint_" + str(epoch) + "_" + str(save_metric) + ".pth.tar") print("Model Saves As " + base_save_folder + "/checkpoint_" + str(epoch) + "_" + str(save_metric) + ".pth.tar") if os.path.isfile(old_file): os.remove(old_file) old_file = base_save_folder + "/checkpoint_" + str(epoch) + "_" + str(save_metric) + ".pth.tar" print("Best IoU So Far: ", best_metric) writer.close() print("End Of Training")
# re-parameterize std = logvar.mul(0.5).exp_() noise.resize_(bsize_now, nz).normal_(0, 1) output = decoder( (Variable(noise).mul(std).add_(mu)).view(bsize_now, nz, 1, 1)) loss = loss_function(output, Variable(input), mu, logvar, bsize, img_size) encoder.zero_grad() decoder.zero_grad() loss.backward() optimizer_de.step() optimizer_en.step() print 'epoch %d step %d, err_d=%.4f' % (epoch, i, loss.data[0]) if i % 100 == 0: # ########################## # # Visualization # ########################## images = make_grid(output.data[:8]) writer.add_image('output', images, i) images = make_grid(input[:8]) writer.add_image('images', images, i) writer.add_scalar('error', loss.data[0], i) del mu, logvar, std, output, loss gc.collect() torch.save(decoder.state_dict(), '%s/decoder-epoch-%d-step-%d.pth' % (check_root, epoch, i)) torch.save(encoder.state_dict(), '%s/encoder-epoch-%d-step-%d.pth' % (check_root, epoch, i))
accuracy_value = classification_accuracy(out, labels_batch) # BACKPROP #optimizer.zero_grad() #net.zero_grad() loss_value.backward() optimizer.step() # LOGGING progress.update(progress.value + 1, loss=loss_value.data.cpu().numpy()[0], accuracy=accuracy_value.data.cpu().numpy()[0], epoch=i + 1) if j % logging_step == 0: # LOSS ACCURACY writer.add_scalar('loss', loss_value.data[0], i * batch_number + j) writer.add_scalar('accuracy', accuracy_value.data[0], i * batch_number + j) # PARAMS for name, param in net.named_parameters(): writer.add_histogram(name, param.clone().cpu().data.numpy(), i * batch_number + j) if j % logging_text_step == 0: net.train(False) # STEP s = "non sopporto i giocatori di biliardo, i soprannomi, gli indecisi, i no"[ 0:75] s_final = s s = numpy.asarray([
def do_training(args, module, data_train, data_val, begin_epoch=0): from distutils.dir_util import mkpath from log_util import LogUtil log = LogUtil().getlogger() mkpath(os.path.dirname(get_checkpoint_path(args))) seq_len = args.config.get('arch', 'max_t_count') batch_size = args.config.getint('common', 'batch_size') save_checkpoint_every_n_epoch = args.config.getint('common', 'save_checkpoint_every_n_epoch') save_checkpoint_every_n_batch = args.config.getint('common', 'save_checkpoint_every_n_batch') enable_logging_train_metric = args.config.getboolean('train', 'enable_logging_train_metric') enable_logging_validation_metric = args.config.getboolean('train', 'enable_logging_validation_metric') contexts = parse_contexts(args) num_gpu = len(contexts) eval_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, seq_length=seq_len,is_logging=enable_logging_validation_metric,is_epoch_end=True) # tensorboard setting loss_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, seq_length=seq_len,is_logging=enable_logging_train_metric,is_epoch_end=False) optimizer = args.config.get('train', 'optimizer') momentum = args.config.getfloat('train', 'momentum') learning_rate = args.config.getfloat('train', 'learning_rate') learning_rate_annealing = args.config.getfloat('train', 'learning_rate_annealing') mode = args.config.get('common', 'mode') num_epoch = args.config.getint('train', 'num_epoch') clip_gradient = args.config.getfloat('train', 'clip_gradient') weight_decay = args.config.getfloat('train', 'weight_decay') save_optimizer_states = args.config.getboolean('train', 'save_optimizer_states') show_every = args.config.getint('train', 'show_every') n_epoch=begin_epoch if clip_gradient == 0: clip_gradient = None module.bind(data_shapes=data_train.provide_data, label_shapes=data_train.provide_label, for_training=True) if begin_epoch == 0 and mode == 'train': module.init_params(initializer=get_initializer(args)) lr_scheduler = SimpleLRScheduler(learning_rate=learning_rate) def reset_optimizer(force_init=False): if optimizer == "sgd": module.init_optimizer(kvstore='device', optimizer=optimizer, optimizer_params={'lr_scheduler': lr_scheduler, 'momentum': momentum, 'clip_gradient': clip_gradient, 'wd': weight_decay}, force_init=force_init) elif optimizer == "adam": module.init_optimizer(kvstore='device', optimizer=optimizer, optimizer_params={'lr_scheduler': lr_scheduler, #'momentum': momentum, 'clip_gradient': clip_gradient, 'wd': weight_decay}, force_init=force_init) else: raise Exception('Supported optimizers are sgd and adam. If you want to implement others define them in train.py') if mode == "train": reset_optimizer(force_init=True) else: reset_optimizer(force_init=False) #tensorboard setting tblog_dir = args.config.get('common', 'tensorboard_log_dir') summary_writer = SummaryWriter(tblog_dir) while True: if n_epoch >= num_epoch: break loss_metric.reset() log.info('---------train---------') for nbatch, data_batch in enumerate(data_train): module.forward_backward(data_batch) module.update() # tensorboard setting if (nbatch + 1) % show_every == 0: module.update_metric(loss_metric, data_batch.label) #summary_writer.add_scalar('loss batch', loss_metric.get_batch_loss(), nbatch) if (nbatch+1) % save_checkpoint_every_n_batch == 0: log.info('Epoch[%d] Batch[%d] SAVE CHECKPOINT', n_epoch, nbatch) module.save_checkpoint(prefix=get_checkpoint_path(args)+"n_epoch"+str(n_epoch)+"n_batch", epoch=(int((nbatch+1)/save_checkpoint_every_n_batch)-1), save_optimizer_states=save_optimizer_states) # commented for Libri_sample data set to see only train cer log.info('---------validation---------') data_val.reset() eval_metric.reset() for nbatch, data_batch in enumerate(data_val): # when is_train = False it leads to high cer when batch_norm module.forward(data_batch, is_train=True) module.update_metric(eval_metric, data_batch.label) # tensorboard setting val_cer, val_n_label, val_l_dist, _ = eval_metric.get_name_value() log.info("Epoch[%d] val cer=%f (%d / %d)", n_epoch, val_cer, int(val_n_label - val_l_dist), val_n_label) curr_acc = val_cer summary_writer.add_scalar('CER validation', val_cer, n_epoch) assert curr_acc is not None, 'cannot find Acc_exclude_padding in eval metric' data_train.reset() # tensorboard setting train_cer, train_n_label, train_l_dist, train_ctc_loss = loss_metric.get_name_value() summary_writer.add_scalar('loss epoch', train_ctc_loss, n_epoch) summary_writer.add_scalar('CER train', train_cer, n_epoch) # save checkpoints if n_epoch % save_checkpoint_every_n_epoch == 0: log.info('Epoch[%d] SAVE CHECKPOINT', n_epoch) module.save_checkpoint(prefix=get_checkpoint_path(args), epoch=n_epoch, save_optimizer_states=save_optimizer_states) n_epoch += 1 lr_scheduler.learning_rate=learning_rate/learning_rate_annealing log.info('FINISH')