def test_pbar_with_metric(): n_iters = 20 batch_size = 10 n_classes = 2 data = list(range(n_iters)) y_true_batch_values = iter( np.random.randint(0, n_classes, size=(n_iters, batch_size))) y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes)) loss_values = iter(range(n_iters)) def step(engine, batch): loss_value = next(loss_values) y_true_batch = next(y_true_batch_values) y_pred_batch = next(y_pred_batch_values) return loss_value, torch.from_numpy(y_pred_batch), torch.from_numpy( y_true_batch) trainer = Engine(step) accuracy = CategoricalAccuracy(output_transform=lambda x: (x[1], x[2])) accuracy.attach(trainer, "avg_accuracy") pbar = ProgressBar() pbar.attach(trainer, ['avg_accuracy']) with pytest.raises(KeyError): trainer.run(data=data, max_epochs=1)
def test_compute(): acc = CategoricalAccuracy() y_pred = torch.eye(4) y = torch.ones(4).type(torch.LongTensor) acc.update((y_pred, y)) assert acc.compute() == 0.25 acc.reset() y_pred = torch.eye(2) y = torch.ones(2).type(torch.LongTensor) acc.update((y_pred, y)) assert acc.compute() == 0.5
def create_supervised_classification_trainer(model, loss_fn, optimizer, val_loader, learning_rate_scheduler, callback=None, use_cuda=None): """ Todo: Add description :param model: :param loss_fn: :param optimizer: :param val_loader: :param learning_rate_scheduler: :param callback: :param use_cuda: :return: """ if use_cuda and not torch.cuda.is_available(): raise RuntimeError( 'Trying to run using cuda, while cuda is not available') if use_cuda and torch.cuda.is_available(): device = torch.device('cuda:0') torch.backends.cudnn.benchmark = True if torch.cuda.device_count() > 1 and not isinstance( model, nn.DataParallel): model = nn.DataParallel(model) print("Using {} gpus for training".format( torch.cuda.device_count())) else: device = torch.device('cpu') trainer = create_trainer(model=model, optimizer=optimizer, loss_fn=loss_fn, metrics={ 'top_1_accuracy': CategoricalAccuracy(), 'top_5_accuracy': TopKCategoricalAccuracy(), 'loss': Loss(loss_fn), }, device=device) evaluator = create_supervised_classification_evaluator( model, loss_fn, use_cuda) if learning_rate_scheduler: trainer.add_event_handler(Events.EPOCH_STARTED, lambda _: learning_rate_scheduler.step()) if callback is not None: trainer.add_event_handler(Events.ITERATION_COMPLETED, callback, model) trainer.add_event_handler(Events.EPOCH_COMPLETED, log_training_results, optimizer) trainer.add_event_handler(Events.EPOCH_COMPLETED, run_evaluation, evaluator, val_loader) return trainer, evaluator
def create_supervised_classification_evaluator(model, loss_fn, use_cuda): """ Create an evaluator :param model: :param loss_fn: :param use_cuda: :return: """ if use_cuda and torch.cuda.is_available(): device = torch.device('cuda:0') # multiple GPUs, we can remove this as well torch.backends.cudnn.benchmark = True if torch.cuda.device_count() > 1 and not isinstance( model, nn.DataParallel): model = nn.DataParallel(model) logger.info("Using %d gpus for training", torch.cuda.device_count()) else: device = torch.device('cpu') evaluator = create_supervised_evaluator(model, metrics={ 'top_1_accuracy': CategoricalAccuracy(), 'top_5_accuracy': TopKCategoricalAccuracy(), 'loss': Loss(loss_fn) }, device=device) return evaluator
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval): cuda = torch.cuda.is_available() train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() if cuda: model = model.cuda() optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) trainer = create_supervised_trainer(model, optimizer, F.nll_loss, cuda=cuda) evaluator = create_supervised_evaluator(model, metrics={'accuracy': CategoricalAccuracy(), 'nll': Loss(F.nll_loss)}, cuda=cuda) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: print("Epoch[{}] Iteration[{}/{}] Loss: {:.2f}" "".format(engine.state.epoch, iter, len(train_loader), engine.state.output)) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] print("Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) trainer.run(train_loader, max_epochs=epochs)
def folds(self, kf): model = BGRU(self.input_size, self.hidden_size, self.num_layers, self.num_classes, self.batch_size, self.dropout) loss = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=self.learning_rate) train_loader, valid_loader = _get_data_loader(kf, self.batch_size) trainer = create_supervised_trainer(model, optimizer, loss, device=DEVICE) evaluator = create_supervised_evaluator(model, metrics={ 'acc': CategoricalAccuracy(), 'loss': Loss(loss), 'prec': Precision(average=True), 'recall': Recall(average=True) }, device=DEVICE) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(trainer): iter_num = trainer.state.iteration if iter_num % 10 == 0: logger.info("Epoch[{}] Iter: {} Loss: {:.2f}".format( trainer.state.epoch, iter_num, trainer.state.output)) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(trainer): evaluator.run(train_loader) metrics = evaluator.state.metrics f1 = (2 * metrics['prec'] * metrics['recall']) / (metrics['prec'] + metrics['recall']) logger.info( "Train Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f} Avg Precision: {:.2f} Avg Recall: {:.2f} Avg F1 Score: {:.2f}" .format(trainer.state.epoch, metrics['acc'], metrics['loss'], metrics['prec'], metrics['recall'], f1)) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(trainer): evaluator.run(valid_loader) metrics = evaluator.state.metrics f1 = (2 * metrics['prec'] * metrics['recall']) / (metrics['prec'] + metrics['recall']) for k in self.res.keys(): if k != 'f1': self.res[k].append(metrics[k]) else: self.res[k].append(f1) logger.info( "Valid Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f} Avg Precision: {:.2f} Avg Recall: {:.2f} Avg F1 Score: {:.2f}" .format(trainer.state.epoch, metrics['acc'], metrics['loss'], metrics['prec'], metrics['recall'], f1)) trainer.run(train_loader, max_epochs=self.num_epochs) return model
def run(mode, noise_fraction, train_batch_size, val_batch_size, epochs, lr, momentum, log_interval, log_dir): seed = 12345 random.seed(seed) torch.manual_seed(seed) now = datetime.now() log_dir = os.path.join(log_dir, "train_{}_{}__{}".format(mode, noise_fraction, now.strftime("%Y%m%d_%H%M"))) os.makedirs(log_dir) cuda = torch.cuda.is_available() train_loader, val_loader = get_data_loaders(noise_fraction, train_batch_size, val_batch_size) model = Net() writer = create_summary_writer(log_dir) if cuda: model = model.cuda() optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) if mode == 'xentropy': criterion = nn.CrossEntropyLoss() elif mode == 'soft_bootstrap': criterion = SoftBootstrappingLoss(beta=0.95) elif mode == 'hard_bootstrap': criterion = HardBootstrappingLoss(beta=0.8) else: raise TypeError("Wrong mode {}, expected: xentropy, soft_bootstrap or hard_bootstrap".format(mode)) trainer = create_supervised_trainer(model, optimizer, criterion, cuda=cuda) evaluator = create_supervised_evaluator(model, metrics={'accuracy': CategoricalAccuracy(), 'nll': Loss(nn.CrossEntropyLoss())}, cuda=cuda) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: print("Epoch[{}] Iteration[{}/{}] Loss: {:.2f}" "".format(engine.state.epoch, iter, len(train_loader), engine.state.output)) writer.add_scalar("training/loss", engine.state.output, engine.state.iteration) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] print("Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) writer.add_scalar("valdation/loss", avg_nll, engine.state.epoch) writer.add_scalar("valdation/accuracy", avg_accuracy, engine.state.epoch) # kick everything off trainer.run(train_loader, max_epochs=epochs) writer.close()
def test_wrong_input_args(): with pytest.raises(TypeError): _ = RunningAverage(src=[12, 34]) with pytest.raises(ValueError): _ = RunningAverage(alpha=-1.0) with pytest.raises(ValueError): _ = RunningAverage(CategoricalAccuracy(), output_transform=lambda x: x[0]) with pytest.raises(ValueError): _ = RunningAverage()
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval, log_dir): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() writer = create_summary_writer(model, train_loader, log_dir) device = 'cpu' if torch.cuda.is_available(): device = 'cuda' model = model.to(device) optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) trainer = create_supervised_trainer(model, optimizer, F.nll_loss, device=device) evaluator = create_supervised_evaluator(model, metrics={'accuracy': CategoricalAccuracy(), 'nll': Loss(F.nll_loss)}, device=device) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: print("Epoch[{}] Iteration[{}/{}] Loss: {:.2f}" "".format(engine.state.epoch, iter, len(train_loader), engine.state.output)) writer.add_scalar("training/loss", engine.state.output, engine.state.iteration) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): evaluator.run(train_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] print("Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) writer.add_scalar("training/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("training/avg_accuracy", avg_accuracy, engine.state.epoch) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] print("Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) writer.add_scalar("valdation/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("valdation/avg_accuracy", avg_accuracy, engine.state.epoch) # kick everything off trainer.run(train_loader, max_epochs=epochs) writer.close()
def test_compute_batch_images(): acc = CategoricalAccuracy() y_pred = torch.softmax(torch.rand(2, 3, 2, 2), dim=1) y = torch.LongTensor([[[0, 1], [0, 1]], [[0, 2], [0, 2]]]) indices = torch.max(y_pred, dim=1)[1] acc.update((y_pred, y)) assert isinstance(acc.compute(), float) assert accuracy_score( y.view(-1).data.numpy(), indices.view(-1).data.numpy()) == pytest.approx(acc.compute())
])), batch_size=batch_size, shuffle=True, **kwargs) model = Net() device = 'cuda' if use_cuda else 'cpu' optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum) trainer = create_supervised_trainer(model, optimizer, F.nll_loss, device=device) evaluator = create_supervised_evaluator(model, metrics={ 'accuracy': CategoricalAccuracy(), 'nll': Loss(F.nll_loss) }, device=device) @trainer.on(Events.STARTED) def load_checkpoint(engine): # you can load the best checkpoint to continue training filename = checkpoint_best # or load the last checkpoint filename = checkpoint_last try: print("Loading checkpoint '{}'".format(filename)) model.load_state_dict(torch.load(filename)) evaluator.run(val_loader) metrics = evaluator.state.metrics
def test_integration(): n_iters = 100 batch_size = 10 n_classes = 10 y_true_batch_values = iter( np.random.randint(0, n_classes, size=(n_iters, batch_size))) y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes)) loss_values = iter(range(n_iters)) def update_fn(engine, batch): loss_value = next(loss_values) y_true_batch = next(y_true_batch_values) y_pred_batch = next(y_pred_batch_values) return loss_value, torch.from_numpy(y_pred_batch), torch.from_numpy( y_true_batch) trainer = Engine(update_fn) alpha = 0.98 acc_metric = RunningAverage( CategoricalAccuracy(output_transform=lambda x: [x[1], x[2]]), alpha=alpha) acc_metric.attach(trainer, 'running_avg_accuracy') avg_output = RunningAverage(output_transform=lambda x: x[0], alpha=alpha) avg_output.attach(trainer, 'running_avg_output') running_avg_acc = [None] @trainer.on(Events.ITERATION_COMPLETED, running_avg_acc) def manual_running_avg_acc(engine, running_avg_acc): _, y_pred, y = engine.state.output indices = torch.max(y_pred, 1)[1] correct = torch.eq(indices, y).view(-1) num_correct = torch.sum(correct).item() num_examples = correct.shape[0] batch_acc = num_correct * 1.0 / num_examples if running_avg_acc[0] is None: running_avg_acc[0] = batch_acc else: running_avg_acc[0] = running_avg_acc[0] * alpha + ( 1.0 - alpha) * batch_acc engine.state.running_avg_acc = running_avg_acc[0] @trainer.on(Events.EPOCH_STARTED) def running_avg_output_init(engine): engine.state.running_avg_output = None @trainer.on(Events.ITERATION_COMPLETED) def running_avg_output_update(engine): if engine.state.running_avg_output is None: engine.state.running_avg_output = engine.state.output[0] else: engine.state.running_avg_output = engine.state.running_avg_output * alpha + \ (1.0 - alpha) * engine.state.output[0] @trainer.on(Events.ITERATION_COMPLETED) def assert_equal_running_avg_acc_values(engine): assert engine.state.running_avg_acc == engine.state.metrics['running_avg_accuracy'], \ "{} vs {}".format(engine.state.running_avg_acc, engine.state.metrics['running_avg_accuracy']) @trainer.on(Events.ITERATION_COMPLETED) def assert_equal_running_avg_output_values(engine): assert engine.state.running_avg_output == engine.state.metrics['running_avg_output'], \ "{} vs {}".format(engine.state.running_avg_output, engine.state.metrics['running_avg_output']) np.random.seed(10) running_avg_acc[0] = None n_iters = 10 batch_size = 10 n_classes = 10 data = list(range(n_iters)) loss_values = iter(range(n_iters)) y_true_batch_values = iter( np.random.randint(0, n_classes, size=(n_iters, batch_size))) y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes)) trainer.run(data, max_epochs=1) running_avg_acc[0] = None n_iters = 10 batch_size = 10 n_classes = 10 data = list(range(n_iters)) loss_values = iter(range(n_iters)) y_true_batch_values = iter( np.random.randint(0, n_classes, size=(n_iters, batch_size))) y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes)) trainer.run(data, max_epochs=1)
import torch.nn.functional as F from ignite.metrics import CategoricalAccuracy, Loss, MeanAbsoluteError from attributer.attributes import FaceAttributes from training.metric_utils import ScaledError _metrics = { FaceAttributes.AGE: ScaledError(MeanAbsoluteError(), 50), FaceAttributes.GENDER: CategoricalAccuracy(), FaceAttributes.EYEGLASSES: CategoricalAccuracy(), FaceAttributes.RECEDING_HAIRLINES: CategoricalAccuracy(), FaceAttributes.SMILING: CategoricalAccuracy(), FaceAttributes.HEAD_YAW_BIN: CategoricalAccuracy(), FaceAttributes.HEAD_PITCH_BIN: CategoricalAccuracy(), FaceAttributes.HEAD_ROLL_BIN: CategoricalAccuracy(), FaceAttributes.HEAD_YAW: MeanAbsoluteError(), FaceAttributes.HEAD_PITCH: MeanAbsoluteError(), FaceAttributes.HEAD_ROLL: MeanAbsoluteError(), } _losses = { FaceAttributes.AGE: F.l1_loss, FaceAttributes.GENDER: F.cross_entropy, FaceAttributes.EYEGLASSES: F.cross_entropy, FaceAttributes.RECEDING_HAIRLINES: F.cross_entropy, FaceAttributes.SMILING: F.cross_entropy, FaceAttributes.HEAD_YAW_BIN: F.cross_entropy, FaceAttributes.HEAD_PITCH_BIN: F.cross_entropy, FaceAttributes.HEAD_ROLL_BIN: F.cross_entropy, FaceAttributes.HEAD_YAW: F.l1_loss, FaceAttributes.HEAD_PITCH: F.l1_loss,
def test_zero_div(): acc = CategoricalAccuracy() with pytest.raises(NotComputableError): acc.compute()
def main(): parser = argparse.ArgumentParser(description='Training') parser.add_argument('--learning_rate', type=float, default=0.001, help='Learning Rate') parser.add_argument('--reg', type=float, default=0.01, help='Regularizer') parser.add_argument('--batch_size', type=int, default=8, help='batch size') parser.add_argument('--max_epochs', type=int, default=500, help='Max Epochs') parser.add_argument('--log_every_batch', type=int, default=10, help='Log every batch') parser.add_argument('--save_ckpt_every', type=int, default=20, help='Save Checkpoint Every') parser.add_argument('--dataset', type=str, default="Names", help='dataset') parser.add_argument('--base_dataset', type=str, default="Names", help='base_dataset') parser.add_argument('--checkpoints_directory', type=str, default="CKPTS", help='Check Points Directory') parser.add_argument('--continue_training', type=str, default="False", help='Continue Training') parser.add_argument('--filter_width', type=int, default=5, help='Filter Width') parser.add_argument('--hidden_units', type=int, default=256, help='hidden_units') parser.add_argument('--embedding_size', type=int, default=256, help='embedding_size') parser.add_argument('--resume_run', type=int, default=-1, help='Which run to resume') parser.add_argument('--random_network', type=str, default="False", help='Random Network') parser.add_argument('--classifier_type', type=str, default="charRNN", help='rnn type') parser.add_argument('--progressive', type=str, default="True", help='Progressively increase length for back prop') parser.add_argument('--progress_up_to', type=float, default=30.0, help='Epoch Number upto which length will be progressed to full length') args = parser.parse_args() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") base_train_dataset = datasets.get_dataset(args.base_dataset, dataset_type = 'train') train_dataset = datasets.get_dataset(args.dataset, dataset_type = 'train') val_dataset = datasets.get_dataset(args.dataset, dataset_type = 'val') if args.classifier_type == "charRNN": lstm_model = model_classifier.uniRNN({ 'vocab_size' : len(base_train_dataset.idx_to_char), 'hidden_size' : args.hidden_units, 'target_size' : len(base_train_dataset.classes), 'embedding_size' : args.embedding_size }) print "char RNN" if args.classifier_type == "biRNN": lstm_model = model_classifier.biRNN({ 'vocab_size' : len(base_train_dataset.idx_to_char), 'hidden_size' : args.hidden_units, 'target_size' : len(base_train_dataset.classes), 'embedding_size' : args.embedding_size }) print "BI RNN" if args.classifier_type == "CNN": lstm_model = model_classifier.CnnTextClassifier({ 'vocab_size' : len(base_train_dataset.idx_to_char), 'hidden_size' : args.hidden_units, 'target_size' : len(base_train_dataset.classes), 'embedding_size' : args.embedding_size }) print "CnnTextClassifier" lstm_ckpt_dir = "{}/{}_classifer_{}".format(args.checkpoints_directory, args.base_dataset, args.classifier_type) lstm_ckpt_name = "{}/best_model.pth".format(lstm_ckpt_dir) if args.random_network != "True": lstm_model.load_state_dict(torch.load(lstm_ckpt_name)) else: print "Random LSTM network.." lstm_model.eval() lstm_loss_criterion = nn.CrossEntropyLoss() seq_model = seq_rewriter.seq_rewriter({ 'vocab_size' : len(train_dataset.idx_to_char), 'target_size' : len(base_train_dataset.idx_to_char), 'filter_width' : args.filter_width, 'target_sequence_length' : base_train_dataset.seq_length }) new_classifier = nn.Sequential(seq_model, lstm_model) lstm_model.to(device) seq_model.to(device) new_classifier.to(device) parameters = filter(lambda p: p.requires_grad, seq_model.parameters()) optimizer = optim.Adam(parameters, lr=args.learning_rate) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=0) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=True, num_workers=0) evaluator = create_supervised_evaluator(new_classifier, metrics={ 'accuracy': CategoricalAccuracy(), }) # CHECKPOINT DIRECTORY STUFF....... checkpoints_dir = "{}/ADVERSARIAL".format(args.checkpoints_directory) if not os.path.exists(checkpoints_dir): os.makedirs(checkpoints_dir) checkpoint_suffix = "lr_{}_rg_{}_fw_{}_bs_{}_rd_{}_classifer_{}".format(args.learning_rate, args.reg, args.filter_width, args.batch_size, args.random_network,args.classifier_type) checkpoints_dir = "{}/{}_adversarial_base_{}_{}".format(checkpoints_dir, args.dataset, args.base_dataset, checkpoint_suffix) if not os.path.exists(checkpoints_dir): os.makedirs(checkpoints_dir) start_epoch = 0 training_log = { 'log' : [], 'best_epoch' : 0, 'best_accuracy' : 0.0, 'running_reward' : [] } running_reward = -args.batch_size if args.continue_training == "True": if args.resume_run == -1: run_index = len(os.listdir(checkpoints_dir)) - 1 else: run_index = args.resume_run checkpoints_dir = "{}/{}".format(checkpoints_dir, run_index) if not os.path.exists(checkpoints_dir): raise Exception("Coud not find checkpoints_dir") with open("{}/training_log.json".format(checkpoints_dir)) as tlog_f: print "CHECKSSSSSS" training_log = json.load(tlog_f) seq_model.load_state_dict(torch.load("{}/best_model.pth".format(checkpoints_dir))) start_epoch = training_log['best_epoch'] running_reward = training_log['running_reward'][-1] else: run_index = len(os.listdir(checkpoints_dir)) checkpoints_dir = "{}/{}".format(checkpoints_dir, run_index) if not os.path.exists(checkpoints_dir): os.makedirs(checkpoints_dir) for epoch in range(start_epoch, args.max_epochs): new_classifier.train() for batch_idx, batch in enumerate(train_loader): rewritten_x = seq_model(batch[0]) pred_logits = lstm_model(rewritten_x) _, predictions = torch.max(pred_logits, 1) pred_correctness = (predictions == batch[1]).float() pred_correctness[pred_correctness == 0.0] = -1.0 rewards = pred_correctness # lstm_loss = lstm_loss_criterion(pred_logits, batch[1]) seq_rewriter_loss = 0 max_length_to_update = train_dataset.seq_length + args.filter_width + 1 if args.progressive == "True": max_length_to_update = min( int( (epoch/args.progress_up_to) * max_length_to_update ) + 1, max_length_to_update ) for idx, log_prob in enumerate(seq_model.saved_log_probs): if (idx % (batch[0].size()[1])) < max_length_to_update: seq_rewriter_loss += (-log_prob * rewards[idx/rewritten_x.size()[1]]) # seq_rewriter_loss /= (args.batch_size * max_length_to_update) # seq_rewriter_loss += (- args.reg * seq_model.entropy) l2_reg = None for W in seq_model.parameters(): if l2_reg is None: l2_reg = W.norm(2) else: l2_reg = l2_reg + W.norm(2) # reg_loss = args.reg * l2_reg reg_loss = 0 seq_rewriter_loss_combined = seq_rewriter_loss + reg_loss optimizer.zero_grad() seq_rewriter_loss_combined.backward() optimizer.step() seq_model.saved_log_probs = None batch_reward = torch.sum(rewards) running_reward -= running_reward/(args.log_every_batch * 1.0) running_reward += batch_reward/(args.log_every_batch * 1.0) if batch_idx % args.log_every_batch == 0: print ("Epoch[{}] Iteration[{}] Running Reward[{}] LossBasic[{}] RegLoss[{}] max_length_to_update[{}]".format( epoch, batch_idx, running_reward, seq_rewriter_loss, reg_loss, max_length_to_update)) training_log['running_reward'].append(float(running_reward.cpu().numpy())) evaluator.run(train_loader) training_metrics = evaluator.state.metrics print("Training Results - Epoch: {} Avg accuracy: {:.2f}" .format(epoch, training_metrics['accuracy'])) evaluator.run(val_loader) evaluation_metrics = evaluator.state.metrics print("Validation Results - Epoch: {} Avg accuracy: {:.2f}" .format(epoch, evaluation_metrics['accuracy'])) training_log['log'].append({ 'training_metrics' : training_metrics, 'evaluation_metrics' : evaluation_metrics, }) if evaluation_metrics['accuracy'] > training_log['best_accuracy']: torch.save(seq_model.state_dict(), "{}/best_model.pth".format(checkpoints_dir)) training_log['best_accuracy'] = evaluation_metrics['accuracy'] training_log['best_epoch'] = epoch if epoch % args.save_ckpt_every == 0: torch.save(seq_model.state_dict(), "{}/model_{}.pth".format(checkpoints_dir, epoch)) print "BEST", training_log['best_epoch'], training_log['best_accuracy'] with open("{}/training_log.json".format(checkpoints_dir), 'w') as f: f.write(json.dumps(training_log)) if not os.path.exists(checkpoints_dir): os.makedirs(checkpoints_dir)
def test_warning(): with pytest.warns(DeprecationWarning): CategoricalAccuracy()
def main(cfg): model_name = get_model_name(cfg) model_name = randomize_name(model_name) print(f'Model name: {model_name}') dataset_train, dataset_dev = get_dataset(cfg) W_emb = create_word_embeddings(cfg, dataset_train.vocab) model_params = get_model_params(cfg, W_emb) model = create_model(cfg, model_params, W_emb=W_emb) data_loader_train = create_data_loader(dataset_train, cfg.batch_size, shuffle=True) data_loader_dev = create_data_loader(dataset_dev, cfg.batch_size, shuffle=False) model_parameters = get_trainable_parameters(model.parameters()) optimizer = torch.optim.Adam(model_parameters, cfg.learning_rate, weight_decay=cfg.weight_decay, amsgrad=True) criterion = torch.nn.CrossEntropyLoss() def update_function(engine, batch): model.train() optimizer.zero_grad() (premise, hypothesis), label = to_device(batch) logits = model(premise, hypothesis) loss = criterion(logits, label) loss.backward() torch.nn.utils.clip_grad_norm_(model_parameters, cfg.max_grad_norm) optimizer.step() return loss.item() def inference_function(engine, batch): model.eval() with torch.no_grad(): (premise, hypothesis), label = to_device(batch) logits = model(premise, hypothesis) return logits, label trainer = Engine(update_function) evaluator = Engine(inference_function) metrics = [ ('loss', Loss(criterion)), ('accuracy', CategoricalAccuracy()) ] for name, metric in metrics: metric.attach(evaluator, name) best_dev_acc = -np.inf @trainer.on(Events.EPOCH_COMPLETED) def eval_model(engine): nonlocal best_dev_acc def format_metric_str(metrics_values): metrics_str = ', '.join([ f'{metric_name} {metrics_values[metric_name]:.3f}' for metric_name, _ in metrics ]) return metrics_str evaluator.run(data_loader_train) metrics_train = evaluator.state.metrics.copy() evaluator.run(data_loader_dev) metrics_dev = evaluator.state.metrics.copy() print(f'Epoch {engine.state.epoch}', end=' | ') print('Train:', format_metric_str(metrics_train), end=' | ') print('Dev:', format_metric_str(metrics_dev), end=' ') print() if metrics_dev['accuracy'] > best_dev_acc: best_dev_acc = metrics_dev['accuracy'] save_weights(model, cfg.models_dir.joinpath(f'{model_name}.pt')) # save models specifications create_dirs(cfg) model_spec = dict(model_name=model_name, model_params=model_params, vocab=dataset_train.vocab, cfg=cfg) save_pickle(model_spec, cfg.models_dir.joinpath(f'{model_name}.pkl')) trainer.run(data_loader_train, max_epochs=cfg.nb_epochs) print(f'Best dev accuracy: {best_dev_acc:.3f}')
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval, restore_from, crash_iteration=1000): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() device = 'cpu' optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) trainer = create_supervised_trainer(model, optimizer, F.nll_loss, device=device) evaluator = create_supervised_evaluator(model, metrics={ 'accuracy': CategoricalAccuracy(), 'nll': Loss(F.nll_loss) }, device=device) # Setup debug level of engine logger: trainer._logger.setLevel(logging.INFO) ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) formatter = logging.Formatter( "%(asctime)s|%(name)s|%(levelname)s| %(message)s") ch.setFormatter(formatter) trainer._logger.addHandler(ch) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: print("Epoch[{}] Iteration[{}/{}] Loss: {:.2f}" "".format(engine.state.epoch, iter, len(train_loader), engine.state.output)) if engine.state.iteration == crash_iteration: raise Exception("STOP at {}".format(engine.state.iteration)) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): evaluator.run(train_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] print( "Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] print( "Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) objects_to_checkpoint = {"model": model, "optimizer": optimizer} engine_checkpoint = EngineCheckpoint(dirname="engine_checkpoint", to_save=objects_to_checkpoint, save_interval=100) trainer.add_event_handler(Events.ITERATION_COMPLETED, engine_checkpoint) if restore_from == "": trainer.run(train_loader, max_epochs=epochs) else: trainer.resume(train_loader, restore_from, to_load=objects_to_checkpoint)
def main(): parser = argparse.ArgumentParser(description='Training') parser.add_argument('--learning_rate', type=float, default=0.0001, help='Output filename') parser.add_argument('--batch_size', type=int, default=32, help='Output filename') parser.add_argument('--epochs', type=int, default=200, help='Epochs') parser.add_argument('--dataset', type=str, default="Names", help='Output filename') parser.add_argument('--checkpoints_directory', type=str, default="CKPTS", help='Check Points Directory') parser.add_argument('--hidden_units', type=int, default=256, help='hidden_units') parser.add_argument('--embedding_size', type=int, default=256, help='embedding_size') parser.add_argument('--patience', type=int, default=10, help='patience') parser.add_argument('--classifier_type', type=str, default="charRNN", help='rnn type') args = parser.parse_args() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") train_dataset = datasets.get_dataset(args.dataset, dataset_type='train') val_dataset = datasets.get_dataset(args.dataset, dataset_type='train_val') if args.classifier_type == "charRNN": model_options = { 'vocab_size': len(train_dataset.idx_to_char), 'hidden_size': args.hidden_units, 'target_size': len(train_dataset.classes), 'embedding_size': args.embedding_size } model = model_classifier.uniRNN(model_options) print "char RNN" if args.classifier_type == "biRNN": model_options = { 'vocab_size': len(train_dataset.idx_to_char), 'hidden_size': args.hidden_units, 'target_size': len(train_dataset.classes), 'embedding_size': args.embedding_size } model = model_classifier.biRNN(model_options) print "BI RNN" if args.classifier_type == "CNN": model_options = { 'vocab_size': len(train_dataset.idx_to_char), 'hidden_size': args.hidden_units, 'target_size': len(train_dataset.classes), 'embedding_size': args.embedding_size } model = model_classifier.CnnTextClassifier(model_options) print "CnnTextClassifier" print device model.to(device) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adam(parameters, lr=args.learning_rate) loss_criterion = nn.CrossEntropyLoss() print "check", torch.cuda.is_available() train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=0) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=True, num_workers=0) trainer = create_supervised_trainer(model, optimizer, loss_criterion) evaluator = create_supervised_evaluator(model, metrics={ 'accuracy': CategoricalAccuracy(), 'nll': Loss(loss_criterion) }) checkpoints_dir = "{}/{}_classifer_{}".format(args.checkpoints_directory, args.dataset, args.classifier_type) if not os.path.exists(checkpoints_dir): os.makedirs(checkpoints_dir) training_log = { 'model_options': model_options, 'log': [], 'best_epoch': 0, 'best_accuracy': 0.0 } @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(trainer): total_batches = int(len(train_dataset) / args.batch_size) if trainer.state.iteration % 100 == 0: print("Epoch[{}] Iteration[{}] Total Iterations[{}] Loss: {:.2f}". format(trainer.state.epoch, trainer.state.iteration, total_batches, trainer.state.output)) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(trainer): evaluator.run(train_loader) training_metrics = evaluator.state.metrics print( "Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(trainer.state.epoch, training_metrics['accuracy'], training_metrics['nll'])) evaluator.run(val_loader) evaluation_metrics = evaluator.state.metrics print( "Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(trainer.state.epoch, evaluation_metrics['accuracy'], evaluation_metrics['nll'])) out_path = "{}/model_epoch_{}.pth".format(checkpoints_dir, trainer.state.epoch) torch.save(model.state_dict(), out_path) training_log['log'].append({ 'training_metrics': training_metrics, 'evaluation_metrics': evaluation_metrics, }) # if (trainer.state.epoch - training_log['best_epoch']) > args.patience and (evaluation_metrics['accuracy'] < training_log['best_accuracy']): # trainer.terminate() if evaluation_metrics['accuracy'] > training_log['best_accuracy']: torch.save(model.state_dict(), "{}/best_model.pth".format(checkpoints_dir)) training_log['best_accuracy'] = evaluation_metrics['accuracy'] training_log['best_epoch'] = trainer.state.epoch print "BEST", training_log['best_epoch'], training_log['best_accuracy'] with open("{}/training_log.json".format(checkpoints_dir), 'w') as f: f.write(json.dumps(training_log)) trainer.run(train_loader, max_epochs=args.epochs)
def train(base_path: str, epochs: int, n_folds: int, #val_batch_size: int, lr: t.Optional[float] = 1e-2, momentum: t.Optional[float] = 0.5, log_interval: t.Optional[int] = 50, random_seed: t.Optional[int] = 42, handlers: t.Optional[t.Tuple] = () ) -> nn.Module: """ Instantiates and trains a CNN on MNIST. """ torch.manual_seed(random_seed) np.random.seed(random_seed) model = ResFeatureExtractor(pretrained_model=models.resnet50) image_transform = Compose([Resize((320,180)), ToTensor()]) kfoldWorkflowSet = kFoldWorkflowSplit(base_path, image_transform=image_transform, video_extn='.avi', shuffle=True, n_folds=n_folds, num_phases=14, batch_size=32, num_workers=16) train_loader, val_loader = next(kfoldWorkflowSet)#get_data_loaders(train_batch_size, val_batch_size) device = 'cpu' if torch.cuda.is_available(): device = 'cuda:0' model = model.to(device=device) optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) criterion_CE = nn.CrossEntropyLoss() trainer = create_supervised_trainer( model, optimizer, criterion_CE, device=device) evaluator = create_supervised_evaluator( model, metrics={'accuracy': CategoricalAccuracy(), 'CE': Loss(criterion_CE)}, device=device) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): i = (engine.state.iteration - 1) % len(train_loader) + 1 if i % log_interval == 0: print(f"[{engine.state.epoch}] {i}/{len(train_loader)} loss: {'%.2f' % engine.state.output}") # Attach scheduler(s) for handler_args in handlers: (scheduler_cls, param_name, start_value, end_value, cycle_mult) = handler_args handler = scheduler_cls( optimizer, param_name, start_value, end_value, len(train_loader), cycle_mult=cycle_mult, save_history=True) trainer.add_event_handler(Events.ITERATION_COMPLETED, handler) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_CE = metrics['CE'] print("Validation Accuracy: {:.2f} Loss: {:.2f}\n".format(avg_accuracy, avg_CE)) trainer.run(train_loader, max_epochs=epochs) return (model, trainer.state)
def main(): parser = argparse.ArgumentParser(description='Training') parser.add_argument('--learning_rate', type=float, default=0.0005, help='Output filename') parser.add_argument('--temp_min', type=float, default=0.01, help='Temp Min') parser.add_argument( '--epochs_to_anneal', type=float, default=15.0, help='Epoch Number upto which length will be progressed to full length' ) parser.add_argument('--temp_max', type=float, default=2.0, help='Temp Max') parser.add_argument('--reg', type=float, default=0.01, help='Output filename') parser.add_argument('--batch_size', type=int, default=8, help='Output filename') parser.add_argument('--max_epochs', type=int, default=500, help='Max Epochs') parser.add_argument('--log_every_batch', type=int, default=50, help='Log every batch') parser.add_argument('--save_ckpt_every', type=int, default=20, help='Save Checkpoint Every') parser.add_argument('--dataset', type=str, default="QuestionLabels", help='Output filename') parser.add_argument('--base_dataset', type=str, default="Names", help='Output filename') parser.add_argument('--checkpoints_directory', type=str, default="CKPTS", help='Check Points Directory') parser.add_argument('--adv_directory', type=str, default="ADVERSARIAL_GUMBEL", help='Check Points Directory') parser.add_argument('--continue_training', type=str, default="False", help='Continue Training') parser.add_argument('--filter_width', type=int, default=5, help='Filter Width') parser.add_argument('--hidden_units', type=int, default=256, help='hidden_units') parser.add_argument('--embedding_size', type=int, default=256, help='embedding_size') parser.add_argument('--resume_run', type=int, default=-1, help='Which run to resume') parser.add_argument('--random_network', type=str, default="False", help='Random Network') parser.add_argument('--classifier_type', type=str, default="charRNN", help='rnn type') parser.add_argument('--print_prob', type=str, default="False", help='Probs') parser.add_argument('--progressive', type=str, default="True", help='Progressively increase length for back prop') args = parser.parse_args() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") base_train_dataset = datasets.get_dataset(args.base_dataset, dataset_type='train') train_dataset = datasets.get_dataset(args.dataset, dataset_type='train') val_dataset = datasets.get_dataset(args.dataset, dataset_type='val') if args.classifier_type == "charRNN": lstm_model = model_classifier.uniRNN({ 'vocab_size': len(base_train_dataset.idx_to_char), 'hidden_size': args.hidden_units, 'target_size': len(base_train_dataset.classes), 'embedding_size': args.embedding_size }) print "char RNN" if args.classifier_type == "biRNN": lstm_model = model_classifier.biRNN({ 'vocab_size': len(base_train_dataset.idx_to_char), 'hidden_size': args.hidden_units, 'target_size': len(base_train_dataset.classes), 'embedding_size': args.embedding_size }) print "BI RNN" if args.classifier_type == "CNN": lstm_model = model_classifier.CnnTextClassifier({ 'vocab_size': len(base_train_dataset.idx_to_char), 'hidden_size': args.hidden_units, 'target_size': len(base_train_dataset.classes), 'embedding_size': args.embedding_size }) print "CnnTextClassifier" lstm_ckpt_dir = "{}/{}_classifer_{}".format(args.checkpoints_directory, args.base_dataset, args.classifier_type) lstm_ckpt_name = "{}/best_model.pth".format(lstm_ckpt_dir) if args.random_network != "True": lstm_model.load_state_dict(torch.load(lstm_ckpt_name)) else: print "Random LSTM network.." lstm_model.eval() lstm_loss_criterion = nn.CrossEntropyLoss() seq_model = seq_rewriter_gumbel.seq_rewriter({ 'vocab_size': len(train_dataset.idx_to_char), 'target_size': len(base_train_dataset.idx_to_char), 'filter_width': args.filter_width, 'target_sequence_length': base_train_dataset.seq_length }) new_classifier = nn.Sequential(seq_model, lstm_model) lstm_model.to(device) seq_model.to(device) new_classifier.to(device) parameters = filter(lambda p: p.requires_grad, seq_model.parameters()) for parameter in parameters: print "PARAMETERS", parameter.size() optimizer = optim.Adam(parameters, lr=args.learning_rate) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=0) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=True, num_workers=0) evaluator = create_supervised_evaluator(new_classifier, metrics={ 'accuracy': CategoricalAccuracy(), }) # CHECKPOINT DIRECTORY STUFF....... checkpoints_dir = "{}/{}".format(args.checkpoints_directory, args.adv_directory) if not os.path.exists(checkpoints_dir): os.makedirs(checkpoints_dir) checkpoint_suffix = "lr_{}_tmin_{}_fw_{}_bs_{}_rand_{}_classifer_{}".format( args.learning_rate, args.temp_min, args.filter_width, args.batch_size, args.random_network, args.classifier_type) checkpoints_dir = "{}/{}_adversarial_base_{}_{}".format( checkpoints_dir, args.dataset, args.base_dataset, checkpoint_suffix) if args.resume_run == -1: run_index = len(os.listdir(checkpoints_dir)) - 1 print "CHeck ", run_index else: run_index = args.resume_run checkpoints_dir = "{}/{}".format(checkpoints_dir, run_index) if not os.path.exists(checkpoints_dir): print checkpoints_dir raise Exception("Coud not find checkpoints_dir") with open("{}/training_log.json".format(checkpoints_dir)) as tlog_f: print "CHECKSSSSSS" training_log = json.load(tlog_f) seq_model.load_state_dict( torch.load("{}/best_model.pth".format(checkpoints_dir))) # running_reward = training_log['running_reward'][-1] seq_model.eval() lstm_model.eval() new_classifier.eval() for batch_idx, batch in enumerate(val_loader): original_sentences = batch_to_sentenes(batch[0], val_dataset.idx_to_char) rewritten_x = seq_model(batch[0], temp=1.0) new_sentences = batch_to_sentenes(rewritten_x, base_train_dataset.idx_to_char, spaces=True) pred_logits = lstm_model(seq_model.probs) _, predictions = torch.max(pred_logits, 1) results = [] for i in range(batch[0].size()[0]): print "ORIG", original_sentences[i] print "REWR", new_sentences[i] print "CLAS", base_train_dataset.classes[int(predictions[i])] print "MAPP", val_dataset.classes[int(predictions[i])] print "TARG", val_dataset.classes[int(batch[1][i])] print "***************"
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() device = 'cpu' if torch.cuda.is_available(): device = 'cuda' optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) trainer = create_supervised_trainer(model, optimizer, F.nll_loss, device=device) evaluator = create_supervised_evaluator(model, metrics={ 'accuracy': CategoricalAccuracy(), 'nll': Loss(F.nll_loss) }, device=device) desc = "ITERATION - loss: {:.2f}" pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0)) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: pbar.desc = desc.format(engine.state.output) pbar.update(log_interval) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() evaluator.run(train_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] tqdm.write( "Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] tqdm.write( "Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) pbar.n = pbar.last_print_n = 0 trainer.run(train_loader, max_epochs=epochs) pbar.close()
def test_compute(): acc = CategoricalAccuracy() y_pred = torch.softmax(torch.rand(4, 4), dim=1) y = torch.ones(4).type(torch.LongTensor) indices = torch.max(y_pred, dim=1)[1] acc.update((y_pred, y)) assert isinstance(acc.compute(), float) assert accuracy_score( y.view(-1).data.numpy(), indices.view(-1).data.numpy()) == pytest.approx(acc.compute()) acc.reset() y_pred = torch.softmax(torch.rand(2, 2), dim=1) y = torch.ones(2).type(torch.LongTensor) indices = torch.max(y_pred, dim=1)[1] acc.update((y_pred, y)) assert isinstance(acc.compute(), float) assert accuracy_score( y.view(-1).data.numpy(), indices.view(-1).data.numpy()) == pytest.approx(acc.compute())
def main(): parser = argparse.ArgumentParser(description='Training') parser.add_argument('--learning_rate', type=float, default=0.0005, help='learning_rate') parser.add_argument('--temp_min', type=float, default=0.01, help='Temp Min') parser.add_argument('--epochs_to_anneal', type=float, default=15.0, help='epochs_to_anneal') parser.add_argument('--temp_max', type=float, default=2.0, help='Temp Max') parser.add_argument('--reg', type=float, default=0.01, help='regularizer') parser.add_argument('--batch_size', type=int, default=8, help='batch_size') parser.add_argument('--max_epochs', type=int, default=500, help='Max Epochs') parser.add_argument('--log_every_batch', type=int, default=50, help='Log every batch') parser.add_argument('--save_ckpt_every', type=int, default=20, help='Save Checkpoint Every') parser.add_argument('--dataset', type=str, default="QuestionLabels", help='dataset') parser.add_argument('--base_dataset', type=str, default="Names", help='base_dataset') parser.add_argument('--checkpoints_directory', type=str, default="CKPTS", help='Check Points Directory') parser.add_argument('--continue_training', type=str, default="False", help='Continue Training') parser.add_argument('--filter_width', type=int, default=5, help='Filter Width') parser.add_argument('--hidden_units', type=int, default=256, help='hidden_units') parser.add_argument('--embedding_size', type=int, default=256, help='embedding_size') parser.add_argument('--resume_run', type=int, default=-1, help='Which run to resume') parser.add_argument('--random_network', type=str, default="False", help='Random Network') parser.add_argument('--classifier_type', type=str, default="charRNN", help='rnn type') parser.add_argument('--print_prob', type=str, default="False", help='Probs') parser.add_argument('--progressive', type=str, default="True", help='Progressively increase length for back prop') args = parser.parse_args() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") base_train_dataset = datasets.get_dataset(args.base_dataset, dataset_type='train') train_dataset = datasets.get_dataset(args.dataset, dataset_type='train') val_dataset = datasets.get_dataset(args.dataset, dataset_type='val') if args.classifier_type == "charRNN": lstm_model = model_classifier.uniRNN({ 'vocab_size': len(base_train_dataset.idx_to_char), 'hidden_size': args.hidden_units, 'target_size': len(base_train_dataset.classes), 'embedding_size': args.embedding_size }) print "char RNN" if args.classifier_type == "biRNN": lstm_model = model_classifier.biRNN({ 'vocab_size': len(base_train_dataset.idx_to_char), 'hidden_size': args.hidden_units, 'target_size': len(base_train_dataset.classes), 'embedding_size': args.embedding_size }) print "BI RNN" if args.classifier_type == "CNN": lstm_model = model_classifier.CnnTextClassifier({ 'vocab_size': len(base_train_dataset.idx_to_char), 'hidden_size': args.hidden_units, 'target_size': len(base_train_dataset.classes), 'embedding_size': args.embedding_size }) print "CnnTextClassifier" lstm_ckpt_dir = "{}/{}_classifer_{}".format(args.checkpoints_directory, args.base_dataset, args.classifier_type) lstm_ckpt_name = "{}/best_model.pth".format(lstm_ckpt_dir) if args.random_network != "True": lstm_model.load_state_dict(torch.load(lstm_ckpt_name)) else: print "Random LSTM network.." lstm_model.eval() lstm_loss_criterion = nn.CrossEntropyLoss() seq_model = seq_rewriter_gumbel.seq_rewriter({ 'vocab_size': len(train_dataset.idx_to_char), 'target_size': len(base_train_dataset.idx_to_char), 'filter_width': args.filter_width, 'target_sequence_length': base_train_dataset.seq_length }) new_classifier = nn.Sequential(seq_model, lstm_model) lstm_model.to(device) seq_model.to(device) new_classifier.to(device) parameters = filter(lambda p: p.requires_grad, seq_model.parameters()) for parameter in parameters: print "PARAMETERS", parameter.size() optimizer = optim.Adam(parameters, lr=args.learning_rate) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=0) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=True, num_workers=0) evaluator = create_supervised_evaluator(new_classifier, metrics={ 'accuracy': CategoricalAccuracy(), }) # CHECKPOINT DIRECTORY STUFF....... checkpoints_dir = "{}/ADVERSARIAL_GUMBEL".format( args.checkpoints_directory) if not os.path.exists(checkpoints_dir): os.makedirs(checkpoints_dir) checkpoint_suffix = "lr_{}_tmin_{}_fw_{}_bs_{}_rand_{}_classifer_{}".format( args.learning_rate, args.temp_min, args.filter_width, args.batch_size, args.random_network, args.classifier_type) checkpoints_dir = "{}/{}_adversarial_base_{}_{}".format( checkpoints_dir, args.dataset, args.base_dataset, checkpoint_suffix) if not os.path.exists(checkpoints_dir): os.makedirs(checkpoints_dir) start_epoch = 0 training_log = { 'log': [], 'best_epoch': 0, 'best_accuracy': 0.0, 'running_reward': [] } running_reward = -args.batch_size lstm_loss_criterion = nn.CrossEntropyLoss() if args.continue_training == "True": if args.resume_run == -1: run_index = len(os.listdir(checkpoints_dir)) - 1 else: run_index = args.resume_run checkpoints_dir = "{}/{}".format(checkpoints_dir, run_index) if not os.path.exists(checkpoints_dir): raise Exception("Coud not find checkpoints_dir") with open("{}/training_log.json".format(checkpoints_dir)) as tlog_f: print "CHECKSSSSSS" training_log = json.load(tlog_f) seq_model.load_state_dict( torch.load("{}/best_model.pth".format(checkpoints_dir))) start_epoch = training_log['best_epoch'] # running_reward = training_log['running_reward'][-1] else: run_index = len(os.listdir(checkpoints_dir)) checkpoints_dir = "{}/{}".format(checkpoints_dir, run_index) if not os.path.exists(checkpoints_dir): os.makedirs(checkpoints_dir) temp_min = args.temp_min temp_max = args.temp_max for epoch in range(start_epoch, args.max_epochs): new_classifier.train() epoch_loss = 0 for batch_idx, batch in enumerate(train_loader): slope = (temp_max - temp_min) / args.epochs_to_anneal temp = max(temp_max - (slope * epoch), temp_min) rewritten_x = seq_model(batch[0], temp=temp) pred_logits = lstm_model(seq_model.probs) # print seq_model.probs _, predictions = torch.max(pred_logits, 1) pred_correctness = (predictions == batch[1]).float() pred_correctness[pred_correctness == 0.0] = -1.0 rewards = pred_correctness batch_reward = torch.sum(rewards) # print batch_reward loss = lstm_loss_criterion(pred_logits, batch[1]) optimizer.zero_grad() loss.backward() optimizer.step() # print running_reward/(args.log_every_batch * 1.0) # print batch_reward/(args.log_every_batch * 1.0) running_reward -= running_reward / (args.log_every_batch * 1.0) running_reward += batch_reward / (args.log_every_batch * 1.0) if batch_idx % args.log_every_batch == 0: if args.print_prob == "True": print "Temp", temp, seq_model.probs print( "Epoch[{}] Iteration[{}] RunningLoss[{}] Reward[{}] Temp[{}]" .format(epoch, batch_idx, loss, running_reward, temp)) evaluator.run(train_loader) training_metrics = evaluator.state.metrics print("Training Results - Epoch: {} Avg accuracy: {:.2f}".format( epoch, training_metrics['accuracy'])) evaluator.run(val_loader) evaluation_metrics = evaluator.state.metrics print("Validation Results - Epoch: {} Avg accuracy: {:.2f}".format( epoch, evaluation_metrics['accuracy'])) training_log['log'].append({ 'training_metrics': training_metrics, 'evaluation_metrics': evaluation_metrics, 'temp': temp }) if evaluation_metrics['accuracy'] > training_log['best_accuracy']: torch.save(seq_model.state_dict(), "{}/best_model.pth".format(checkpoints_dir)) training_log['best_accuracy'] = evaluation_metrics['accuracy'] training_log['best_epoch'] = epoch if epoch % args.save_ckpt_every == 0: torch.save(seq_model.state_dict(), "{}/model_{}.pth".format(checkpoints_dir, epoch)) print "BEST", training_log['best_epoch'], training_log['best_accuracy'] with open("{}/training_log.json".format(checkpoints_dir), 'w') as f: f.write(json.dumps(training_log)) if not os.path.exists(checkpoints_dir): os.makedirs(checkpoints_dir)
def test_compute_batch_images(): acc = CategoricalAccuracy() y_pred = torch.zeros(2, 3, 2, 2) y_pred[0, 1, :] = 1 y_pred[0, 2, :] = 1 y = torch.LongTensor([[[0, 1], [0, 1]], [[0, 2], [0, 2]]]) acc.update((y_pred, y)) assert isinstance(acc.compute(), float) assert acc.compute() == 0.5 acc.reset() y_pred = torch.zeros(2, 3, 2, 2) y_pred[0, 1, :] = 1 y_pred[1, 2, :] = 1 y = torch.LongTensor([[[2, 1], [1, 1]], [[2, 2], [0, 2]]]) acc.update((y_pred, y)) assert isinstance(acc.compute(), float) assert acc.compute() == 0.75
def run(path, model_name, imgaugs, train_batch_size, val_batch_size, num_workers, epochs, optim, lr, lr_update_every, gamma, restart_every, restart_factor, init_lr_factor, lr_reduce_patience, early_stop_patience, log_interval, output, debug): # Polyaxon exp = Experiment() exp.log_params(seed=SEED) print("--- Cifar10 Playground : Training --- ") from datetime import datetime now = datetime.now() log_dir = os.path.join(output, "training_{}_{}".format(model_name, now.strftime("%Y%m%d_%H%M"))) if not os.path.exists(log_dir): os.makedirs(log_dir) log_level = logging.INFO if debug: log_level = logging.DEBUG print("Activated debug mode") logger = logging.getLogger("Cifar10 Playground: Train") setup_logger(logger, log_dir, log_level) logger.debug("Setup tensorboard writer") writer = SummaryWriter(log_dir=os.path.join(log_dir, "tensorboard")) save_conf(logger, writer, model_name, imgaugs, train_batch_size, val_batch_size, num_workers, epochs, optim, lr, lr_update_every, gamma, restart_every, restart_factor, init_lr_factor, lr_reduce_patience, early_stop_patience, log_dir) # Polyaxon # log config exp.log_params( model_name=model_name, imgaugs=imgaugs, train_batch_size=train_batch_size, val_batch_size=val_batch_size, num_workers=num_workers, num_epochs=epochs, optimizer=optim, lr=lr, lr_update_every=lr_update_every, gamma=gamma, restart_every=restart_every, restart_factor=restart_factor, init_lr_factor=init_lr_factor, lr_reduce_patience=lr_reduce_patience, early_stop_patience=early_stop_patience ) device = 'cpu' if torch.cuda.is_available(): logger.debug("CUDA is enabled") from torch.backends import cudnn cudnn.benchmark = True device = 'cuda' # Polyaxon exp.log_params(device=device) logger.debug("Setup model: {}".format(model_name)) if not os.path.isfile(model_name): assert model_name in MODEL_MAP, "Model name not in {}".format(MODEL_MAP.keys()) model = MODEL_MAP[model_name](num_classes=10) else: model = torch.load(model_name) model_name = model.__class__.__name__ if 'cuda' in device: model = model.to(device) logger.debug("Setup train/val dataloaders") train_loader, val_loader = get_data_loaders(path, imgaugs, train_batch_size, val_batch_size, num_workers, device=device) write_model_graph(writer, model, train_loader, device=device) logger.debug("Setup optimizer") assert optim in OPTIMIZER_MAP, "Optimizer name not in {}".format(OPTIMIZER_MAP.keys()) optimizer = OPTIMIZER_MAP[optim](model.parameters(), lr=lr) logger.debug("Setup criterion") criterion = nn.CrossEntropyLoss() if 'cuda' in device: criterion = criterion.cuda() lr_scheduler = ExponentialLR(optimizer, gamma=gamma) lr_scheduler_restarts = LRSchedulerWithRestart(lr_scheduler, restart_every=restart_every, restart_factor=restart_factor, init_lr_factor=init_lr_factor) reduce_on_plateau = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=lr_reduce_patience, threshold=0.01, verbose=True) logger.debug("Setup ignite trainer and evaluator") trainer = create_supervised_trainer(model, optimizer, criterion, device=device) metrics = { 'accuracy': CategoricalAccuracy(), 'precision': Precision(), 'recall': Recall(), 'nll': Loss(criterion) } train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) val_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) logger.debug("Setup handlers") # Setup timer to measure training time timer = Timer(average=True) timer.attach(trainer, start=Events.EPOCH_STARTED, resume=Events.ITERATION_STARTED, pause=Events.ITERATION_COMPLETED) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: logger.info("Epoch[{}] Iteration[{}/{}] Loss: {:.4f}".format(engine.state.epoch, iter, len(train_loader), engine.state.output)) writer.add_scalar("training/loss_vs_iterations", engine.state.output, engine.state.iteration) @trainer.on(Events.EPOCH_STARTED) def update_lr_schedulers(engine): if (engine.state.epoch - 1) % lr_update_every == 0: lr_scheduler_restarts.step() @trainer.on(Events.EPOCH_STARTED) def log_lrs(engine): if len(optimizer.param_groups) == 1: lr = float(optimizer.param_groups[0]['lr']) writer.add_scalar("learning_rate", lr, engine.state.epoch) logger.debug("Learning rate: {}".format(lr)) else: for i, param_group in enumerate(optimizer.param_groups): lr = float(param_group['lr']) logger.debug("Learning rate (group {}): {}".format(i, lr)) writer.add_scalar("learning_rate_group_{}".format(i), lr, engine.state.epoch) log_images_dir = os.path.join(log_dir, "figures") os.makedirs(log_images_dir) def log_precision_recall_results(metrics, epoch, mode): for metric_name in ['precision', 'recall']: value = metrics[metric_name] avg_value = torch.mean(value).item() writer.add_scalar("{}/avg_{}".format(mode, metric_name), avg_value, epoch) kwargs = {"{}_avg_{}".format(mode, metric_name): avg_value} # Polyaxon exp.log_metrics(step=epoch, **kwargs) # Save metric per class figure sorted_values = value.to('cpu').numpy() indices = np.argsort(sorted_values) sorted_values = sorted_values[indices] n_classes = len(sorted_values) classes = np.array(["class_{}".format(i) for i in range(n_classes)]) sorted_classes = classes[indices] fig = create_fig_param_per_class(sorted_values, metric_name, classes=sorted_classes, n_classes_per_fig=20) fname = os.path.join(log_images_dir, "{}_{}_{}_per_class.png".format(mode, epoch, metric_name)) fig.savefig(fname) tag = "{}_{}".format(mode, metric_name) writer.add_figure(tag, fig, epoch) @trainer.on(Events.EPOCH_COMPLETED) def log_training_metrics(engine): epoch = engine.state.epoch logger.info("One epoch training time (seconds): {}".format(timer.value())) metrics = train_evaluator.run(train_loader).metrics logger.info("Training Results - Epoch: {} Avg accuracy: {:.4f} Avg loss: {:.4f}" .format(engine.state.epoch, metrics['accuracy'], metrics['nll'])) writer.add_scalar("training/avg_accuracy", metrics['accuracy'], epoch) writer.add_scalar("training/avg_error", 1.0 - metrics['accuracy'], epoch) writer.add_scalar("training/avg_loss", metrics['nll'], epoch) kwargs = { "training_avg_accuracy": metrics['accuracy'], "training_avg_error": 1.0 - metrics['accuracy'], "training_avg_loss": metrics['nll'], } # Polyaxon exp.log_metrics(step=epoch, **kwargs) log_precision_recall_results(metrics, epoch, "training") @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): epoch = engine.state.epoch metrics = val_evaluator.run(val_loader).metrics writer.add_scalar("validation/avg_loss", metrics['nll'], epoch) writer.add_scalar("validation/avg_accuracy", metrics['accuracy'], epoch) writer.add_scalar("validation/avg_error", 1.0 - metrics['accuracy'], epoch) kwargs = { "validation_avg_accuracy": metrics['accuracy'], "validation_avg_error": 1.0 - metrics['accuracy'], "validation_avg_loss": metrics['nll'], } # Polyaxon exp.log_metrics(step=epoch, **kwargs) logger.info("Validation Results - Epoch: {} Avg accuracy: {:.4f} Avg loss: {:.4f}" .format(engine.state.epoch, metrics['accuracy'], metrics['nll'])) log_precision_recall_results(metrics, epoch, "validation") @val_evaluator.on(Events.COMPLETED) def update_reduce_on_plateau(engine): val_loss = engine.state.metrics['nll'] reduce_on_plateau.step(val_loss) def score_function(engine): val_loss = engine.state.metrics['nll'] # Objects with highest scores will be retained. return -val_loss # Setup early stopping: handler = EarlyStopping(patience=early_stop_patience, score_function=score_function, trainer=trainer) setup_logger(handler._logger, log_dir, log_level) val_evaluator.add_event_handler(Events.COMPLETED, handler) # Setup model checkpoint: best_model_saver = ModelCheckpoint(log_dir, filename_prefix="model", score_name="val_loss", score_function=score_function, n_saved=5, atomic=True, create_dir=True) val_evaluator.add_event_handler(Events.COMPLETED, best_model_saver, {model_name: model}) last_model_saver = ModelCheckpoint(log_dir, filename_prefix="checkpoint", save_interval=1, n_saved=1, atomic=True, create_dir=True) trainer.add_event_handler(Events.COMPLETED, last_model_saver, {model_name: model}) logger.info("Start training: {} epochs".format(epochs)) try: trainer.run(train_loader, max_epochs=epochs) except KeyboardInterrupt: logger.info("Catched KeyboardInterrupt -> exit") except Exception as e: # noqa logger.exception("") if args.debug: try: # open an ipython shell if possible import IPython IPython.embed() # noqa except ImportError: print("Failed to start IPython console") logger.debug("Training is ended") writer.close()
# Optional config param: if set evaluation on val_dataloader is run val_dataloader = get_basic_dataloader("test", batch_size, num_workers, device=device, data_augs=val_data_augs) # Required config param model = resnet50(pretrained=False, num_classes=10) model.avgpool = nn.AdaptiveAvgPool2d(1) # Required config param optimizer = optim.SGD(model.parameters(), lr=0.01) # Required config param criterion = nn.CrossEntropyLoss() # Required config param num_epochs = 50 # Optional config param metrics = { "precision": Precision(average=True), "recall": Recall(average=True), "accuracy": CategoricalAccuracy() } # Optional config param lr_scheduler = CosineAnnealingLR(optimizer, T_max=1200, eta_min=1e-5)
def train(params, log, time_keeper): # specify dataset dataset = DatasetFactory.create(params) # specify model model = ModelFactory.create(params) model = model.to(params['device']) # optiimizer optimizer = SGD(model.parameters(), lr=params['TRAIN']['lr'], momentum=params['TRAIN']['momentum']) # scheduler scheduler = None # best accuracy(precision) best_prec = 0 # optionally resume from a checkpoint checkpoint_file = params['TRAIN']['resume'] start_epoch, best_prec = load_checkpoint(log, model, checkpoint_file, optimizer, scheduler) trainer = create_supervised_trainer(model, optimizer, F.cross_entropy, device=params['device']) # evaluator evaluator = create_supervised_evaluator(model, metrics={ 'accuracy': CategoricalAccuracy(), 'cross_entropy': Loss(F.cross_entropy) }, device=params['device']) # log details log_string = "\n" + "==== NET MODEL:\n" + str(model) log_string += "\n" + "==== OPTIMIZER:\n" + str(optimizer) + "\n" log.log_global(log_string) # end-of-iteration events @trainer.on(Events.ITERATION_COMPLETED) def on_iter(engine): iter_current = engine.state.iteration % len(dataset.loader['train']) epoch_current = engine.state.epoch num_iter = len(dataset.loader['train']) loss = engine.state.output # logging time_string = time_keeper.get_current_str() # get current time log.log_iter(iter_current, epoch_current - 1, num_iter, loss, time_string) # end-of-epoch events @trainer.on(Events.EPOCH_COMPLETED) def on_epoch(engine): nonlocal best_prec # current epoch epoch_current = engine.state.epoch # evaluation on train set evaluator.run(dataset.loader['train']) acc_train = evaluator.state.metrics['accuracy'] * 100 loss_train = evaluator.state.metrics['cross_entropy'] # evaluation on val set evaluator.run(dataset.loader['val']) acc_val = evaluator.state.metrics['accuracy'] * 100 loss_val = evaluator.state.metrics['cross_entropy'] is_best = acc_val > best_prec best_prec = max(acc_val, best_prec) save_checkpoint( { 'epoch': epoch_current + 1, 'state_dict': model.state_dict(), 'best_prec': best_prec, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler }, model, params, is_best) # logging results time_string = time_keeper.get_current_str() # get current time log.log_epoch(epoch_current, acc_train, loss_train, acc_val, loss_val, is_best, time_string) time_keeper.start() trainer.run(dataset.loader['train'], max_epochs=params['TRAIN']['epochs'])
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval): vis = visdom.Visdom() # if not vis.check_connection(): # raise RuntimeError("Visdom server not running. Please run python -m visdom.server") train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() device = 'cpu' if torch.cuda.is_available(): device = 'cuda' optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) trainer = create_supervised_trainer(model, optimizer, F.nll_loss, device=device) evaluator = create_supervised_evaluator(model, metrics={'accuracy': CategoricalAccuracy(), 'nll': Loss(F.nll_loss)}, device=device) train_loss_window = create_plot_window(vis, '#Iterations', 'Loss', 'Training Loss') train_avg_loss_window = create_plot_window(vis, '#Iterations', 'Loss', 'Training Average Loss') train_avg_accuracy_window = create_plot_window(vis, '#Iterations', 'Accuracy', 'Training Average Accuracy') val_avg_loss_window = create_plot_window(vis, '#Epochs', 'Loss', 'Validation Average Loss') val_avg_accuracy_window = create_plot_window(vis, '#Epochs', 'Accuracy', 'Validation Average Accuracy') @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: print("Epoch[{}] Iteration[{}/{}] Loss: {:.2f}" "".format(engine.state.epoch, iter, len(train_loader), engine.state.output)) vis.line(X=np.array([engine.state.iteration]), Y=np.array([engine.state.output]), update='append', win=train_loss_window) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): evaluator.run(train_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] print("Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) vis.line(X=np.array([engine.state.epoch]), Y=np.array([avg_accuracy]), win=train_avg_accuracy_window, update='append') vis.line(X=np.array([engine.state.epoch]), Y=np.array([avg_nll]), win=train_avg_loss_window, update='append') @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] print("Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) vis.line(X=np.array([engine.state.epoch]), Y=np.array([avg_accuracy]), win=val_avg_accuracy_window, update='append') vis.line(X=np.array([engine.state.epoch]), Y=np.array([avg_nll]), win=val_avg_loss_window, update='append') # kick everything off trainer.run(train_loader, max_epochs=epochs)
def run(config_file): print("--- iMaterialist 2018 : Training --- ") print("Load config file ... ") config = load_config(config_file) seed = config.get("SEED", 2018) random.seed(seed) torch.manual_seed(seed) output = Path(config["OUTPUT_PATH"]) debug = config.get("DEBUG", False) from datetime import datetime now = datetime.now() log_dir = output / ("{}".format(Path(config_file).stem)) / "{}".format( now.strftime("%Y%m%d_%H%M")) assert not log_dir.exists(), \ "Output logging directory '{}' already existing".format(log_dir) log_dir.mkdir(parents=True) shutil.copyfile(config_file, (log_dir / Path(config_file).name).as_posix()) log_level = logging.INFO if debug: log_level = logging.DEBUG print("Activated debug mode") logger = logging.getLogger("iMaterialist 2018: Train") setup_logger(logger, (log_dir / "train.log").as_posix(), log_level) logger.debug("Setup tensorboard writer") writer = SummaryWriter(log_dir=(log_dir / "tensorboard").as_posix()) save_conf(config_file, log_dir.as_posix(), logger, writer) model = config["MODEL"] model_name = model.__class__.__name__ device = config.get("DEVICE", 'cuda') if 'cuda' in device: assert torch.cuda.is_available(), \ "Device {} is not compatible with torch.cuda.is_available()".format(device) from torch.backends import cudnn cudnn.benchmark = True logger.debug("CUDA is enabled") model = model.to(device) logger.debug("Setup train/val dataloaders") train_loader, val_loader = config["TRAIN_LOADER"], config["VAL_LOADER"] # Setup training subset to run evaluation on: indices = np.arange(len(train_loader.sampler)) np.random.shuffle(indices) indices = indices[:len(val_loader.sampler)] if len( val_loader.sampler) < len(train_loader.sampler) else indices train_eval_loader = get_train_eval_data_loader(train_loader, indices) logger.debug( "- train data loader: {} number of batches | {} number of samples". format(len(train_loader), len(train_loader.sampler))) logger.debug( "- train eval data loader: {} number of batches | {} number of samples" .format(len(train_eval_loader), len(train_eval_loader.sampler))) logger.debug( "- validation data loader: {} number of batches | {} number of samples" .format(len(val_loader), len(val_loader.sampler))) # write_model_graph(writer, model=model, data_loader=train_loader, device=device) optimizer = config["OPTIM"] logger.debug("Setup criterion") criterion = config["CRITERION"] if "cuda" in device and isinstance(criterion, nn.Module): criterion = criterion.to(device) lr_schedulers = config.get("LR_SCHEDULERS") logger.debug("Setup ignite trainer and evaluator") trainer = create_supervised_trainer(model, optimizer, criterion, device=device) metrics = { 'accuracy': CategoricalAccuracy(), 'precision': Precision(), 'recall': Recall(), 'nll': Loss(criterion) } train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) val_metrics = { 'accuracy': CategoricalAccuracy(), 'precision': Precision(), 'recall': Recall(), 'nll': Loss(nn.CrossEntropyLoss()) } val_evaluator = create_supervised_evaluator(model, metrics=val_metrics, device=device) logger.debug("Setup handlers") log_interval = config.get("LOG_INTERVAL", 100) reduce_on_plateau = config.get("REDUCE_LR_ON_PLATEAU") # Setup timer to measure training time timer = Timer(average=True) timer.attach(trainer, start=Events.EPOCH_STARTED, resume=Events.ITERATION_STARTED, pause=Events.ITERATION_COMPLETED) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: logger.info("Epoch[{}] Iteration[{}/{}] Loss: {:.4f}".format( engine.state.epoch, iter, len(train_loader), engine.state.output)) writer.add_scalar("training/loss_vs_iterations", engine.state.output, engine.state.iteration) @trainer.on(Events.EPOCH_STARTED) def update_lr_schedulers(engine): if lr_schedulers is not None: for lr_scheduler in lr_schedulers: lr_scheduler.step() @trainer.on(Events.EPOCH_STARTED) def log_lrs(engine): if len(optimizer.param_groups) == 1: lr = float(optimizer.param_groups[0]['lr']) writer.add_scalar("learning_rate", lr, engine.state.epoch) logger.debug("Learning rate: {}".format(lr)) else: for i, param_group in enumerate(optimizer.param_groups): lr = float(param_group['lr']) logger.debug("Learning rate (group {}): {}".format(i, lr)) writer.add_scalar("learning_rate_group_{}".format(i), lr, engine.state.epoch) log_images_dir = log_dir / "figures" log_images_dir.mkdir(parents=True) def log_precision_recall_results(metrics, epoch, mode): for metric_name in ['precision', 'recall']: value = metrics[metric_name] avg_value = torch.mean(value).item() writer.add_scalar("{}/avg_{}".format(mode, metric_name), avg_value, epoch) # Save metric per class figure sorted_values = value.to('cpu').numpy() indices = np.argsort(sorted_values) sorted_values = sorted_values[indices] n_classes = len(sorted_values) classes = np.array( ["class_{}".format(i) for i in range(n_classes)]) sorted_classes = classes[indices] fig = create_fig_param_per_class(sorted_values, metric_name, classes=sorted_classes, n_classes_per_fig=20) fname = log_images_dir / ("{}_{}_{}_per_class.png".format( mode, epoch, metric_name)) fig.savefig(fname.as_posix()) # Add figure in TB img = Image.open(fname.as_posix()) tag = "{}_{}".format(mode, metric_name) writer.add_image(tag, np.asarray(img), epoch) @trainer.on(Events.EPOCH_COMPLETED) def log_training_metrics(engine): epoch = engine.state.epoch logger.info("One epoch training time (seconds): {}".format( timer.value())) metrics = train_evaluator.run(train_eval_loader).metrics logger.info( "Training Results - Epoch: {} Avg accuracy: {:.4f} Avg loss: {:.4f}" .format(engine.state.epoch, metrics['accuracy'], metrics['nll'])) writer.add_scalar("training/avg_accuracy", metrics['accuracy'], epoch) writer.add_scalar("training/avg_error", 1.0 - metrics['accuracy'], epoch) writer.add_scalar("training/avg_loss", metrics['nll'], epoch) log_precision_recall_results(metrics, epoch, "training") @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): epoch = engine.state.epoch metrics = val_evaluator.run(val_loader).metrics writer.add_scalar("validation/avg_loss", metrics['nll'], epoch) writer.add_scalar("validation/avg_accuracy", metrics['accuracy'], epoch) writer.add_scalar("validation/avg_error", 1.0 - metrics['accuracy'], epoch) logger.info( "Validation Results - Epoch: {} Avg accuracy: {:.4f} Avg loss: {:.4f}" .format(engine.state.epoch, metrics['accuracy'], metrics['nll'])) log_precision_recall_results(metrics, epoch, "validation") if reduce_on_plateau is not None: @val_evaluator.on(Events.COMPLETED) def update_reduce_on_plateau(engine): val_loss = engine.state.metrics['nll'] reduce_on_plateau.step(val_loss) def score_function(engine): val_loss = engine.state.metrics['nll'] # Objects with highest scores will be retained. return -val_loss # Setup early stopping: if "EARLY_STOPPING_KWARGS" in config: kwargs = config["EARLY_STOPPING_KWARGS"] if 'score_function' not in kwargs: kwargs['score_function'] = score_function handler = EarlyStopping(trainer=trainer, **kwargs) setup_logger(handler._logger, (log_dir / "train.log").as_posix(), log_level) val_evaluator.add_event_handler(Events.COMPLETED, handler) # Setup model checkpoint: best_model_saver = ModelCheckpoint(log_dir.as_posix(), filename_prefix="model", score_name="val_loss", score_function=score_function, n_saved=5, atomic=True, create_dir=True) val_evaluator.add_event_handler(Events.COMPLETED, best_model_saver, {model_name: model}) last_model_saver = ModelCheckpoint(log_dir.as_posix(), filename_prefix="checkpoint", save_interval=1, n_saved=1, atomic=True, create_dir=True) trainer.add_event_handler(Events.EPOCH_COMPLETED, last_model_saver, {model_name: model}) # Setup custom event handlers: for (event, handler) in config["TRAINER_CUSTOM_EVENT_HANDLERS"]: trainer.add_event_handler(event, handler, val_evaluator, logger) for (event, handler) in config["EVALUATOR_CUSTOM_EVENT_HANDLERS"]: val_evaluator.add_event_handler(event, handler, trainer, logger) n_epochs = config["N_EPOCHS"] logger.info("Start training: {} epochs".format(n_epochs)) try: trainer.run(train_loader, max_epochs=n_epochs) except KeyboardInterrupt: logger.info("Catched KeyboardInterrupt -> exit") except Exception as e: # noqa logger.exception("") if debug: try: # open an ipython shell if possible import IPython IPython.embed() # noqa except ImportError: print("Failed to start IPython console") logger.debug("Training is ended") writer.close()