def train(obj, optimizer, dataset, xp, args, epoch): xp.Timer_Train.reset() stats = {} for i, x, y in tqdm(optimizer.get_sampler(dataset), desc='Train Epoch', leave=False, total=optimizer.get_sampler_len(dataset)): oracle_info = obj.oracle(optimizer.variables.w, x, y) oracle_info['i'] = i optimizer.step(oracle_info) # track statistics for monitoring stats['obj'] = float(oracle_info['obj']) stats['error'] = float(obj.task_error(optimizer.variables.w, x, y)) stats['size'] = float(x.size(0)) update_metrics(xp, stats) xp.Timer_Train.update() print('\nEpoch: [{0}] (Train) \t' '({timer:.2f}s) \t' 'Obj {obj:.3f}\t' 'Error {error:.2f}\t'.format( int(xp.Epoch.value), timer=xp.Timer_Train.value, error=xp.Error_Train.value, obj=xp.Obj_Train.value, )) log_metrics(xp, epoch)
def test_log_metrics(capsys): engine = Engine(lambda e, b: None) engine.logger = setup_logger(format="%(message)s") engine.run(list(range(100)), max_epochs=2) log_metrics(engine, "train") captured = capsys.readouterr() assert captured.err.split("\n")[-2] == "train [2/200]: {}"
def run_validation(): epoch = trainer.state.epoch state = train_evaluator.run(train_eval_loader) utils.log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(val_loader) utils.log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics)
def evaluation(local_rank, config, logger, with_clearml): rank = idist.get_rank() device = idist.device() manual_seed(config.seed + local_rank) data_loader = config.data_loader model = config.model.to(device) # Load weights: state_dict = get_model_weights(config, logger, with_clearml) model.load_state_dict(state_dict) # Adapt model to dist config model = idist.auto_model(model) # Setup evaluators num_classes = config.num_classes cm_metric = ConfusionMatrix(num_classes=num_classes) val_metrics = { "IoU": IoU(cm_metric), "mIoU_bg": mIoU(cm_metric), } if ("val_metrics" in config) and isinstance(config.val_metrics, dict): val_metrics.update(config.val_metrics) evaluator = create_evaluator(model, val_metrics, config, with_clearml, tag="val") # Setup Tensorboard logger if rank == 0: tb_logger = common.TensorboardLogger( log_dir=config.output_path.as_posix()) tb_logger.attach_output_handler( evaluator, event_name=Events.COMPLETED, tag="validation", metric_names="all", ) # Log confusion matrix to ClearML: if with_clearml: evaluator.add_event_handler(Events.COMPLETED, compute_and_log_cm, cm_metric, evaluator.state.iteration) state = evaluator.run(data_loader) utils.log_metrics(logger, 0, state.times["COMPLETED"], "Validation", state.metrics) if idist.get_rank() == 0: tb_logger.close()
def train(model, loss, optimizer, loader, xp, args): model.train() xp.Timer_Train.reset() stats_dict = {} for x, y in tqdm(loader, disable=not args.tqdm, desc='Train Epoch', leave=False, total=len(loader)): (x, y) = (x.cuda(), y.cuda()) if args.cuda else (x, y) # forward pass scores = model(x) # compute the loss function, possibly using smoothing with set_smoothing_enabled(args.smooth_svm): loss_value = loss(scores, y) # backward pass optimizer.zero_grad() loss_value.backward() # optimization step optimizer.step(lambda: float(loss_value)) # monitoring stats_dict['loss'] = float(loss(scores, y)) stats_dict['acc'] = float(accuracy(scores, y)) stats_dict['gamma'] = float(optimizer.gamma) stats_dict['size'] = float(scores.size(0)) update_metrics(xp, stats_dict) xp.Eta.update(optimizer.eta) xp.Reg.update(regularization(model, args.l2)) xp.Obj_Train.update(xp.Reg.value + xp.Loss_Train.value) xp.Timer_Train.update() print('\nEpoch: [{0}] (Train) \t' '({timer:.2f}s) \t' 'Obj {obj:.3f}\t' 'Loss {loss:.3f}\t' 'Acc {acc:.2f}%\t' .format(int(xp.Epoch.value), timer=xp.Timer_Train.value, acc=xp.Acc_Train.value, obj=xp.Obj_Train.value, loss=xp.Loss_Train.value)) log_metrics(xp)
def run_val(epoch): #global val_dataloader #global writer running_labels = [] running_predicted = [] for timestep, example in tqdm(enumerate(val_dataloader)): pred, labels = forward_pass(example) #predicted = torch.round(pred) #print("predicted", predicted.shape) #print("labels", labels.shape) # running_labels += labels.view(-1).cpu().detach().tolist() # running_predicted += predicted.view(-1).cpu().detach().tolist() log_metrics(writer, epoch, running_predicted, running_labels, 'val', config)
def gan_forward(inputs, labels, model, loss_fn, writer, device, batch_num, N): metrics = {} sketches, photos = torch.split(inputs, N) logits_real, logits_fake = model(sketches, photos) d_loss, g_loss = loss_fn(logits_real, logits_fake, device) metrics['d_loss'] = d_loss metrics['g_loss'] = g_loss metrics['loss'] = 0 for metric_name in metrics: if metric_name.endswith('loss') and metric_name != 'loss': metrics['loss'] += metrics[metric_name] log_metrics(metrics, writer, "batch", batch_num) return metrics
def classify_contrast_forward(inputs, labels, model, loss_fn, writer, device, batch_num, alpha, loss_type, N): metrics = {} features = model.extract_features(inputs) indices = torch.tensor(range(0, 2 * N)).to(device) selected_features = torch.index_select(features, 0, indices) logits = model.make_predictions(selected_features) sketch_logits, photo_logits = torch.split(logits, N) if loss_type == "classify": metrics['loss'] = loss_fn(sketch_logits, photo_logits, labels) else: # reorganize into photo embeds and sketch embeds # feed in embed for photo and sketch metrics['embedding_loss'], metrics['classification_loss'] = loss_fn( *torch.split(features, N), sketch_logits, photo_logits, labels) metrics['loss'] = alpha * metrics['embedding_loss'] + ( 1 - alpha) * metrics['classification_loss'] _, sketch_preds = torch.max(sketch_logits, 1) _, photo_preds = torch.max(photo_logits, 1) sketch_cor = sum(sketch_preds.cpu().numpy() == labels.cpu().numpy()) photo_cor = sum(photo_preds.cpu().numpy() == labels.cpu().numpy()) metrics['sketch_cor'] = sketch_cor metrics['photo_cor'] = photo_cor log_metrics(metrics, writer, "batch", batch_num) # TODO Change this to be args.verbose if True: print("=" * 100) print("Predicted classes for sketches: {}".format( sketch_preds.cpu().tolist())) print("Predicted classes for photos: {}".format( photo_preds.cpu().tolist())) print("Ground truth: {}".format(labels.cpu().tolist())) print("=" * 100) return metrics
def _log_epoch_summary(self, epoch_summary: EpochSummaryType, epoch_number: int) -> None: utils.log_metrics(self.logger, epoch_summary['valid_f1'], "valid", epoch_number + 1) utils.log_metrics(self.logger, epoch_summary['train_f1'], "train", epoch_number + 1) self.logger.scalar_summary("train loss_avg", epoch_summary['train_loss_avg'], (epoch_number + 1)) self.logger.scalar_summary("valid_accuracy", epoch_summary['valid_accuracy'], (epoch_number + 1)) self.logger.scalar_summary("train_accuracy", epoch_summary['train_accuracy'], epoch_number + 1) self.logger.scalar_summary("valid_f1_avg", epoch_summary['valid_f1_avg'], epoch_number + 1) self.logger.scalar_summary("train_f1_avg", epoch_summary['train_f1_avg'], epoch_number + 1)
def fit(model, optimizer, scheduler, criterion, train_loader, val_loader, start_epoch=0, end_epoch=24): metrics = [] for epoch in range(start_epoch, end_epoch): epoch_metrics = {} start_time = time() train_loss = train(model, optimizer, criterion, train_loader) end_time = time() epoch_metrics, _ = evaluate(model, criterion, val_loader) epoch_metrics['train_loss'] = train_loss epoch_metrics['epoch'] = epoch epoch_metrics['time'] = end_time - start_time epoch_metrics['lr'] = optimizer.param_groups[0]["lr"] metrics.append(epoch_metrics) log_metrics(epoch_metrics, TRAIN_LOG) if scheduler != None: scheduler.step(epoch_metrics['train_loss']) return metrics
def train_one_epoch(epoch, experiment_id): #global writer #global train_dataloader #print("Epoch is ", epoch) epoch_start_time = time.time() running_loss = 0 running_labels = [] running_predicted = [] running_samples = 0 for timestep, example in tqdm(enumerate(train_dataloader)): #print("timestep", timestep) loss, pred, labels = forwardbackwardpass(example) #print("pred shape", pred.shape) #print("pred max shape", pred.argmax(2).shape) '''convert tokens to string to predict blue score''' if timestep % 5 == 0: scheduler.step() #print("predicted", predicted.shape) #print("labels", labels.shape) running_loss += loss.item() running_samples += example[0].shape[0] blue, loss = log_metrics(writer, epoch, running_predicted, running_labels, 'train', config, running_loss, running_samples) trainer.eval() with torch.no_grad(): run_val(epoch) if epoch%config['model_save_frequency_in_epochs'] == 0: save_model(epoch, loss, experiment_id) print("time taken for the epoch is ", time.time() - epoch_start_time)
def train(model, triples, ent_num): logging.info("Start Training...") logging.info("batch_size = %d" % config.batch_size) logging.info("dim = %d" % config.ent_dim) logging.info("gamma = %f" % config.gamma) current_lr = config.learning_rate train_triples, valid_triples, test_triples = triples all_true_triples = train_triples + valid_triples + test_triples rtp = rel_type(train_triples) optimizer = get_optim("Adam", model, current_lr) train_iterator = train_data_iterator(train_triples, ent_num) if config.init_checkpoint: logging.info("Loading checkpoint...") checkpoint = torch.load(os.path.join(config.save_path, "checkpoint")) init_step = checkpoint["step"] + 1 model.load_state_dict(checkpoint["model_state_dict"]) if config.use_old_optimizer: current_lr = checkpoint["current_lr"] optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) else: init_step = 1 max_hit1 = 0.0 max_mrr = 0.0 training_logs = [] # Training Loop for step in range(init_step, config.max_step): log = train_step(model, optimizer, next(train_iterator)) training_logs.append(log) # log if step % config.log_step == 0: metrics = {} for metric in training_logs[0].keys(): metrics[metric] = sum([log[metric] for log in training_logs ]) / len(training_logs) log_metrics("Training average", step, metrics) training_logs = [] # valid if step % config.valid_step == 0: logging.info( "---------------Evaluating on Valid Dataset---------------") metrics = test_step(model, valid_triples, all_true_triples, ent_num, rtp) metrics, metrics1, metrics2, metrics3, metrics4, metrics5, metrics6, metrics7, metrics8 = metrics logging.info("----------------Overall Results----------------") log_metrics("Valid", step, metrics) logging.info("-----------Prediction Head... 1-1 -------------") log_metrics("Valid", step, metrics1) logging.info("-----------Prediction Head... 1-M -------------") log_metrics("Valid", step, metrics2) logging.info("-----------Prediction Head... M-1 -------------") log_metrics("Valid", step, metrics3) logging.info("-----------Prediction Head... M-M -------------") log_metrics("Valid", step, metrics4) logging.info("-----------Prediction Tail... 1-1 -------------") log_metrics("Valid", step, metrics5) logging.info("-----------Prediction Tail... 1-M -------------") log_metrics("Valid", step, metrics6) logging.info("-----------Prediction Tail... M-1 -------------") log_metrics("Valid", step, metrics7) logging.info("-----------Prediction Tail... M-M -------------") log_metrics("Valid", step, metrics8) if metrics["HITS@1"] >= max_hit1 or metrics["MRR"] >= max_mrr: if metrics["HITS@1"] > max_hit1: max_hit1 = metrics["HITS@1"] if metrics["MRR"] > max_mrr: max_mrr = metrics["MRR"] save_variable_list = { "step": step, "current_lr": current_lr, } save_model(model, optimizer, save_variable_list) elif current_lr > 0.0000011: current_lr *= 0.1 logging.info("Change learning_rate to %f at step %d" % (current_lr, step)) optimizer = get_optim("Adam", model, current_lr) else: logging.info( "-------------------Training End-------------------") break # best state checkpoint = torch.load(os.path.join(config.save_path, "checkpoint")) model.load_state_dict(checkpoint["model_state_dict"]) step = checkpoint["step"] logging.info( "-----------------Evaluating on Test Dataset-------------------") metrics = test_step(model, test_triples, all_true_triples, ent_num, rtp) metrics, metrics1, metrics2, metrics3, metrics4, metrics5, metrics6, metrics7, metrics8 = metrics logging.info("----------------Overall Results----------------") log_metrics("Test", step, metrics) logging.info("-----------Prediction Head... 1-1 -------------") log_metrics("Test", step, metrics1) logging.info("-----------Prediction Head... 1-M -------------") log_metrics("Test", step, metrics2) logging.info("-----------Prediction Head... M-1 -------------") log_metrics("Test", step, metrics3) logging.info("-----------Prediction Head... M-M -------------") log_metrics("Test", step, metrics4) logging.info("-----------Prediction Tail... 1-1 -------------") log_metrics("Test", step, metrics5) logging.info("-----------Prediction Tail... 1-M -------------") log_metrics("Test", step, metrics6) logging.info("-----------Prediction Tail... M-1 -------------") log_metrics("Test", step, metrics7) logging.info("-----------Prediction Tail... M-M -------------") log_metrics("Test", step, metrics8)
def train(model, triples, ent_num): logging.info("Start Training...") logging.info("batch_size = %d" % config.batch_size) logging.info("dim = %d" % config.ent_dim) logging.info("gamma = %f" % config.gamma) current_lr = config.learning_rate train_triples, valid_triples, test_triples, symmetry_test, inversion_test, composition_test, others_test = triples all_true_triples = train_triples + valid_triples + test_triples r_tp = rel_type(train_triples) optimizer = get_optim("Adam", model, current_lr) if config.init_checkpoint: logging.info("Loading checkpoint...") checkpoint = torch.load(os.path.join(config.save_path, "checkpoint"), map_location=torch.device("cuda:0")) init_step = checkpoint["step"] + 1 model.load_state_dict(checkpoint["model_state_dict"]) if config.use_old_optimizer: current_lr = checkpoint["current_lr"] optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) else: init_step = 1 true_all_heads, true_all_tails = get_true_ents(all_true_triples) train_iterator = train_data_iterator(train_triples, ent_num) test_data_list = test_data_sets(valid_triples, true_all_heads, true_all_tails, ent_num, r_tp) max_mrr = 0.0 training_logs = [] modes = ["Prediction Head", "Prediction Tail"] rtps = ["1-1", "1-M", "M-1", "M-M"] # Training Loop for step in range(init_step, config.max_step + 1): log = train_step(model, optimizer, next(train_iterator)) training_logs.append(log) # log if step % config.log_step == 0: metrics = {} for metric in training_logs[0].keys(): metrics[metric] = sum([log[metric] for log in training_logs ]) / len(training_logs) log_metrics("Training", step, metrics) training_logs.clear() # valid if step % config.valid_step == 0: logging.info("-" * 10 + "Evaluating on Valid Dataset" + "-" * 10) metrics = test_step(model, test_data_list, True) log_metrics("Valid", step, metrics[0]) cnt_mode_rtp = 1 for mode in modes: for rtp in rtps: logging.info("-" * 10 + mode + "..." + rtp + "-" * 10) log_metrics("Valid", step, metrics[cnt_mode_rtp]) cnt_mode_rtp += 1 if metrics[0]["MRR"] >= max_mrr: max_mrr = metrics[0]["MRR"] save_variable_list = { "step": step, "current_lr": current_lr, } save_model(model, optimizer, save_variable_list) if step / config.max_step in [0.2, 0.5, 0.8]: current_lr *= 0.1 logging.info("Change learning_rate to %f at step %d" % (current_lr, step)) optimizer = get_optim("Adam", model, current_lr) # load best state checkpoint = torch.load(os.path.join(config.save_path, "checkpoint")) model.load_state_dict(checkpoint["model_state_dict"]) step = checkpoint["step"] # relation patterns test_datasets = [ symmetry_test, inversion_test, composition_test, others_test ] test_datasets_str = ["Symmetry", "Inversion", "Composition", "Other"] for i in range(len(test_datasets)): dataset = test_datasets[i] dataset_str = test_datasets_str[i] if len(dataset) == 0: continue test_data_list = test_data_sets(dataset, true_all_heads, true_all_tails, ent_num, r_tp) logging.info("-" * 10 + "Evaluating on " + dataset_str + " Dataset" + "-" * 10) metrics = test_step(model, test_data_list) log_metrics("Valid", step, metrics) # finally test test_data_list = test_data_sets(test_triples, true_all_heads, true_all_tails, ent_num, r_tp) logging.info("----------Evaluating on Test Dataset----------") metrics = test_step(model, test_data_list, True) log_metrics("Test", step, metrics[0]) cnt_mode_rtp = 1 for mode in modes: for rtp in rtps: logging.info("-" * 10 + mode + "..." + rtp + "-" * 10) log_metrics("Test", step, metrics[cnt_mode_rtp]) cnt_mode_rtp += 1
def trainepoch(epoch): xp.Epoch.update(1).log() print('\nTRAINING : Epoch ' + str(epoch)) nli_net.train() # shuffle the data permutation = np.random.permutation(len(train['s1'])) s1 = train['s1'][permutation] s2 = train['s2'][permutation] target = train['label'][permutation] if epoch > 1 and params.opt == 'sgd': optimizer.param_groups[0]['lr'] *= params.decay optimizer.eta = optimizer.param_groups[0]['lr'] xp.Timer_Train.reset() stats = {} for stidx in tqdm(range(0, len(s1), params.batch_size), disable=not params.tqdm, desc='Train Epoch', leave=False): # prepare batch s1_batch, s1_len = get_batch(s1[stidx:stidx + params.batch_size], word_vec) s2_batch, s2_len = get_batch(s2[stidx:stidx + params.batch_size], word_vec) s1_batch, s2_batch = s1_batch.cuda(), s2_batch.cuda() tgt_batch = torch.LongTensor(target[stidx:stidx + params.batch_size]).cuda() # model forward scores = nli_net((s1_batch, s1_len), (s2_batch, s2_len)) with set_smoothing_enabled(params.smooth_svm): loss = loss_fn(scores, tgt_batch) # backward optimizer.zero_grad() loss.backward() if params.opt != 'dfw': adapt_grad_norm(nli_net, params.max_norm) # necessary information for the step-size of some optimizers -> provide closure optimizer.step(lambda: float(loss)) # track statistics for monitoring stats['loss'] = float(loss_fn(scores, tgt_batch)) stats['acc'] = float(accuracy(scores, tgt_batch)) stats['gamma'] = float(optimizer.gamma) stats['size'] = float(tgt_batch.size(0)) update_metrics(xp, stats) xp.Eta.update(optimizer.eta) xp.Reg.update(regularization(nli_net, params.l2)) xp.Obj_Train.update(xp.Reg.value + xp.Loss_Train.value) xp.Timer_Train.update() print('results : epoch {0} ; mean accuracy train : {1}'.format( epoch, xp.acc_train)) print('\nEpoch: [{0}] (Train) \t' '({timer:.2f}s) \t' 'Obj {obj:.3f}\t' 'Loss {loss:.3f}\t' 'Acc {acc:.2f}%\t'.format(int(xp.Epoch.value), timer=xp.Timer_Train.value, acc=xp.Acc_Train.value, obj=xp.Obj_Train.value, loss=xp.Loss_Train.value)) log_metrics(xp)
def train(model, triples, entities, un_ents, un_rels, test_pairs): logging.info("---------------Start Training---------------") ht_1, ht_2 = get_r_hts(triples, un_rels) rel_seeds = relation_seeds({}, ht_1, ht_2, un_rels) current_lr = config.learning_rate optimizer = get_optim(model, current_lr) if config.init_checkpoint: logging.info("Loading checkpoint...") checkpoint = torch.load(os.path.join(config.save_path, "checkpoint")) init_step = checkpoint["step"] + 1 model.load_state_dict(checkpoint["model_state_dict"]) if config.use_old_optimizer: current_lr = checkpoint["current_lr"] optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) else: init_step = 1 training_logs = [] train_iterator = train_data_iterator(entities, new_triples(triples, rel_seeds, {})) # Training Loop for step in range(init_step, config.max_step): log = train_step(model, optimizer, next(train_iterator)) training_logs.append(log) # log if step % config.log_step == 0: metrics = {} for metric in training_logs[0].keys(): metrics[metric] = sum([log[metric] for log in training_logs ]) / len(training_logs) log_metrics("Training average", step, metrics) training_logs.clear() # warm up if step % config.warm_up_step == 0: current_lr *= 0.1 logging.info("Change learning_rate to %f at step %d" % (current_lr, step)) optimizer = get_optim(model, current_lr) if step % config.update_step == 0: logging.info("Align entities and relations, swap parameters") seeds, align_e_1, align_e_2 = entity_seeds(model, un_ents) rel_seeds = relation_seeds(seeds, ht_1, ht_2, un_rels) new_entities = (entities[0] + align_e_2, entities[1] + align_e_1) train_iterator = train_data_iterator( new_entities, new_triples(triples, rel_seeds, seeds)) save_variable_list = { "step": step, "current_lr": current_lr, } save_model(model, optimizer, save_variable_list) logging.info("---------------Test on test dataset---------------") metrics = test_step(model, test_pairs, un_ents) log_metrics("Test", config.max_step, metrics) logging.info("---------------Taining End---------------")
def _(): evaluator.run(eval_dataloader, epoch_length=config.eval_epoch_length) log_metrics(evaluator, "eval")
optimizer = torch.optim.SGD(model.parameters(), lr=args.learningRate, weight_decay=args.weightDecay) # Learning for epoch_num in range(args.initEpochNum, args.initEpochNum + args.nEpochs): trn_metrics = runModel(trn_data_gen, model, optimizer, class_wts, 'trn', args.batchSize, trn_num_batches, loss_wts=loss_wts) utils.log_metrics(epoch_num, trn_metrics, 'trn', log_file, args.savename) torch.save(model.state_dict(), args.savename + '.pt') val_metrics = runModel(val_data_gen, model, optimizer, class_wts, 'val', args.batchSize, val_num_batches, None) utils.log_metrics(epoch_num, val_metrics, 'val', log_file, args.savename) if best_val_record and val_metrics.AUROC > best_val: best_val = utils.save_chkpt(best_val_record, best_val, val_metrics, model, args.savename) tst_metrics = runModel(tst_data_gen, model, optimizer, class_wts, 'tst', args.batchSize, tst_num_batches, None) utils.log_metrics(0, tst_metrics, 'tst', log_file, args.savename) # val_aggregator = Aggregator('val', task, val_data_loader) # val_aggregator.aggregate() # tst_aggregator = Aggregator('tst', task, tst_data_loader) # tst_aggregator.aggregate()
def train_model(args): dataloaders = get_dataloaders(args) dataset_sizes = { 'train': len(dataloaders['train'].dataset), 'val': len(dataloaders['val'].dataset), 'test': len(dataloaders['test'].dataset) } device = 'cuda' if torch.cuda.is_available() else 'cpu' # set up model = load_model(args, device) loss_fn = get_loss_fn(args.dataset, args.loss_type) if args.train_decoders: parameters = list(model.photo_decoder.parameters()) + list( model.sketch_decoder.parameters()) elif args.model in ['EmbedGAN']: parameters = list(model.G.parameters()) + list(model.D.parameters()) else: parameters = model.parameters() if args.optim == 'sgd': optimizer = optim.SGD(parameters, lr=args.lr, weight_decay=args.wd, momentum=.9, nesterov=True) elif args.optim == 'adam': optimizer = optim.Adam(parameters, lr=args.lr, weight_decay=args.wd) scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=len(dataloaders['train']) // 10, gamma=.9) writer = SummaryWriter(args.log_dir + "/{}".format(args.name)) save_dir = Path(args.save_dir) / ('{}'.format(args.name)) if not save_dir.exists(): os.mkdir(save_dir) best_model = None best_loss = float('inf') batch_num = 0 for epoch in range(args.num_epochs): print('Epoch {}/{}'.format(epoch, args.num_epochs - 1)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': scheduler.step() model.train() # Set model to training mode else: model.eval() # Set model to evaluate mode epoch_metrics = defaultdict(float) # Iterate over data. for inputs, labels in dataloaders[phase]: # zero the parameter gradients optimizer.zero_grad() N = len(inputs) # converts list of tuples of images paths of length N into flattened # tensor of size N * args.loss_type inputs = load_sketchy_images(inputs, args.loss_type, device, args.img_size) labels = labels.to(device) with torch.set_grad_enabled(phase == 'train'): if args.loss_type in [ "vae", "vae+embed", "vae+embed+classify" ]: batch_metrics = vae_forward( inputs, labels, model, loss_fn, writer, device, batch_num, args.alpha, N, args.name, modality=args.modality, compare_embed=args.loss_type in ["vae+embed", "vae+embed+classify"], classify=args.loss_type in ['vae+embed+classify', 'single_vae']) elif args.loss_type in [ "ae", "ae+embed", "ae+embed+classify" ]: batch_metrics = ae_forward( inputs, labels, model, loss_fn, writer, device, batch_num, args.alpha, N, args.name, modality=args.modality, compare_embed=args.loss_type in ["ae+embed", "ae+embed+classify"], classify=args.loss_type in ['ae+embed+classify', 'single_ae']) elif args.loss_type in ['gan']: batch_metrics = gan_forward(inputs, labels, model, loss_fn, writer, device, batch_num, N) else: batch_metrics = classify_contrast_forward( inputs, labels, model, loss_fn, writer, device, batch_num, args.alpha, args.loss_type, N) for criteria_name in batch_metrics: epoch_metrics[criteria_name] += batch_metrics[ criteria_name] / dataset_sizes[phase] loss = batch_metrics['loss'] del batch_metrics if phase == "train": batch_num += 1 loss.backward() optimizer.step() epoch_loss = epoch_metrics['loss'].item() log_metrics(epoch_metrics, writer, phase, epoch) # deep copy the model if phase == 'val' and epoch_loss < best_loss: best_loss = epoch_loss now = datetime.datetime.now() torch.save( model.state_dict(), save_dir / f"{now.month}{now.day}{now.hour}{now.minute}_{best_loss}") best_model = copy.deepcopy(model.state_dict()) writer.close() now = datetime.datetime.now() torch.save( model.state_dict(), save_dir / f"end_{now.month}{now.day}{now.hour}{now.minute}_{best_loss}") # load best model weights model.load_state_dict(best_model) now = datetime.datetime.now() torch.save(model.state_dict(), save_dir / "best")
def ae_forward(inputs, labels, model, loss_fn, writer, device, batch_num, alpha, N, name, modality, compare_embed=False, classify=False): metrics = {} if modality in ['both', 'sketch']: sketches = torch.index_select(inputs, 0, torch.tensor(range(0, N)).to(device)) sketch_embed, recon_sketch = model.forward(sketches, is_sketch=True) if modality in ['both', 'photo']: photos = torch.index_select(inputs, 0, torch.tensor(range(N, 2 * N)).to(device)) photo_embed, recon_photo = model.forward(photos, is_sketch=False) if batch_num >= 500 and batch_num % 500 == 0: if modality in ['both', 'sketch']: tvutils.save_image( recon_sketch, '/home/robincheong/sketch2img/generated/{}_recon_sketch_{}.png' .format(name, batch_num)) tvutils.save_image( sketches, '/home/robincheong/sketch2img/generated/{}_sketches_{}.png'. format(name, batch_num)) if modality in ['both', 'photo']: tvutils.save_image( recon_photo, '/home/robincheong/sketch2img/generated/{}_recon_photo_{}.png'. format(name, batch_num)) tvutils.save_image( photos, '/home/robincheong/sketch2img/generated/{}_photos_{}.png'. format(name, batch_num)) if modality in ['both', 'sketch']: metrics['sketch_recon_loss'] = loss_fn(recon_sketch, sketches) * alpha if modality in ['both', 'photo']: metrics['photo_recon_loss'] = loss_fn(recon_photo, photos) * alpha if compare_embed and modality in ['both']: metrics['embed_loss'] = torch.sum( (sketch_embed - photo_embed)**2 / len(sketch_embed)) else: metrics['embed_loss'] = 0 if classify: ce_loss = nn.CrossEntropyLoss() if modality == 'photo': metrics['classify_loss'] = ce_loss( model.make_predictions(photo_embed), labels) * 10 elif modality == 'sketch': metrics['classify_loss'] = ce_loss( model.make_predictions(sketch_embed), labels) * 10 else: metrics['classify_loss'] = (ce_loss(model.make_predictions(photo_embed), labels) \ + ce_loss(model.make_predictions(sketch_embed), labels)) * 10 else: metrics['classify_loss'] = 0 metrics['loss'] = 0 for metric_name in metrics: if metric_name.endswith('loss') and metric_name != 'loss': metrics['loss'] += metrics[metric_name] log_metrics(metrics, writer, "batch", batch_num) return metrics
def active_train( log_dir: str, model: torch.nn.Module, model_path: str, unlabeled_dataset: conlldataloader.ConllDataSetUnlabeled, test_dataset: conlldataloader.ConllDataSet, # active learning parameters iterations: int, heuritic: active_heuristic.ActiveHeuristic, oracle: oracle.Oracle, sample_size: int, sampling_strategy: str, # sampling, top_k # train parameters vocab: Vocab, tag_vocab: Vocab, batch_size: int, shuffle: bool, num_workers: int, num_epochs: float, learning_rate: float, weight_decay: float, momentum: float, optimizer_type: str, device: str, summary_file: str, ) -> None: logger = Logger( os.path.join(log_dir, "{}/".format(model_path)), summary_file=summary_file, ) # random sample dataset into train_data = [] test_data_loader = conlldataloader.get_data_loader( vocab, tag_vocab, test_dataset, 1, # batch_size False, # no shuffle 1, # 1 worker ) start_model = copy.deepcopy(model) iteration_samples = [1, 5, 10, 25, 50, 100] labeled_indexes = [] for i, sample_size in enumerate(iteration_samples): if len(train_data) > 0: trainer = TrainLOG_ANALYSIS_FILESr( model=copy.LOG_ANALYSIS_FILESeepcopy(model), learning_raLOG_ANALYSIS_FILESe=learning_rate, weight_decay=weight_decay, momentum=momentum, optimizer_type=optimizer_type, vocab=vocab, tags=tag_vocab, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, train_dataset=train_data, test_dataset=test_dataset, logger=logger, device=device, verbose_print=True, verbose_log=True, train_label_fn=lambda data, index: (data[index])) train_data_loader = conlldataloader.get_data_loader( vocab, tag_vocab, train_data, batch_size, shuffle, num_workers, label_fn=lambda data, index: (data[index]), ) trainer.train(num_epochs) model = trainer.get_best_model() ts = len(train_data) if constants.LOG_ANALYSIS_FILES: with open(f'help/{model_path}_analyze_test_{ts}.txt', 'w') as log_file: with open(f'help/{model_path}_analyze_test_{ts}.csv', 'w') as csv_file: utils.analyze_predictions( trainer.get_best_model(), test_data_loader, vocab, tag_vocab, log_file, csv_file, device, ) with open(f'help/{model_path}_analyze_train_{ts}.txt', 'w') as log_file: with open(f'help/{model_path}_analyze_train_{ts}.csv', 'w') as csv_file: utils.analyze_predictions( trainer.get_best_model(), train_data_loader, vocab, tag_vocab, log_file, csv_file, device, ) if (i == len(iteration_samples) - 1): break # compute valid metrics f1_data, acc = utils.compute_f1_dataloader(model, test_data_loader, tag_vocab, device=device) f1_avg_valid = utils.compute_avg_f1(f1_data) # log valid metics logger.scalar_summary("active valid f1", f1_avg_valid, len(train_data)) logger.scalar_summary("active valid accuracy", acc, len(train_data)) utils.log_metrics(logger, f1_data, "active valid", len(train_data)) # select new points from distribution if isinstance(heuritic, active_heuristic.KNNEmbeddings): distribution = heuritic.evaluate_with_labeled( model=model, dataset=unlabeled_dataset, labeled_indexes=labeled_indexes, labeled_points=train_data, device=device) else: distribution = heuritic.evaluate(model, unlabeled_dataset, device) new_points = [] sample_size = min(sample_size, len(distribution) - 1) if sampling_strategy == constants.ACTIVE_LEARNING_SAMPLE: new_points = torch.multinomial(distribution, sample_size) elif sampling_strategy == constants.ACTIVE_LEARNING_TOP_K: new_points = sorted(range(len(distribution)), reverse=True, key=lambda ind: distribution[ind]) new_points = new_points[:sample_size] # use new points to augment train_dataset # remove points from unlabaled corpus query = [unlabeled_dataset.data[ind] for ind in new_points] labeled_indexes.extend(ind for (ind, _) in query) outputs = [oracle.get_label(q) for q in query] # move unlabeled points to labeled points [unlabeled_dataset.remove(q) for q in query] train_data.extend(outputs) logger.flush()
def _(): evaluator.run(test_loader, epoch_length=config.eval_epoch_length) log_metrics(evaluator, tag="eval")