def _attach_losses(engine: Engine, prefix: str = "", running_average: bool = False): loss_peaks = Loss(output_transform=lambda x: (x["loss-peaks"], x["loss-peaks"]), loss_fn=lambda *x: x[0].mean()) loss_peaks.attach(engine, prefix + 'loss-peaks') loss_enrichment = Loss(output_transform=lambda x: (x["loss-enrichment"], x["loss-enrichment"]), loss_fn=lambda *x: x[0].mean()) loss_enrichment.attach(engine, prefix + 'loss-enrichment') if running_average: RunningAverage(loss_peaks).attach(engine, prefix + 'ra-loss-peaks') RunningAverage(loss_enrichment).attach(engine, prefix + 'ra-loss-enrichment')
def create_basic_evaluator(model, device, beta=1, kl_loss=kl_loss, recon_loss=recon_loss, **kwargs): def evaluate_function(engine, batch): model.eval() with torch.no_grad(): x = batch x = x.to(device) x_recon, logstd_noise, mu_z, logstd_z = model(x) kw = { 'logstd_noise': logstd_noise, 'mu_z': mu_z, 'logstd_z': logstd_z } return x, x_recon, kw evaluator = Engine(evaluate_function) # Registering metrics m1 = Loss(kl_loss, output_transform=lambda x: (x[2]['mu_z'], x[2]['logstd_z'])) m2 = Loss(recon_loss, output_transform=lambda x: (x[0], x[1], { 'logstd_noise': x[2]['logstd_noise'] })) m1.attach(evaluator, 'kl_loss') m2.attach(evaluator, 'recon_loss') m3 = m2 + beta * m1 m3.attach(evaluator, 'elbo_loss') return evaluator
def train_loop(model, params, ds, min_y, base_data, model_id, device, batch_size, max_epochs=2): ds_train, ds_valid = ds min_y_train, min_y_val = min_y with create_summary_writer(model, ds_train, base_data, model_id, device=device) as writer: lr = params['lr'] mom = params['momentum'] wd = params['l2_wd'] optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=mom, weight_decay=wd) sched = ReduceLROnPlateau(optimizer, factor=0.5, patience=5) funcs = {'accuracy': Accuracy(), 'loss': Loss(F.cross_entropy)} loss = funcs['loss']._loss_fn acc_metric = Accuracy(device=device) loss_metric = Loss(F.cross_entropy, device=device) acc_val_metric = Accuracy(device=device) loss_val_metric = Loss(F.cross_entropy, device=device) def train_step(engine, batch): model.train() x, y = batch x = x.to(device) y = y.to(device) - min_y_train ans = model.forward(x) l = loss(ans, y) optimizer.zero_grad() l.backward() optimizer.step() # return ans, y return l.item() trainer = Engine(train_step) # acc_metric.attach(trainer, "accuracy") # loss_metric.attach(trainer, 'loss') def train_eval_step(engine, batch): model.eval() with torch.no_grad(): x, y = batch x = x.to(device) y = y.to(device) - min_y_train ans = model.forward(x) return ans, y train_evaluator = Engine(train_eval_step) acc_metric.attach(train_evaluator, "accuracy") loss_metric.attach(train_evaluator, 'loss') def validation_step(engine, batch): model.eval() with torch.no_grad(): x, y = batch x = x.to(device) y = y.to(device) - min_y_val ans = model.forward(x) return ans, y valid_evaluator = Engine(validation_step) acc_val_metric.attach(valid_evaluator, "accuracy") loss_val_metric.attach(valid_evaluator, 'loss') @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): valid_evaluator.run(ds_valid) metrics = valid_evaluator.state.metrics valid_avg_accuracy = metrics['accuracy'] avg_nll = metrics['loss'] print( "Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, valid_avg_accuracy, avg_nll)) writer.add_scalar("validation/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("validation/avg_accuracy", valid_avg_accuracy, engine.state.epoch) writer.add_scalar("validation/avg_error", 1. - valid_avg_accuracy, engine.state.epoch) @trainer.on(Events.EPOCH_COMPLETED) def lr_scheduler(engine): metrics = valid_evaluator.state.metrics avg_nll = metrics['accuracy'] sched.step(avg_nll) @trainer.on(Events.ITERATION_COMPLETED(every=100)) def log_training_loss(engine): batch = engine.state.batch ds = DataLoader(TensorDataset(*batch), batch_size=batch_size) train_evaluator.run(ds) metrics = train_evaluator.state.metrics # metrics = engine.state.metrics accuracy = metrics['accuracy'] nll = metrics['loss'] iter = (engine.state.iteration - 1) % len(ds_train) + 1 if (iter % 100) == 0: print("Epoch[{}] Iter[{}/{}] Accuracy: {:.2f} Loss: {:.2f}". format(engine.state.epoch, iter, len(ds_train), accuracy, nll)) writer.add_scalar("batchtraining/detloss", nll, engine.state.epoch) writer.add_scalar("batchtraining/accuracy", accuracy, engine.state.iteration) writer.add_scalar("batchtraining/error", 1. - accuracy, engine.state.iteration) writer.add_scalar("batchtraining/loss", engine.state.output, engine.state.iteration) @trainer.on(Events.EPOCH_COMPLETED) def log_lr(engine): writer.add_scalar("lr", optimizer.param_groups[0]['lr'], engine.state.epoch) # @trainer.on(Events.EPOCH_COMPLETED) # def log_training_results(engine): # train_evaluator.run(ds_train) # metrics = train_evaluator.state.metrics # # metrics = engine.state.metrics # avg_accuracy = metrics['accuracy'] # avg_nll = metrics['loss'] # print("Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" # .format(engine.state.epoch, avg_accuracy, avg_nll)) # writer.add_scalar("training/avg_loss", avg_nll, engine.state.epoch) # writer.add_scalar("training/avg_accuracy", # avg_accuracy, engine.state.epoch) # writer.add_scalar("training/avg_error", 1. - # avg_accuracy, engine.state.epoch) @trainer.on(Events.EPOCH_COMPLETED) def validation_value(engine): metrics = valid_evaluator.state.metrics valid_avg_accuracy = metrics['accuracy'] return valid_avg_accuracy to_save = {'model': model} handler = Checkpoint( to_save, DiskSaver(os.path.join(base_data, model_id), create_dir=True), score_function=validation_value, score_name="val_acc", global_step_transform=global_step_from_engine(trainer), n_saved=None) # kick everything off trainer.add_event_handler( Events.ITERATION_COMPLETED(every=200 * 5000 // batch_size // 5), handler) trainer.run(ds_train, max_epochs=max_epochs)
def adv_prune_train_loop(model, params, ds, dset, min_y, base_data, model_id, prune_type, device, batch_size, tpa, max_epochs=5): #assert prune_type in ['global_unstructured', 'structured'] total_prune_amount = tpa remove_amount = tpa ds_train, ds_valid = ds train_set, valid_set = dset min_y_train, min_y_val = min_y train_set, valid_set = dset total_prune_amount = tpa original_model = copy.deepcopy(model) original_model.eval() model_id = f'{model_id}_{prune_type}_pruning_{tpa}_l1' valid_freq = 200 * 500 // batch_size // 3 conv_layers = [model.conv1] for sequential in [model.layer1, model.layer2, model.layer3, model.layer4]: for bottleneck in sequential: conv_layers.extend([bottleneck.conv1, bottleneck.conv2, bottleneck.conv3]) conv_layers = conv_layers[:22] def prune_model(model): print(f'pruned model by {total_prune_amount}') if prune_type == 'global_unstructured': parameters_to_prune = [(layer, 'weight') for layer in conv_layers] prune.global_unstructured( parameters_to_prune, pruning_method=prune.L1Unstructured, amount=total_prune_amount, ) else: for layer in conv_layers: prune_model(model) def valid_eval(model, dataset, dataloader, device, label): right = 0 total = 0 model.eval() with torch.no_grad(): for i, data in tqdm(enumerate(dataloader), total=len(dataset) / dataloader.batch_size): data, y = data data = data.to(device) y = y.to(device) - label ans = model.forward(data) right += torch.sum(torch.eq(torch.argmax(ans, dim=1), y)) total += y.shape[0] return right/total valid_acc = valid_eval(model, valid_set, ds_valid, device, min_y_val) print('initial accuracy:', valid_acc.item()) with create_summary_writer(model, ds_train, base_data, model_id, device=device) as writer: lr = params['lr'] mom = params['momentum'] wd = params['l2_wd'] optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=mom, weight_decay=wd) sched = ReduceLROnPlateau(optimizer, factor=0.5, patience=5) funcs = {'accuracy': Accuracy(), 'loss': Loss(F.cross_entropy)} loss = funcs['loss']._loss_fn acc_metric = Accuracy(device=device) loss_metric = Loss(F.cross_entropy, device=device) acc_val_metric = Accuracy(device=device) loss_val_metric = Loss(F.cross_entropy, device=device) # attack = GradientSignAttack(original_model, loss_fn=loss, eps=0.2) def train_step(engine, batch): model.train() x, y = batch x = x.to(device) y = y.to(device) - min_y_train # with ctx_noparamgrad_and_eval(model): # x_adv = attack.perturb(x, y) # optimizer.zero_grad() # x = torch.cat((x, x_adv)) # y = torch.cat((y, y)) ans = model.forward(x) l = loss(ans, y) optimizer.zero_grad() l.backward() optimizer.step() with torch.no_grad(): for layer in conv_layers: layer.weight *= layer.weight_mask return l.item() trainer = Engine(train_step) def train_eval_step(engine, batch): model.eval() x, y = batch x = x.to(device) y = y.to(device) - min_y_train # x_adv = attack.perturb(x, y) # x = torch.cat((x, x_adv)) # y = torch.cat((y, y)) with torch.no_grad(): ans = model.forward(x) return ans, y train_evaluator = Engine(train_eval_step) acc_metric.attach(train_evaluator, "accuracy") loss_metric.attach(train_evaluator, 'loss') def validation_step(engine, batch): model.eval() x, y = batch x = x.to(device) y = y.to(device) - min_y_val # x_adv = attack.perturb(x, y) # x = torch.cat((x, x_adv)) # y = torch.cat((y, y)) with torch.no_grad(): ans = model.forward(x) return ans, y valid_evaluator = Engine(validation_step) acc_val_metric.attach(valid_evaluator, "accuracy") loss_val_metric.attach(valid_evaluator, 'loss') @trainer.on(Events.ITERATION_COMPLETED(every=valid_freq)) def log_validation_results(engine): valid_evaluator.run(ds_valid) metrics = valid_evaluator.state.metrics valid_avg_accuracy = metrics['accuracy'] avg_nll = metrics['loss'] print("Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, valid_avg_accuracy, avg_nll)) writer.add_scalar("validation/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("validation/avg_accuracy", valid_avg_accuracy, engine.state.epoch) writer.add_scalar("validation/avg_error", 1. - valid_avg_accuracy, engine.state.epoch) @trainer.on(Events.EPOCH_COMPLETED) def lr_scheduler(engine): metrics = valid_evaluator.state.metrics avg_nll = metrics['accuracy'] sched.step(avg_nll) @trainer.on(Events.ITERATION_COMPLETED(every=100)) def log_training_loss(engine): batch = engine.state.batch ds = DataLoader(TensorDataset(*batch), batch_size=batch_size) train_evaluator.run(ds) metrics = train_evaluator.state.metrics accuracy = metrics['accuracy'] nll = metrics['loss'] iter = (engine.state.iteration - 1) % len(ds_train) + 1 if (iter % 50) == 0: print("Epoch[{}] Iter[{}/{}] Accuracy: {:.2f} Loss: {:.2f}" .format(engine.state.epoch, iter, len(ds_train), accuracy, nll)) writer.add_scalar("batchtraining/detloss", nll, engine.state.epoch) writer.add_scalar("batchtraining/accuracy", accuracy, engine.state.iteration) writer.add_scalar("batchtraining/error", 1. - accuracy, engine.state.iteration) writer.add_scalar("batchtraining/loss", engine.state.output, engine.state.iteration) @trainer.on(Events.EPOCH_COMPLETED) def log_lr(engine): writer.add_scalar("lr", optimizer.param_groups[0]['lr'], engine.state.epoch) @trainer.on(Events.ITERATION_COMPLETED(every=valid_freq)) def validation_value(engine): metrics = valid_evaluator.state.metrics valid_avg_accuracy = metrics['accuracy'] return valid_avg_accuracy to_save = {'model': model} handler = Checkpoint(to_save, DiskSaver(os.path.join(base_data, model_id), create_dir=True), score_function=validation_value, score_name="val_acc", global_step_transform=global_step_from_engine(trainer), n_saved=None) # kick everything off trainer.add_event_handler(Events.ITERATION_COMPLETED(every=valid_freq), handler) trainer.run(ds_train, max_epochs=max_epochs)
def train( trn_path: Path, save_dir: Path, dev_path: Optional[Path] = None, vocab_path: Optional[Path] = None, encoding: str = 'utf8', lr: float = 1e-3, max_epochs: int = 50, batch_size: int = 16, patience: int = 5, numeric: bool = False, device: Optional[str] = None, ) -> None: logging.info('Creating save directory if not exist in %s', save_dir) save_dir.mkdir() ### Read/create/load samples and vocab trn_samples = read_or_load_samples(trn_path, encoding=encoding) vocab = create_or_load_vocab(trn_samples, path=vocab_path) dev_samples = None if dev_path is not None: dev_samples = read_or_load_samples(dev_path, encoding=encoding, name='dev') ### Numericalize samples if not numeric: logging.info('Numericalizing train samples') trn_samples = list(vocab.apply_to(trn_samples)) if dev_samples is not None: logging.info('Numericalizing dev samples') dev_samples = list(vocab.apply_to(dev_samples)) ### Save vocab and samples fnames = ['vocab.pkl', 'train-samples.pkl', 'dev-samples.pkl'] objs = [vocab, trn_samples] if dev_samples is not None: objs.append(dev_samples) for fname, obj in zip(fnames, objs): save_path = save_dir / fname logging.info('Saving to %s', save_path) with open(save_path, 'wb') as f: pickle.dump(obj, f) ### Create model, optimizer, and loss fn logging.info('Creating language model') padding_idx = vocab['words']['<pad>'] max_width = get_max_filter_width([trn_samples, dev_samples]) model = create_lm( len(vocab['words']), len(vocab['chars']), padding_idx=padding_idx, filter_widths=list(range(1, max_width)), ) logging.info('Model created with %d parameters', sum(p.numel() for p in model.parameters())) optimizer = torch.optim.Adam(model.parameters(), lr=lr) loss_fn = LMLoss(padding_idx=padding_idx) ### Save model metadata metadata_path = save_dir / 'metadata.yml' logging.info('Saving model metadata to %s', metadata_path) metadata_path.write_text(dump(model), encoding='utf8') ### Prepare engines def batch2tensors( batch: Batch, device: Optional[str] = None, non_blocking: Optional[bool] = None, ) -> Tuple[dict, torch.LongTensor]: arr = batch.to_array(pad_with=padding_idx) tsr = { k: torch.from_numpy(v).to(device=device) for k, v in arr.items() } words = tsr['words'][:, :-1].contiguous() chars = tsr['chars'][:, :-1, :].contiguous() targets = tsr['words'][:, 1:].contiguous() return {'words': words, 'chars': chars}, targets trainer = create_supervised_trainer(model, optimizer, loss_fn, device=device, prepare_batch=batch2tensors) trn_evaluator = create_supervised_evaluator(model, device=device, prepare_batch=batch2tensors) dev_evaluator = create_supervised_evaluator(model, device=device, prepare_batch=batch2tensors) ### Attach metrics loss = Loss(loss_fn, batch_size=lambda tgt: (tgt != padding_idx).long().sum().item()) ppl = MetricsLambda(math.exp, loss) loss.attach(trn_evaluator, 'loss') loss.attach(dev_evaluator, 'loss') ppl.attach(trn_evaluator, 'ppl') ppl.attach(dev_evaluator, 'ppl') ### Attach timers epoch_timer = Timer() epoch_timer.attach(trainer, start=Events.EPOCH_STARTED, pause=Events.EPOCH_COMPLETED) ### Attach progress bars trn_pbar = ProgressBar(bar_format=None, unit='batch', desc='Training') trn_pbar.attach(trainer, output_transform=lambda loss: { 'loss': loss, 'ppl': math.exp(loss) }) eval_pbar = ProgressBar(bar_format=None, unit='sent', desc='Evaluating') eval_pbar.attach(trn_evaluator) eval_pbar.attach(dev_evaluator) ### Attach checkpointers if dev_samples is None: ckptr_kwargs: dict = {'save_interval': 1, 'n_saved': 5} ckptr_engine = trainer else: ckptr_kwargs = { 'score_function': lambda eng: -eng.state.metrics['ppl'], 'score_name': 'dev_ppl' } ckptr_engine = dev_evaluator ckptr = ModelCheckpoint(str(save_dir / 'checkpoints'), 'ckpt', save_as_state_dict=True, **ckptr_kwargs) ckptr_engine.add_event_handler(Events.EPOCH_COMPLETED, ckptr, { 'model': model, 'optimizer': optimizer }) ### Attach early stopper if dev_samples is not None: early_stopper = EarlyStopping(patience, lambda eng: -eng.state.metrics['ppl'], trainer) dev_evaluator.add_event_handler(Events.EPOCH_COMPLETED, early_stopper) ### Attach custom handlers @trainer.on(Events.EPOCH_STARTED) def start_epoch(engine: Engine) -> None: logging.info('[Epoch %d/%d] Starting', engine.state.epoch, engine.state.max_epochs) @trainer.on(Events.EPOCH_COMPLETED) def complete_epoch(engine: Engine) -> None: epoch = engine.state.epoch max_epochs = engine.state.max_epochs logging.info('[Epoch %d/%d] Done in %s', epoch, max_epochs, timedelta(seconds=epoch_timer.value())) logging.info('[Epoch %d/%d] Evaluating on train corpus', epoch, max_epochs) trn_evaluator.run(BatchIterator(trn_samples)) if dev_samples is not None: logging.info('[Epoch %d/%d] Evaluating on dev corpus', epoch, max_epochs) dev_evaluator.run(BatchIterator(dev_samples)) @trn_evaluator.on(Events.COMPLETED) @dev_evaluator.on(Events.COMPLETED) def print_metrics(engine: Engine) -> None: loss = engine.state.metrics['loss'] ppl = engine.state.metrics['ppl'] logging.info('||| loss %.4f | ppl %.4f', loss, ppl) ### Start training iterator = ShuffleIterator(trn_samples, key=lambda s: len(s['words'])) iterator = BatchIterator(iterator, batch_size=batch_size) try: trainer.run(iterator, max_epochs=max_epochs) except KeyboardInterrupt: logging.info('Interrupt detected, aborting training') trainer.terminate()
def run(): writer = SummaryWriter() CUDA = Config.device model = Retriever() print(f'Initializing model on {CUDA}') model.to(CUDA) optimizer = torch.optim.Adam(model.parameters(), lr=Config.LR) loss_fn = torch.nn.L1Loss().to(CUDA) print(f'Creating sentence transformer') encoder = SentenceTransformer(Config.sentence_transformer).to(CUDA) for parameter in encoder.parameters(): parameter.requires_grad = False print(f'Loading data') if os.path.exists('_full_dump'): with open('_full_dump', 'rb') as pin: train_loader, train_utts, val_loader, val_utts = pickle.load(pin) else: data = load_data(Config.data_source) train_loader, train_utts, val_loader, val_utts = get_loaders(data, encoder, Config.batch_size) with open('_full_dump', 'wb') as pout: pickle.dump((train_loader, train_utts, val_loader, val_utts), pout, protocol=-1) def train_step(engine, batch): model.train() optimizer.zero_grad() x, not_ys, y = batch yhat = model(x[0]) loss = loss_fn(yhat, y) gains = loss_fn(not_ys[0], yhat) * Config.negative_weight loss -= gains loss.backward() optimizer.step() return loss.item() def eval_step(engine, batch): model.eval() with torch.no_grad(): x, _, y = batch yhat = model(x[0]) return yhat, y trainer = Engine(train_step) trainer.logger = setup_logger('trainer') evaluator = Engine(eval_step) evaluator.logger = setup_logger('evaluator') latent_space = BallTree(numpy.array(list(train_utts.keys()))) l1 = Loss(loss_fn) recall = RecallAt(latent_space) recall.attach(evaluator, 'recall') l1.attach(evaluator, 'l1') @trainer.on(Events.ITERATION_COMPLETED(every=1000)) def log_training(engine): batch_loss = engine.state.output lr = optimizer.param_groups[0]['lr'] e = engine.state.epoch n = engine.state.max_epochs i = engine.state.iteration print("Epoch {}/{} : {} - batch loss: {}, lr: {}".format(e, n, i, batch_loss, lr)) writer.add_scalar('Training/loss', batch_loss, i) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics print(f"Training Results - Epoch: {engine.state.epoch} " f" L1: {metrics['l1']:.2f} " f" R@1: {metrics['r1']:.2f} " f" R@3: {metrics['r3']:.2f} " f" R@10: {metrics['r10']:.2f} ") for metric, value in metrics.items(): writer.add_scalar(f'Training/{metric}', value, engine.state.epoch) #@trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics print(f"Validation Results - Epoch: {engine.state.epoch} " f"L1: {metrics['l1']:.2f} " f" R@10: {metrics['r10']:.2f} ") for metric, value in metrics.items(): writer.add_scalar(f'Validation/{metric}', value, engine.state.epoch) trainer.run(train_loader, max_epochs=Config.max_epochs) torch.save(model.state_dict(), Config.checkpoint) print(f'Saved checkpoint at {Config.checkpoint}') interact(model, encoder, latent_space, train_utts)
def training(encoder, decoder, batch_size): optimizer_en = optim.Adam(encoder.parameters(), lr=lr) scheduler_en = optim.lr_scheduler.ReduceLROnPlateau(optimizer_en, 'min', patience=patience, min_lr=min_lr, factor=0.1) optimizer_de = optim.Adam(decoder.parameters(), lr=lr) scheduler_de = optim.lr_scheduler.ReduceLROnPlateau(optimizer_de, 'min', patience=patience, min_lr=min_lr, factor=0.1) def process_function(engine, batch): encoder.train() decoder.train() optimizer_en.zero_grad() optimizer_de.zero_grad() encoded = encoder(batch) decoded = decoder(encoded) loss = criterion(decoded, batch) loss.backward() optimizer_en.step() optimizer_de.step() return loss.item() def eval_function(engine, batch): encoder.eval() decoder.eval() with torch.no_grad(): encoded = encoder(batch) decoded = decoder(encoded) return decoded, batch trainer = Engine(process_function) train_evaluator = Engine(eval_function) validation_evaluator = Engine(eval_function) metric = Loss(criterion) metric.attach(train_evaluator, 'loss') metric.attach(validation_evaluator, 'loss') pbar = ProgressBar(persist=True, bar_format="") pbar.attach(trainer, ['loss']) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): train_evaluator.run(train_iterator) metrics = train_evaluator.state.metrics avg_loss = metrics['loss'] pbar.log_message( "Training Results - Epoch: {} Avg loss: {:.4f}".format( engine.state.epoch, avg_loss)) def log_validation_results(engine): validation_evaluator.run(valid_iterator) metrics = validation_evaluator.state.metrics avg_loss = metrics['loss'] print(avg_loss) print("Current lr: {}".format(optimizer_de.param_groups[0]['lr'])) scheduler_en.step(avg_loss) scheduler_de.step(avg_loss) pbar.log_message( "Validation Results - Epoch: {} Avg loss: {:.4f}".format( engine.state.epoch, avg_loss)) pbar.n = pbar.last_print_n = 0 trainer.add_event_handler(Events.EPOCH_COMPLETED, log_validation_results) # Reduce on Plateau def average_loss(engine): print("Current lr: {}".format(optimizer_de.param_groups[0]['lr'])) average_loss = engine.state.metrics['loss'] scheduler_en.step(average_loss) scheduler_de.step(average_loss) validation_evaluator.add_event_handler(Events.COMPLETED, average_loss) # Early Stopping def score_function(engine): val_loss = engine.state.metrics['loss'] return -val_loss handler = EarlyStopping(patience=100, score_function=score_function, trainer=trainer) validation_evaluator.add_event_handler(Events.COMPLETED, handler) # Model Checkpoint checkpointer = ModelCheckpoint(str(DRIVE_PATH.joinpath('models')), 'review', save_interval=10, n_saved=1, create_dir=False, save_as_state_dict=True, require_empty=False) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpointer, { 'encoder': encoder, 'decoder': decoder }) train_iterator = Data.DataLoader(train_data, batch_size=batch_size, shuffle=True, drop_last=False) valid_iterator = Data.DataLoader(val_data, batch_size=batch_size, shuffle=True, drop_last=False) trainer.run(train_iterator, max_epochs=500)
def run(model, criterion, optimizer, epochs=100, log_interval=10): vis = visdom.Visdom(env='ft_lift_ignite') train_loader = dataloaders['train'] val_loader = dataloaders['test'] # if not vis.check_connection(): # raise RuntimeError("Visdom server not running. Please run python -m visdom.server") # trainer = create_supervised_trainer(model, optimizer, criterion, device=device) # evaluator = create_supervised_evaluator(model, # metrics={'accuracy': Accuracy(criterion['label']), # 'nll': Loss(criterion['label']), # 'precision': Precision(average=True )}, # device=device) def update_model(trainer, batch): inputs, labels = batch inputs = inputs.to(device) labels = labels.to(device) #inputs, labels = _prepare_batch(batch, device=device) optimizer.zero_grad() class_output, structured_output = model(inputs) loss = criterion['label'](class_output, labels)+criterion['structured'](structured_output, labels) loss.backward() optimizer.step() return {'loss': loss.item(), 'class_output': class_output, 'structured_output': structured_output, #'inputs': inputs, 'labels': labels} trainer = Engine(update_model) # def _prepare_batch(batch, device=None, non_blocking=False): # """Prepare batch for training: pass to a device with options # """ # x, y = batch # return (convert_tensor(x, device=device, non_blocking=non_blocking), # convert_tensor(y, device=device, non_blocking=non_blocking)) def _inference(evaluator, batch): model.eval() with torch.no_grad(): inputs, labels = batch inputs = inputs.to(device) labels = labels.to(device) class_output, structured_output = model(inputs) loss = criterion['label'](class_output, labels)+criterion['structured'](structured_output, labels) return {'loss': loss.item(), 'class_output': class_output, 'structured_output': structured_output, #'inputs': inputs, 'labels': labels} evaluator = Engine(_inference) output_transform1 = lambda data: (data['class_output'], data['labels']) output_transform2 = lambda data: (data['structured_output'], data['labels']) metric_accuracy = Accuracy(output_transform=output_transform1) metric_accuracy.attach(evaluator, 'accuracy') metric_nll = Loss(criterion['label'], output_transform=output_transform1) metric_nll.attach(evaluator, 'nll') metric_precision = Precision(average=True, output_transform=output_transform1) metric_precision.attach(evaluator, 'precision') # evaluator = create_supervised_evaluator(model, # metrics={'accuracy': Accuracy(output_transform=output_transform1), # 'nll': Loss(criterion['label'], output_transform=output_transform1), # 'precision': Precision(average=True, output_transform=output_transform1)}, # device=device) handler = ModelCheckpoint('/1116/tmp/lift_models', 'myprefix', save_interval=1, n_saved=150, require_empty=False, create_dir=True) train_loss_window = create_plot_window(vis, '#Iterations', 'Loss', 'Training Loss') train_avg_loss_window = create_plot_window(vis, '#Iterations', 'Loss', 'Training Average Loss') train_avg_accuracy_window = create_plot_window(vis, '#Iterations', 'Accuracy', 'Training Average Accuracy') train_avg_precision_window = create_plot_window(vis, '#Iterations', 'Precision', 'Training Average Precision') val_avg_loss_window = create_plot_window(vis, '#Epochs', 'Loss', 'Validation Average Loss') val_avg_accuracy_window = create_plot_window(vis, '#Epochs', 'Accuracy', 'Validation Average Accuracy') val_avg_precision_window = create_plot_window(vis, '#Epochs', 'Precision', 'Validation Average Precison') @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: print("Epoch[{}] Iteration[{}/{}] Loss: {:.2f}" "".format(engine.state.epoch, iter, len(train_loader), engine.state.output['loss'])) vis.line(X=np.array([engine.state.iteration]), Y=np.array([engine.state.output['loss']]), update='append', win=train_loss_window) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): evaluator.run(train_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] avg_precision = metrics['precision'] print("Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f} Avg Precision: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll, avg_precision)) vis.line(X=np.array([engine.state.epoch]), Y=np.array([avg_accuracy]), win=train_avg_accuracy_window, update='append') vis.line(X=np.array([engine.state.epoch]), Y=np.array([avg_nll]), win=train_avg_loss_window, update='append') vis.line(X=np.array([engine.state.epoch]), Y=np.array([avg_precision]), win=train_avg_precision_window, update='append') @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] avg_precision = metrics['precision'] print("Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f} Avg Precision: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll, avg_precision)) vis.line(X=np.array([engine.state.epoch]), Y=np.array([avg_accuracy]), win=val_avg_accuracy_window, update='append') vis.line(X=np.array([engine.state.epoch]), Y=np.array([avg_nll]), win=val_avg_loss_window, update='append') vis.line(X=np.array([engine.state.epoch]), Y=np.array([avg_precision]), win=val_avg_precision_window, update='append') # kick everything off trainer.add_event_handler(Events.EPOCH_COMPLETED, handler, {'mymodel': model}) trainer.run(train_loader, max_epochs=epochs)
def adv_train_loop(model, params, ds, min_y, base_data, model_id, attack_type, device, batch_size, max_epochs=5): print('training adversarial:', attack_type) ds_train, ds_valid = ds min_y_train, min_y_val = min_y original_model = copy.deepcopy( model) # used to generate adv images for the trained model original_model.eval() model = copy.deepcopy( model) # making a copy so that original model is not changed model = model.to(device) model_id = f'{model_id}_{attack_type}' with create_summary_writer(model, ds_train, base_data, model_id, device=device) as writer: lr = params['lr'] mom = params['momentum'] wd = params['l2_wd'] optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=mom, weight_decay=wd) sched = ReduceLROnPlateau(optimizer, factor=0.5, patience=5) funcs = {'accuracy': Accuracy(), 'loss': Loss(F.cross_entropy)} loss = funcs['loss']._loss_fn acc_metric = Accuracy(device=device) loss_metric = Loss(F.cross_entropy, device=device) acc_val_metric = Accuracy(device=device) loss_val_metric = Loss(F.cross_entropy, device=device) classifier = PyTorchClassifier( model=original_model, clip_values=(0, 1), loss=nn.CrossEntropyLoss(), optimizer=optimizer, input_shape=(3, 64, 64), nb_classes=200, ) attack = None # if attack_type == "fgsm": # attack = FastGradientMethod(estimator=classifier, eps=0.2) # elif attack_type == "bim": # attack = BasicIterativeMethod(estimator=classifier, eps=0.2) # elif attack_type == "carlini": # attack = CarliniLInfMethod(classifier=classifier) # elif attack_type == "deepfool": # attack = DeepFool(classifier=classifier) if attack_type == "fgsm": attack = GradientSignAttack(model, loss_fn=loss, eps=0.2) elif attack_type == "ffa": attack = FastFeatureAttack(model, loss_fn=loss, eps=0.3) elif attack_type == "carlini": attack = CarliniWagnerL2Attack(model, 200, max_iterations=1000) elif attack_type == "lbfgs": attack = DeepFool(classifier=classifier) def train_step(engine, batch): model.train() x, y = batch x = x.to(device) y = y.to(device) - min_y_train with ctx_noparamgrad_and_eval(model): x_adv = attack.perturb(x, y) optimizer.zero_grad() x = torch.cat((x, x_adv)) y = torch.cat((y, y)) ans = model.forward(x) l = loss(ans, y) optimizer.zero_grad() l.backward() optimizer.step() # return ans, y return l.item() trainer = Engine(train_step) # acc_metric.attach(trainer, "accuracy") # loss_metric.attach(trainer, 'loss') def train_eval_step(engine, batch): model.eval() x, y = batch x = x.to(device) y = y.to(device) - min_y_train x_adv = attack.perturb(x, y) x = torch.cat((x, x_adv)) y = torch.cat((y, y)) with torch.no_grad(): ans = model.forward(x) return ans, y train_evaluator = Engine(train_eval_step) acc_metric.attach(train_evaluator, "accuracy") loss_metric.attach(train_evaluator, 'loss') def validation_step(engine, batch): model.eval() x, y = batch x = x.to(device) y = y.to(device) - min_y_val x_adv = attack.perturb(x, y) x = torch.cat((x, x_adv)) y = torch.cat((y, y)) with torch.no_grad(): ans = model.forward(x) return ans, y valid_evaluator = Engine(validation_step) acc_val_metric.attach(valid_evaluator, "accuracy") loss_val_metric.attach(valid_evaluator, 'loss') @trainer.on( Events.ITERATION_COMPLETED(every=200 * 5000 // batch_size // 10)) def log_validation_results(engine): valid_evaluator.run(ds_valid) metrics = valid_evaluator.state.metrics valid_avg_accuracy = metrics['accuracy'] avg_nll = metrics['loss'] print( "Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, valid_avg_accuracy, avg_nll)) writer.add_scalar("validation/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("validation/avg_accuracy", valid_avg_accuracy, engine.state.epoch) writer.add_scalar("validation/avg_error", 1. - valid_avg_accuracy, engine.state.epoch) @trainer.on(Events.EPOCH_COMPLETED) def lr_scheduler(engine): metrics = valid_evaluator.state.metrics avg_nll = metrics['accuracy'] sched.step(avg_nll) @trainer.on(Events.ITERATION_COMPLETED(every=50)) def log_training_loss(engine): batch = engine.state.batch ds = DataLoader(TensorDataset(*batch), batch_size=batch_size) train_evaluator.run(ds) metrics = train_evaluator.state.metrics # metrics = engine.state.metrics accuracy = metrics['accuracy'] nll = metrics['loss'] iter = (engine.state.iteration - 1) % len(ds_train) + 1 if (iter % 50) == 0: print("Epoch[{}] Iter[{}/{}] Accuracy: {:.2f} Loss: {:.2f}". format(engine.state.epoch, iter, len(ds_train), accuracy, nll)) writer.add_scalar("batchtraining/detloss", nll, engine.state.epoch) writer.add_scalar("batchtraining/accuracy", accuracy, engine.state.iteration) writer.add_scalar("batchtraining/error", 1. - accuracy, engine.state.iteration) writer.add_scalar("batchtraining/loss", engine.state.output, engine.state.iteration) @trainer.on(Events.EPOCH_COMPLETED) def log_lr(engine): writer.add_scalar("lr", optimizer.param_groups[0]['lr'], engine.state.epoch) # @trainer.on(Events.EPOCH_COMPLETED) # def log_training_results(engine): # train_evaluator.run(ds_train) # metrics = train_evaluator.state.metrics # # metrics = engine.state.metrics # avg_accuracy = metrics['accuracy'] # avg_nll = metrics['loss'] # print("Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" # .format(engine.state.epoch, avg_accuracy, avg_nll)) # writer.add_scalar("training/avg_loss", avg_nll, engine.state.epoch) # writer.add_scalar("training/avg_accuracy", # avg_accuracy, engine.state.epoch) # writer.add_scalar("training/avg_error", 1. - # avg_accuracy, engine.state.epoch) @trainer.on( Events.ITERATION_COMPLETED(every=200 * 5000 // batch_size // 10)) def validation_value(engine): metrics = valid_evaluator.state.metrics valid_avg_accuracy = metrics['accuracy'] return valid_avg_accuracy to_save = {'model': model} handler = Checkpoint( to_save, DiskSaver(os.path.join(base_data, model_id), create_dir=True), score_function=validation_value, score_name="val_acc", global_step_transform=global_step_from_engine(trainer), n_saved=None) # kick everything off trainer.add_event_handler( Events.ITERATION_COMPLETED(every=200 * 5000 // batch_size // 10), handler) trainer.run(ds_train, max_epochs=max_epochs)
model.eval() with torch.no_grad(): video, class_num = batch["video"].cuda(), batch["class"].cuda() pred = model(video) pred = F.softmax(pred, dim=1) # torch.cuda.empty_cache() return pred, class_num evaluator = Engine(validation_step) accuracy_metric = Accuracy() accuracy_metric.attach(evaluator, "accuracy") ce_loss_metric = Loss(ce_loss_fn) ce_loss_metric.attach(evaluator, "loss") @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): e = engine.state.epoch i = engine.state.iteration loss = engine.state.output print(f"Epoch: {e} / {cfg.epochs} : {i} - Loss: {loss:.5f}") # if wandb_online: # wandb.log({"loss": loss}) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): state = evaluator.run(train_loader)
def prune_train_loop(model, params, ds, min_y, base_data, model_id, prune_type, device, batch_size, max_epochs=5): assert prune_type in ['global_unstructured', 'structured'] total_prune_amount = 0.3 if prune_type == 'global_unstructured' else 0.1 ds_train, ds_valid = ds min_y_train, min_y_val = min_y model_id = f'{model_id}_{prune_type}_pruning' conv_layers = [model.conv1] for sequential in [model.layer1, model.layer2, model.layer3, model.layer4]: for bottleneck in sequential: conv_layers.extend( [bottleneck.conv1, bottleneck.conv2, bottleneck.conv3]) def prune_model(model): remove_amount = total_prune_amount / (max_epochs * 10) print(f'pruned model by {remove_amount}') if prune_type == 'global_unstructured': parameters_to_prune = [(layer, 'weight') for layer in conv_layers] prune.global_unstructured( parameters_to_prune, pruning_method=prune.L1Unstructured, amount=remove_amount, ) else: for layer in conv_layers: prune.ln_structured(layer, name='weight', amount=remove_amount, n=1, dim=0) prune_model(model) with create_summary_writer(model, ds_train, base_data, model_id, device=device) as writer: lr = params['lr'] mom = params['momentum'] wd = params['l2_wd'] optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=mom, weight_decay=wd) sched = ReduceLROnPlateau(optimizer, factor=0.5, patience=5) funcs = {'accuracy': Accuracy(), 'loss': Loss(F.cross_entropy)} loss = funcs['loss']._loss_fn acc_metric = Accuracy(device=device) loss_metric = Loss(F.cross_entropy, device=device) acc_val_metric = Accuracy(device=device) loss_val_metric = Loss(F.cross_entropy, device=device) def train_step(engine, batch): model.train() x, y = batch x = x.to(device) y = y.to(device) - min_y_train optimizer.zero_grad() ans = model.forward(x) l = loss(ans, y) optimizer.zero_grad() l.backward() optimizer.step() with torch.no_grad(): for layer in conv_layers: layer.weight *= layer.weight_mask # make sure pruned weights stay 0 return l.item() trainer = Engine(train_step) def train_eval_step(engine, batch): model.eval() x, y = batch x = x.to(device) y = y.to(device) - min_y_train with torch.no_grad(): ans = model.forward(x) return ans, y train_evaluator = Engine(train_eval_step) acc_metric.attach(train_evaluator, "accuracy") loss_metric.attach(train_evaluator, 'loss') def validation_step(engine, batch): model.eval() x, y = batch x = x.to(device) y = y.to(device) - min_y_val with torch.no_grad(): ans = model.forward(x) return ans, y valid_evaluator = Engine(validation_step) acc_val_metric.attach(valid_evaluator, "accuracy") loss_val_metric.attach(valid_evaluator, 'loss') @trainer.on( Events.ITERATION_COMPLETED(every=200 * 5000 // batch_size // 10)) def log_validation_results(engine): valid_evaluator.run(ds_valid) metrics = valid_evaluator.state.metrics valid_avg_accuracy = metrics['accuracy'] avg_nll = metrics['loss'] print( "Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, valid_avg_accuracy, avg_nll)) writer.add_scalar("validation/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("validation/avg_accuracy", valid_avg_accuracy, engine.state.epoch) writer.add_scalar("validation/avg_error", 1. - valid_avg_accuracy, engine.state.epoch) prune_model(model) @trainer.on(Events.EPOCH_COMPLETED) def lr_scheduler(engine): metrics = valid_evaluator.state.metrics avg_nll = metrics['accuracy'] sched.step(avg_nll) @trainer.on(Events.ITERATION_COMPLETED(every=50)) def log_training_loss(engine): batch = engine.state.batch ds = DataLoader(TensorDataset(*batch), batch_size=batch_size) train_evaluator.run(ds) metrics = train_evaluator.state.metrics accuracy = metrics['accuracy'] nll = metrics['loss'] iter = (engine.state.iteration - 1) % len(ds_train) + 1 if (iter % 50) == 0: print("Epoch[{}] Iter[{}/{}] Accuracy: {:.2f} Loss: {:.2f}". format(engine.state.epoch, iter, len(ds_train), accuracy, nll)) writer.add_scalar("batchtraining/detloss", nll, engine.state.epoch) writer.add_scalar("batchtraining/accuracy", accuracy, engine.state.iteration) writer.add_scalar("batchtraining/error", 1. - accuracy, engine.state.iteration) writer.add_scalar("batchtraining/loss", engine.state.output, engine.state.iteration) @trainer.on(Events.EPOCH_COMPLETED) def log_lr(engine): writer.add_scalar("lr", optimizer.param_groups[0]['lr'], engine.state.epoch) @trainer.on( Events.ITERATION_COMPLETED(every=200 * 5000 // batch_size // 10)) def validation_value(engine): metrics = valid_evaluator.state.metrics valid_avg_accuracy = metrics['accuracy'] return valid_avg_accuracy to_save = {'model': model} handler = Checkpoint( to_save, DiskSaver(os.path.join(base_data, model_id), create_dir=True), score_function=validation_value, score_name="val_acc", global_step_transform=global_step_from_engine(trainer), n_saved=None) # kick everything off trainer.add_event_handler( Events.ITERATION_COMPLETED(every=200 * 5000 // batch_size // 10), handler) trainer.run(ds_train, max_epochs=max_epochs)
def train(name, load, lrate, weight_decay, workers, smooth, device, validation, ground_truth): if not name: name = '{}_{}'.format(lrate, weight_decay) click.echo('model output name: {}'.format(name)) torch.set_num_threads(1) train_set = BaselineSet(glob.glob('{}/**/*.seeds.png'.format(ground_truth), recursive=True), smooth=smooth) train_data_loader = DataLoader(dataset=train_set, num_workers=workers, batch_size=1, shuffle=True, pin_memory=True) val_set = BaselineSet(glob.glob('{}/**/*.seeds.png'.format(validation), recursive=True), smooth=smooth) val_data_loader = DataLoader(dataset=val_set, num_workers=workers, batch_size=1, pin_memory=True) click.echo('loading network') model = ResUNet(refine_encoder=False).to(device) if load: click.echo('loading weights') model = torch.load(load, map_location=device) criterion = nn.BCEWithLogitsLoss() opti = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lrate, weight_decay=weight_decay) def score_function(engine): val_loss = engine.state.metrics['loss'] return -val_loss def output_preprocess(output): o, target = output o = torch.sigmoid(o) o = denoising_hysteresis_thresh(o.detach().squeeze().cpu().numpy(), 0.8, 0.9, 2.5) return torch.from_numpy(o.astype('f')).unsqueeze(0).unsqueeze(0).to( device), target.double().to(device) trainer = create_supervised_trainer(model, opti, criterion, device=device, non_blocking=True) accuracy = Accuracy(output_transform=output_preprocess) precision = Precision(output_transform=output_preprocess) recall = Recall(output_transform=output_preprocess) loss = Loss(criterion) precision = Precision(average=False) recall = Recall(average=False) f1 = (precision * recall * 2 / (precision + recall)).mean() evaluator = create_supervised_evaluator(model, device=device, non_blocking=True) accuracy.attach(evaluator, 'accuracy') precision.attach(evaluator, 'precision') recall.attach(evaluator, 'recall') loss.attach(evaluator, 'loss') f1.attach(evaluator, 'f1') ckpt_handler = ModelCheckpoint('.', name, save_interval=1, n_saved=10, require_empty=False) RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss') progress_bar = ProgressBar(persist=True) progress_bar.attach(trainer, ['loss']) trainer.add_event_handler(event_name=Events.EPOCH_COMPLETED, handler=ckpt_handler, to_save={'net': model}) trainer.add_event_handler(event_name=Events.ITERATION_COMPLETED, handler=TerminateOnNan()) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_data_loader) metrics = evaluator.state.metrics progress_bar.log_message( 'eval results - epoch {} loss: {:.4f} f1: {:.4f}, accuracy: {:.4f} recall: {:.4f} precision {:.4f}' .format(engine.state.epoch, metrics['loss'], metrics['f1'], metrics['accuracy'], metrics['recall'], metrics['precision'])) trainer.run(train_data_loader, max_epochs=1000)
def _setup(self, config): training_params = config['tp'] hyper_params = config['hyper-params'] self.name = training_params['name'] self.loss_fn = training_params['loss_fn'] self.device = training_params['device'] self.n_it_max = training_params['n_it_max'] self.split_names = training_params['split_names'] self.model = torch.load(training_params['model_path']) self.datasets = self._load_datasets(training_params['data_path'], training_params['loss_fn'], training_params['past_tasks']) self.batch_sizes = training_params['batch_sizes'] data_loaders = self._get_dataloaders(self.datasets, self.batch_sizes) self.train_loader, self.eval_loaders = data_loaders self.named_eval_loaders = OrderedDict( zip(self.split_names, self.eval_loaders)) training_params['optim_func'].func.keywords['lr'] = hyper_params['lr'] training_params['optim_func'].func.keywords['weight_decay'] = \ hyper_params['wd'] if 'dropout' in hyper_params: set_dropout(self.model, hyper_params['dropout']) # optim_func.keywords['momentum'] = config['momentum'] self.optim = training_params['optim_func'](self.model.parameters()) self.log_interval = training_params.get('log_interval', 30) self.log_steps = training_params['log_steps'].copy() if self.log_steps is None: self.log_steps = [] if training_params['log_epoch']: self.log_steps.append(len(self.train_loader)) self.n_iterations = 0 self.n_epochs = 0 self.n_steps = 0 # For early stopping self.patience = training_params['patience'] self.counter = 0 self.best_score = None self.best_loss = float('inf') self.trainer = \ create_supervised_trainer(self.model, self.optim, training_params['loss_fn'], device=self.device, output_transform= lambda x, y, y_pred, loss: ( y_pred, y)) self.trainer._logger.setLevel(logging.WARNING) l = Loss(lambda y_pred, y: self.loss_fn(y_pred, y).mean()) l.attach(self.trainer, 'train_loss') self.trainer.add_event_handler(Events.ITERATION_COMPLETED, l.completed, 'train_loss') StopAfterIterations(self.log_steps).attach(self.trainer) self.eval_metrics = {'nll': Loss(lambda y_pred, y: self.loss_fn(y_pred, y).mean())} for i in range(self.model.n_out): self.eval_metrics['accuracy_{}'.format(i)] = \ Accuracy(output_transform=get_attr_transform(i)) self.evaluator = \ create_supervised_evaluator(self.model, metrics=self.eval_metrics, device=self.device) self.all_accuracies = defaultdict(dict)
def _add_metrics(self): train_loss = RunningAverage(Loss(self.get_loss)) train_loss.attach(self.trainer, 'avg_train_loss') val_loss = Loss(self.get_loss) val_loss.attach(self.evaluator, 'val_loss')
def __init__(self, optimizer: OptimizerType, train_loader: DataLoaderType, model: torch.nn.Module, train_engine: Optional[ignite.engine.Engine] = None, test_engine: Optional[ignite.engine.Engine] = None, test_loader: Optional[DataLoaderType] = None, loss_fn: Optional[LossFnType] = None, eval_metric: Optional[ignite.metrics.Metric] = None, descending: bool = True, device: str = 'cuda') -> None: super().__init__() self.descending = descending self.optimizer: OptimizerType = optimizer self.model: Optional[torch.nn.Module] = model self.train_engine: ignite.engine.Engine self.train_loader: DataLoaderType = train_loader self.test_loader: Optional[DataLoaderType] = test_loader self.test_engine: Optional[ignite.engine.Engine] # create the train engine if necessary # if so, build it from the model and loss_fn if train_engine is None and model is None: raise TypeError('either train_engine or model have to be provided') if train_engine is not None: self.train_engine = train_engine # directly use it elif model is not None: if loss_fn is None: raise TypeError( 'loss_fn has to be provided if passing a plain pytorch model' ) self.train_engine = ignite.engine.create_supervised_trainer( model, optimizer, loss_fn=loss_fn, device=device, non_blocking=True) # get the metric to use new_metric = None if eval_metric is not None: new_metric = eval_metric elif loss_fn is not None: # use the given eval_metric if provided, but fallback # to using the loss averaged over the entire epoch new_metric = Loss(loss_fn) # if the test loader is present, then we need an engine for training if test_loader is not None: # test engine is needed only if we have a test loader if test_engine is None: if eval_metric is None: if loss_fn is None: # error if no metric or loss_fn raise TypeError( 'loss_fn has to be provided if using the default evaluator and not ' 'providing a metric') if model is None: raise TypeError( 'model must be provided if using the default evaluator' ) # create a default test engine self.test_engine = ignite.engine.create_supervised_evaluator( model, metrics={'loss': new_metric}, device=device, non_blocking=True) else: self.test_engine = test_engine # use the specified engine # attach a new metric if present if new_metric is not None: new_metric.attach(self.test_engine, 'loss') else: self.test_engine = None # no need for a test engine if no test loader specified
def prune_train_loop(model, params, ds, dset, min_y, base_data, model_id, prune_type, device, batch_size, tpa, max_epochs=2): assert prune_type in ['global_unstructured', 'structured'] total_prune_amount = tpa ds_train, ds_valid = ds train_set, valid_set = dset min_y_train, min_y_val = min_y model_id = f'{model_id}_{prune_type}_pruning_{tpa}' valid_freq = 200 * 500 // batch_size // 3 conv_layers = [model.conv1] def prune_model(model): # remove_amount = total_prune_amount // (max_epochs) remove_amount = total_prune_amount print(f'pruned model by {remove_amount}') worst = select_filters(model, ds_valid, valid_set, remove_amount, device) worst = [ k for k in Counter(torch.stack(worst).view(-1).cpu().numpy()).keys() ] worst.sort(reverse=True) print(worst) for layer in conv_layers: for d in worst: TuckerStructured(layer, name='weight', amount=0, dim=0, filt=d) return worst bad = prune_model(model) zeros = [] wrong = [] for i in range(len(model.conv1.weight_mask)): if torch.sum(model.conv1.weight_mask[i]) == 0.0: zeros.append(i) zeros.sort(reverse=True) if zeros == bad: print("correctly zero'd filters") else: if len(zeros) == len(bad): for i in range(len(zeros)): if zeros[i] != bad[i]: wrong.append((bad[i], zeros[i])) print(wrong) else: print("diff number filters zero'd", zeros) with create_summary_writer(model, ds_train, base_data, model_id, device=device) as writer: lr = params['lr'] mom = params['momentum'] wd = params['l2_wd'] optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=mom, weight_decay=wd) sched = ReduceLROnPlateau(optimizer, factor=0.5, patience=5) funcs = {'accuracy': Accuracy(), 'loss': Loss(F.cross_entropy)} loss = funcs['loss']._loss_fn acc_metric = Accuracy(device=device) loss_metric = Loss(F.cross_entropy, device=device) acc_val_metric = Accuracy(device=device) loss_val_metric = Loss(F.cross_entropy, device=device) def train_step(engine, batch): model.train() x, y = batch x = x.to(device) y = y.to(device) - min_y_train optimizer.zero_grad() ans = model.forward(x) l = loss(ans, y) optimizer.zero_grad() l.backward() optimizer.step() with torch.no_grad(): for layer in conv_layers: layer.weight *= layer.weight_mask # make sure pruned weights stay 0 return l.item() trainer = Engine(train_step) def train_eval_step(engine, batch): model.eval() x, y = batch x = x.to(device) y = y.to(device) - min_y_train with torch.no_grad(): ans = model.forward(x) return ans, y train_evaluator = Engine(train_eval_step) acc_metric.attach(train_evaluator, "accuracy") loss_metric.attach(train_evaluator, 'loss') def validation_step(engine, batch): model.eval() x, y = batch x = x.to(device) y = y.to(device) - min_y_val with torch.no_grad(): ans = model.forward(x) return ans, y valid_evaluator = Engine(validation_step) acc_val_metric.attach(valid_evaluator, "accuracy") loss_val_metric.attach(valid_evaluator, 'loss') @trainer.on(Events.ITERATION_COMPLETED(every=valid_freq)) # @trainer.on(Events.ITERATION_COMPLETED) def log_validation_results(engine): valid_evaluator.run(ds_valid) metrics = valid_evaluator.state.metrics valid_avg_accuracy = metrics['accuracy'] avg_nll = metrics['loss'] print( "Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, valid_avg_accuracy, avg_nll)) writer.add_scalar("validation/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("validation/avg_accuracy", valid_avg_accuracy, engine.state.epoch) writer.add_scalar("validation/avg_error", 1. - valid_avg_accuracy, engine.state.epoch) # prune_model(model) @trainer.on(Events.EPOCH_COMPLETED) def lr_scheduler(engine): metrics = valid_evaluator.state.metrics avg_nll = metrics['accuracy'] sched.step(avg_nll) @trainer.on(Events.ITERATION_COMPLETED(every=100)) def log_training_loss(engine): batch = engine.state.batch ds = DataLoader(TensorDataset(*batch), batch_size=batch_size) train_evaluator.run(ds) metrics = train_evaluator.state.metrics accuracy = metrics['accuracy'] nll = metrics['loss'] iter = (engine.state.iteration - 1) % len(ds_train) + 1 if (iter % 100) == 0: print("Epoch[{}] Iter[{}/{}] Accuracy: {:.2f} Loss: {:.2f}". format(engine.state.epoch, iter, len(ds_train), accuracy, nll)) writer.add_scalar("batchtraining/detloss", nll, engine.state.epoch) writer.add_scalar("batchtraining/accuracy", accuracy, engine.state.iteration) writer.add_scalar("batchtraining/error", 1. - accuracy, engine.state.iteration) writer.add_scalar("batchtraining/loss", engine.state.output, engine.state.iteration) @trainer.on(Events.EPOCH_COMPLETED) def log_lr(engine): writer.add_scalar("lr", optimizer.param_groups[0]['lr'], engine.state.epoch) @trainer.on(Events.ITERATION_COMPLETED(every=valid_freq)) def validation_value(engine): metrics = valid_evaluator.state.metrics valid_avg_accuracy = metrics['accuracy'] return valid_avg_accuracy to_save = {'model': model} handler = Checkpoint( to_save, DiskSaver(os.path.join(base_data, model_id), create_dir=True), score_function=validation_value, score_name="val_acc", global_step_transform=global_step_from_engine(trainer), n_saved=None) # kick everything off trainer.add_event_handler(Events.ITERATION_COMPLETED(every=valid_freq), handler) trainer.run(ds_train, max_epochs=max_epochs)
def __call__(self) -> float: self.logger = logging.getLogger() if self.options.debug: self.logger.setLevel(logging.DEBUG) else: self.logger.setLevel(logging.INFO) fh = logging.FileHandler( os.path.join(self.options.checkpoints_dir, "trainer.log")) formatter = logging.Formatter( "%(asctime)s - %(threadName)s - %(levelname)s: %(message)s") fh.setFormatter(formatter) self.logger.addHandler(fh) self.logger.info("Creating trainer with the following options:") for key, value in vars(self.options).items(): if key == "device": value = value.type elif key == "gpu_ids": value = "cuda : " + str(value) if torch.cuda.is_available( ) else "cpu" self.logger.info( f" {key:>25}: {'None' if value is None else value:<30}") # Create Reconstructor Model self.reconstructor = models.reconstruction.ReconstructorNetwork( number_of_cascade_blocks=self.options.number_of_cascade_blocks, n_downsampling=self.options.n_downsampling, number_of_filters=self.options.number_of_reconstructor_filters, number_of_layers_residual_bottleneck=self.options. number_of_layers_residual_bottleneck, mask_embed_dim=self.options.mask_embed_dim, dropout_probability=self.options.dropout_probability, img_width=self.options.image_width, use_deconv=self.options.use_deconv, ) if self.options.device.type == "cuda": self.reconstructor = torch.nn.DataParallel(self.reconstructor).to( self.options.device) self.optimizers = { "G": optim.Adam( self.reconstructor.parameters(), lr=self.options.lr, betas=(self.options.beta1, 0.999), ) } # Create Evaluator Model if self.options.use_evaluator: self.evaluator = models.evaluator.EvaluatorNetwork( number_of_filters=self.options.number_of_evaluator_filters, number_of_conv_layers=self.options. number_of_evaluator_convolution_layers, use_sigmoid=False, width=self.options.image_width, height=640 if self.options.dataroot == "KNEE_RAW" else None, mask_embed_dim=self.options.mask_embed_dim, ) self.evaluator = torch.nn.DataParallel(self.evaluator).to( self.options.device) self.optimizers["D"] = optim.Adam( self.evaluator.parameters(), lr=self.options.lr, betas=(self.options.beta1, 0.999), ) train_loader, val_loader = self.get_loaders() self.load_from_checkpoint_if_present() self.load_weights_from_given_checkpoint() writer = SummaryWriter(self.options.checkpoints_dir) # Training engine and handlers train_engine = Engine(lambda engine, batch: self.update(batch)) val_engine = Engine(lambda engine, batch: self.inference(batch)) validation_mse = Loss( loss_fn=F.mse_loss, output_transform=lambda x: ( x["reconstructed_image_magnitude"], x["ground_truth_magnitude"], ), ) validation_mse.attach(val_engine, name="mse") validation_ssim = Loss( loss_fn=util.common.compute_ssims, output_transform=lambda x: ( x["reconstructed_image_magnitude"], x["ground_truth_magnitude"], ), ) validation_ssim.attach(val_engine, name="ssim") if self.options.use_evaluator: validation_loss_d = Loss( loss_fn=self.discriminator_loss, output_transform=lambda x: ( x["reconstructor_eval"], x["ground_truth_eval"], { "reconstructed_image": x["reconstructed_image"], "target": x["ground_truth"], "mask": x["mask"], }, ), ) validation_loss_d.attach(val_engine, name="loss_D") progress_bar = ProgressBar() progress_bar.attach(train_engine) train_engine.add_event_handler( Events.EPOCH_COMPLETED, run_validation_and_update_best_checkpoint, val_engine=val_engine, progress_bar=progress_bar, val_loader=val_loader, trainer=self, ) # Tensorboard Plots @train_engine.on(Events.ITERATION_COMPLETED) def plot_training_loss(engine): writer.add_scalar( "training/generator_loss", engine.state.output["loss_G"], self.updates_performed, ) if "loss_D" in engine.state.output: writer.add_scalar( "training/discriminator_loss", engine.state.output["loss_D"], self.updates_performed, ) @train_engine.on(Events.EPOCH_COMPLETED) def plot_validation_loss(_): writer.add_scalar("validation/MSE", val_engine.state.metrics["mse"], self.completed_epochs) writer.add_scalar( "validation/SSIM", val_engine.state.metrics["ssim"], self.completed_epochs, ) if "loss_D" in val_engine.state.metrics: writer.add_scalar( "validation/loss_D", val_engine.state.metrics["loss_D"], self.completed_epochs, ) @train_engine.on(Events.EPOCH_COMPLETED) def plot_validation_images(_): ground_truth = val_engine.state.output["ground_truth_magnitude"] zero_filled_image = val_engine.state.output[ "zero_filled_image_magnitude"] reconstructed_image = val_engine.state.output[ "reconstructed_image_magnitude"] uncertainty_map = val_engine.state.output["uncertainty_map"] difference = torch.abs(ground_truth - reconstructed_image) # Create plots ground_truth = util.common.create_grid_from_tensor(ground_truth) writer.add_image("validation_images/ground_truth", ground_truth, self.completed_epochs) zero_filled_image = util.common.create_grid_from_tensor( zero_filled_image) writer.add_image( "validation_images/zero_filled_image", zero_filled_image, self.completed_epochs, ) reconstructed_image = util.common.create_grid_from_tensor( reconstructed_image) writer.add_image( "validation_images/reconstructed_image", reconstructed_image, self.completed_epochs, ) uncertainty_map = util.common.gray2heatmap( util.common.create_grid_from_tensor(uncertainty_map.exp()), cmap="jet", ) writer.add_image( "validation_images/uncertainty_map", uncertainty_map, self.completed_epochs, ) difference = util.common.create_grid_from_tensor(difference) difference = util.common.gray2heatmap(difference, cmap="gray") writer.add_image("validation_images/difference", difference, self.completed_epochs) mask = util.common.create_grid_from_tensor( val_engine.state.output["mask"].repeat( 1, 1, val_engine.state.output["mask"].shape[3], 1)) writer.add_image("validation_images/mask_image", mask, self.completed_epochs) train_engine.add_event_handler( Events.EPOCH_COMPLETED, save_regular_checkpoint, trainer=self, progress_bar=progress_bar, ) train_engine.run(train_loader, self.options.max_epochs - self.completed_epochs) writer.close() return self.best_validation_score