def train_loop(configs: dict, model: GTransformer, opt: torch.optim.Adam, train: Dataset, test: Dataset, text_encoder: WhitespaceEncoder) -> GTransformer: """ Main training loop. :param configs: Configs defined on the default.yaml file. :param model: Sequence-to-sequence transformer. :param opt: Adam optimizer. :param train: The dataset used for training. :param test: The dataset used for validation. :param text_encoder: Torch NLP text encoder for tokenization and vectorization. """ for e in range(configs.get('num_epochs', 8)): print(f'\n Epoch {e}') model.train() nr_batches = math.ceil(len(train) / configs.get('batch_size', 8)) train_iter, test_iter = get_iterators(configs, train, test) total_loss, steps = 0, 0 for sample in tqdm.tqdm(train_iter, total=nr_batches): # 0) Zero out previous grads opt.zero_grad() # 1) Prepare Sample src, src_lengths, trg, shifted_trg, trg_lengths = prepare_sample( sample, text_encoder) # 2) Run model lprobs = model( src=src.cuda(), trg=shifted_trg.cuda(), src_mask=lengths_to_mask(src_lengths).unsqueeze(1).cuda(), trg_mask=lengths_to_mask(trg_lengths).unsqueeze(1).cuda()) # 3) Compute loss loss = F.nll_loss(lprobs.transpose(2, 1), trg.cuda(), reduction='mean') loss.backward() # 4) Update training metrics total_loss += float(loss.item()) steps += int(trg.ne(0).sum()) # 5) clip gradients # - If the total gradient vector has a length > 1, we clip it back down to 1. if configs.get('gradient_clipping', -1) > 0.0: nn.utils.clip_grad_norm_(model.parameters(), configs.get('gradient_clipping')) # 6) Optim step opt.step() print(f'-- total train loss {total_loss:.4}') total_steps = steps * (e + 1) print(f'-- train steps {total_steps}') validate(model, test_iter, text_encoder) return model
def train( model: DepressionDetector, optimizer: torch.optim.Adam, train_dataset: Dataset, batch_size, num_epochs, writer, reduction_loss, tensorboard_batch=100, _shuffle=True, last_epoch_count=0 ): """ train a LSTMNetwork object :param optimizer: :param last_epoch_count: :param _shuffle: :param reduction_loss: :param tensorboard_batch: :param writer: :param batch_size: :param train_dataset: :param num_epochs: :param model: :return: """ loss_fn = torch.nn.MSELoss(reduction=reduction_loss).cuda() train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=_shuffle) model.train() running_loss_epoch = 0.0 running_loss_batches = 0.0 last_epoch = last_epoch_count + 1 if last_epoch_count != 0 else last_epoch_count for epoch in range(0, num_epochs): try: for batch, data in enumerate(train_data_loader, 0): inputs, labels = data if inputs.shape[0] == batch_size: inputs, labels = inputs.cuda(), labels.cuda() out = model(inputs) loss = loss_fn(out.float(), labels.float()) optimizer.zero_grad() loss.backward() optimizer.step() running_loss_batches += loss.item() running_loss_epoch += loss.item() print(f'Epoch {epoch + 1} batch {batch + 1} train loss: {loss.item()}') if batch % tensorboard_batch == tensorboard_batch - 1: writer.add_scalar(f'training loss per {tensorboard_batch} batches', running_loss_batches / tensorboard_batch, last_epoch * len(train_data_loader) + batch) running_loss_batches = 0.0 writer.add_scalar('training loss per epoch', running_loss_epoch, last_epoch) running_loss_epoch = 0.0 last_epoch += 1 except KeyboardInterrupt: return model, optimizer, last_epoch return model, optimizer, last_epoch
def train(model: Model, optimizer: torch.optim.Adam, epoch_num, train_loader, test_loader, save_dir_best, save_dir_final, device: torch.device): train_losses = [] best_test_auc = 0.0 for epoch in tqdm(range(epoch_num)): model.train() for _, (hist_seq, hist_answers, new_seq, target_answers, _) in tqdm(enumerate(train_loader)): hist_seq, hist_answers, new_seq, target_answers = \ hist_seq.to(device), hist_answers.to(device), new_seq.to(device), target_answers.to(device) # * foward pass # (batch_size, seq_len - 1, 1) pred = model(hist_seq, hist_answers, new_seq) # * compute loss loss = model.loss(pred, target_answers.float()) train_losses.append(loss.item()) # * backward pass & update optimizer.zero_grad() loss.backward() optimizer.step() epoch_loss = np.sum(train_losses) / len(train_losses) model.eval() test_auc = evaluate(model, test_loader, device) print("epoch {}: train_loss: {}, test_auc: {}".format( epoch + 1, epoch_loss, test_auc)) wandb.log({"train_loss": epoch_loss, "test_auc": test_auc}) if test_auc > best_test_auc: best_test_auc = test_auc torch.save(model.state_dict(), save_dir_best) print("best_auc: {} at epoch {}".format(best_test_auc, epoch + 1)) wandb.log({"best_auc": best_test_auc}) print("best_auc: {}".format(best_test_auc)) torch.save(model.state_dict(), save_dir_final) print("done.")
def train_psnr(dataloader: torch.utils.data.DataLoader, model: nn.Module, criterion: nn.MSELoss, optimizer: torch.optim.Adam, epoch: int, scaler: amp.GradScaler, writer: SummaryWriter, args: argparse.ArgumentParser.parse_args): batch_time = AverageMeter("Time", ":6.4f") losses = AverageMeter("Loss", ":.6f") progress = ProgressMeter(num_batches=len(dataloader), meters=[batch_time, losses], prefix=f"Epoch: [{epoch}]") # switch to train mode model.train() end = time.time() for i, (lr, hr) in enumerate(dataloader): # Move data to special device. if args.gpu is not None: lr = lr.cuda(args.gpu, non_blocking=True) hr = hr.cuda(args.gpu, non_blocking=True) optimizer.zero_grad() with amp.autocast(): sr = model(lr) loss = criterion(sr, hr) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() # measure elapsed time batch_time.update(time.time() - end) end = time.time() # measure accuracy and record loss losses.update(loss.item(), lr.size(0)) iters = i + epoch * len(dataloader) + 1 writer.add_scalar("Train/Loss", loss.item(), iters) # Output results every 100 batches. if i % 100 == 0: progress.display(i) # Save image every 300 batches. if iters % 300 == 0: vutils.save_image(hr.detach(), os.path.join("runs", "hr", f"PSNR_{iters}.bmp")) vutils.save_image(sr.detach(), os.path.join("runs", "sr", f"PSNR_{iters}.bmp"))
def train_loop(configs: dict, model: CTransformer, opt: torch.optim.Adam, train: Dataset, test: Dataset, text_encoder: WhitespaceEncoder, label_encoder: LabelEncoder) -> CTransformer: """ Main training loop. :param configs: Configs defined on the default.yaml file. :param model: Transformer Classifier. :param opt: Adam optimizer. :param train: The dataset used for training. :param test: The dataset used for validation. :param text_encoder: Torch NLP text encoder for tokenization and vectorization. :param label_encoder: Torch NLP label encoder for vectorization of the labels. """ seen = 0 for e in range(configs.get('num_epochs', 8)): print(f'\n Epoch {e}') model.train() nr_batches = math.ceil(len(train) / configs.get('batch_size', 8)) train_iter, test_iter = get_iterators(configs, train, test) for sample in tqdm.tqdm(train_iter, total=nr_batches): # 0) Zero out previous grads opt.zero_grad() # 1) Prepare Sample input_seqs, input_mask, targets = prepare_sample( sample, text_encoder, label_encoder, configs.get('max_length', 256)) # 2) Run model out = model(input_seqs.cuda(), input_mask.cuda()) # 3) Compute loss loss = F.nll_loss(out, targets.cuda()) loss.backward() # 4) clip gradients # - If the total gradient vector has a length > 1, we clip it back down to 1. if configs.get('gradient_clipping', -1) > 0.0: nn.utils.clip_grad_norm_(model.parameters(), configs.get('gradient_clipping')) # 5) Optim step opt.step() # 6) Update number of seen examples... seen += input_seqs.size(0) validate(model, text_encoder, label_encoder, configs.get('max_length', 256), test_iter) return model
def train(net: Network, optimizer: torch.optim.Adam, train_loader: torch.utils.data.DataLoader, epoch: int): net.train() for batch_idx, (x, y) in enumerate(train_loader): optimizer.zero_grad() output = net(x.view(-1, 28 * 28).to(net.device)) loss = F.nll_loss(output, y.to(net.device)) loss.backward() optimizer.step() if batch_idx % 100 == 0: print( f"Train Epoch: {epoch}, Step: {batch_idx*len(x)}/{len(train_loader.dataset)}, Loss: {loss.item()}" )
def train_segments(self, model, l_loss, m_loss, optimizer: torch.optim.Adam, train_set): model.train(mode=True) accuracy_classification_sum = 0 loss_m_sum = 0 loss_l1_sum = 0 loss_classification_sum = 0 batch_count = 0 for images, segments, labels in train_set: labels, segments = model_utils.reduce_to_class_number( self.left_class_number, self.right_class_number, labels, segments) images, labels, segments = self.convert_data_and_label( images, labels, segments) segments = self.puller(segments) optimizer.zero_grad() model_classification, model_segmentation = model_utils.wait_while_can_execute( model, images) classification_loss = l_loss(model_classification, labels) segmentation_loss = m_loss(model_segmentation, segments) #torch.cuda.empty_cache() segmentation_loss.backward() optimizer.step() output_probability, output_cl, cl_acc = self.calculate_accuracy( labels, model_classification, labels.size(0)) self.save_train_data(labels, output_cl, output_probability) # accumulate information accuracy_classification_sum += model_utils.scalar(cl_acc.sum()) loss_m_sum += model_utils.scalar(segmentation_loss.sum()) loss_l1_sum += 0 loss_classification_sum += model_utils.scalar( classification_loss.sum()) batch_count += 1 #self.de_convert_data_and_label(images, labels, segments) #torch.cuda.empty_cache() model.train(mode=False) return accuracy_classification_sum / ( batch_count + p.EPS), loss_m_sum / (batch_count + p.EPS), loss_l1_sum / ( batch_count + p.EPS), loss_classification_sum / (batch_count + p.EPS)
def _train_step( batch_x: torch.Tensor, batch_y: torch.Tensor, cavity_model_net: CavityModel, optimizer: torch.optim.Adam, loss_function: torch.nn.CrossEntropyLoss, ) -> (torch.Tensor, float): """ Helper function to take a training step """ cavity_model_net.train() optimizer.zero_grad() batch_y_pred = cavity_model_net(batch_x) loss_batch = loss_function(batch_y_pred, torch.argmax(batch_y, dim=-1)) loss_batch.backward() optimizer.step() return (batch_y_pred, loss_batch.detach().cpu().item())
def train_epoch(model: nn.Module, train_loader: DataLoader, criterion: nn.CrossEntropyLoss, optimizer: torch.optim.Adam, device: torch.device, ration): epoch_loss = 0.0 model.train() for data in train_loader: optimizer.zero_grad() prediction, target = model(data, device=device, ration=ration) loss = criterion(prediction, target) loss.backward() optimizer.step() epoch_loss += loss.item() return epoch_loss / len(train_loader.dataset)
def train_loop(num_of_epoch: int, input_data: torch.autograd.Variable, ground_truth: torch.autograd.Variable, optimizer: torch.optim.Adam, model: torch.nn.Sequential): """A simple train loop. Args: num_of_epoch (int): Number of epoch. input_data (torch.autograd.Variable): Input data. ground_truth(torch.autograd.Variable): Ground truth. optimizer (torch.optim.Adam): ADAM optimizer model(torch.nn.Sequential): Neural network model. """ loss_fn = torch.nn.MSELoss(reduction='sum') for t in range(num_of_epoch): output_pred = model(input_data) loss = loss_fn(output_pred, ground_truth) optimizer.zero_grad() loss.backward() optimizer.step()
def model_train(model: nn.Module, train_loader: DataLoader, optimizer: torch.optim.Adam, num_epochs: int, loss_function: Callable[ [torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], float], device: str) -> List[float]: ''' Function for training a given input model. Parameters ---------- model : nn.Model Model, i.e. varational autoencoder, which needs to be trained. train_loader : DataLoader DataLoder of the custom training set used training utilities such as mini-batches and shuffling. optimizer : torch.optim.Adam Adam optimizer for the recalculation of the neural network weights by minimizing the calculated loss. num_epochs : int Number of training epochs of the model. loss_function : Callable[[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], float] Custom loss input function for error calculation. device : str Device on which the computation is performed. Typically it is either cpu or cuda (gpu). Returns ------- running_rec_loss : list Returns the list of values relating to the average error at the end of each epoch. ''' running_rec_loss = [] loss = 0 model.train() tqdm_bar = tqdm(range(1, num_epochs + 1), desc="epoch [loss: ...]") for epoch in tqdm_bar: train_loss_averager = make_averager() batch_bar = tqdm(train_loader, leave=False, desc='batch', total=len(train_loader)) for batch in batch_bar: batch = batch.float() batch = batch.to(device) batch_reconstructed, latent_mu, latent_logvar = model(batch) loss = loss_function(batch_reconstructed, batch, latent_mu, latent_logvar) #Backpropagation optimizer.zero_grad() loss.backward() optimizer.step() refresh_bar( batch_bar, f"train batch [loss: {train_loss_averager(loss.item()):.3f}]") refresh_bar(tqdm_bar, f"epoch [loss: {train_loss_averager(None):.3f}]") running_rec_loss.append(train_loss_averager(None)) return running_rec_loss
def train_gan(dataloader: torch.utils.data.DataLoader, discriminator: nn.Module, discriminator_optimizer: torch.optim.Adam, generator: nn.Module, generator_optimizer: torch.optim.Adam, pixel_criterion: nn.L1Loss, content_criterion: VGGLoss, adversarial_criterion: nn.BCEWithLogitsLoss, epoch: int, scaler: amp.GradScaler, writer: SummaryWriter, args: argparse.ArgumentParser.parse_args): batch_time = AverageMeter("Time", ":.4f") d_losses = AverageMeter("D Loss", ":.6f") g_losses = AverageMeter("G Loss", ":.6f") pixel_losses = AverageMeter("Pixel Loss", ":6.4f") content_losses = AverageMeter("Content Loss", ":6.4f") adversarial_losses = AverageMeter("Adversarial Loss", ":6.4f") progress = ProgressMeter(num_batches=len(dataloader), meters=[ batch_time, d_losses, g_losses, pixel_losses, content_losses, adversarial_losses ], prefix=f"Epoch: [{epoch}]") # switch to train mode discriminator.train() generator.train() end = time.time() for i, (lr, hr) in enumerate(dataloader): # Move data to special device. if args.gpu is not None: lr = lr.cuda(args.gpu, non_blocking=True) hr = hr.cuda(args.gpu, non_blocking=True) batch_size = lr.size(0) # The real sample label is 1, and the generated sample label is 0. real_label = torch.full((batch_size, 1), 1, dtype=lr.dtype).cuda(args.gpu, non_blocking=True) fake_label = torch.full((batch_size, 1), 0, dtype=lr.dtype).cuda(args.gpu, non_blocking=True) ############################################## # (1) Update D network: E(hr)[fake(C(D(hr) - E(sr)C(sr)))] + E(sr)[fake(C(fake) - E(real)C(real))] ############################################## discriminator_optimizer.zero_grad() with amp.autocast(): sr = generator(lr) # It makes the discriminator distinguish between real sample and fake sample. real_output = discriminator(hr) fake_output = discriminator(sr.detach()) # Adversarial loss for real and fake images (relativistic average GAN) d_loss_real = adversarial_criterion( real_output - torch.mean(fake_output), real_label) d_loss_fake = adversarial_criterion( fake_output - torch.mean(real_output), fake_label) # Count all discriminator losses. d_loss = (d_loss_real + d_loss_fake) / 2 scaler.scale(d_loss).backward() scaler.step(discriminator_optimizer) scaler.update() ############################################## # (2) Update G network: E(hr)[sr(C(D(hr) - E(sr)C(sr)))] + E(sr)[sr(C(fake) - E(real)C(real))] ############################################## generator_optimizer.zero_grad() with amp.autocast(): sr = generator(lr) # It makes the discriminator unable to distinguish the real samples and fake samples. real_output = discriminator(hr.detach()) fake_output = discriminator(sr) # Calculate the absolute value of pixels with L1 loss. pixel_loss = pixel_criterion(sr, hr.detach()) # # The 35th layer in VGG19 is used as the feature extractor by default. content_loss = content_criterion(sr, hr.detach()) # Adversarial loss for real and fake images (relativistic average GAN) adversarial_loss = adversarial_criterion( fake_output - torch.mean(real_output), real_label) # Count all generator losses. g_loss = 0.01 * pixel_loss + 1 * content_loss + 0.005 * adversarial_loss scaler.scale(g_loss).backward() scaler.step(generator_optimizer) scaler.update() # Set generator gradients to zero. generator.zero_grad() # measure elapsed time batch_time.update(time.time() - end) end = time.time() # measure accuracy and record loss d_losses.update(d_loss.item(), lr.size(0)) g_losses.update(g_loss.item(), lr.size(0)) pixel_losses.update(pixel_loss.item(), lr.size(0)) content_losses.update(content_loss.item(), lr.size(0)) adversarial_losses.update(adversarial_loss.item(), lr.size(0)) iters = i + epoch * len(dataloader) + 1 writer.add_scalar("Train/D Loss", d_loss.item(), iters) writer.add_scalar("Train/G Loss", g_loss.item(), iters) writer.add_scalar("Train/Pixel Loss", pixel_loss.item(), iters) writer.add_scalar("Train/Content Loss", content_loss.item(), iters) writer.add_scalar("Train/Adversarial Loss", adversarial_loss.item(), iters) # Output results every 100 batches. if i % 100 == 0: progress.display(i) # Save image every 300 batches. if iters % 300 == 0: vutils.save_image(hr.detach(), os.path.join("runs", "hr", f"GAN_{iters}.bmp")) vutils.save_image(sr.detach(), os.path.join("runs", "sr", f"GAN_{iters}.bmp"))
def train( #train_config: TrainingConfiguration, model: nn.Module, optimizer: torch.optim.Optimizer, train_config: TrainingConfiguration, model: nn.Module, optimizer: torch.optim.Adam, train_loader: torch.utils.data.DataLoader, epoch_idx: int ) -> None: # change model in training mood model.train() # to get batch loss batch_loss = np.array([]) # to get batch accuracy batch_acc = np.array([]) for batch_idx, (data, target) in enumerate(train_loader): # clone target indx_target = target.clone() # send data to device (its is medatory if GPU has to be used) data = data.to(train_config.device) # send target to device target = target.to(train_config.device) # reset parameters gradient to zero optimizer.zero_grad() # forward pass to the model output = model(data) # cross entropy loss loss = F.cross_entropy(output, target) # find gradients w.r.t training parameters loss.backward() # Update parameters using gardients optimizer.step() batch_loss = np.append(batch_loss, [loss.item()]) # Score to probability using softmax prob = F.softmax(output, dim=1) # get the index of the max probability pred = prob.data.max(dim=1)[1] # correct prediction correct = pred.cpu().eq(indx_target).sum() # accuracy acc = float(correct) / float(len(data)) batch_acc = np.append(batch_acc, [acc]) if batch_idx % train_config.log_interval == 0 and batch_idx > 0: print( 'Train Epoch: {} [{}/{}] Loss: {:.6f} Acc: {:.4f}'.format( epoch_idx, batch_idx * len(data), len(train_loader.dataset), loss.item(), acc ) ) epoch_loss = batch_loss.mean() epoch_acc = batch_acc.mean() return epoch_loss, epoch_acc
def fit(self, train_dataloader: DataLoader, train_len: int, epochs: int, criterion: nn.CrossEntropyLoss, optimizer: torch.optim.Adam, verbose=True, device="cuda", test_dataloader=None, test_len=None, use_nni=False, save_checkpoints=False, model_save_threshold=0.85) -> dict: self.train() results = dict() results["train_acc"] = list() results["train_loss"] = list() results["train_precision"] = list() results["train_recall"] = list() results["train_f1"] = list() if test_dataloader is not None: results["test_acc"] = list() results["test_loss"] = list() results["test_precision"] = list() results["test_recall"] = list() results["test_f1"] = list() self.to(device) if verbose: print("statring training...") for epoch in tqdm.tqdm( range(epochs)): # loop over the dataset multiple times running_loss = 0.0 for i, data in enumerate( tqdm.tqdm(train_dataloader)): # mini-batch self.train() inputs, mask, target_mask, labels = data outputs = self(inputs, mask, target_mask) loss = criterion(outputs, labels) loss.backward() optimizer.step() optimizer.zero_grad() # print statistics #running_loss += loss.item() y_real, y_pred = calc_performance(self, train_dataloader) acc, precision, recall, f1 = calc_classification_metrics( y_true=y_real, y_pred=y_pred) if verbose: print( f'\tEp #{epoch} | Train. Loss: {loss:.3f} | Acc: {acc * 100:.2f}% | Precision: {precision * 100:.2f}% | Recall: {recall * 100:.2f}% | F1: {f1 * 100:.2f}%' ) results["train_acc"].append(acc) results["train_loss"].append(loss) results["train_precision"].append(precision) results["train_recall"].append(recall) results["train_f1"].append(f1) if test_dataloader is not None: y_real, y_pred = calc_performance(self, test_dataloader) test_acc, precision, recall, test_f1 = calc_classification_metrics( y_true=y_real, y_pred=y_pred) if verbose: print( f'\tEp #{epoch} | Dev. ' f'cc: {acc * 100:.2f}% | Precision: {precision * 100:.2f}% | Recall: {recall * 100:.2f}% | F1: {f1 * 100:.2f}%' ) results["test_acc"].append(acc) results["test_loss"].append(loss) results["test_precision"].append(precision) results["test_recall"].append(recall) results["test_f1"].append(f1) if save_checkpoints and model_save_threshold <= acc: model_name = self.generate_model_save_name(acc) model_path = os.path.join("models", model_name) torch.save(self, model_path) if use_nni: nni.report_intermediate_result({ "train_acc": acc, "train_f1": f1, "default": test_acc, "test_f1": test_f1 }) if verbose: print('Finished Training') return results