def train_epoch( clf: torch.nn.Module, optimizer: torch.optim.Optimizer, loss_function: torch.nn.Module, words_train: List[List[str]], y_train: List[int], sequence_limit=32, batch_size=32, device="cpu", ) -> List[float]: clf.train() N = len(words_train) X, y = shuffle(words_train, y_train) epoch_pred = [] losses = [] with tqdm(range(0, N, batch_size)) as progress: for start in progress: clf.train() end = min(start + batch_size, N) X_batch = [x[:sequence_limit] for x in X[start:end]] y_batch = torch.tensor(y[start:end], dtype=torch.long).to(device) clf.zero_grad() y_scores = clf(X_batch) loss = loss_function(y_scores, y_batch) loss.backward() optimizer.step() clf.eval() epoch_pred.extend(((y_scores[:, 1] - y_scores[:, 0]) > 0).tolist()) losses.append(loss.item()) progress.set_description("Train Loss: {:.03}".format( np.mean(losses[-10:]))) return losses
def train(): for epoch in range(epochs): ts = time.time() print(epoch) for iter, (X, tar, Y) in enumerate(train_loader): optimizer.zero_grad() # inputs = X.to(computing_device) inputs = X.cuda() labels = Y.cuda() # labels = Y.to(computing_device) print("Getting outputs") outputs = resnet_model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() #EARLY STOP TESTING CONDITION if iter > 5: break if iter % 10 == 0: print("epoch{}, iter{}, loss: {}".format( epoch, iter, loss.item())) print("Finish epoch {}, time elapsed {}".format( epoch, time.time() - ts)) #torch.save(resnet_model, 'best_model') #val(epoch) resnet_model.train()
def test(net: nn.Module, loss_fn: loss, x_test: np.array, y_test: np.array) -> Tuple[float, np.array]: """ Run the model on x_test and calculate the loss of the predictions. The model run on evaluation mode and without updating the computational graph (no_grad) """ net.eval() with torch.no_grad(): y_test_pred = net(x_test.float()) loss = loss_fn(input=y_test_pred.reshape(-1), target=y_test.float()) test_loss = loss.item() return test_loss, y_test_pred
def train_val(loader=None, model=None, loss_function=None, optimizer=None, train_enable=None, device=None, model_classifier=None, model_id=None): sum_loss = 0.0 sum_mse = 0.0 sum_mae = 0.0 sum_psnr = 0.0 sum_ssim = 0.0 if train_enable == 'True': model = model.train() else: model = model.eval() # default closing Dropout for img_NAC, img_AC, _ in loader: img_NAC = img_NAC.float() img_NAC = img_NAC.to(device) img_AC = img_AC.float() img_AC = img_AC.to(device) cam = get_grad_cam(model_classifier, img_NAC) pred = process_cam(cam, model_id, model, img_NAC, device) loss = loss_function(pred, img_AC) # Loss is just MSE mse, mae, psnr, ssim = matrics(img_AC, pred) if train_enable == 'True': optimizer.zero_grad() loss.backward() # back propagation optimizer.step() sum_loss += float(loss.item()) sum_mse += float(mse.item()) sum_mae += float(mae.item()) sum_psnr += float(psnr) sum_ssim += float(ssim) epoch_loss = sum_loss / len(loader) epoch_mse = sum_mse / len(loader) epoch_mae = sum_mae / len(loader) epoch_psnr = sum_psnr / len(loader) epoch_ssim = sum_ssim / len(loader) return epoch_loss, epoch_mse, epoch_mae, epoch_psnr, epoch_ssim
def train(args): transformer=T.Compose([ T.ToTensor(), T.Normalize((0.3081),(0.1307)) ]) train_data=torchvision.datasets.MNIST(root=args.data_path,transform=transformer,download=True,train=True) train_loader=torch.utils.data.DataLoader(train_data,batch_size=args.batch_size,shuffle=True,drop_last=True,num_workers=4) model_arg=model_dict[args.model][1] model_arg["act"]=(act_dict[args.act]) device=torch.device(args.device) net=model_dict[args.model][0](**model_arg).to(device) if args.optimizer=='adam': optimizer=torch.optim.Adam(net.parameters(),lr=args.lr,betas=(0.9,0.99)) elif args.optimizer=='SGD': optimizer=torch.optim.SGD(net.parameters(),lr=args.lr,momentum=0.9) else: optimizer=None loss_func=loss_dict[args.loss_func]() writer=tensorboardX.SummaryWriter() current_acc=0 for epoch in range(args.epoch): total_loss=0. total_acc=0. for i,(images,labels) in enumerate(train_loader): images,labels=images.to(device),labels.to(device) outputs=net(images) loss=loss_func(outputs,labels) optimizer.zero_grad() loss.backward() optimizer.step() total_loss+=loss.item() acc=torch.sum(outputs.argmax(-1)==labels).item() total_acc+=acc/args.batch_size writer.add_scalar('data/loss',loss,i+epoch*len(train_loader)) print("epoch%3d: loss=%.4f ,acc:%.2f%% " %(epoch,total_loss/len(train_loader),total_acc*100/len(train_loader))) if(epoch%1==0): eval_acc=eval(args,net) writer.add_scalar('data/acc',eval_acc,epoch) if eval_acc>current_acc: torch.save(net.state_dict(),'%s/best_%s_model.pth' %(args.checkpoints_path,args.model))
def trainloop(self, n_epochs): for epoch in range(1, n_epochs + 1): self.evaluate(mask_data.data, mask_data.label) loss_train = 0.0 for input, realout in self.dataloader: predictout = self.network(input) loss = self.loss_fn(predictout, realout) self.optim.zero_grad() loss.backward() self.optim.step() loss_train += loss.item() #if epoch == 1 or epoch % 100 == 0: print( f'{datetime.datetime.now()} epoch {epoch} training loss {loss_train/len(self.dataloader)}' )
def validate(self): """ Validation cycle. Performed over a custom dataset. """ print("Validation") self._net.eval() val_gen = self._dispatcher.val_gen() relative_error_list = [] for sample_idx, sample in enumerate(val_gen): if self.use_gpu: sample.cuda() sample.batchify() pred = self._net.forward(sample.image_tensor) loss, details = self._net.loss(pred, sample.segmentation_tensor) pred_area = self._dispatcher.decode_prediction(pred) anno_hw = sample.anno_hw gt_area = anno_hw[0] * anno_hw[1] relative_error = abs(pred_area - gt_area) / gt_area relative_error_list.append(relative_error) if sample_idx % 20 == 0: print("loss={:.4f} gt_area={} pred_area={}".format( loss.item(), gt_area, pred_area)) self._render_prediction( pred.detach().cpu().numpy()[0], None, sample.image_tensor.detach().cpu().numpy()[0].transpose( (1, 2, 0))) average_relative_error = \ np.array(relative_error_list).sum() / len(relative_error_list) print("-------- Final metric -----------") print("Average relative area error = {:0.6f}".format( average_relative_error)) pass
def train(self): """ Perform training of the network. """ num_epochs = 50 batch_size = 16 batches_per_epoch = 1024 learning_rate = 0.02 optimizer = torch.optim.SGD(self._net.parameters(), lr=learning_rate) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [40, 45], gamma=0.1, last_epoch=-1) training_start_time = time.time() self.validate() for epoch in range(num_epochs): print("Epoch ------ ", epoch) train_gen = self._dispatcher.train_gen(batches_per_epoch, batch_size) self._net.train() for batch_index, batch in enumerate(train_gen): if self.use_gpu: batch.cuda() pred = self._net.forward(batch.image_tensor) loss, details = self._net.loss(pred, batch.segmentation_tensor) if batch_index % 50 == 0: print("epoch={} batch={} loss={:.4f}".format( epoch, batch_index, loss.item())) self._render_prediction( pred.detach().cpu().numpy()[0], batch.segmentation_tensor.detach().cpu().numpy()[0], batch.image_tensor.detach().cpu().numpy()[0].transpose( (1, 2, 0))) print("-------------------------------") optimizer.zero_grad() loss.backward() optimizer.step() pass scheduler.step() # Save after every epoch torch.save(self._net.state_dict(), self._snapshot_name) # Validate every epoch self.validate() pass # end of epoch training_end_time = time.time() print("Training took {} hours".format( (training_end_time - training_start_time) / 3600)) print("Train finished!")
def _train( self, train_data, epoch, val_data=None, val_step=None, ckpt_step=None, ): """helper method, called by the fit method on each epoch. Iterates once through train_data, using it to update model parameters. Override this method if you need to implement your own training method. Parameters ---------- train_data : torch.util.Dataloader instance that will be iterated over. """ self.network.train() progress_bar = tqdm(train_data) for ind, batch in enumerate(progress_bar): x, y = batch[0].to(self.device), batch[1].to(self.device) y_pred = self.network.forward(x) self.optimizer.zero_grad() loss = self.loss(y_pred, y) loss.backward() self.optimizer.step() progress_bar.set_description( f'Epoch {epoch}, batch {ind}. Loss: {loss.item():.4f}. Global step: {self.global_step}' ) if self.summary_writer is not None: self.summary_writer.add_scalar('loss/train', loss.item(), self.global_step) self.global_step += 1 if val_data is not None: if self.global_step % val_step == 0: log_or_print( f'Step {self.global_step} is a validation step; computing metrics on validation set', logger=self.logger, level='info') metric_vals = self._eval(val_data) self.network.train() # because _eval calls network.eval() log_or_print(msg=', '.join([ f'{metric_name}: {metric_value:.4f}' for metric_name, metric_value in metric_vals.items() if metric_name.startswith('avg_') ]), logger=self.logger, level='info') if self.summary_writer is not None: for metric_name, metric_value in metric_vals.items(): if metric_name.startswith('avg_'): self.summary_writer.add_scalar( f'{metric_name}/val', metric_value, self.global_step) current_val_acc = metric_vals['avg_acc'] if current_val_acc > self.max_val_acc: self.max_val_acc = current_val_acc log_or_print( msg= f'Accuracy on validation set improved. Saving max-val-acc checkpoint.', logger=self.logger, level='info') self.save(self.max_val_acc_ckpt_path, epoch=epoch, global_step=self.global_step) if self.patience: self.patience_counter = 0 else: # if accuracy did not improve if self.patience: self.patience_counter += 1 if self.patience_counter > self.patience: log_or_print( 'Stopping training early, ' f'accuracy has not improved in {self.patience} validation steps.', logger=self.logger, level='info') # save "backup" checkpoint upon stopping; don't save over "max-val-acc" checkpoint self.save(self.ckpt_path, epoch=epoch, global_step=self.global_step) progress_bar.close() break else: log_or_print( f'Accuracy has not improved in {self.patience_counter} validation steps. ' f'Not saving max-val-acc checkpoint for this validation step.', logger=self.logger, level='info') else: # patience is None. We still log that we are not saving checkpoint. log_or_print( 'Accuracy is less than maximum validation accuracy so far. ' 'Not saving max-val-acc checkpoint.', logger=self.logger, level='info') # below can be true regardless of whether we have val_data and/or current epoch is a val_epoch if self.global_step % ckpt_step == 0: log_or_print(f'Step {self.global_step} is a checkpoint step.', logger=self.logger, level='info') self.save(self.ckpt_path, epoch=epoch, global_step=self.global_step)
def train(net: nn.Module, train_dataloader: DataLoader = None, val_dataloader: DataLoader = None, test_dataloader: DataLoader = None, is_earlystopping: bool = True) -> nn.Module: """ Training loop iterating on the train dataloader and updating the model's weights. Inferring the validation dataloader & test dataloader, if given, to babysit the learning Activating cuda device if available. :return: Trained model """ train_losses: np.array = np.zeros(NUM_EPOCHS) val_losses: np.array = np.zeros(NUM_EPOCHS) best_epoch: int = NUM_EPOCHS - 1 if test_dataloader: untrained_test_loss, untrained_y_test_pred = infer( net, test_dataloader, loss_fn) _, _ = get_num_of_areas_and_targets_from_arary(array=y_test) print(f'Test Loss before training: {untrained_test_loss:.3f}') _, _, _ = calculate_model_metrics(y_true=y_test, y_pred=untrained_y_test_pred, verbose=True) for epoch in range(NUM_EPOCHS): print(f'*************** Epoch {epoch + 1} ***************') net.train() h = net.init_hidden(batch_size=BATCH_SIZE) for batch_idx, (x_train, y_train) in enumerate(tqdm(train_dataloader)): if train_on_gpu: net.cuda() x_train, y_train = x_train.cuda(), y_train.cuda() h = h.data optimizer.zero_grad() y_train_pred, h = net(x_train, h) loss = loss_fn(y_train_pred, y_train) loss.backward() optimizer.step() if val_dataloader: val_loss, y_val_pred = infer(net, val_dataloader, loss_fn) val_losses[epoch] = val_loss if is_earlystopping and check_earlystopping(loss=val_losses, epoch=epoch): print('EarlyStopping !!!') best_epoch = np.argmin(val_losses[:epoch + 1]) break train_losses[epoch] = loss.item() / len(train_dataloader) scheduler.step( val_loss) # Change the lr if needed based on the validation loss if epoch % PRINT_EVERY == 0: print(f"Epoch: {epoch + 1}/{NUM_EPOCHS},", f"Train loss: {train_losses[epoch]:.5f},", f"Validation loss: {val_losses[epoch]:.5f}") _, _, _ = calculate_model_metrics(y_true=y_train, y_pred=y_train_pred, mode='Train-Last Batch') if val_dataloader: _, _, _ = calculate_model_metrics(y_true=y_val, y_pred=y_val_pred, mode='Validation') if (epoch + 1) % SAVE_EVERY == 0: save_pt_model(net=net) if best_epoch != NUM_EPOCHS - 1: # earlystopping NOT activated train_losses = train_losses[:best_epoch + 1] val_losses = val_losses[:best_epoch + 1] else: best_epoch = np.argmin(val_losses) print( f'Best Epoch: {best_epoch + 1}; Best Validation Loss: {val_losses[best_epoch]:.4f}' ) print(train_losses) plot_values_by_epochs(train_values=train_losses, validation_values=val_losses) return net
def train(self): self.model.train() # log data to these variables if 'loaded' in self.settings: if not self.settings['loaded']: self.model.training_loss = [] self.model.training_acc = [] self.model.validation_acc = [] self.model.validation_loss = [] else: self.model.training_loss = [] self.model.training_acc = [] self.model.validation_acc = [] self.model.validation_loss = [] for epoch in range(self.settings['EPOCHS']): self.model.train() ts = time.time() lossSum = 0 accuracySum = 0 totalImage = 0 for iter, (X, tar, Y) in enumerate(self.train_loader): self.optimizer.zero_grad() if('imagesPerEpoch' in self.settings): if iter*self.batch_size > self.settings['imagesPerEpoch']: break #inputs = X.to(computing_device) inputs = X.cuda() labels = Y.cuda() #labels = Y.to(computing_device) outputs = self.model(inputs) loss = self.criterion(outputs, labels) lossSum += loss.item() accuracies = pixel_acc(outputs, labels) accuracySum += torch.sum(accuracies)/self.batch_size torch.cuda.empty_cache() loss.backward() self.optimizer.step() totalImage += 1 if iter % 100 == 0: None print("Iter", iter, "Done") #print("epoch{}, iter{}, loss: {}".format(epoch, iter, loss.item())) lossSum = lossSum / totalImage self.model.training_loss.append(lossSum) accuracy = accuracySum / totalImage # totalImage? if accuracy is None: accuracy = torch.tensor([0.0]) self.model.training_acc.append(accuracy.item()) print(totalImage*self.batch_size) print("-------------------------------------") print("Train epoch {}, time elapsed {}, loss {}, accuracy: {}".format(epoch, time.time() - ts, lossSum, accuracy.item())) self.val(epoch)
def train( self, epoch, max_epoch, writer, print_freq=10, fixbase_epoch=0, open_layers=None ): losses_t = AverageMeter() losses_x = AverageMeter() losses_recons = AverageMeter() accs = AverageMeter() batch_time = AverageMeter() data_time = AverageMeter() self.model.train() open_all_layers(self.model) num_batches = len(self.train_loader) end = time.time() for batch_idx, data in enumerate(self.train_loader): data_time.update(time.time() - end) imgs, pids = self._parse_data_for_train(data) imgs_clean=imgs.clone() if self.use_gpu: imgs = imgs.cuda() imgs_clean = imgs_clean.cuda() pids = pids.cuda() labelss=[] if epoch >= 0 and epoch < 15: randmt = RandomErasing(probability=0.5,sl=0.07, sh=0.3) for i, img in enumerate(imgs): imgs[i],p = randmt(img) labelss.append(p) if epoch >= 15: randmt = RandomErasing(probability=0.5,sl=0.1, sh=0.3) for i, img in enumerate(imgs): imgs[i],p = randmt(img) labelss.append(p) binary_labels = torch.tensor(np.asarray(labelss)).cuda() self.optimizer.zero_grad() outputs, outputs2, recons,bin_out1,bin_out2, bin_out3 = self.model(imgs ) loss_mse = self.criterion_mse(recons, imgs_clean) loss = self.mgn_loss(outputs, pids) occ_loss1 = self.BCE_criterion(bin_out1.squeeze(1),binary_labels.float() ) occ_loss2 = self.BCE_criterion(bin_out2.squeeze(1),binary_labels.float() ) occ_loss3 = self.BCE_criterion(bin_out3.squeeze(1),binary_labels.float() ) loss = loss + .05*loss_mse + 0.1*occ_loss1 + 0.1*occ_loss2+0.1*occ_loss3 #loss = self.weight_t * loss_t + self.weight_x * loss_x #+ #self.weight_r*loss_mse loss.backward() self.optimizer.step() batch_time.update(time.time() - end) #losses_t.update(loss_t.item(), pids.size(0)) losses_x.update(loss.item(), pids.size(0)) losses_recons.update(occ_loss1.item(), binary_labels.size(0)) accs.update(metrics.accuracy(outputs, pids)[0].item()) if (batch_idx + 1) % print_freq == 0: # estimate remaining time eta_seconds = batch_time.avg * ( num_batches - (batch_idx + 1) + (max_epoch - (epoch + 1)) * num_batches ) eta_str = str(datetime.timedelta(seconds=int(eta_seconds))) print( 'Epoch: [{0}/{1}][{2}/{3}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' #'Loss_t {loss_t.val:.4f} ({loss_t.avg:.4f})\t' 'Loss_x {loss_x.val:.4f} ({loss_x.avg:.4f})\t' 'Loss_Occlusion {loss_r.val:.4f} ({loss_r.avg:.4f})\t' 'Acc {acc.val:.2f} ({acc.avg:.2f})\t' 'Lr {lr:.6f}\t' 'eta {eta}'.format( epoch + 1, max_epoch, batch_idx + 1, num_batches, batch_time=batch_time, data_time=data_time, #loss_t=losses_t, loss_x=losses_x, loss_r = losses_recons, acc=accs, lr=self.optimizer.param_groups[0]['lr'], eta=eta_str ) ) writer= None if writer is not None: n_iter = epoch * num_batches + batch_idx writer.add_scalar('Train/Time', batch_time.avg, n_iter) writer.add_scalar('Train/Data', data_time.avg, n_iter) writer.add_scalar('Train/Loss_t', losses_t.avg, n_iter) writer.add_scalar('Train/Loss_x', losses_x.avg, n_iter) writer.add_scalar('Train/Acc', accs.avg, n_iter) writer.add_scalar( 'Train/Lr', self.optimizer.param_groups[0]['lr'], n_iter ) end = time.time() if self.scheduler is not None: self.scheduler.step()
def train(net: nn.Module, optimizer: torch.optim, train_dataloader: DataLoader = None, val_dataloader: DataLoader = None, infer_df: np.array = None, is_earlystopping: bool = False) -> nn.Module: """ Training loop iterating on the train dataloader and updating the model's weights. Inferring the validation dataloader & test dataloader, if given, to babysit the learning Activating cuda device if available. :return: Trained model """ NUMBER_OF_PREDS: int = len(train_dataloader.dataset) * NUM_USERS train_losses: np.array = np.zeros(NUM_EPOCHS) train_accuracy: np.array = np.zeros(NUM_EPOCHS) val_losses: np.array = np.zeros(NUM_EPOCHS) val_accuracy: np.array = np.zeros(NUM_EPOCHS) train_positive_pred: int = 0 train_positive_number: int = 0 best_epoch: int = NUM_EPOCHS - 1 if val_dataloader: untrained_val_loss, untrained_val_accuracy = infer(net=net, infer_dataloader=val_dataloader, loss_fn=loss_fn, infer_df=infer_df) print(f'Validation Loss before training: {untrained_val_loss:.5f}') for epoch in range(NUM_EPOCHS): print(f'*************** Epoch {epoch + 1} ***************') train_correct_counter = 0 loss_running = 0 net.train() for x_train, y_train in tqdm(train_dataloader): if train_on_gpu: net.cuda() x_train, y_train = x_train.cuda(), y_train.cuda() optimizer.zero_grad() y_train_pred = net(x_train) loss = loss_fn(y_train_pred.flatten(), y_train.flatten()) loss_running += loss.item() loss.backward() optimizer.step() train_preds = np.where(y_train_pred > 0.5, 1, 0) train_correct_counter += (train_preds == np.array(y_train)).sum() train_positive_number += get_number_of_positves(y=y_train) train_positive_pred += get_number_of_tp(y_true=y_train, y_pred=train_preds) train_losses[epoch] = loss_running / len(train_dataloader) train_accuracy[epoch] = train_correct_counter.item() / NUMBER_OF_PREDS train_recall = train_positive_pred / train_positive_number * 100 if val_dataloader: val_loss, val_acc = infer(net=net, infer_dataloader=val_dataloader, loss_fn=loss_fn, infer_df=infer_df) val_losses[epoch] = val_loss val_accuracy[epoch] = val_acc if is_earlystopping and val_dataloader and check_earlystopping(loss=val_losses, epoch=epoch): print('EarlyStopping !!!') best_epoch = np.argmin(val_losses[:epoch + 1]) break if epoch % PRINT_EVERY == 0: print(f"Epoch: {epoch + 1}/{NUM_EPOCHS},", f"Train loss: {train_losses[epoch]:.5f}, Train Num Correct: {train_correct_counter} " f"/ {NUMBER_OF_PREDS}, Train Accuracy: {train_accuracy[epoch]:.3f}, Train Recall: {train_recall:.3f}") if val_dataloader: print(f"Validation loss: {val_losses[epoch]:.5f}, Validation Accuracy: {val_accuracy[epoch]:.3f}") if (epoch + 1) % SAVE_EVERY == 0: save_pt_model(net=net) if best_epoch != NUM_EPOCHS - 1: # Earlystopping NOT activated train_losses = train_losses[:best_epoch + 1] val_losses = val_losses[:best_epoch + 1] else: best_epoch = np.argmin(val_losses) print( f'Best Epoch: {best_epoch + 1}; Best Validation Loss: {val_losses[best_epoch]:.4f}') if val_dataloader: print('val_accuracy', val_accuracy) print('val_loss', val_loss) print(train_losses) plot_values_by_epochs(train_values=train_losses, test_values=val_losses) return net
def train( model: Model, device: Device, loader: DataLoader, optimizer: Optimizer, loss_function: Criterion, epoch: int, log: Logger, writer: Optional[SummaryWriter] = None, scheduler: Optional[Scheduler] = None, ) -> Tuple[float, float]: """ Training loop :param model: PyTorch model to test :param device: torch.device or str, where to perform computations :param loader: PyTorch DataLoader over test dataset :param optimizer: PyTorch Optimizer bounded with model :param loss_function: criterion :param epoch: epoch id :param writer: tensorboard SummaryWriter :param log: Logger :param scheduler: optional PyTorch Scheduler :return: tuple(train loss, train accuracy) """ model.train() model.to(device) meter_loss = Meter("loss") meter_corr = Meter("acc") batch_size = len(loader.dataset) / len(loader) tqdm_loader = tqdm(loader, desc=f"train epoch {epoch:03d}") for batch_idx, batch_data in enumerate(tqdm_loader): data, target = batch_data.images.to(device), batch_data.labels.to( device) optimizer.zero_grad() output = model(data) loss = loss_function(output, target) loss.backward() optimizer.step() if scheduler is not None: scheduler.step() pred = output.argmax(dim=1, keepdim=True) # Display training status meter_loss.add(loss.item()) meter_corr.add(pred.eq(target.view_as(pred)).sum().item()) tqdm_loader.set_postfix({ "loss": meter_loss.avg, "acc": 100 * meter_corr.avg / batch_size, "lr": scheduler.get_lr(), }) # Log in file and tensorboard acc = 100.0 * meter_corr.sum / len(loader.dataset) log.info("Train Epoch: {} [ ({:.0f}%)]\tLoss: {:.6f}".format( epoch, acc, meter_loss.avg)) if writer is not None: writer.add_scalar("train_loss", loss.item(), global_step=epoch) writer.add_scalar("train_acc", acc, global_step=epoch) return meter_loss.avg, acc
def forward(self, result, target): loss = F.mse_loss(result, target, size_average=True, reduce=True) self.loss = loss.item() return loss
def fit(model, train_dataset, device, epoch=0, image_index=0, optimizer=None): if (optimizer == None): print('instantiating optimizer') optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) print('optimizer instantiated') criterion = nn.CrossEntropyLoss() # Error function: cross entropy # Stochastic gradient descent # Initial learning rate: 0.001 # momentum: 0.9 running_loss = 1.0 images_since_last_save = 0 #Runs training for two epochs (iterates over the 87 thousand images twice) while epoch < 2 or running_loss < 10e-3: running_loss = 0.0 print('epoch', epoch) model.train( ) # Sets a flag indicating the code that follows performs training # this makes sure training functionality like dropout and batch normalization perform #as expected print('loading new batch') batch_start = timer() for index, (samples, labels) in enumerate(train_dataset): batch_end = timer() print('batch loaded. time elapsed: ', batch_end - batch_start) if (image_index % 1000 == 0): print('current image:', image_index) # the variable data contains an entire batch of inputs and their associated labels #samples, labels = data #print('sending data to device') device_start = timer() samples, labels = samples.to(device), labels.to( device) # Sends the data to the GPU device_end = timer() #print('data sent. elapsed time', device_end-device_start) #print("zeroing grad") optimizer.zero_grad( ) # Zeroes the gradient, otherwise it will accumulate at every iteration # the result would be that the network would start taking huge parameter jumps as training went on #print('grad zeroed') #print('inferring...') infer_start = timer() output = model(samples)[:, :, :800, : 800] # Forward passes the input data infer_end = timer() #print('inferred') #print('time elapsed during inference:', infer_end - infer_start) #print('computing loss') loss_start = timer() loss = criterion(output, labels) # Computes the error loss.backward( ) # Computes the gradient, yielding how much each parameter must be updated loss_end = timer() #print('updating weights') weights_start = timer() optimizer.step( ) # Updates each parameter according to the gradient weights_end = timer() #print('weights updated. time elapsed: ', weights_end-weights_start) running_loss = loss.item() print('running loss', running_loss) '''if index % 10 == 9: print('[%d %5d] loss %.3f' % (epoch + 1, index + 1, running_loss / 2000)) running_loss = 0.0''' #print('loading new batch') batch_start = timer() image_index += samples.size()[0] images_since_last_save += samples.size()[0] if (images_since_last_save > 500): print('saving checkpoint at image', image_index) save_model( model, epoch, image_index, optimizer, 'customfcn_' + str(epoch) + '_' + str(image_index) + '.pickle') model = model.to(device) images_since_last_save = 0 image_index = 0 print('finished training')
def train(net: nn.Module, optimizer: torch.optim, train_dataloader: DataLoader = None, val_dataloader: DataLoader = None, is_earlystopping: bool = False) -> nn.Module: """ Training loop iterating on the train dataloader and updating the model's weights. Inferring the validation dataloader & test dataloader, if given, to babysit the learning Activating cuda device if available. :return: Trained model """ train_losses: np.array = np.zeros(NUM_EPOCHS) train_accuracy: np.array = np.zeros(NUM_EPOCHS) val_losses: np.array = np.zeros(NUM_EPOCHS) val_accuracy: np.array = np.zeros(NUM_EPOCHS) train_auc: np.array = np.zeros(NUM_EPOCHS) val_auc: np.array = np.zeros(NUM_EPOCHS) best_epoch: int = NUM_EPOCHS - 1 if val_dataloader: untrained_test_loss, untrained_test_accuracy, untrained_test_auc = infer( net, val_dataloader, loss_fn) print(f'Test Loss before training: {untrained_test_loss:.5f}') for epoch in range(NUM_EPOCHS): print(f'*************** Epoch {epoch + 1} ***************') train_correct_counter = 0 train_auc_accumulated = 0 loss_running = 0 net.train() for x_train, y_train in tqdm(train_dataloader): if x_train.shape[-1] == 224: y_train = torch.tensor(np.where(y_train == 3, 0, 1)).long() if train_on_gpu: net.cuda() x_train, y_train = x_train.cuda(), y_train.cuda() optimizer.zero_grad() y_train_pred = net(x_train) loss = loss_fn(y_train_pred, y_train) loss_running += loss.item() loss.backward() optimizer.step() _, train_preds = torch.max(y_train_pred, dim=1) train_correct_counter += torch.sum(train_preds == y_train) train_auc_accumulated += calculate_auc_score(y_true=y_train, y_pred=train_preds) train_losses[epoch] = loss_running / len(train_dataloader) train_accuracy[epoch] = train_correct_counter.item() / len( train_dataloader.dataset) train_auc[epoch] = train_auc_accumulated / len(train_dataloader) if val_dataloader: val_loss, val_acc, val_auc_val = infer(net, val_dataloader, loss_fn) val_losses[epoch] = val_loss val_accuracy[epoch] = val_acc val_auc[epoch] = val_auc_val if is_earlystopping and check_earlystopping(loss=val_losses, epoch=epoch): print('EarlyStopping !!!') best_epoch = np.argmin(val_losses[:epoch + 1]) break if epoch % PRINT_EVERY == 0: print( f"Epoch: {epoch + 1}/{NUM_EPOCHS},", f"Train loss: {train_losses[epoch]:.5f}, Train Num Correct: {train_correct_counter} " f"/ {len(train_dataloader.dataset)}, Train Accuracy: {train_accuracy[epoch]:.3f}\n", f"Validation loss: {val_losses[epoch]:.5f}, Validation Accuracy: {val_accuracy[epoch]:.3f}", f"Validation AUC: {val_auc[epoch]:.5f}, Train AUC: {train_auc[epoch]:.5f}" ) if (epoch + 1) % SAVE_EVERY == 0: save_pt_model(net=net) if best_epoch != NUM_EPOCHS - 1: # earlystopping NOT activated train_losses = train_losses[:best_epoch + 1] val_losses = val_losses[:best_epoch + 1] else: best_epoch = np.argmin(val_losses) print( f'Best Epoch: {best_epoch + 1}; Best Validation Loss: {val_losses[best_epoch]:.4f}' ) print('val_accuracy', val_accuracy) print('val_loss', val_loss) print(train_losses) plot_values_by_epochs(train_values=train_losses, test_values=val_losses) return net