Beispiel #1
0
def train_epoch(
    clf: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    loss_function: torch.nn.Module,
    words_train: List[List[str]],
    y_train: List[int],
    sequence_limit=32,
    batch_size=32,
    device="cpu",
) -> List[float]:
    clf.train()
    N = len(words_train)
    X, y = shuffle(words_train, y_train)
    epoch_pred = []
    losses = []
    with tqdm(range(0, N, batch_size)) as progress:
        for start in progress:
            clf.train()
            end = min(start + batch_size, N)
            X_batch = [x[:sequence_limit] for x in X[start:end]]
            y_batch = torch.tensor(y[start:end], dtype=torch.long).to(device)
            clf.zero_grad()
            y_scores = clf(X_batch)
            loss = loss_function(y_scores, y_batch)
            loss.backward()
            optimizer.step()

            clf.eval()
            epoch_pred.extend(((y_scores[:, 1] - y_scores[:, 0]) > 0).tolist())
            losses.append(loss.item())
            progress.set_description("Train Loss: {:.03}".format(
                np.mean(losses[-10:])))
    return losses
def train():
    for epoch in range(epochs):
        ts = time.time()
        print(epoch)
        for iter, (X, tar, Y) in enumerate(train_loader):
            optimizer.zero_grad()

            # inputs = X.to(computing_device)
            inputs = X.cuda()
            labels = Y.cuda()
            # labels = Y.to(computing_device)

            print("Getting outputs")
            outputs = resnet_model(inputs)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            #EARLY STOP TESTING CONDITION
            if iter > 5:
                break
            if iter % 10 == 0:
                print("epoch{}, iter{}, loss: {}".format(
                    epoch, iter, loss.item()))

        print("Finish epoch {}, time elapsed {}".format(
            epoch,
            time.time() - ts))
        #torch.save(resnet_model, 'best_model')

        #val(epoch)
        resnet_model.train()
Beispiel #3
0
def test(net: nn.Module, loss_fn: loss, x_test: np.array, y_test: np.array) -> Tuple[float, np.array]:
    """
    Run the model on x_test and calculate the loss of the predictions.
    The model run on evaluation mode and without updating the computational graph (no_grad)
    """
    net.eval()
    with torch.no_grad():
        y_test_pred = net(x_test.float())
        loss = loss_fn(input=y_test_pred.reshape(-1), target=y_test.float())
        test_loss = loss.item()
    return test_loss, y_test_pred
def train_val(loader=None,
              model=None,
              loss_function=None,
              optimizer=None,
              train_enable=None,
              device=None,
              model_classifier=None,
              model_id=None):
    sum_loss = 0.0
    sum_mse = 0.0
    sum_mae = 0.0
    sum_psnr = 0.0
    sum_ssim = 0.0

    if train_enable == 'True':
        model = model.train()
    else:
        model = model.eval()  # default closing Dropout

    for img_NAC, img_AC, _ in loader:

        img_NAC = img_NAC.float()
        img_NAC = img_NAC.to(device)
        img_AC = img_AC.float()
        img_AC = img_AC.to(device)

        cam = get_grad_cam(model_classifier, img_NAC)
        pred = process_cam(cam, model_id, model, img_NAC, device)

        loss = loss_function(pred, img_AC)  # Loss is just MSE
        mse, mae, psnr, ssim = matrics(img_AC, pred)

        if train_enable == 'True':
            optimizer.zero_grad()
            loss.backward()  # back propagation
            optimizer.step()

        sum_loss += float(loss.item())
        sum_mse += float(mse.item())
        sum_mae += float(mae.item())
        sum_psnr += float(psnr)
        sum_ssim += float(ssim)

    epoch_loss = sum_loss / len(loader)
    epoch_mse = sum_mse / len(loader)
    epoch_mae = sum_mae / len(loader)
    epoch_psnr = sum_psnr / len(loader)
    epoch_ssim = sum_ssim / len(loader)

    return epoch_loss, epoch_mse, epoch_mae, epoch_psnr, epoch_ssim
Beispiel #5
0
def train(args):
    transformer=T.Compose([
    T.ToTensor(),
    T.Normalize((0.3081),(0.1307))
    ])
    train_data=torchvision.datasets.MNIST(root=args.data_path,transform=transformer,download=True,train=True)
    train_loader=torch.utils.data.DataLoader(train_data,batch_size=args.batch_size,shuffle=True,drop_last=True,num_workers=4)

    model_arg=model_dict[args.model][1]
    model_arg["act"]=(act_dict[args.act])
    device=torch.device(args.device)
    net=model_dict[args.model][0](**model_arg).to(device)

    if args.optimizer=='adam':
        optimizer=torch.optim.Adam(net.parameters(),lr=args.lr,betas=(0.9,0.99))
    elif args.optimizer=='SGD':
        optimizer=torch.optim.SGD(net.parameters(),lr=args.lr,momentum=0.9)
    else:
        optimizer=None
    loss_func=loss_dict[args.loss_func]()
    
    writer=tensorboardX.SummaryWriter()

    current_acc=0
    for epoch in range(args.epoch):
        total_loss=0.
        total_acc=0.
        for i,(images,labels) in enumerate(train_loader):
            images,labels=images.to(device),labels.to(device)
            outputs=net(images)
            loss=loss_func(outputs,labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss+=loss.item()
            acc=torch.sum(outputs.argmax(-1)==labels).item()
            total_acc+=acc/args.batch_size

            writer.add_scalar('data/loss',loss,i+epoch*len(train_loader))
        print("epoch%3d: loss=%.4f ,acc:%.2f%% " %(epoch,total_loss/len(train_loader),total_acc*100/len(train_loader)))
        if(epoch%1==0):
            eval_acc=eval(args,net)
            writer.add_scalar('data/acc',eval_acc,epoch)
            if eval_acc>current_acc:
                torch.save(net.state_dict(),'%s/best_%s_model.pth' %(args.checkpoints_path,args.model))
Beispiel #6
0
    def trainloop(self, n_epochs):
        for epoch in range(1, n_epochs + 1):
            self.evaluate(mask_data.data, mask_data.label)
            loss_train = 0.0
            for input, realout in self.dataloader:
                predictout = self.network(input)

                loss = self.loss_fn(predictout, realout)

                self.optim.zero_grad()

                loss.backward()
                self.optim.step()
                loss_train += loss.item()
            #if epoch == 1 or epoch % 100 == 0:
            print(
                f'{datetime.datetime.now()} epoch {epoch} training loss {loss_train/len(self.dataloader)}'
            )
Beispiel #7
0
    def validate(self):
        """ Validation cycle. Performed over a custom dataset. """
        print("Validation")

        self._net.eval()

        val_gen = self._dispatcher.val_gen()
        relative_error_list = []

        for sample_idx, sample in enumerate(val_gen):
            if self.use_gpu:
                sample.cuda()
            sample.batchify()

            pred = self._net.forward(sample.image_tensor)
            loss, details = self._net.loss(pred, sample.segmentation_tensor)

            pred_area = self._dispatcher.decode_prediction(pred)

            anno_hw = sample.anno_hw
            gt_area = anno_hw[0] * anno_hw[1]
            relative_error = abs(pred_area - gt_area) / gt_area
            relative_error_list.append(relative_error)

            if sample_idx % 20 == 0:
                print("loss={:.4f} gt_area={} pred_area={}".format(
                    loss.item(), gt_area, pred_area))
                self._render_prediction(
                    pred.detach().cpu().numpy()[0], None,
                    sample.image_tensor.detach().cpu().numpy()[0].transpose(
                        (1, 2, 0)))

        average_relative_error = \
            np.array(relative_error_list).sum() / len(relative_error_list)
        print("-------- Final metric -----------")
        print("Average relative area error = {:0.6f}".format(
            average_relative_error))

        pass
Beispiel #8
0
    def train(self):
        """ Perform training of the network. """

        num_epochs = 50
        batch_size = 16
        batches_per_epoch = 1024
        learning_rate = 0.02

        optimizer = torch.optim.SGD(self._net.parameters(), lr=learning_rate)
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [40, 45],
                                                         gamma=0.1,
                                                         last_epoch=-1)

        training_start_time = time.time()

        self.validate()

        for epoch in range(num_epochs):
            print("Epoch ------ ", epoch)

            train_gen = self._dispatcher.train_gen(batches_per_epoch,
                                                   batch_size)

            self._net.train()

            for batch_index, batch in enumerate(train_gen):
                if self.use_gpu:
                    batch.cuda()

                pred = self._net.forward(batch.image_tensor)

                loss, details = self._net.loss(pred, batch.segmentation_tensor)

                if batch_index % 50 == 0:
                    print("epoch={} batch={} loss={:.4f}".format(
                        epoch, batch_index, loss.item()))
                    self._render_prediction(
                        pred.detach().cpu().numpy()[0],
                        batch.segmentation_tensor.detach().cpu().numpy()[0],
                        batch.image_tensor.detach().cpu().numpy()[0].transpose(
                            (1, 2, 0)))
                    print("-------------------------------")

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                pass

            scheduler.step()

            # Save after every epoch
            torch.save(self._net.state_dict(), self._snapshot_name)

            # Validate every epoch
            self.validate()

            pass
            # end of epoch

        training_end_time = time.time()
        print("Training took {} hours".format(
            (training_end_time - training_start_time) / 3600))

        print("Train finished!")
Beispiel #9
0
    def _train(
        self,
        train_data,
        epoch,
        val_data=None,
        val_step=None,
        ckpt_step=None,
    ):
        """helper method, called by the fit method on each epoch.
        Iterates once through train_data, using it to update model parameters.
        Override this method if you need to implement your own training method.

        Parameters
        ----------
        train_data : torch.util.Dataloader
            instance that will be iterated over.
        """
        self.network.train()

        progress_bar = tqdm(train_data)
        for ind, batch in enumerate(progress_bar):
            x, y = batch[0].to(self.device), batch[1].to(self.device)
            y_pred = self.network.forward(x)
            self.optimizer.zero_grad()
            loss = self.loss(y_pred, y)
            loss.backward()
            self.optimizer.step()
            progress_bar.set_description(
                f'Epoch {epoch}, batch {ind}. Loss: {loss.item():.4f}. Global step: {self.global_step}'
            )

            if self.summary_writer is not None:
                self.summary_writer.add_scalar('loss/train', loss.item(),
                                               self.global_step)
            self.global_step += 1

            if val_data is not None:
                if self.global_step % val_step == 0:
                    log_or_print(
                        f'Step {self.global_step} is a validation step; computing metrics on validation set',
                        logger=self.logger,
                        level='info')
                    metric_vals = self._eval(val_data)
                    self.network.train()  # because _eval calls network.eval()
                    log_or_print(msg=', '.join([
                        f'{metric_name}: {metric_value:.4f}'
                        for metric_name, metric_value in metric_vals.items()
                        if metric_name.startswith('avg_')
                    ]),
                                 logger=self.logger,
                                 level='info')

                    if self.summary_writer is not None:
                        for metric_name, metric_value in metric_vals.items():
                            if metric_name.startswith('avg_'):
                                self.summary_writer.add_scalar(
                                    f'{metric_name}/val', metric_value,
                                    self.global_step)

                    current_val_acc = metric_vals['avg_acc']
                    if current_val_acc > self.max_val_acc:
                        self.max_val_acc = current_val_acc
                        log_or_print(
                            msg=
                            f'Accuracy on validation set improved. Saving max-val-acc checkpoint.',
                            logger=self.logger,
                            level='info')
                        self.save(self.max_val_acc_ckpt_path,
                                  epoch=epoch,
                                  global_step=self.global_step)
                        if self.patience:
                            self.patience_counter = 0
                    else:  # if accuracy did not improve
                        if self.patience:
                            self.patience_counter += 1
                            if self.patience_counter > self.patience:
                                log_or_print(
                                    'Stopping training early, '
                                    f'accuracy has not improved in {self.patience} validation steps.',
                                    logger=self.logger,
                                    level='info')
                                # save "backup" checkpoint upon stopping; don't save over "max-val-acc" checkpoint
                                self.save(self.ckpt_path,
                                          epoch=epoch,
                                          global_step=self.global_step)
                                progress_bar.close()
                                break
                            else:
                                log_or_print(
                                    f'Accuracy has not improved in {self.patience_counter} validation steps. '
                                    f'Not saving max-val-acc checkpoint for this validation step.',
                                    logger=self.logger,
                                    level='info')
                        else:  # patience is None. We still log that we are not saving checkpoint.
                            log_or_print(
                                'Accuracy is less than maximum validation accuracy so far. '
                                'Not saving max-val-acc checkpoint.',
                                logger=self.logger,
                                level='info')

            # below can be true regardless of whether we have val_data and/or current epoch is a val_epoch
            if self.global_step % ckpt_step == 0:
                log_or_print(f'Step {self.global_step} is a checkpoint step.',
                             logger=self.logger,
                             level='info')
                self.save(self.ckpt_path,
                          epoch=epoch,
                          global_step=self.global_step)
def train(net: nn.Module,
          train_dataloader: DataLoader = None,
          val_dataloader: DataLoader = None,
          test_dataloader: DataLoader = None,
          is_earlystopping: bool = True) -> nn.Module:
    """
    Training loop iterating on the train dataloader and updating the model's weights.
    Inferring the validation dataloader & test dataloader, if given, to babysit the learning
    Activating cuda device if available.
    :return: Trained model
    """
    train_losses: np.array = np.zeros(NUM_EPOCHS)
    val_losses: np.array = np.zeros(NUM_EPOCHS)
    best_epoch: int = NUM_EPOCHS - 1

    if test_dataloader:
        untrained_test_loss, untrained_y_test_pred = infer(
            net, test_dataloader, loss_fn)
        _, _ = get_num_of_areas_and_targets_from_arary(array=y_test)
        print(f'Test Loss before training: {untrained_test_loss:.3f}')
        _, _, _ = calculate_model_metrics(y_true=y_test,
                                          y_pred=untrained_y_test_pred,
                                          verbose=True)

    for epoch in range(NUM_EPOCHS):
        print(f'*************** Epoch {epoch + 1} ***************')
        net.train()
        h = net.init_hidden(batch_size=BATCH_SIZE)
        for batch_idx, (x_train, y_train) in enumerate(tqdm(train_dataloader)):
            if train_on_gpu:
                net.cuda()
                x_train, y_train = x_train.cuda(), y_train.cuda()
            h = h.data
            optimizer.zero_grad()
            y_train_pred, h = net(x_train, h)
            loss = loss_fn(y_train_pred, y_train)
            loss.backward()
            optimizer.step()

        if val_dataloader:
            val_loss, y_val_pred = infer(net, val_dataloader, loss_fn)
            val_losses[epoch] = val_loss

        if is_earlystopping and check_earlystopping(loss=val_losses,
                                                    epoch=epoch):
            print('EarlyStopping !!!')
            best_epoch = np.argmin(val_losses[:epoch + 1])
            break
        train_losses[epoch] = loss.item() / len(train_dataloader)
        scheduler.step(
            val_loss)  # Change the lr if needed based on the validation loss

        if epoch % PRINT_EVERY == 0:
            print(f"Epoch: {epoch + 1}/{NUM_EPOCHS},",
                  f"Train loss: {train_losses[epoch]:.5f},",
                  f"Validation loss: {val_losses[epoch]:.5f}")

            _, _, _ = calculate_model_metrics(y_true=y_train,
                                              y_pred=y_train_pred,
                                              mode='Train-Last Batch')
            if val_dataloader:
                _, _, _ = calculate_model_metrics(y_true=y_val,
                                                  y_pred=y_val_pred,
                                                  mode='Validation')

        if (epoch + 1) % SAVE_EVERY == 0:
            save_pt_model(net=net)

    if best_epoch != NUM_EPOCHS - 1:  # earlystopping NOT activated
        train_losses = train_losses[:best_epoch + 1]
        val_losses = val_losses[:best_epoch + 1]
    else:
        best_epoch = np.argmin(val_losses)

    print(
        f'Best Epoch: {best_epoch + 1}; Best Validation Loss: {val_losses[best_epoch]:.4f}'
    )
    print(train_losses)
    plot_values_by_epochs(train_values=train_losses,
                          validation_values=val_losses)
    return net
Beispiel #11
0
    def train(self):
        self.model.train()

        # log data to these variables
        if 'loaded' in self.settings:
            if not self.settings['loaded']:
                self.model.training_loss = []
                self.model.training_acc = []
                self.model.validation_acc = []
                self.model.validation_loss = []
        else:
            self.model.training_loss = []
            self.model.training_acc = []
            self.model.validation_acc = []
            self.model.validation_loss = []

      

        for epoch in range(self.settings['EPOCHS']):
            self.model.train()
            ts = time.time()
            lossSum = 0
            accuracySum = 0
            totalImage = 0
            for iter, (X, tar, Y) in enumerate(self.train_loader):

                
                self.optimizer.zero_grad()

                if('imagesPerEpoch' in self.settings):
                    if iter*self.batch_size > self.settings['imagesPerEpoch']:
                        break
                

                #inputs = X.to(computing_device)
                inputs = X.cuda()
                labels = Y.cuda()
                #labels = Y.to(computing_device)

                outputs = self.model(inputs)

                loss = self.criterion(outputs, labels)

                lossSum += loss.item()

                accuracies = pixel_acc(outputs, labels)

                accuracySum += torch.sum(accuracies)/self.batch_size

                torch.cuda.empty_cache()
                
                loss.backward()
                self.optimizer.step()

                

                totalImage += 1

                if iter % 100 == 0:
                    None
                    print("Iter", iter, "Done")
                    #print("epoch{}, iter{}, loss: {}".format(epoch, iter, loss.item()))
            lossSum = lossSum / totalImage

            self.model.training_loss.append(lossSum)
            accuracy = accuracySum / totalImage # totalImage?
            if accuracy is None:
                accuracy = torch.tensor([0.0])
            self.model.training_acc.append(accuracy.item())
            print(totalImage*self.batch_size)
            print("-------------------------------------")
            print("Train epoch {}, time elapsed {}, loss {}, accuracy: {}".format(epoch, time.time() - ts, lossSum, accuracy.item()))


            self.val(epoch)
    def train(
            self,
            epoch,
            max_epoch,
            writer,
            print_freq=10,
            fixbase_epoch=0,
            open_layers=None
    ):
        losses_t = AverageMeter()
        losses_x = AverageMeter()
        losses_recons = AverageMeter()
        accs = AverageMeter()
        batch_time = AverageMeter()
        data_time = AverageMeter()

        self.model.train()
       
        open_all_layers(self.model)

        num_batches = len(self.train_loader)
        end = time.time()
        for batch_idx, data in enumerate(self.train_loader):
            data_time.update(time.time() - end)

            imgs, pids = self._parse_data_for_train(data)
            imgs_clean=imgs.clone()
            if self.use_gpu:
                imgs = imgs.cuda()
                imgs_clean = imgs_clean.cuda()
                pids = pids.cuda()
            labelss=[]
            if epoch >= 0 and epoch < 15:
                randmt = RandomErasing(probability=0.5,sl=0.07, sh=0.3)
                for i, img in enumerate(imgs):
                   
                   imgs[i],p = randmt(img)
                   labelss.append(p)
               
            if epoch >= 15:
                randmt = RandomErasing(probability=0.5,sl=0.1, sh=0.3)
                for i, img in enumerate(imgs):
                   
                   imgs[i],p = randmt(img)
                   labelss.append(p)

            binary_labels = torch.tensor(np.asarray(labelss)).cuda()
            self.optimizer.zero_grad()
            
            outputs, outputs2, recons,bin_out1,bin_out2, bin_out3 = self.model(imgs )
            loss_mse = self.criterion_mse(recons, imgs_clean)
            loss = self.mgn_loss(outputs, pids)
            
            occ_loss1 = self.BCE_criterion(bin_out1.squeeze(1),binary_labels.float() )
            occ_loss2 = self.BCE_criterion(bin_out2.squeeze(1),binary_labels.float() )
            occ_loss3 = self.BCE_criterion(bin_out3.squeeze(1),binary_labels.float() )


            loss = loss + .05*loss_mse + 0.1*occ_loss1 + 0.1*occ_loss2+0.1*occ_loss3
            #loss = self.weight_t * loss_t + self.weight_x * loss_x #+ #self.weight_r*loss_mse
            loss.backward()
            self.optimizer.step()

            batch_time.update(time.time() - end)

            #losses_t.update(loss_t.item(), pids.size(0))
            losses_x.update(loss.item(), pids.size(0))
            losses_recons.update(occ_loss1.item(), binary_labels.size(0))
            accs.update(metrics.accuracy(outputs, pids)[0].item())

            if (batch_idx + 1) % print_freq == 0:
                # estimate remaining time
                eta_seconds = batch_time.avg * (
                        num_batches - (batch_idx + 1) + (max_epoch -
                                                         (epoch + 1)) * num_batches
                )
                eta_str = str(datetime.timedelta(seconds=int(eta_seconds)))
                print(
                    'Epoch: [{0}/{1}][{2}/{3}]\t'
                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                    'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                    #'Loss_t {loss_t.val:.4f} ({loss_t.avg:.4f})\t'
                    'Loss_x {loss_x.val:.4f} ({loss_x.avg:.4f})\t'
                    'Loss_Occlusion {loss_r.val:.4f} ({loss_r.avg:.4f})\t'             
                    'Acc {acc.val:.2f} ({acc.avg:.2f})\t'
                    'Lr {lr:.6f}\t'
                    'eta {eta}'.format(
                        epoch + 1,
                        max_epoch,
                        batch_idx + 1,
                        num_batches,
                        batch_time=batch_time,
                        data_time=data_time,
                        #loss_t=losses_t,
                        loss_x=losses_x,
                        loss_r = losses_recons,
                        acc=accs,
                        lr=self.optimizer.param_groups[0]['lr'],
                        eta=eta_str
                    )
                )
            writer= None
            if writer is not None:
                n_iter = epoch * num_batches + batch_idx
                writer.add_scalar('Train/Time', batch_time.avg, n_iter)
                writer.add_scalar('Train/Data', data_time.avg, n_iter)
                writer.add_scalar('Train/Loss_t', losses_t.avg, n_iter)
                writer.add_scalar('Train/Loss_x', losses_x.avg, n_iter)
                writer.add_scalar('Train/Acc', accs.avg, n_iter)
                writer.add_scalar(
                    'Train/Lr', self.optimizer.param_groups[0]['lr'], n_iter
                )

            end = time.time()

        if self.scheduler is not None:
            self.scheduler.step()
Beispiel #13
0
def train(net: nn.Module, optimizer: torch.optim, train_dataloader: DataLoader = None,
          val_dataloader: DataLoader = None, infer_df: np.array = None, is_earlystopping: bool = False) -> nn.Module:
    """
    Training loop iterating on the train dataloader and updating the model's weights.
    Inferring the validation dataloader & test dataloader, if given, to babysit the learning
    Activating cuda device if available.
    :return: Trained model
    """
    NUMBER_OF_PREDS: int = len(train_dataloader.dataset) * NUM_USERS
    train_losses: np.array = np.zeros(NUM_EPOCHS)
    train_accuracy: np.array = np.zeros(NUM_EPOCHS)
    val_losses: np.array = np.zeros(NUM_EPOCHS)
    val_accuracy: np.array = np.zeros(NUM_EPOCHS)
    train_positive_pred: int = 0
    train_positive_number: int = 0
    best_epoch: int = NUM_EPOCHS - 1

    if val_dataloader:
        untrained_val_loss, untrained_val_accuracy = infer(net=net, infer_dataloader=val_dataloader, loss_fn=loss_fn,
                                                           infer_df=infer_df)
        print(f'Validation Loss before training: {untrained_val_loss:.5f}')

    for epoch in range(NUM_EPOCHS):
        print(f'*************** Epoch {epoch + 1} ***************')
        train_correct_counter = 0
        loss_running = 0

        net.train()
        for x_train, y_train in tqdm(train_dataloader):
            if train_on_gpu:
                net.cuda()
                x_train, y_train = x_train.cuda(), y_train.cuda()
            optimizer.zero_grad()
            y_train_pred = net(x_train)

            loss = loss_fn(y_train_pred.flatten(), y_train.flatten())
            loss_running += loss.item()
            loss.backward()
            optimizer.step()
            train_preds = np.where(y_train_pred > 0.5, 1, 0)
            train_correct_counter += (train_preds == np.array(y_train)).sum()
            train_positive_number += get_number_of_positves(y=y_train)
            train_positive_pred += get_number_of_tp(y_true=y_train, y_pred=train_preds)

        train_losses[epoch] = loss_running / len(train_dataloader)
        train_accuracy[epoch] = train_correct_counter.item() / NUMBER_OF_PREDS
        train_recall = train_positive_pred / train_positive_number * 100

        if val_dataloader:
            val_loss, val_acc = infer(net=net, infer_dataloader=val_dataloader, loss_fn=loss_fn, infer_df=infer_df)
            val_losses[epoch] = val_loss
            val_accuracy[epoch] = val_acc

        if is_earlystopping and val_dataloader and check_earlystopping(loss=val_losses, epoch=epoch):
            print('EarlyStopping !!!')
            best_epoch = np.argmin(val_losses[:epoch + 1])
            break
        if epoch % PRINT_EVERY == 0:
            print(f"Epoch: {epoch + 1}/{NUM_EPOCHS},",
                  f"Train loss: {train_losses[epoch]:.5f}, Train Num Correct: {train_correct_counter} "
                  f"/ {NUMBER_OF_PREDS}, Train Accuracy: {train_accuracy[epoch]:.3f}, Train Recall: {train_recall:.3f}")

            if val_dataloader:
                print(f"Validation loss: {val_losses[epoch]:.5f}, Validation Accuracy: {val_accuracy[epoch]:.3f}")

        if (epoch + 1) % SAVE_EVERY == 0:
            save_pt_model(net=net)

    if best_epoch != NUM_EPOCHS - 1:  # Earlystopping NOT activated
        train_losses = train_losses[:best_epoch + 1]
        val_losses = val_losses[:best_epoch + 1]
    else:
        best_epoch = np.argmin(val_losses)

    print(
        f'Best Epoch: {best_epoch + 1}; Best Validation Loss: {val_losses[best_epoch]:.4f}')
    if val_dataloader:
        print('val_accuracy', val_accuracy)
        print('val_loss', val_loss)
    print(train_losses)
    plot_values_by_epochs(train_values=train_losses, test_values=val_losses)
    return net
Beispiel #14
0
def train(
    model: Model,
    device: Device,
    loader: DataLoader,
    optimizer: Optimizer,
    loss_function: Criterion,
    epoch: int,
    log: Logger,
    writer: Optional[SummaryWriter] = None,
    scheduler: Optional[Scheduler] = None,
) -> Tuple[float, float]:
    """
    Training loop
    :param model: PyTorch model to test
    :param device: torch.device or str, where to perform computations
    :param loader: PyTorch DataLoader over test dataset
    :param optimizer: PyTorch Optimizer bounded with model
    :param loss_function: criterion
    :param epoch: epoch id
    :param writer: tensorboard SummaryWriter
    :param log: Logger
    :param scheduler: optional PyTorch Scheduler
    :return: tuple(train loss, train accuracy)
    """
    model.train()
    model.to(device)

    meter_loss = Meter("loss")
    meter_corr = Meter("acc")

    batch_size = len(loader.dataset) / len(loader)
    tqdm_loader = tqdm(loader, desc=f"train epoch {epoch:03d}")
    for batch_idx, batch_data in enumerate(tqdm_loader):
        data, target = batch_data.images.to(device), batch_data.labels.to(
            device)
        optimizer.zero_grad()

        output = model(data)

        loss = loss_function(output, target)
        loss.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step()

        pred = output.argmax(dim=1, keepdim=True)
        # Display training status
        meter_loss.add(loss.item())
        meter_corr.add(pred.eq(target.view_as(pred)).sum().item())
        tqdm_loader.set_postfix({
            "loss": meter_loss.avg,
            "acc": 100 * meter_corr.avg / batch_size,
            "lr": scheduler.get_lr(),
        })

    # Log in file and tensorboard
    acc = 100.0 * meter_corr.sum / len(loader.dataset)
    log.info("Train Epoch: {} [ ({:.0f}%)]\tLoss: {:.6f}".format(
        epoch, acc, meter_loss.avg))
    if writer is not None:
        writer.add_scalar("train_loss", loss.item(), global_step=epoch)
        writer.add_scalar("train_acc", acc, global_step=epoch)

    return meter_loss.avg, acc
Beispiel #15
0
 def forward(self, result, target):
     loss = F.mse_loss(result, target, size_average=True, reduce=True)
     self.loss = loss.item()
     return loss
def fit(model, train_dataset, device, epoch=0, image_index=0, optimizer=None):
    if (optimizer == None):
        print('instantiating optimizer')
        optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
        print('optimizer instantiated')

    criterion = nn.CrossEntropyLoss()  # Error function: cross entropy
    # Stochastic gradient descent
    # Initial learning rate: 0.001
    # momentum: 0.9

    running_loss = 1.0
    images_since_last_save = 0

    #Runs training for two epochs (iterates over the 87 thousand images twice)
    while epoch < 2 or running_loss < 10e-3:
        running_loss = 0.0
        print('epoch', epoch)
        model.train(
        )  # Sets a flag indicating the code that follows performs training
        # this makes sure training functionality like dropout and batch normalization perform
        #as expected

        print('loading new batch')
        batch_start = timer()

        for index, (samples, labels) in enumerate(train_dataset):
            batch_end = timer()
            print('batch loaded. time elapsed: ', batch_end - batch_start)
            if (image_index % 1000 == 0):
                print('current image:', image_index)
            # the variable data contains an entire batch of inputs and their associated labels

            #samples, labels = data
            #print('sending data to device')
            device_start = timer()
            samples, labels = samples.to(device), labels.to(
                device)  # Sends the data to the GPU
            device_end = timer()
            #print('data sent. elapsed time', device_end-device_start)

            #print("zeroing grad")
            optimizer.zero_grad(
            )  # Zeroes the gradient, otherwise it will accumulate at every iteration
            # the result would be that the network would start taking huge parameter jumps as training went on
            #print('grad zeroed')

            #print('inferring...')
            infer_start = timer()
            output = model(samples)[:, :, :800, :
                                    800]  # Forward passes the input data
            infer_end = timer()
            #print('inferred')
            #print('time elapsed during inference:', infer_end - infer_start)

            #print('computing loss')
            loss_start = timer()
            loss = criterion(output, labels)  # Computes the error
            loss.backward(
            )  # Computes the gradient, yielding how much each parameter must be updated
            loss_end = timer()

            #print('updating weights')
            weights_start = timer()
            optimizer.step(
            )  # Updates each parameter according to the gradient
            weights_end = timer()
            #print('weights updated. time elapsed: ', weights_end-weights_start)

            running_loss = loss.item()
            print('running loss', running_loss)
            '''if index % 10 == 9:
                print('[%d %5d] loss %.3f' % (epoch + 1, index + 1, running_loss / 2000))
                running_loss = 0.0'''
            #print('loading new batch')
            batch_start = timer()

            image_index += samples.size()[0]

            images_since_last_save += samples.size()[0]
            if (images_since_last_save > 500):
                print('saving checkpoint at image', image_index)
                save_model(
                    model, epoch, image_index, optimizer, 'customfcn_' +
                    str(epoch) + '_' + str(image_index) + '.pickle')
                model = model.to(device)
                images_since_last_save = 0

        image_index = 0
    print('finished training')
def train(net: nn.Module,
          optimizer: torch.optim,
          train_dataloader: DataLoader = None,
          val_dataloader: DataLoader = None,
          is_earlystopping: bool = False) -> nn.Module:
    """
    Training loop iterating on the train dataloader and updating the model's weights.
    Inferring the validation dataloader & test dataloader, if given, to babysit the learning
    Activating cuda device if available.
    :return: Trained model
    """
    train_losses: np.array = np.zeros(NUM_EPOCHS)
    train_accuracy: np.array = np.zeros(NUM_EPOCHS)
    val_losses: np.array = np.zeros(NUM_EPOCHS)
    val_accuracy: np.array = np.zeros(NUM_EPOCHS)
    train_auc: np.array = np.zeros(NUM_EPOCHS)
    val_auc: np.array = np.zeros(NUM_EPOCHS)
    best_epoch: int = NUM_EPOCHS - 1

    if val_dataloader:
        untrained_test_loss, untrained_test_accuracy, untrained_test_auc = infer(
            net, val_dataloader, loss_fn)
        print(f'Test Loss before training: {untrained_test_loss:.5f}')

    for epoch in range(NUM_EPOCHS):
        print(f'*************** Epoch {epoch + 1} ***************')
        train_correct_counter = 0
        train_auc_accumulated = 0
        loss_running = 0
        net.train()
        for x_train, y_train in tqdm(train_dataloader):
            if x_train.shape[-1] == 224:
                y_train = torch.tensor(np.where(y_train == 3, 0, 1)).long()
            if train_on_gpu:
                net.cuda()
                x_train, y_train = x_train.cuda(), y_train.cuda()
            optimizer.zero_grad()
            y_train_pred = net(x_train)

            loss = loss_fn(y_train_pred, y_train)
            loss_running += loss.item()
            loss.backward()
            optimizer.step()
            _, train_preds = torch.max(y_train_pred, dim=1)
            train_correct_counter += torch.sum(train_preds == y_train)
            train_auc_accumulated += calculate_auc_score(y_true=y_train,
                                                         y_pred=train_preds)

        train_losses[epoch] = loss_running / len(train_dataloader)
        train_accuracy[epoch] = train_correct_counter.item() / len(
            train_dataloader.dataset)
        train_auc[epoch] = train_auc_accumulated / len(train_dataloader)

        if val_dataloader:
            val_loss, val_acc, val_auc_val = infer(net, val_dataloader,
                                                   loss_fn)
            val_losses[epoch] = val_loss
            val_accuracy[epoch] = val_acc
            val_auc[epoch] = val_auc_val

        if is_earlystopping and check_earlystopping(loss=val_losses,
                                                    epoch=epoch):
            print('EarlyStopping !!!')
            best_epoch = np.argmin(val_losses[:epoch + 1])
            break

        if epoch % PRINT_EVERY == 0:
            print(
                f"Epoch: {epoch + 1}/{NUM_EPOCHS},",
                f"Train loss: {train_losses[epoch]:.5f}, Train Num Correct: {train_correct_counter} "
                f"/ {len(train_dataloader.dataset)}, Train Accuracy: {train_accuracy[epoch]:.3f}\n",
                f"Validation loss: {val_losses[epoch]:.5f}, Validation Accuracy: {val_accuracy[epoch]:.3f}",
                f"Validation AUC: {val_auc[epoch]:.5f}, Train AUC: {train_auc[epoch]:.5f}"
            )

        if (epoch + 1) % SAVE_EVERY == 0:
            save_pt_model(net=net)

    if best_epoch != NUM_EPOCHS - 1:  # earlystopping NOT activated
        train_losses = train_losses[:best_epoch + 1]
        val_losses = val_losses[:best_epoch + 1]
    else:
        best_epoch = np.argmin(val_losses)

    print(
        f'Best Epoch: {best_epoch + 1}; Best Validation Loss: {val_losses[best_epoch]:.4f}'
    )
    print('val_accuracy', val_accuracy)
    print('val_loss', val_loss)
    print(train_losses)
    plot_values_by_epochs(train_values=train_losses, test_values=val_losses)
    return net