コード例 #1
0
class LivelossCallback(AvgStatsCallback):
    def __init__(self, metrics):
        super().__init__(metrics)
        self.liveloss = PlotLosses(skip_first=0)
        self.metricnames = [m.__name__ for m in metrics]
        self.logs = {}

    def begin_epoch(self):
        super().begin_epoch()
        self.logs = {}
        self.iteration = 0

    def after_loss(self):
        super().after_loss()
        if self.in_train:
            self.iteration += 1
            print(
                "\r[%d, %5d] Train_loss: %.3f" %
                (self.epoch + 1, self.iteration, self.loss),
                end="",
            )

    def after_epoch(self):
        super().after_epoch()
        self.logs["loss"] = self.train_stats.avg_stats[0]
        self.logs["val_loss"] = self.valid_stats.avg_stats[0]
        for i, metric in enumerate(self.metricnames):
            self.logs[metric] = self.train_stats.avg_stats[i + 1].item()
            self.logs["val_" + metric] = self.valid_stats.avg_stats[i +
                                                                    1].item()
        self.liveloss.update(self.logs)
        self.liveloss.draw()
コード例 #2
0
    def train_vae(self,
                  epochs=10,
                  hidden_size=2,
                  lr=0.0005,
                  recon_loss_method='mse'):
        """
        Handles the training of the vae model.

        Parameters
        ----------
        epochs : int
            Number of complete passes over the whole training set.
        hidden_size : int
            Size of the latent space of the vae.
        lr : float.
            Learning rate for the vae model training.
        recon_loss_method : str
            Method for reconstruction loss calculation

        Returns
        -------
        None

        """
        set_seed(42)  # Set the random seed
        self.model = VAE(hidden_size, self.input.shape)  # Initialise model

        # Create optimizer
        optimizer = optim.Adam(self.model.parameters(),
                               lr=lr,
                               betas=(0.9, 0.999))

        if self.plot_loss:
            liveloss = PlotLosses()
            liveloss.skip_first = 0
            liveloss.figsize = (16, 10)

        # Start training loop
        for epoch in range(1, epochs + 1):
            tl = train(epoch,
                       self.model,
                       optimizer,
                       self.train_loader,
                       recon_loss_method=recon_loss_method
                       )  # Train model on train dataset
            testl = test(epoch,
                         self.model,
                         self.test_loader,
                         recon_loss_method=recon_loss_method)

            if self.plot_loss:  # log train and test losses for dynamic plot
                logs = {}
                logs['' + 'ELBO'] = tl
                logs['val_' + 'ELBO'] = testl
                liveloss.update(logs)
                liveloss.draw()
コード例 #3
0
def train(model, patch_train_loader, patch_val_loader, EPOCHS, learning_rate):
  loss_func = nn.MSELoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)#, weight_decay=0.99)
  liveloss = PlotLosses()
  lr2_tr_loss = []
  lr2_val_loss = []
  model_losses, valid_losses = [], []
      
  for epoch in range(EPOCHS):
    print("epoch{}".format(epoch))
    model_losses, valid_losses = [], []
    logs = {}
    prefix = ''
      
    # with train data
    model.train()
    for idx, (data,target) in enumerate(patch_train_loader):
        data = torch.autograd.Variable(data).to(device = device, dtype = torch.float)
        print(data.shape)
        optimizer.zero_grad()
        pred = model(data)
        print(pred.shape)
        loss = loss_func(pred, data)
        # Backpropagation
        loss.backward()
        # update
        optimizer.step()
        # loss save
        model_losses.append(loss.cpu().data.item())
        logs[prefix + 'MSE loss'] = loss.item()
        print(idx,"complete")
          
    ## with validation data(only nodefect)
    model.eval()
    for idx, (data,target) in enumerate(patch_val_loader):
        data = torch.autograd.Variable(data).to(device = device, dtype = torch.float)
        pred = model(data)
        loss = loss_func(pred, data)
        valid_losses.append(loss.item())
        prefix = 'val_'
        logs[prefix + 'MSE loss'] = loss.item()
             
    lr2_tr_loss.append(np.mean(model_losses))
    lr2_val_loss.append(np.mean(valid_losses))
    liveloss.update(logs)
    liveloss.draw()
    print ("Epoch:", epoch+1, " Training Loss: ", np.mean(model_losses), " Valid Loss: ", np.mean(valid_losses))
    ## epoch 별로 모델을 저장을 해서, 혹시 overfitting이 된다면 그 이전의 epoch때를 저장해서 AE모델로 사용하고자한다.
    path = os.path.join("/content/drive/Shared drives/data/nocrop/model/hs/model{}".format(str(model)[11:12]),str(model)[:12] + '_epoch{}.pth'.format(epoch))
    torch.save(model.state_dict(), path)
    
    ## epoch19(즉 마지막 에포크)때의 모델을 AE모델로 저장
    if epoch == EPOCHS -1:
        path = os.path.join("/content/drive/Shared drives/data/nocrop/model/hs",str(model)[:12] + '.pth')
        torch.save(model.state_dict(), path)
        return lr2_tr_loss, lr2_val_loss
def execute(model, n_epochs, trn_ldr, val_ldr, opti, crit, plot):
    '''
    This routine is responsible for the entire training process, and handles in-training plotting

    Arguments:
    model       : the model to be trained                                   // nn.Module
    n_epochs    : the number of epochs the model should be trained for      // integer
    trn_ldr     : the training dataloader                                   // dataloader
    val_ldr     : the validation dataloader                                 // dataloader
    opti        : the optimiser object                                      // optim
    crit        : the criterion (loss) function                             // nn loss function
    plot        : a flag denoting whether in-training plotting should occur // boolean

    Parameters:
    liveloss    : responsible for in-training plotting, activated by plot   // PlotLosses() object
    epoch       : the current epoch number                                  // integer
    logs        : holds the log data for the current epoch                  // dict
    trn_los     : the training loss for the current epoch                   // float
    trn_acc     : the training accuracy for the current epoch               // float
    val_los     : the validation loss for the current epoch                 // float
    val_acc     : the validation accuracy for the current epoch             // float

    Returns:
    model       : the final, trained model                                  // nn.Module
    '''

    if plot:
        liveloss = PlotLosses()  # initialise liveloss if plotting flag true

    for epoch in range(n_epochs):
        logs = {}

        trn_los, trn_acc = trn(model, opti, crit,
                               trn_ldr)  # run the training cycle
        logs['' + 'log loss'] = trn_los.item()
        logs['' + 'accuracy'] = trn_acc.item()  # update the logs

        val_los, val_acc = val(model, crit,
                               val_ldr)  # run the validation cycle
        logs['val_' + 'log loss'] = val_los.item()
        logs['val_' + 'accuracy'] = val_acc.item()  # update the logs

        if plot:
            liveloss.update(logs)
            liveloss.draw()  # print the plots if flag is true
        if not plot:
            print(
                "Epoch: " +
                str(epoch))  # if not plotting, print epoch number for tracking

    return model  # return finished trained model
コード例 #5
0
class LiveLossPlotListener(DojoListener):
    """
    DojoListener implementation which renders a livelossplot after finishing a dan.
    """
    def __init__(self):
        self.liveloss = None

    def training_started(self, aikidoka: Aikidoka, kata: Kata, kun: DojoKun):
        self.liveloss = PlotLosses()

    def dan_finished(self, aikidoka: Aikidoka, run: (int, int),
                     metrics: (float, float)):
        (loss, acc) = metrics

        self.liveloss.update({"loss": loss, "train_acc": acc})
        self.liveloss.draw()
コード例 #6
0
def train_model_gener(model, criterion, optimizer, dataloaders, num_epochs=10):
    liveloss = PlotLosses()
    model = model.to(device)

    for epoch in range(num_epochs):
        logs = {}
        for phase in ['train', 'validation']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            running_corrects = 0

            for inputs_full, labels_class in dataloaders[phase]:

                # here are changes!
                inputs = inputs_full[:, :-1].to(device)
                labels = inputs_full[:, 1:].to(device)

                outputs = model(inputs)

                loss = criterion(outputs, labels)

                if phase == 'train':
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                _, preds = torch.max(outputs, 1)
                running_loss += loss.detach() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.float() / len(
                dataloaders[phase].dataset)

            prefix = ''
            if phase == 'validation':
                prefix = 'val_'

            logs[prefix + 'log loss'] = epoch_loss.item()
            logs[prefix + 'accuracy'] = epoch_acc.item()

        liveloss.update(logs)
        liveloss.draw()
コード例 #7
0
    def train(self, train_ds, valid_ds, plot_loss=True):
        # Initialize plotting
        if plot_loss:
            liveloss = PlotLosses()

        # Initialize DataLoaders
        tdl = DataLoader(train_ds, batch_size=self.batch_size, pin_memory=True)
        vdl = DataLoader(valid_ds,
                         batch_size=self.batch_size,
                         shuffle=False,
                         pin_memory=True)

        # Lists for losses
        train_losses, valid_losses = [], []
        # Lists for accuracies
        train_accs, valid_accs = [], []

        # Iterate over epochs
        for epoch in range(self.max_epochs):
            # Logs for livelossplot
            logs = {}

            batch_losses = []
            batch_count_goods = []
            # Iterate over batches
            for idx_batch, batch in enumerate(tdl):
                x = batch[0].to(DEVICE)
                y = batch[1].to(device=DEVICE, dtype=torch.long)
                pred = self.model(x)
                loss = self.loss_fn(pred, y)
                batch_losses.append(loss.item())
                # Accuracy
                with torch.no_grad():
                    batch_count_goods.append(self.count_goods(pred, y))
                loss.backward()
                self.optimizer.step()
                self.optimizer.zero_grad()

            # Save train loss and accuracy for the epoch
            train_losses.append(sum(batch_losses) / len(train_ds))
            train_accs.append(sum(batch_count_goods) / len(train_ds))

            # Compute and save validation loss and accuracy for the epoch
            with torch.no_grad():
                v_batch_losses, v_batch_count_goods = [], []
                for idx_batch, batch in enumerate(vdl):
                    x = batch[0].to(DEVICE)
                    y = batch[1].to(device=DEVICE, dtype=torch.long)
                    pred = self.model(x)
                    loss = self.loss_fn(pred, y)
                    v_batch_losses.append(loss.item())
                    v_batch_count_goods.append(self.count_goods(pred, y))
                valid_losses.append(sum(v_batch_losses) / len(valid_ds))
                valid_accs.append(sum(v_batch_count_goods) / len(valid_ds))

            if plot_loss:
                logs['log loss'] = train_losses[epoch]
                logs['val_log loss'] = valid_losses[epoch]
                logs['accuracy'] = train_accs[epoch]
                logs['val_accuracy'] = valid_accs[epoch]
                liveloss.update(logs)
                liveloss.draw()
コード例 #8
0
# TO START:
# pip install livelossplot
# pip install neptune-cli
# neptune account login
# neptune run minimal-neptune.py
# enjoy results

from time import sleep
import numpy as np

from livelossplot import PlotLosses

liveplot = PlotLosses(target='neptune')
for i in range(20):
    liveplot.update({
        'accuracy': 1 - np.random.rand() / (i + 2.),
        'val_accuracy': 1 - np.random.rand() / (i + 0.5),
        'mse': 1. / (i + 2.),
        'val_mse': 1. / (i + 0.5)
    })
    liveplot.draw()
    sleep(.5)
コード例 #9
0
def train(model, criterion, optimizer, train_dl, test_dl, num_epochs=40):
    liveloss = PlotLosses()
    for epoch in range(num_epochs):
        train_loss, valid_loss = [], []
        logs = {}
        prefix = ''
  
        # Training Part
        model.train()
        for i, data in enumerate(train_dl, 0):
            # Get the inputs
            inputs = labels = data
            inputs = inputs.cuda()
            labels = labels.cuda()
            
            inputs = inputs.float()
            labels = labels.float()
            
            # zero the parameter gradients
            optimizer.zero_grad()
            
            # forward + backward + optimize
            outputs = model(inputs)
            outputs = outputs.cuda()
            loss = criterion(outputs,labels)
            loss.backward()
            optimizer.step()
            
            ## -> Dense Output Re-feeding <- ##
            
            # Zero the gradiants
            optimizer.zero_grad()

            # Important detach() the output, to avoid construction of 
            # computation graph
            outputs = model(outputs.detach())
            outputs = outputs.cuda()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
            logs[prefix + 'MMSE loss'] = loss.item()
        
        for i, data in enumerate(test_dl, 0):
            model.eval()
            inputs = labels = data
            inputs = inputs.cuda()
            labels = labels.cuda()
            
            inputs = inputs.float()
            labels = labels.float()
            
            outputs = model(inputs)
            outputs = outputs.cuda()
            loss = criterion(outputs, labels)
            
            valid_loss.append(loss.item())
            prefix = 'val_'
            logs[prefix + 'MMSE loss'] = loss.item()
        
    print()
    liveloss.update(logs)
    liveloss.draw()
    print ("Epoch:", epoch+1, " Training Loss: ", np.mean(train_loss), " Valid Loss: ", np.mean(valid_loss))
コード例 #10
0
    def train_classifier(
        self,
        train_loader,
        test_loader,
        params: dict = None,
        livelossplot=False,
        save_checkpoint_each=None,
    ):
        """
        Method to train the model.

        Arguments:
        ----------
            - train_loader : DatasetLoader for the training set
            - test_loader : DatasetLoader for the test set
            - params (dict) : if needed to update some parameters such as epochs without rebuilding
            the entire class put the updated parameters here
            - livelossplot (bool=False): use livelossplot to plot running loss and error_rate
            - save_checkpoint_each (list): list of epoch when we want to save model
        """
        # Update parameters if given

        if save_checkpoint_each is None:
            save_checkpoint_each = [self.params_classifier["epochs"]]
        if params:
            for param, value in params.items():
                self.params_classifier[param] = value

        # Define liveloss and time of training start
        if livelossplot:
            liveloss = PlotLosses()
        since = time.time()

        # Show which device is used
        print("Using device {}".format(self.device))
        self.model.to(self.device)

        loader_dict = {"train": train_loader, "validation": test_loader}
        for e in range(self.params_classifier["epochs"]):
            self.logs = {}
            if not livelossplot:
                print("Epoch {}/{} :".format(e,
                                             self.params_classifier["epochs"]))
                print("--------------")
            # Alternate between train and validation phase
            for phase in ["train", "validation"]:
                if phase == "train":
                    self.model.train()
                else:
                    self.model.eval()

                # Define loss and uncorrects predictions
                running_loss = 0.0
                running_uncorrects = 0

                # Loop over loader
                for images, labels in iter(loader_dict[phase]):
                    images = images.to(self.device)
                    labels = torch.tensor(labels,
                                          dtype=torch.long,
                                          device=self.device)

                    # Compute forward
                    output = self.model.forward(images)
                    loss = self.loss(output, labels)

                    # Do the retropropag if in train phase
                    if phase == "train":
                        self.optimizer.zero_grad()
                        loss.backward()
                        self.optimizer.step()

                    # Compute prediction
                    _, predicted = torch.max(output, 1)
                    running_loss += loss.detach() * images.size(0)
                    running_uncorrects += torch.sum(
                        predicted != labels.data.detach())

                # Compute loss and error_rate
                size_loader = len(loader_dict[phase].dataset)
                epoch_loss = running_loss / size_loader
                epoch_error_rate = running_uncorrects.float() / size_loader

                # Set the prefix for logs
                prefix = ""
                if phase == "validation":
                    prefix = "val_"

                # Update logs
                self.logs[prefix + "log loss"] = epoch_loss.item()
                self.logs[prefix + "error_rate"] = epoch_error_rate.item()

            # Use liveloss to plot loss and accuracy
            if livelossplot:
                liveloss.update(self.logs)
                liveloss.draw()
            else:
                string_print = """
                Training:               |   Validation:
                    log loss = {}       |       val_log loss = {}
                    error_rate = {}     |       val_error_rate = {}
                """.format(
                    self.logs["log loss"],
                    self.logs["val_log loss"],
                    self.logs["error_rate"],
                    self.logs["val_error_rate"],
                )
                print(string_print)

            # Save checkpoint
            if (e + 1) in save_checkpoint_each:
                save_checkpoint(
                    self.model,
                    model_name="AlexNet_checkpoint_e{}.pth".format(e))

        # Print training time
        time_elapsed = time.time() - since
        print("Training complete in {:.0f}m {:.0f}s".format(
            time_elapsed // 60, time_elapsed % 60))
コード例 #11
0
def train_model_it(model,
                   dataloaders,
                   dataset_sizes,
                   criterion,
                   optimizer,
                   batch_size,
                   num_epochs=10,
                   scheduler=None):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    since = time.time()
    liveloss = PlotLosses()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch + 1, num_epochs))
        print('-' * 10)
        running_loss = 0.0
        running_corrects = 0
        #Iteration
        for i, (inputs, labels) in enumerate(dataloaders['train']):
            if scheduler != None:
                scheduler.step()
            model.train()
            running_loss = 0.0
            running_corrects = 0
            inputs = inputs.to(device)
            labels = labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward
            # track history if only in train
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)
            print("\rTraining Iteration: {}/{}, Loss: {}.".format(
                i + 1, len(dataloaders['train']),
                loss.item() * inputs.size(0) / batch_size),
                  end="")
            sys.stdout.flush()

            if (i + 1) % 100 == 0:
                it_loss = running_loss / batch_size
                it_acc = running_corrects.double() / batch_size
                model.eval()
                val_loss = 0
                val_corr = 0
                for j, (inputs, labels) in enumerate(dataloaders['val']):
                    inputs = inputs.to(device)
                    labels = labels.to(device)
                    optimizer.zero_grad()
                    with torch.set_grad_enabled(False):
                        outputs = model(inputs)
                        _, preds = torch.max(outputs, 1)
                        loss = criterion(outputs, labels)
                    val_loss += loss.item() * inputs.size(0)
                    val_corr += torch.sum(preds == labels.data)
                    print("\rValidation Iteration: {}/{}, Loss: {}.".format(
                        j + 1, len(dataloaders['val']),
                        loss.item() * inputs.size(0) / batch_size),
                          end="")
                    sys.stdout.flush()
                valid_loss = val_loss / dataset_sizes['val']
                valid_acc = val_corr.double() / dataset_sizes['val']

                if valid_acc > best_acc:
                    best_acc = valid_acc
                    best_model_wts = copy.deepcopy(model.state_dict())
                    # statistics

                liveloss.update({
                    'log loss': it_loss,
                    'val_log loss': valid_loss,
                    'accuracy': it_acc,
                    'val_accuracy': valid_acc
                })

                liveloss.draw()
                print('validation loss: {}, validation accuracy: {}'.format(
                    valid_loss, valid_acc))
                print('Best Accuracy: {}'.format(best_acc))

                torch.save(
                    model.state_dict(),
                    "./models/acc_{}_loss_{}.pt".format(best_acc, valid_loss))

#         print('Train Loss: {:.4f} Acc: {:.4f}'.format(avg_loss, t_acc))
#         print(  'Val Loss: {:.4f} Acc: {:.4f}'.format(val_loss, val_acc))
#         print('Best Val Accuracy: {}'.format(best_acc))
        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model
コード例 #12
0
def train_loop(model,
               device,
               optimizer,
               train_loader,
               test_loader,
               lr_scheduler=reduce_lr_scheduler,
               criterion=cross_entropy,
               epoch_value=10,
               plot_loss=False):
    lr_policy = lr_scheduler(optimizer)
    start = time.time()
    liveloss = PlotLosses()

    for epoch_ind in range(epoch_value):
        try:
            logs = {}
            model.train()
            train_loss = 0

            for ind, (input_s, target_s) in enumerate(train_loader):
                input_s = input_s.to(device)
                target_s = target_s.to(device)

                pred = model(input_s)
                loss = criterion(pred, target_s)
                model.zero_grad()
                loss.backward()
                optimizer.step()

                train_loss += loss

                del input_s, target_s
                gc.collect()
                torch.cuda.empty_cache()

            train_loss /= (ind + 1)

            test_loss = 0
            model.eval()

            with torch.no_grad():
                for ind, (input_s, target_s) in enumerate(test_loader):
                    input_s = input_s.to(device)
                    target_s = target_s.to(device)

                    pred = model(input_s)
                    loss = criterion(pred, target_s)

                    test_loss += loss

                    del input_s, target_s
                    gc.collect()
                    torch.cuda.empty_cache()

                test_loss /= (ind + 1)
                lr_policy.step(test_loss)

            train_time = time_since(start)

            if plot_loss:
                logs['val_' + 'loss'] = train_loss
                logs['loss'] = test_loss
                liveloss.update(logs)
                liveloss.draw()

            else:
                callback(train_loss, test_loss, train_time, epoch_value,
                         epoch_ind + 1)

        except KeyboardInterrupt:
            print(f"Early stopping | Epoch: {epoch_ind + 1}")
            break

    return model
コード例 #13
0
ファイル: trainer.py プロジェクト: Kotorinyanya/IMEGAT
def train_cross_validation(model_cls,
                           dataset,
                           dropout=0.0,
                           lr=1e-3,
                           weight_decay=1e-2,
                           num_epochs=200,
                           n_splits=10,
                           use_gpu=True,
                           dp=False,
                           ddp=False,
                           comment='',
                           tb_service_loc='192.168.192.57:6007',
                           batch_size=1,
                           num_workers=0,
                           pin_memory=False,
                           cuda_device=None,
                           tb_dir='runs',
                           model_save_dir='saved_models',
                           res_save_dir='res',
                           fold_no=None,
                           saved_model_path=None,
                           device_ids=None,
                           patience=20,
                           seed=None,
                           fold_seed=None,
                           save_model=False,
                           is_reg=True,
                           live_loss=True,
                           domain_cls=True,
                           final_cls=True):
    """
    :type fold_seed: int
    :param live_loss: bool
    :param is_reg: bool
    :param save_model: bool
    :param seed:
    :param patience: for early stopping
    :param device_ids: for ddp
    :param saved_model_path:
    :param fold_no: int
    :param ddp_port: str
    :param ddp: DDP
    :param cuda_device: list of int
    :param pin_memory: bool, DataLoader args
    :param num_workers: int, DataLoader args
    :param model_cls: pytorch Module cls
    :param dataset: instance
    :param dropout: float
    :param lr: float
    :param weight_decay:
    :param num_epochs:
    :param n_splits: number of kFolds
    :param use_gpu: bool
    :param dp: bool
    :param comment: comment in the logs, to filter runs in tensorboard
    :param tb_service_loc: tensorboard service location
    :param batch_size: Dataset args not DataLoader
    :return:
    """
    saved_args = locals()
    seed = int(time.time() % 1e4 * 1e5) if seed is None else seed
    saved_args['random_seed'] = seed

    torch.manual_seed(seed)
    np.random.seed(seed)
    if use_gpu:
        torch.cuda.manual_seed_all(seed)
        # torch.backends.cudnn.deterministic = True
        # torch.backends.cudnn.benchmark = False

    model_name = model_cls.__name__

    if not cuda_device:
        if device_ids and dp:
            device = device_ids[0]
        else:
            device = torch.device(
                'cuda' if torch.cuda.is_available() and use_gpu else 'cpu')
    else:
        device = cuda_device

    device_count = torch.cuda.device_count() if dp else 1
    device_count = len(device_ids) if (device_ids is not None
                                       and dp) else device_count

    batch_size = batch_size * device_count

    # TensorBoard
    log_dir_base = get_model_log_dir(comment, model_name)
    if tb_service_loc is not None:
        print("TensorBoard available at http://{1}/#scalars&regexInput={0}".
              format(log_dir_base, tb_service_loc))
    else:
        print("Please set up TensorBoard")

    # model
    criterion = nn.NLLLoss()

    print("Training {0} {1} models for cross validation...".format(
        n_splits, model_name))
    # 1
    # folds, fold = KFold(n_splits=n_splits, shuffle=False, random_state=seed), 0
    # 2
    # folds = GroupKFold(n_splits=n_splits)
    # iter = folds.split(np.zeros(len(dataset)), groups=dataset.data.site_id)
    # 4
    # folds = StratifiedKFold(n_splits=n_splits, random_state=fold_seed, shuffle=True if fold_seed else False)
    # iter = folds.split(np.zeros(len(dataset)), dataset.data.y.numpy(), groups=dataset.data.subject_id)
    # 5
    fold = 0
    iter = multi_site_cv_split(dataset.data.y,
                               dataset.data.site_id,
                               dataset.data.subject_id,
                               n_splits,
                               random_state=fold_seed,
                               shuffle=True if fold_seed else False)

    for train_idx, val_idx in tqdm_notebook(iter, desc='CV', leave=False):
        fold += 1
        liveloss = PlotLosses() if live_loss else None

        # for a specific fold
        if fold_no is not None:
            if fold != fold_no:
                continue

        writer = SummaryWriter(log_dir=osp.join('runs', log_dir_base +
                                                str(fold)))
        model_save_dir = osp.join('saved_models', log_dir_base + str(fold))

        print("creating dataloader tor fold {}".format(fold))

        train_dataset, val_dataset = norm_train_val(dataset, train_idx,
                                                    val_idx)

        model = model_cls(writer)

        train_dataloader = DataLoader(train_dataset,
                                      shuffle=True,
                                      batch_size=batch_size,
                                      collate_fn=lambda data_list: data_list,
                                      num_workers=num_workers,
                                      pin_memory=pin_memory)
        val_dataloader = DataLoader(val_dataset,
                                    shuffle=False,
                                    batch_size=batch_size,
                                    collate_fn=lambda data_list: data_list,
                                    num_workers=num_workers,
                                    pin_memory=pin_memory)

        if fold == 1 or fold_no is not None:
            print(model)
            writer.add_text('model_summary', model.__repr__())
            writer.add_text('training_args', str(saved_args))

        optimizer = torch.optim.AdamW(model.parameters(),
                                      lr=lr,
                                      betas=(0.9, 0.999),
                                      eps=1e-08,
                                      weight_decay=weight_decay,
                                      amsgrad=False)
        # scheduler_reduce = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)
        scheduler = GradualWarmupScheduler(optimizer,
                                           multiplier=10,
                                           total_epoch=5)
        # scheduler = scheduler_reduce
        # optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay)
        if dp and use_gpu:
            model = model.cuda() if device_ids is None else model.to(
                device_ids[0])
            model = DataParallel(model, device_ids=device_ids)
        elif use_gpu:
            model = model.to(device)

        if saved_model_path is not None:
            model.load_state_dict(torch.load(saved_model_path))

        best_map, patience_counter, best_score = 0.0, 0, np.inf
        for epoch in tqdm_notebook(range(1, num_epochs + 1),
                                   desc='Epoch',
                                   leave=False):
            logs = {}

            # scheduler.step(epoch=epoch, metrics=best_score)

            for phase in ['train', 'validation']:

                if phase == 'train':
                    model.train()
                    dataloader = train_dataloader
                else:
                    model.eval()
                    dataloader = val_dataloader

                # Logging
                running_total_loss = 0.0
                running_corrects = 0
                running_reg_loss = 0.0
                running_nll_loss = 0.0
                epoch_yhat_0, epoch_yhat_1 = torch.tensor([]), torch.tensor([])
                epoch_label, epoch_predicted = torch.tensor([]), torch.tensor(
                    [])

                logging_hist = True if phase == 'train' else False  # once per epoch
                for data_list in tqdm_notebook(dataloader,
                                               desc=phase,
                                               leave=False):

                    # TODO: check devices
                    if dp:
                        data_list = to_cuda(data_list,
                                            (device_ids[0] if device_ids
                                             is not None else 'cuda'))

                    y_hat, domain_yhat, reg = model(data_list)

                    y = torch.tensor([],
                                     dtype=dataset.data.y.dtype,
                                     device=device)
                    domain_y = torch.tensor([],
                                            dtype=dataset.data.site_id.dtype,
                                            device=device)
                    for data in data_list:
                        y = torch.cat([y, data.y.view(-1).to(device)])
                        domain_y = torch.cat(
                            [domain_y,
                             data.site_id.view(-1).to(device)])

                    loss = criterion(y_hat, y)
                    domain_loss = criterion(domain_yhat, domain_y)
                    # domain_loss = -1e-7 * domain_loss
                    # print(domain_loss.item())
                    if domain_cls:
                        total_loss = domain_loss
                        _, predicted = torch.max(domain_yhat, 1)
                        label = domain_y
                    if final_cls:
                        total_loss = loss
                        _, predicted = torch.max(y_hat, 1)
                        label = y
                    if domain_cls and final_cls:
                        total_loss = (loss + domain_loss).sum()
                        _, predicted = torch.max(y_hat, 1)
                        label = y

                    if is_reg:
                        total_loss += reg.sum()

                    if phase == 'train':
                        # print(torch.autograd.grad(y_hat.sum(), model.saved_x, retain_graph=True))
                        optimizer.zero_grad()
                        total_loss.backward()
                        nn.utils.clip_grad_norm_(model.parameters(), 2.0)
                        optimizer.step()

                    running_nll_loss += loss.item()
                    running_total_loss += total_loss.item()
                    running_reg_loss += reg.sum().item()
                    running_corrects += (predicted == label).sum().item()

                    epoch_yhat_0 = torch.cat(
                        [epoch_yhat_0, y_hat[:, 0].detach().view(-1).cpu()])
                    epoch_yhat_1 = torch.cat(
                        [epoch_yhat_1, y_hat[:, 1].detach().view(-1).cpu()])
                    epoch_label = torch.cat(
                        [epoch_label,
                         label.detach().float().view(-1).cpu()])
                    epoch_predicted = torch.cat([
                        epoch_predicted,
                        predicted.detach().float().view(-1).cpu()
                    ])

                # precision = sklearn.metrics.precision_score(epoch_label, epoch_predicted, average='micro')
                # recall = sklearn.metrics.recall_score(epoch_label, epoch_predicted, average='micro')
                # f1_score = sklearn.metrics.f1_score(epoch_label, epoch_predicted, average='micro')
                accuracy = sklearn.metrics.accuracy_score(
                    epoch_label, epoch_predicted)
                epoch_total_loss = running_total_loss / dataloader.__len__()
                epoch_nll_loss = running_nll_loss / dataloader.__len__()
                epoch_reg_loss = running_reg_loss / dataloader.__len__()

                # print('epoch {} {}_nll_loss: {}'.format(epoch, phase, epoch_nll_loss))
                writer.add_scalars(
                    'nll_loss', {'{}_nll_loss'.format(phase): epoch_nll_loss},
                    epoch)
                writer.add_scalars('accuracy',
                                   {'{}_accuracy'.format(phase): accuracy},
                                   epoch)
                # writer.add_scalars('{}_APRF'.format(phase),
                #                    {
                #                        'accuracy': accuracy,
                #                        'precision': precision,
                #                        'recall': recall,
                #                        'f1_score': f1_score
                #                    },
                #                    epoch)
                if epoch_reg_loss != 0:
                    writer.add_scalars(
                        'reg_loss'.format(phase),
                        {'{}_reg_loss'.format(phase): epoch_reg_loss}, epoch)
                # print(epoch_reg_loss)
                # writer.add_histogram('hist/{}_yhat_0'.format(phase),
                #                      epoch_yhat_0,
                #                      epoch)
                # writer.add_histogram('hist/{}_yhat_1'.format(phase),
                #                      epoch_yhat_1,
                #                      epoch)

                # Save Model & Early Stopping
                if phase == 'validation':
                    model_save_path = model_save_dir + '-{}-{}-{:.3f}-{:.3f}'.format(
                        model_name, epoch, accuracy, epoch_nll_loss)
                    # best score
                    if accuracy > best_map:
                        best_map = accuracy
                        model_save_path = model_save_path + '-best'

                    score = epoch_nll_loss
                    if score < best_score:
                        patience_counter = 0
                        best_score = score
                    else:
                        patience_counter += 1

                    # skip first 10 epoch
                    # best_score = best_score if epoch > 10 else -np.inf

                    if save_model:
                        for th, pfix in zip(
                            [0.8, 0.75, 0.7, 0.5, 0.0],
                            ['-perfect', '-great', '-good', '-bad', '-miss']):
                            if accuracy >= th:
                                model_save_path += pfix
                                break

                        torch.save(model.state_dict(), model_save_path)

                    writer.add_scalars('best_val_accuracy',
                                       {'{}_accuracy'.format(phase): best_map},
                                       epoch)
                    writer.add_scalars(
                        'best_nll_loss',
                        {'{}_nll_loss'.format(phase): best_score}, epoch)

                    writer.add_scalars('learning_rate', {
                        'learning_rate':
                        scheduler.optimizer.param_groups[0]['lr']
                    }, epoch)

                    if patience_counter >= patience:
                        print("Stopped at epoch {}".format(epoch))
                        return

                if live_loss:
                    prefix = ''
                    if phase == 'validation':
                        prefix = 'val_'

                    logs[prefix + 'log loss'] = epoch_nll_loss
                    logs[prefix + 'accuracy'] = accuracy
            if live_loss:
                liveloss.update(logs)
                liveloss.draw()

    print("Done !")
コード例 #14
0
class LiveLossPlot(Callback):
    """
    Callback to write metrics to `LiveLossPlot <https://github.com/stared/livelossplot>`_, a library for visualisation in notebooks

    Example: ::

        >>> import torch.nn
        >>> from torchbearer import Trial
        >>> from torchbearer.callbacks import LiveLossPlot

        # Example Trial which clips all model gradients norms at 2 under the L1 norm.
        >>> model = torch.nn.Linear(1,1)
        >>> live_loss_plot = LiveLossPlot()
        >>> trial = Trial(model, callbacks=[live_loss_plot], metrics=['acc'])

    Args:
        on_batch (bool): If True, batch metrics will be logged. Else batch metrics will not be logged
        batch_step_size (int): The number of batches between logging metrics
        on_epoch (bool): If True, epoch metrics will be logged every epoch. Else epoch metrics will not be logged
        draw_once (bool): If True, draw the plot only at the end of training. Else draw every time metrics are logged
        kwargs: Keyword arguments for livelossplot.PlotLosses

    State Requirements:
        - :attr:`torchbearer.state.METRICS`: Metrics should be a dict containing the metrics to be plotted
        - :attr:`torchbearer.state.BATCH`: Batch should be the current batch or iteration number in the epoch
    """
    def __init__(self,
                 on_batch=False,
                 batch_step_size=10,
                 on_epoch=True,
                 draw_once=False,
                 **kwargs):
        super(LiveLossPlot, self).__init__()
        self._kwargs = kwargs

        self.on_batch = on_batch
        self.on_epoch = on_epoch
        self.draw_once = draw_once
        self.batch_step_size = batch_step_size

        if on_batch:
            self.on_step_training = self._on_step_training

        if on_epoch:
            self.on_end_epoch = self._on_end_epoch

    def on_start(self, state):
        from livelossplot import PlotLosses
        self.plt = PlotLosses(**self._kwargs)
        self.batch_plt = PlotLosses(**self._kwargs)

    def _on_step_training(self, state):
        self.batch_plt.update({
            k: get_metric('LiveLossPlot', state, k)
            for k in state[torchbearer.METRICS]
        })
        if state[torchbearer.
                 BATCH] % self.batch_step_size == 0 and not self.draw_once:
            with no_print():
                self.batch_plt.draw()

    def _on_end_epoch(self, state):
        self.plt.update({
            k: get_metric('LiveLossPlot', state, k)
            for k in state[torchbearer.METRICS]
        })
        if not self.draw_once:
            with no_print():
                self.plt.draw()

    def on_end(self, state):
        if self.draw_once:
            with no_print():
                self.batch_plt.draw()
                self.plt.draw()
コード例 #15
0
def train_model(model,
                optimizer,
                criterion,
                n_epochs,
                train_loader,
                validation_loader=None,
                device='cpu',
                random_seed=42,
                backup_folder=None):
    """ Train a model for a number of epochs.
    Visualizes average loss, F1 and accuracy score over epochs
    If a folder is given, saves state dict and scores to disk after each epoch

    Parameters
    ----------
    model : torch.nn.Module
        The neural network model to train
    
    optimizer : torch.optim.Optimizer
        Optimizer for to use in training
    
    criterion : loss function in torch.nn
    
    n_epochs : int
        number of training iterations over entire train dataset
    
    train_loader : torch.utils.data.DataLoader
        batch data loader of train data
    
    validation_loader : torch.utils.data.DataLoader, optional
        batch data loader of validation data, if available
        
    device : str, optional
        'cpu' or 'cuda'; hardware accelerator to use

    random_seed : int, optional
        seed number for RNGs
    
    backup_folder : path-like, optional
        folder where model parameters are saved after each epoch
        !will delete all contents of folder first!
    
    Returns
    -------
    model : torch.nn.Module
        model with optimized weights after training
    
    validation_loss : float
        average loss of final model on all samples from validation/test set
    
    validation_accuracy : float
        accuracy score of final model on validation/test set
    """

    set_seed(
        random_seed)  # seed all RNGs before start to have reproducible results
    model = model.to(device)  # create instance of model

    if backup_folder is not None:
        if os.path.isdir(backup_folder):
            shutil.rmtree(backup_folder)
        elif os.path.exists(backup_folder):
            os.remove(backup_folder)

        os.mkdir(backup_folder)
        with open(os.path.join(backup_folder, "training_report.csv"),
                  'w') as csv_report:
            csv_report.write(
                "epoch,train_loss,train_accuracy,train_f1,validation_loss,validation_accuracy,validation_f1\n"
            )

    # use special (faster) data loaders if running on TPU
    if TPU_AVAILABLE and (device not in ['cpu', 'cuda']):
        train_loader = xla_loader(train_loader,
                                  [device]).per_device_loader(device)
        if validation_loader is not None:
            validation_loader = xla_loader(validation_loader,
                                           [device]).per_device_loader(device)

    live_plot = PlotLosses()
    # training loop
    for epoch in range(n_epochs):
        logs = dict()
        # do mini-batch SGD over all training samples
        train_loss, train_accuracy, train_f1 = train(model, optimizer,
                                                     criterion, train_loader,
                                                     device)

        if backup_folder is not None:
            torch.save(
                model.state_dict(),
                os.path.join(backup_folder,
                             "model_epoch_{:d}.pth".format(epoch)))

        logs['log loss'] = train_loss
        logs['f1 score'] = train_f1
        logs['accuracy'] = train_accuracy

        # evaluate model on validation/test set
        if validation_loader is not None:
            validation_loss, validation_accuracy, validation_f1 = validate(
                model, criterion, validation_loader, device)
            logs['val_log loss'] = validation_loss
            logs['val_f1 score'] = validation_f1
            logs['val_accuracy'] = validation_accuracy
        else:
            validation_loss, validation_accuracy, validation_f1 = 0, 0, 0

        if backup_folder is not None:
            with open(os.path.join(backup_folder, "training_report.csv"),
                      'a') as csv_report:
                csv_report.write(
                    "{:d},{:.4f},{:.4f},{:.4f},{:.4f},{:.4f},{:.4f}\n".format(
                        epoch, train_loss, train_accuracy, train_f1,
                        validation_loss, validation_accuracy, validation_f1))

        # draw the visualization of average loss and accuracy
        live_plot.update(logs)
        live_plot.draw()

    return model, validation_loss, validation_accuracy, validation_f1
コード例 #16
0
class train_wrapper():
    """
    Class that keeps a model, its optimiser and dataloaders together.
    Stores the train, validate and evaluate functions for training as well
    as some other useful methods to carry out the training with a love plot
    and save the model.
    """
    
    def __init__(self, model, optimizer, train_loader, validate_loader,
        criterion=nn.CrossEntropyLoss(), device="cpu", keep_best=0):
        "Stores the parameters on the class instance for later methods"
        
        for arg in ["model", "optimizer", "train_loader", "validate_loader",
        "criterion", "device", "keep_best"]:
            exec("self." + arg + "=" + arg)
            
        try:
            self.transform = validate_loader.dataset.transform
        except:
            print("No transform found, test data must be normalised manually")
        
        # store the liveloss as it holds all our logs, useful for later
        self.liveloss = PlotLosses()
        # store the best model params
        self.best_params_dict = {}
        # store the current epoch between training batches
        self.epoch = 0
        # for keeping the best model params
        self.max_acc=0.
            
        return
    
    
    def train(self):
        "Train a single epoch"
        
        # set the model expect a backward pass
        self.model.train()
        
        train_loss, train_accuracy = 0, 0
        
        # for every training batch
        for X, y in self.train_loader:
            
            # put the samples on the device
            X, y = X.to(self.device), y.to(self.device)
            
            # zero the gradent
            self.optimizer.zero_grad()
            
            # find the model output with current parameters
            output = self.model(X)
            
            # caclulate the loss for to the expect output
            loss = self.criterion(output, y)
            
            # propagate the gradients though the network
            loss.backward()
            
            # store the loss (scaled by batch size for averaging)
            train_loss += loss * X.size(0)
            
            # find the predictions from this output
            y_pred = F.log_softmax(output, dim=1).max(1)[1]
            
            # compare to expected output to find the accuracy
            train_accuracy += accuracy_score(y.cpu().numpy(), y_pred.detach().cpu().numpy())*X.size(0)
            
            # improve the parameters
            self.optimizer.step()

        # return the mean loss and accuracy of this epoch
        N_samp = len(self.train_loader.dataset)
        return train_loss/N_samp, train_accuracy/N_samp
    
    
    def validate(self):
        """
        Find the loss and accuracy of the current model parameters to the
        validation data set
        """
        
        # if no validation set present return zeros
        if self.validate_loader == None:
            return torch.tensor(0.), torch.tensor(0.)
        
        # set the model to not expect a backward pass
        self.model.eval()
        
        validation_loss, validation_accuracy = 0., 0.
        
        # for every validate batch
        for X, y in self.validate_loader:
            
            # tell the optimizer not to store gradients
            with torch.no_grad():
                
                # put the samples on the device
                X, y = X.to(self.device), y.to(self.device)
                
                # find the model output with current parameters
                output = self.model(X)
                
                # caclulate the loss for to the expect output
                loss = self.criterion(output, y)
                
                # store the loss (scaled by batch size for averaging)
                validation_loss += loss * X.size(0)
                
                # find the predictions from this output
                y_pred = F.log_softmax(output, dim=1).max(1)[1]
                
                # compare to expected output to find the accuracy
                validation_accuracy += accuracy_score(y.cpu().numpy(), y_pred.cpu().numpy())*X.size(0)
        
        # return the mean loss and accuracy of this epoch
        N_samp = len(self.validate_loader.dataset)
        return validation_loss/N_samp, validation_accuracy/N_samp
    
    
    def evaluate(self, test_data, prob_output=True):
        """
        Find the prediction of the current model parameters with the test
        data set and return both the predicted and actual labels
        """
        

        # set the model to not expect a backward pass
        self.model.eval()
        
        y_preds = []
        
        # for every test batch
        for X in test_data:
            
            # normalise the test data with validates transformation
            if self.transform:
                X = self.transform(X)

        
            # tell the optimizer not to store gradients
            with torch.no_grad():
                
                # put the samples on the device
                X = X.to(self.device)
                
                # find the model output with current parameters
                output = self.model(X.view(-1, 1, 28, 28))
                
                # find the predictions from this output
                y_pred = F.log_softmax(output, dim=1)
                if not prob_output:
                    y_pred = y_pred.max(1)[1]
                
                # store the predicted and actual outcomes
                y_preds.append(y_pred.cpu().numpy())

        # return the list of predictions and actual targets
        return np.concatenate(y_preds, 0)
    
    
    def train_model(self, epochs):
        """
        Do a live plot of the training accuracy and loss as the model is trained
        """
        
        for _ in range(epochs):
            logs = {}
            train_loss, train_accuracy = self.train()

            logs['' + 'log loss'] = train_loss.item()
            logs['' + 'accuracy'] = train_accuracy.item()

            validation_loss, validation_accuracy = self.validate()
            logs['val_' + 'log loss'] = validation_loss.item()
            logs['val_' + 'accuracy'] = validation_accuracy.item()

            # if we are after the 
            if self.keep_best:
                if train_accuracy.item() > self.max_acc and self.epoch > self.keep_best:
                    self.max_acc = train_accuracy.item()
                    self.best_params_dict = self.model.state_dict()
            
            self.liveloss.update(logs)
            self.liveloss.draw()
            self.epoch += 1
            
        print("Training Finished")
        return
    
    
    def save_model(self, name, path=F"/content/gdrive/My Drive/models/"):
        """
        Pickel either the whole model or its parameter dictionary
        via torch's save methods
        """
        
        dict = {"model":self.model, "transform":self.transform,
                "Liveloss":self.liveloss}
        torch.save(dict, path + name)
        print("saved to " + path + name)
    

    def num_model_params(self):
        n_params = sum([t.cpu().detach().numpy().size 
                        for t in self.model.parameters()])
        print("Number of model Parameters: ", n_params)
        return n_params


    def max_acc_epoch(self):
        max_acc = self.liveloss.metrics_extrema['val_accuracy']['max']
        for log in self.liveloss.logs:
            if log["val_accuracy"] == max_acc:
                return log["_i"]#

            
    def confusion_matrix(self):
        
        y_preds, ys = [], []
        
        # same code as validate
        self.model.eval()
        
        for X, y in self.validate_loader:
            with torch.no_grad():
                X, y = X.to(self.device), y.to(self.device)
                output = self.model(X)
                y_pred = F.log_softmax(output, dim=1)
                y_pred = y_pred.max(1)[1]
                
                y_preds.append(y_pred.cpu().numpy())
                ys.append(y.cpu().numpy())
        
        y_preds = np.array(y_preds).flatten()
        ys = np.array(ys).flatten()
        
        return ConfusionMatrix(actual_vector=ys, predict_vector=y_preds)
コード例 #17
0
ファイル: vbcar.py プロジェクト: mindis/VBCAR
    def train(self):
        """Multiple training.

        Returns:
            None.
        """
        max_noprogress = 5
        _loss_train_min = 1e-5
        n_noprogress = 0

        process_bar = tqdm(range(self.iteration))
        liveloss = PlotLosses(fig_path=self.output_file_name + ".iter.pdf")
        loss_list = []
        _best_ndcg = 0
        for i in process_bar:
            logs = {}
            all_loss = 0
            kl_loss = 0
            batch_num = 0
            for batch_ndx, sample in enumerate(self.data_loader):
                pos_u = torch.tensor(
                    [triple[0] for triple in sample],
                    dtype=torch.int64,
                    device=self.device,
                )
                pos_i_1 = torch.tensor(
                    [triple[1] for triple in sample],
                    dtype=torch.int64,
                    device=self.device,
                )
                pos_i_2 = torch.tensor(
                    [triple[2] for triple in sample],
                    dtype=torch.int64,
                    device=self.device,
                )

                neg_u = torch.tensor(
                    self.data.user_sampler.sample(self.n_neg, len(sample)),
                    dtype=torch.int64,
                    device=self.device,
                )
                neg_i_1 = torch.tensor(
                    self.data.item_sampler.sample(self.n_neg, len(sample)),
                    dtype=torch.int64,
                    device=self.device,
                )
                neg_i_2 = torch.tensor(
                    self.data.item_sampler.sample(self.n_neg, len(sample)),
                    dtype=torch.int64,
                    device=self.device,
                )
                #                 print(pos_u,neg_u)

                self.optimizer.zero_grad()
                loss = self.model.forward(pos_u, pos_i_1, pos_i_2, neg_u,
                                          neg_i_2, neg_i_2)
                #                 print(loss)
                loss.backward()
                self.optimizer.step()
                all_loss = all_loss + loss
                kl_loss = kl_loss + self.model.kl_loss
                batch_num = batch_ndx
            if self.device.type == "cuda":
                all_loss = all_loss.cpu()
                if kl_loss != 0:
                    kl_loss = kl_loss.cpu()

            logs["loss"] = all_loss.item() / batch_num

            if self.show_result:
                data_i = np.random.randint(10)
                result = self.data.evaluate_vali(self.data.test[data_i],
                                                 self.model)
                logs["ndcg@10_test"], logs["recall@10_test"] = (
                    result["ndcg@10"],
                    result["recall@10"],
                )
                result = self.data.evaluate_vali(self.data.validate[data_i],
                                                 self.model)
                logs["ndcg@10_val"], logs["recall@10_val"] = (
                    result["ndcg@10"],
                    result["recall@10"],
                )
                if _best_ndcg < result["ndcg@10"]:
                    _best_ndcg = result["ndcg@10"]
                    self.best_model = copy.deepcopy(self.model.state_dict())
                    torch.save(self.best_model, self.output_file_name)
            if kl_loss != 0:
                logs["kl_loss"] = kl_loss.item() / batch_num
                logs["loss"] = logs["loss"] - logs["kl_loss"]

            loss_list.append(logs["loss"])

            if i > 1:
                if abs(loss_list[i] - loss_list[i - 1]) < _loss_train_min:
                    n_noprogress += 1
                else:
                    n_noprogress = 0

            liveloss.update(logs)
            liveloss.draw()
            process_bar.set_description(
                "Loss: %0.8f, lr: %0.6f" %
                (logs["loss"], self.optimizer.param_groups[0]["lr"]))
            print("=== #no progress: ", n_noprogress)

            if n_noprogress >= max_noprogress:
                liveloss.draw()
                break
            """Sets the learning rate to the initial LR decayed by 10 every 10 epochs"""
            lr = self.initial_lr * (0.5**(i // 10))
            for param_group in self.optimizer.param_groups:
                param_group["lr"] = lr
            if i >= self.iteration - 1:
                liveloss.draw()
コード例 #18
0
def main():
    global best_test_bpd

    last_checkpoints = []
    lipschitz_constants = []
    ords = []

    # if args.resume:
    #     validate(args.begin_epoch - 1, model, ema)

    #liveloss = PlotLosses()

    #liveloss = PlotLosses()
    liveloss = PlotLosses()

    for epoch in range(args.begin_epoch, args.nepochs):
        logs = {}

        logger.info('Current LR {}'.format(optimizer.param_groups[0]['lr']))

        running_loss = train(epoch, model)

        #train(epoch, model)
        lipschitz_constants.append(get_lipschitz_constants(model))

        #ords.append(get_ords(model))

        #ords.append(get_ords(model))
        ords.append(get_ords(model))

        logger.info('Lipsh: {}'.format(pretty_repr(lipschitz_constants[-1])))
        logger.info('Order: {}'.format(pretty_repr(ords[-1])))

        #epoch_loss = running_loss / len(dataloaders[phase].dataset)
        epoch_loss = running_loss / len(
            datasets.CIFAR10(
                args.dataroot, train=True, transform=transform_train))

        logs['log loss'] = epoch_loss.item()

        liveloss.update(logs)
        liveloss.draw()

        if args.ema_val:
            test_bpd = validate(epoch, model, ema)
        else:
            test_bpd = validate(epoch, model)

        if args.scheduler and scheduler is not None:
            scheduler.step()

        if test_bpd < best_test_bpd:
            best_test_bpd = test_bpd

            utils.save_checkpoint(
                {
                    'state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'args': args,
                    'ema': ema,
                    'test_bpd': test_bpd,
                },
                os.path.join(args.save, 'moMoModels'),
                epoch,
                last_checkpoints,
                num_checkpoints=5)
            """
            utils.save_checkpoint({
                'state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'args': args,
                'ema': ema,
                'test_bpd': test_bpd,
            }, os.path.join(args.save, 'mMoModels'), epoch, last_checkpoints, num_checkpoints=5)
            
            utils.save_checkpoint({
                'state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'args': args,
                'ema': ema,
                'test_bpd': test_bpd,
            }, os.path.join(args.save, 'mModels'), epoch, last_checkpoints, num_checkpoints=5)
            
            utils.save_checkpoint({
                'state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'args': args,
                'ema': ema,
                'test_bpd': test_bpd,
            }, os.path.join(args.save, 'models'), epoch, last_checkpoints, num_checkpoints=5)
            """

        torch.save(
            {
                'state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'args': args,
                'ema': ema,
                'test_bpd': test_bpd,
            }, os.path.join(args.save, 'models',
                            '010mmoosttMoosttRecentt.pth'))
        """
コード例 #19
0
def train_model(model,
                dataloaders,
                dataset_sizes,
                criterion,
                optimizer,
                scheduler,
                num_epochs=25):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    since = time.time()
    liveloss = PlotLosses()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch + 1, num_epochs))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
                model.train()  # Set model to training mode
            else:
                model.eval()  # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for i, (inputs, labels) in enumerate(dataloaders[phase]):
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                print("\rIteration: {}/{}, Loss: {}.".format(
                    i + 1, len(dataloaders[phase]),
                    loss.item() * inputs.size(0)),
                      end="")

                #                 print( (i+1)*100. / len(dataloaders[phase]), "% Complete" )
                sys.stdout.flush()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            if phase == 'train':
                avg_loss = epoch_loss
                t_acc = epoch_acc
            else:
                val_loss = epoch_loss
                val_acc = epoch_acc

#             print('{} Loss: {:.4f} Acc: {:.4f}'.format(
#                 phase, epoch_loss, epoch_acc))

# deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        liveloss.update({
            'log loss': avg_loss,
            'val_log loss': val_loss,
            'accuracy': t_acc,
            'val_accuracy': val_acc
        })

        liveloss.draw()
        print('Train Loss: {:.4f} Acc: {:.4f}'.format(avg_loss, t_acc))
        print('Val Loss: {:.4f} Acc: {:.4f}'.format(val_loss, val_acc))
        print('Best Val Accuracy: {}'.format(best_acc))
        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model
        best_acc = test_correct / test_total


#        checkpoint = torch.load('./checkpoint/Sqnet_1x_v1.0/Sqnet_1x_v1.0_Cifar10.ckpt')
#        net.load_state_dict(checkpoint['net_state_dict'])
#        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

liveloss = PlotLosses()
for _epoch in range(start_epoch, start_epoch + num_epochs):
    start_time = time.time()
    train(_epoch)
    print()
    test(_epoch)
    print()
    print()
    end_time = time.time()
    print('Epoch #%d Cost %ds' % (_epoch, end_time - start_time))
    best_cost = end_time - start_time
    if end_time - start_time < best_cost:
        best_cost = end_time - start_time

    liveloss.update({
        'log loss': train_loss,
        'val_log loss': test_loss,
        'accuracy': train_correct,
        'val_accuracy': test_correct
    })
    liveloss.draw()
print('Best Cost: %ds' % (best_cost))
print('Best Acc: %.4f percent' % (best_acc * 100))
コード例 #21
0
def fit_model(train_loader,
              val_loader,
              model,
              optimizer,
              scheduler,
              n_epochs,
              log_interval,
              plot=True,
              burnin=-1,
              patience=3,
              early_stop_score='MAP',
              eval_metric='cosine'):
    early_stop = {}
    early_stop['best'] = -float('inf')
    early_stop['best_params'] = to_cpu(model.state_dict())
    early_stop['fails'] = 0

    if plot:
        liveloss = PlotLosses()
    for epoch in range(n_epochs):
        logs = {}
        start_time = time.time()

        # Training
        train_loss = train_epoch(train_loader, model, optimizer)
        train_scores = {}
        # Turned off for optimize
        # if epoch > 0 and epoch % log_interval == 0:
        # train_scores = evaluate_ranking(model, train_loader, metric=eval_metric)

        elapsed = time.time() - start_time
        message = '\n' + '=' * 80
        message += '\nTrain:     '
        message += f' epoch: {epoch:2d}, time: {int(elapsed):d}s., loss: {train_loss:5.3f}'
        if 'silhouette' in train_scores:
            message += f', silouhette: {train_scores["silhouette"]:.2f}'
        message += '\n'

        # Validation
        start_time = time.time()
        val_loss = test_epoch(val_loader, model)
        val_scores = {}

        if epoch > 0 and epoch % log_interval == 0:
            train_label_set = list(set(train_loader.dataset.labels))
            val_scores = evaluate_ranking(model,
                                          val_loader,
                                          train_label_set,
                                          metric=eval_metric)

            # early stopping
            if val_scores[early_stop_score] > early_stop['best']:
                early_stop['best'] = val_scores[early_stop_score]
                early_stop['best_params'] = to_cpu(model.state_dict())
                early_stop['fails'] = 0
                early_stop['val_scores'] = val_scores
            else:
                early_stop['fails'] += 1
            if early_stop['fails'] >= patience:
                raise EarlyStopException(early_stop['best'],
                                         early_stop['best_params'],
                                         early_stop['fails'],
                                         early_stop['val_scores'])

        elapsed = time.time() - start_time

        message += 'Validation:'
        message += f' epoch: {epoch:2d}, time: {int(elapsed):d}s., loss: {val_loss:5.3f}'
        if 'silhouette' in val_scores:
            message += f', silhouette: {val_scores["silhouette"]:.2f}'
            message += f'\n            MAP: {val_scores["MAP"]:.2f}'
            message += f', MAP (seen): {val_scores["MAP seen labels"]:.2f}'
            message += f', MAP (unseen): {val_scores["MAP unseen labels"]:.2f}'
        message += '\n'
        message += '=' * 80 + '\n'
        print(message)

        logs['loss'] = train_loss
        logs['val_loss'] = val_loss
        for score, value in train_scores.items():
            logs[score] = value
        for score, value in val_scores.items():
            logs[f'val_{score}'] = value

        if epoch > burnin:
            scheduler.step(val_loss)

        if plot:
            liveloss.update(logs)
            liveloss.draw()

    # return data in case it never early stopped
    return early_stop
コード例 #22
0
ファイル: train.py プロジェクト: jkoscialkowski/gsn-projekt
    def train(self,
              train_ds,
              valid_ds,
              plot_loss=True,
              verbose=True,
              save_path=None,
              need_y: str = 'no'):
        """Method for training, takes train and validation Datasets, as well
        as parameters specifying training monitoring and trains a network for
        a given set of hyperparameters.

        :param train_ds: training Dataset
        :param valid_ds: validation Dataset
        :param plot_loss: whether to plot loss during training
        :param verbose: whether to print loss after each epoch
        :param save_path: if given, serialises the model and saves there
        :param need_y: command to extract y's in order to train Attention based models with
        'state' or 'switch cells' layer
        """
        # Create DataLoaders
        assert need_y in ['no', 'yes'], 'Should be no/yes'
        train_dl = DataLoader(train_ds,
                              batch_size=self.batch_size,
                              shuffle=True)
        test_dl = DataLoader(valid_ds, batch_size=self.batch_size)

        # Dictionary for losses
        losses = {'train_loss': [], 'valid_loss': []}

        # Plot losses if the user chooses so
        if plot_loss:
            liveloss = PlotLosses()

        # Iterate over epochs
        for epoch in range(self.max_epochs):

            # Switch to training mode
            self.model.train()

            if verbose:
                print('Starting epoch {}'.format(epoch + 1))

            # A list for batch-wise training losses in a given epoch
            epoch_loss = []

            # Iterate over batches
            for idx_batch, batch in enumerate(train_dl):
                self.optimizer.zero_grad()
                if need_y == 'yes':
                    out = self.model(batch[0]['train_obs'].permute(1, 0, 2),
                                     y=batch[1].permute(1, 0))
                    tr_loss = self.loss(out, batch[0]['train_y'].to(DEVICE))
                elif need_y == 'no':
                    out = self.model(batch['train_obs'].permute(1, 0, 2))
                    tr_loss = self.loss(out, batch['train_y'].to(DEVICE))
                epoch_loss.append(tr_loss.item())
                tr_loss.backward()
                self.optimizer.step()

            # Switch to evaluation mode
            self.model.eval()

            # Compute training loss for the epoch
            losses['train_loss'].append(sum(epoch_loss) / len(train_dl))

            # Compute validation loss by iterating through valid dl batches
            with torch.no_grad():

                # A list for batch-wise validation losses
                val_loss = []

                # Iterate over batches in the validation DataLoader
                for idx_v_batch, v_batch in enumerate(test_dl):
                    if need_y == 'yes':
                        val_loss.append(
                            self.loss(
                                self.model(v_batch[0]['test_obs'].permute(
                                    1, 0, 2),
                                           y=v_batch[1].permute(1, 0)),
                                v_batch[0]['test_y']).item())
                    elif need_y == 'no':
                        val_loss.append(
                            self.loss(
                                self.model(v_batch['test_obs'].permute(
                                    1, 0, 2)), v_batch['test_y']).item())
                losses['valid_loss'].append(sum(val_loss) / len(test_dl))

            # Printing loss for a given epoch
            if verbose:
                print('Loss: {}'.format(losses['valid_loss'][epoch]))
            # Plot loss after each epoch if the user chose to
            if plot_loss:
                logs = {
                    'log_loss': losses['train_loss'][epoch],
                    'val_log_loss': losses['valid_loss'][epoch]
                }

                liveloss.update(logs)
                liveloss.draw()

            # Early stopping
            if self.early_stopping_patience:
                lag_1 = losses['valid_loss'][(
                    epoch - self.early_stopping_patience):epoch]
                lag_2 = losses['valid_loss'][(epoch -
                                              self.early_stopping_patience -
                                              1):(epoch - 1)]
                no_drops = sum(True if l1 < l2 else False
                               for l1, l2 in zip(lag_1, lag_2))
                if epoch > self.early_stopping_patience and no_drops == 0:
                    break

        # Save last loss
        self.final_loss = np.mean(losses['valid_loss'][-1])
        self.last_epoch = epoch

        # Save model
        if save_path:
            torch.save(self.model.state_dict(), save_path)
コード例 #23
0
def train_model(model,
                train_dataset,
                validate_dataset,
                test_dataset,
                batch_size,
                test_batch_size,
                lr,
                n_epochs,
                optimizer=None,
                epoch_trained=0,
                seed=42):
    """The train function """
    set_seed(seed)
    if optimizer is None:
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=lr,
                                     weight_decay=1e-5,
                                     eps=1e-3,
                                     amsgrad=True)
    else:
        optimizer = optimizer
    criterion_train = multiscaleUnsupervisorError
    criterion_validate = realEPE

    # Prepare data loader
    train_loader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=4,
                              pin_memory=True)
    validation_loader = DataLoader(validate_dataset,
                                   batch_size=test_batch_size,
                                   shuffle=False,
                                   num_workers=4,
                                   pin_memory=True)
    test_loader = DataLoader(test_dataset,
                             batch_size=test_batch_size,
                             shuffle=False,
                             num_workers=4,
                             pin_memory=True)

    liveloss = PlotLosses()
    para_dict = {}
    total_time = 0
    for epoch in range(epoch_trained, n_epochs):
        start_time = time.clock()
        print("Total epoch %d" % n_epochs)
        print("Epoch %d starts! " % epoch)
        print("Memory allocated: ",
              torch.cuda.memory_allocated() / 1024 / 1024 / 1024)

        GPUtil.showUtilization()

        logs = {}
        train_loss, train_loss_epe = train(model, optimizer, criterion_train,
                                           train_loader)
        validation_loss_epe = validate(model, criterion_validate,
                                       validation_loader)

        end_time = time.clock()

        logs['' + 'multiscale loss'] = train_loss
        logs['' + 'EPE loss'] = train_loss_epe
        logs['val_' + 'EPE loss'] = validation_loss_epe
        liveloss.update(logs)
        liveloss.draw()

        total_time += end_time - start_time

        print(
            "Epoch: ", epoch, ", Avg. Train EPE Loss: %1.3f" % train_loss_epe,
            " Avg. Validation EPE Loss: %1.3f" % validation_loss_epe,
            "Time used this epoch (seconds): %1.3f" % (end_time - start_time),
            "Time remain(hrs) %1.3f" % (total_time / (epoch + 1) *
                                        (n_epochs - epoch) / 3600))

        # Every 5 epoach, checkpoint
        if (epoch + 1) % 5 == 0:
            test_loss_epe = validate(model, criterion_validate, test_loader)
            # Fill in the parameters into the dict
            para_dict['epoch'] = epoch
            para_dict['dataset size'] = len(train_loader.dataset)
            para_dict['train EPE'] = train_loss_epe
            para_dict['validation EPE'] = validation_loss_epe
            para_dict['learning rate'] = lr
            para_dict['time used(seconds)'] = total_time

            # There is no actual test loss, so use validation loss here
            para_dict['test EPE'] = test_loss_epe

            # Do the save
            save_model(model, optimizer, train_loss, para_dict,
                       "UnLiteFlowNet_checkpoint_%d_" % epoch)

    test_loss_epe = validate(model, criterion_validate, test_loader)
    print(" Avg. Test EPE Loss: %1.3f" % test_loss_epe,
          "Total time used(seconds): %1.3f" % total_time)
    print("")

    # Fill in the parameters into the dict
    para_dict = {}
    para_dict['epoch'] = n_epochs
    para_dict['dataset size'] = len(train_loader.dataset)
    para_dict['batch_size'] = batch_size
    para_dict['train EPE'] = train_loss_epe
    para_dict['validation EPE'] = validation_loss_epe
    para_dict['learning rate'] = lr
    para_dict['time used(seconds)'] = total_time
    para_dict['test EPE'] = test_loss_epe
    save_model(model, optimizer, train_loss, para_dict,
               "UnLiteFlowNet_%d_" % epoch)

    return model