def train_eval(fold, dataloaders, dataset_sizes, net, criterion, optimizer,
               scheduler, net_name, num_epochs):
    """
    Train and evaluate a net.
    """
    # Initialize logs
    fname = os.path.join(args.model_dir, f'train{fold}.log')
    logging_train = myutils.setup_logger(fname)
    fname = os.path.join(args.model_dir, f'lr{fold}.log')
    logging_lr = myutils.setup_logger(fname)
    # Reproducibility
    myutils.myseed(seed=42)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load initial weights
    net = net.to(device)
    #best_net_wts = copy.deepcopy(net.state_dict())
    best_acc, epoch = 0.0, 1

    # Initialize .tar files to save settings
    fname = f'last{fold}.tar'
    last_path = os.path.join(args.model_dir, fname)
    fname = f'best{fold}.tar'
    best_path = os.path.join(args.model_dir, fname)

    # To resume training for more epochs
    if args.resume:
        try:
            # Load last settings from .tar file
            last_checkpoint = torch.load(last_path)
            net.load_state_dict(last_checkpoint['net_state_dict'])
            optimizer.load_state_dict(last_checkpoint['optimizer_state_dict'])
            epoch = last_checkpoint[
                'epoch'] + 1  # Since last epoch was saved we start with the next one
            logging_process.info(
                f'Model: {args.model_dir}\tLast epoch saved: {epoch-1}, resumming training since epoch: {epoch}'
            )

            # Load best settings from .tar file
            best_checkpoint = torch.load(best_path)
            #best_net_wts = best_checkpoint['net_state_dict']
            best_acc = best_checkpoint['acc']

        except FileNotFoundError as err:
            # This error happens when folds are present
            # If interrupted on fold 1 then best best_checkpoint for fold 2 does
            # not exists. This is fixed like this.
            logging_process.info(f'Model: {args.model_dir}\tError: {err}')

    # TRAINING LOOP
    for epoch in range(epoch, num_epochs + 1):

        print(f'Epoch {epoch}/{num_epochs}')
        logging_train.info(f'Epoch {epoch}/{num_epochs}')

        # Each epoch has a training phase and a validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()  # Set net to training mode
                mylr_value = optimizer.param_groups[0]['lr']
                logging_lr.info(f'Epoch {epoch}\tlr: {mylr_value}')
            else:
                net.eval()  # Set net to evaluate mode

            # Track statistics
            running_loss = 0.0
            running_corrects = 0

            # Iterate over data
            for index, inputs, labels in tqdm(dataloaders[phase]):
                inputs = inputs.to(device)
                labels = labels.to(device)

                # Zero the parameter gradients
                optimizer.zero_grad()

                # Forward
                # Track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(inputs)
                    _, targets = torch.max(labels, 1)
                    _, preds = torch.max(outputs, 1)

                    #if net_name.startswith('vgg16_ft_no_soft'):
                    #    outputs = torch.reshape(outputs, (-1,)) # reshape added for binary
                    #    loss = criterion(outputs, targets.float()) # float added for binary

                    #else:
                    loss = criterion(outputs, targets)

                    # Backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # Batch statistics
                running_loss += loss.detach().item() * inputs.size(
                    0)  # This is batch loss
                running_corrects += torch.sum(
                    preds == targets.data)  # This is batch accuracy

            # efficientnetb
            if net_name.startswith('efficientnetb'):
                if phase == 'train':
                    scheduler.step()

            # inceptionv
            if net_name.startswith('inceptionv'):
                if phase == 'train':
                    if (epoch % 2) == 0:
                        scheduler.step()

            # Epoch statistics
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss,
                                                       epoch_acc))
            logging_train.info('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            if phase == 'val':

                # Save last settings to .tar file
                torch.save(
                    {
                        'epoch': epoch,
                        'net_state_dict': net.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'loss': epoch_loss
                    }, last_path)

                if epoch_acc > best_acc:
                    best_acc = epoch_acc
                    #best_net_wts = net.state_dict()

                    # Save best settings to .tar file
                    torch.save(
                        {
                            'epoch': epoch,
                            'net_state_dict': net.state_dict(),  #best_net_wts
                            'optimizer_state_dict': optimizer.state_dict(),
                            'loss': epoch_loss,
                            'acc': best_acc
                        },
                        best_path)

                    # Save best settings to .json file
                    best_metrics = {
                        f'loss{fold}': epoch_loss,
                        f'acc{fold}': best_acc.item()
                    }
                    fname = os.path.join(args.model_dir, f'metrics{fold}.json')
                    with open(fname, 'w') as f:
                        f.write(json.dumps(best_metrics))

                # vgg
                if net_name.startswith('vgg'):
                    scheduler.step(epoch_acc)

                # resnet
                if net_name.startswith('resnet'):
                    scheduler.step(epoch_loss)

    print('Best val Acc: {:4f}'.format(best_acc))
    logging_process.info('Model: {}\tFold: {}\tBest val Acc: {:4f}'.format(
        args.model_dir, fold, best_acc))
Exemple #2
0
def train_eval(fold, dataloaders, dataset_sizes, net, criterion, optimizer,
               scheduler, net_name, num_epochs):
    """
    Train and evaluate a net.
    """
    # Initialize logs
    fname = os.path.join(args.model_dir, f'train{fold}.log')
    logging_train = myutils.setup_logger(fname)
    fname = os.path.join(args.model_dir, f'lr{fold}.log')
    logging_lr = myutils.setup_logger(fname)
    fname = os.path.join(args.model_dir, f'bins{fold}.log')
    logging_bins = myutils.setup_logger(fname)
    fname = os.path.join(args.model_dir, f'cats{fold}.log')
    logging_cats = myutils.setup_logger(fname)

    # Reproducibility
    myutils.myseed(seed=42)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load initial weights
    net = net.to(device)
    #best_net_wts = copy.deepcopy(net.state_dict())
    best_acc, epoch = 0.0, 1

    # Initialize .tar files to save settings
    fname = f'last{fold}.tar'
    last_path = os.path.join(args.model_dir, fname)
    fname = f'best{fold}.tar'
    best_path = os.path.join(args.model_dir, fname)

    # To resume training for more epochs
    if args.resume:
        try:
            # Load last settings from .tar file
            last_checkpoint = torch.load(last_path)
            net.load_state_dict(last_checkpoint['net_state_dict'])
            optimizer.load_state_dict(last_checkpoint['optimizer_state_dict'])
            epoch = last_checkpoint[
                'epoch'] + 1  # Since last epoch was saved we start with the next one
            logging_process.info(
                f'Model: {args.model_dir}\tLast epoch saved: {epoch-1}, resumming training since epoch: {epoch}'
            )

            # Load best settings from .tar file
            best_checkpoint = torch.load(best_path)
            #best_net_wts = best_checkpoint['net_state_dict']
            best_acc = best_checkpoint['acc']

        except FileNotFoundError as err:
            # This error happens when folds are present
            # If interrupted on fold 1 then best best_checkpoint for fold 2 does
            # not exists. This is fixed like this.
            logging_process.info(f'Model: {args.model_dir}\tError: {err}')

    # TRAINING LOOP
    for epoch in range(epoch, num_epochs + 1):

        print(f'Epoch {epoch}/{num_epochs}')

        # To track values in each epoch
        tloss, tacc, vloss, vacc = '', '', '', ''
        tloss0, tacc0, vloss0, vacc0 = '', '', '', ''
        tloss1, tacc2, vloss3, vacc4 = '', '', '', ''

        # Each epoch has a training phase and a validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()  # Set net to training mode

                # Track learning rate for plot
                mylr_value = optimizer.param_groups[0]['lr']
                logging_lr.info(f'Epoch {epoch}\tlr: {mylr_value}')

            else:
                net.eval()  # Set net to evaluate mode

            # Track statistics
            running_loss0 = 0.0
            running_loss1 = 0.0

            running_corrects0 = 0
            running_corrects1 = 0

            # Iterate over data
            for index, inputs, bins_labels, cats_labels in tqdm(
                    dataloaders[phase]):
                inputs = inputs.to(device)
                bins_labels = bins_labels.to(device)
                cats_labels = cats_labels.to(device)

                # Zero the parameter gradients
                optimizer.zero_grad()

                # Forward
                # Track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs_bins, outputs_cats = net(inputs)
                    #outputs_bins = torch.reshape(outputs_bins, (-1,)) # reshape added for binary
                    outputs_bins = outputs_bins.to(device)
                    outputs_cats = outputs_cats.to(device)

                    #loss0 = criterion[0](outputs_bins, bins_labels.float())# float added for binary
                    loss0 = criterion[0](outputs_bins, bins_labels)
                    loss1 = criterion[1](outputs_cats, cats_labels)
                    loss0 = loss0 * (2 / 307)
                    loss1 = loss1 * (305 / 307)

                    #loss0 = loss0 * (2/306)
                    #loss1 = loss1 * (304/306)

                    # Backward + optimize only if in training phase
                    if phase == 'train':
                        loss = (loss0 + loss1) / 2
                        loss.backward()
                        optimizer.step()

                # Batch statistics
                running_loss0 += loss0.detach().item() * inputs.size(0)
                running_loss1 += loss1.detach().item() * inputs.size(0)

                #running_corrects0 += torch.sum(torch.round(outputs_bins) == bins_labels.data)
                running_corrects0 += torch.sum(
                    torch.max(outputs_bins, 1)[1] == bins_labels.data)
                running_corrects1 += torch.sum(
                    torch.max(outputs_cats, 1)[1] == cats_labels.data)

            # efficientnetb
            #if net_name.startswith('efficientnetb'):
            #    if phase == 'train':
            #        scheduler.step()

            # inceptionv
            #if net_name.startswith('inceptionv'):
            #    if phase == 'train':
            #        if (epoch % 2) == 0:
            #            scheduler.step()

            # Epoch statistics
            epoch_loss0 = running_loss0 / dataset_sizes[phase]
            epoch_loss1 = running_loss1 / dataset_sizes[phase]

            epoch_loss = epoch_loss0 + epoch_loss1

            epoch_acc0 = (running_corrects0.double() /
                          dataset_sizes[phase]) * (2 / 307)
            epoch_acc1 = (running_corrects1.double() /
                          dataset_sizes[phase]) * (305 / 307)

            epoch_acc = (epoch_acc0 + epoch_acc1) / 2

            #print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss,
                                                       epoch_acc))
            #logging_train.info('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

            print('{} bin_loss: {:.4f} bin_acc: {:.4f}'.format(
                phase, epoch_loss0, epoch_acc0))
            #logging_bins.info('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss0, epoch_acc0))

            print('{} cat_loss: {:.4f} cat_acc: {:.4f}'.format(
                phase, epoch_loss1, epoch_acc1))
            #logging_cats.info('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss1, epoch_acc1))
            if phase == 'train':
                tloss = epoch_loss
                tloss0 = epoch_loss0
                tloss1 = epoch_loss1

                tacc = epoch_acc
                tacc0 = epoch_acc0
                tacc1 = epoch_acc1

            if phase == 'val':
                vloss = epoch_loss
                vloss0 = epoch_loss0
                vloss1 = epoch_loss1

                vacc = epoch_acc
                vacc0 = epoch_acc0
                vacc1 = epoch_acc1

                logging_train.info(
                    'Epoch: {}\ttloss: {:.4f}\ttacc: {:.4f}\tvloss: {:.4f}\tvacc: {:.4f}'
                    .format(epoch, tloss, tacc, vloss, vacc))
                logging_bins.info(
                    'Epoch: {}\ttloss: {:.4f}\ttacc: {:.4f}\tvloss: {:.4f}\tvacc: {:.4f}'
                    .format(epoch, tloss0, tacc0, vloss0, vacc0))
                logging_cats.info(
                    'Epoch: {}\ttloss: {:.4f}\ttacc: {:.4f}\tvloss: {:.4f}\tvacc: {:.4f}'
                    .format(epoch, tloss1, tacc1, vloss1, vacc1))

                # Save last settings to .tar file
                torch.save(
                    {
                        'epoch': epoch,
                        'net_state_dict': net.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'loss': epoch_loss
                    }, last_path)

                if epoch_acc > best_acc:
                    best_acc = epoch_acc
                    #best_net_wts = net.state_dict()

                    # Save best settings to .tar file
                    torch.save(
                        {
                            'epoch': epoch,
                            'net_state_dict': net.state_dict(),  #best_net_wts
                            'optimizer_state_dict': optimizer.state_dict(),
                            'loss': epoch_loss,
                            'acc': best_acc
                        },
                        best_path)

                    # Save best settings to .json file
                    best_metrics = {
                        f'loss{fold}': epoch_loss,
                        f'acc{fold}': best_acc.item()
                    }
                    fname = os.path.join(args.model_dir, f'metrics{fold}.json')
                    with open(fname, 'w') as f:
                        f.write(json.dumps(best_metrics))

                #vgg
                if net_name.startswith('vgg'):
                    scheduler.step(epoch_acc)

                # resnet
                #if net_name.startswith('resnet'):
                #    scheduler.step(epoch_loss)

    print('Best val Acc: {:4f}'.format(best_acc))
    logging_process.info('Model: {}\tFold: {}\tBest val Acc: {:4f}'.format(
        args.model_dir, fold, best_acc))
        args.data_dir), "Could not find the dataset at {}".format(
            args.data_dir)
    assert os.path.isdir(
        args.model_dir), "Could not find the model at {}".format(
            args.model_dir)
    assert os.path.isdir(
        args.net_dir), "Could not find the network at {}".format(args.net_dir)

    # Initialize main log folder
    logs_dir_path = os.path.join(os.getcwd(), 'Logs')
    if not os.path.exists(logs_dir_path):
        os.mkdir(logs_dir_path)

    # Initialize main log file
    log_file = os.path.join(logs_dir_path, 'process.log')
    logging_process = myutils.setup_logger(log_file, date=True)

    # Save commandline settings to log
    script_activated = ' '.join(sys.argv)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logging_process.info(f'Script: {script_activated}, device: {device}')

    # Get the experiment parameters
    params_file = os.path.join(args.model_dir, 'params.json')
    assert os.path.isfile(
        params_file), "No json configuration file found at {}".format(
            params_file)
    params = myutils.Params(params_file)

    # FOLD LOOP
    dfs = {}
Exemple #4
0
def train_eval(fold, dataloaders, dataset_sizes, net, criterion, optimizer,
               scheduler, num_epochs):
    """
    Train and evaluate a net.
    """
    # Initialize logs
    fname = os.path.join(args.model_dir, f'train{fold}.log')
    logging_train = myutils.setup_logger(fname)

    # Reproducibility
    myutils.myseed(seed=42)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # Load initial weights
    net = net.to(device)
    best_net_wts = copy.deepcopy(net.state_dict())
    best_acc, epoch = 0.0, 1

    # Initialize .tar files to save settings
    fname = f'last{fold}.tar'
    last_path = os.path.join(args.model_dir, fname)
    fname = f'best{fold}.tar'
    best_path = os.path.join(args.model_dir, fname)

    # To resume training for more epochs
    if args.resume:
        try:
            # Load last settings from .tar file
            last_checkpoint = torch.load(last_path)
            net.load_state_dict(last_checkpoint['net_state_dict'])
            optimizer.load_state_dict(last_checkpoint['optimizer_state_dict'])
            epoch = last_checkpoint[
                'epoch'] + 1  # Since last epoch was saved we start with the next one
            logging_process.info(
                f'Model: {args.model_dir}\tLast epoch saved: {epoch-1}, resumming training since epoch: {epoch}'
            )

            # Load best settings from .tar file
            best_checkpoint = torch.load(best_path)
            best_net_wts = best_checkpoint['net_state_dict']
            best_acc = best_checkpoint['acc']

        except FileNotFoundError as err:
            # This error happens when folds are present
            # If interrupted on fold 1 then best best_checkpoint for fold 2 does
            # not exists. This is fixed like this.
            logging_process.info(f'Model: {args.model_dir}\tError: {err}')

    # Initialize early stop settings
    best_val_loss, epochs_no_improve, patience = np.Inf, 0, 5

    # TRAINING LOOP
    for epoch in range(epoch, num_epochs + 1):

        # Early stop
        if epochs_no_improve == patience:
            print('Early stop')
            logging_process.info(
                f'Model: {args.model_dir}\tFold:{fold}\tEarly stop: {epoch}')
            break

        print(f'Epoch {epoch}/{num_epochs}')
        logging_train.info(f'Epoch {epoch}/{num_epochs}')

        # Each epoch has a training phase and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()  # Set net to training mode
            else:
                net.eval()  # Set net to evaluate mode

            # Track statistics
            running_loss = 0.0
            running_corrects = 0

            # Iterate over data
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # Zero the parameter gradients
                optimizer.zero_grad()

                # Forward
                # Track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(inputs)
                    probs, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # Backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # Batch statistics
                running_loss += loss.item() * inputs.size(
                    0)  # This is batch loss
                running_corrects += torch.sum(
                    preds == labels.data)  # This is batch accuracy

            if phase == 'train':
                scheduler.step()

            # Epoch statistics
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss,
                                                       epoch_acc))
            logging_train.info('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            if phase == 'val':
                # Best loss tracking for early stop
                if epoch_loss < best_val_loss:
                    best_val_loss = epoch_loss
                    epochs_no_improve = 0
                else:
                    epochs_no_improve += 1

                # Save last settings to .tar file
                torch.save(
                    {
                        'epoch': epoch,
                        'net_state_dict': net.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'loss': epoch_loss
                    }, last_path)

                if epoch_acc > best_acc:
                    best_acc = epoch_acc
                    best_net_wts = net.state_dict()

                    # Save best settings to .tar file
                    torch.save(
                        {
                            'epoch': epoch,
                            'net_state_dict': best_net_wts,
                            'optimizer_state_dict': optimizer.state_dict(),
                            'loss': epoch_loss,
                            'acc': best_acc
                        }, best_path)

                    # Save best settings to .json file
                    best_metrics = {
                        f'loss{fold}': epoch_loss,
                        f'acc{fold}': best_acc.item()
                    }
                    fname = os.path.join(args.model_dir, f'metrics{fold}.json')
                    with open(fname, 'w') as f:
                        f.write(json.dumps(best_metrics))

    print('Best val Acc: {:4f}'.format(best_acc))
    logging_process.info('Model: {}\tFold: {}\tBest val Acc: {:4f}'.format(
        args.model_dir, fold, best_acc))