def train(train_loader, model, criterion, optimizer, scheduler, num_classes,
          batch_size, task, ep_idx, progress_log, num_devices):
    """
    Train the model and return the metrics of the training epoch
    :param train_loader: training data loader
    :param model: model to train
    :param criterion: loss criterion
    :param optimizer: optimizer to use
    :param scheduler: learning rate scheduler
    :param num_classes: number of classes
    :param batch_size: number of samples to process simultaneously
    :param task: segmentation or classification
    :param ep_idx: epoch index (for hypertrainer log)
    :param progress_log: progress log file (for hypertrainer log)
    :param num_devices: (int) number of GPU devices to use.
    :return: Updated training loss
    """
    model.train()
    train_metrics = create_metrics_dict(num_classes)

    for index, data in enumerate(train_loader):
        progress_log.open('a', buffering=1).write(
            tsv_line(ep_idx, 'trn', index, len(train_loader), time.time()))

        if task == 'classification':
            inputs, labels = data
            if torch.cuda.is_available():
                inputs = inputs.cuda()
                labels = labels.cuda()
            optimizer.zero_grad()
            outputs = model(inputs)
            outputs_flatten = outputs
        elif task == 'segmentation':
            if num_devices > 0:
                inputs = data['sat_img'].cuda()
                labels = flatten_labels(data['map_img']).cuda()
            else:
                inputs = data['sat_img']
                labels = flatten_labels(data['map_img'])
            # forward
            optimizer.zero_grad()
            outputs = model(inputs)
            outputs_flatten = flatten_outputs(outputs, num_classes)

        del outputs
        del inputs
        loss = criterion(outputs_flatten, labels)
        train_metrics['loss'].update(loss.item(), batch_size)

        loss.backward()
        optimizer.step()

    scheduler.step()
    print('Training Loss: {:.4f}'.format(train_metrics['loss'].avg))
    return train_metrics
Esempio n. 2
0
def train(train_loader, model, criterion, optimizer, scheduler, num_classes,
          batch_size, classifier, ep_idx, progress_log):
    """ Train the model and return the metrics of the training phase.
    Args:
        train_loader: training data loader
        model: model to train
        criterion: loss criterion
        optimizer: optimizer to use
        scheduler: learning rate scheduler
        num_classes: number of classes
        batch_size: number of samples to process simultaneously
        classifier: True if doing a classification task, False if doing semantic segmentation
        ep_idx: epoch idx (for hypertrainer log)
        progress_log: progress log file (for hypertrainer log)
    """
    model.train()
    scheduler.step()
    train_metrics = create_metrics_dict(num_classes)

    for index, data in enumerate(train_loader):
        progress_log.open('a', buffering=1).write(
            tsv_line(ep_idx, 'trn', index, len(train_loader), time.time()))

        if classifier:
            inputs, labels = data
            if torch.cuda.is_available():
                inputs = inputs.cuda()
                labels = labels.cuda()
            optimizer.zero_grad()
            outputs = model(inputs)
            outputs_flatten = outputs
        else:
            if torch.cuda.is_available():
                inputs = data['sat_img'].cuda()
                labels = flatten_labels(data['map_img']).cuda()
            else:
                inputs = data['sat_img']
                labels = flatten_labels(data['map_img'])
            # forward
            optimizer.zero_grad()
            outputs = model(inputs)
            outputs_flatten = flatten_outputs(outputs, num_classes)

        del outputs
        del inputs
        loss = criterion(outputs_flatten, labels)
        train_metrics['loss'].update(loss.item(), batch_size)

        loss.backward()
        optimizer.step()

    print('Training Loss: {:.4f}'.format(train_metrics['loss'].avg))
    return train_metrics
Esempio n. 3
0
def validation(valid_loader,
               model,
               criterion,
               num_classes,
               batch_size,
               classifier,
               ep_idx,
               progress_log,
               batch_metrics=None):
    """Args:
        valid_loader: validation data loader
        model: model to validate
        criterion: loss criterion
        num_classes: number of classes
        batch_size: number of samples to process simultaneously
        classifier: True if doing a classification task, False if doing semantic segmentation
        ep_idx: epoch idx (for hypertrainer log)
        progress_log: progress log file (for hypertrainer log)
        batch_metrics: (int) Metrics computed every (int) batches. If left blank, will not perform metrics.
    """

    valid_metrics = create_metrics_dict(num_classes)
    model.eval()

    for index, data in enumerate(valid_loader):
        progress_log.open('a', buffering=1).write(
            tsv_line(ep_idx, 'val', index, len(valid_loader), time.time()))

        with torch.no_grad():
            if classifier:
                inputs, labels = data
                if torch.cuda.is_available():
                    inputs = inputs.cuda()
                    labels = labels.cuda()

                outputs = model(inputs)
                outputs_flatten = outputs
            else:
                if torch.cuda.is_available():
                    inputs = data['sat_img'].cuda()
                    labels = flatten_labels(data['map_img']).cuda()
                else:
                    inputs = data['sat_img']
                    labels = flatten_labels(data['map_img'])

                outputs = model(inputs)
                outputs_flatten = flatten_outputs(outputs, num_classes)

            loss = criterion(outputs_flatten, labels)
            valid_metrics['loss'].update(loss.item(), batch_size)

            # Compute metrics every 2 batches. Time consuming.
            if batch_metrics is not None:
                if index % batch_metrics == 0:
                    a, segmentation = torch.max(outputs_flatten, dim=1)
                    valid_metrics = report_classification(
                        segmentation, labels, batch_size, valid_metrics)

    print('Validation Loss: {:.4f}'.format(valid_metrics['loss'].avg))
    if batch_metrics is not None:
        print('Validation precision: {:.4f}'.format(
            valid_metrics['precision'].avg))
        print('Validation recall: {:.4f}'.format(valid_metrics['recall'].avg))
        print('Validation f1-score: {:.4f}'.format(
            valid_metrics['fscore'].avg))

    return valid_metrics
Esempio n. 4
0
def main(bucket_name, data_path, output_path, num_trn_samples, num_val_samples,
         pretrained, batch_size, num_epochs, learning_rate, weight_decay,
         step_size, gamma, num_classes, class_weights, batch_metrics, model,
         classifier, model_name):
    """Function to train and validate a models for semantic segmentation.
    Args:
        bucket_name: bucket in which data is stored if using AWS S3
        data_path: full file path of the folder containing h5py files
        output_path: full file path in which the model will be saved
        num_trn_samples: number of training samples
        num_val_samples: number of validation samples
        pretrained: booleam indicating if the model is pretrained
        batch_size: number of samples to process simultaneously
        num_epochs: number of epochs
        learning_rate: learning rate
        weight_decay: weight decay
        step_size: step size
        gamma: multiplicative factor of learning rate decay
        num_classes: number of classes
        class_weights: weights to apply to each class. A value > 1.0 will apply more weights to the learning of the class
        batch_metrics:(int) Metrics computed every (int) batches. If left blank, will not perform metrics.
        model: CNN model (tensor)
        classifier: True if doing image classification, False if doing semantic segmentation.
        model_name: name of the model used for training.
    Returns:
        Files 'checkpoint.pth.tar' and 'last_epoch.pth.tar' containing trained weight
    """
    if bucket_name:
        if output_path is None:
            bucket_output_path = None
        else:
            bucket_output_path = output_path
        output_path = 'output_path'
        try:
            os.mkdir(output_path)
        except FileExistsError:
            pass
        s3 = boto3.resource('s3')
        bucket = s3.Bucket(bucket_name)
        if classifier:
            for i in ['trn', 'val']:
                get_s3_classification_images(i, bucket, bucket_name, data_path,
                                             output_path, num_classes)
                class_file = os.path.join(output_path, 'classes.csv')
                if bucket_output_path:
                    bucket.upload_file(
                        class_file,
                        os.path.join(bucket_output_path, 'classes.csv'))
                else:
                    bucket.upload_file(class_file, 'classes.csv')
            data_path = 'Images'
        else:
            if data_path:
                bucket.download_file(
                    os.path.join(data_path, 'samples/trn_samples.hdf5'),
                    'samples/trn_samples.hdf5')
                bucket.download_file(
                    os.path.join(data_path, 'samples/val_samples.hdf5'),
                    'samples/val_samples.hdf5')
            else:
                bucket.download_file('samples/trn_samples.hdf5',
                                     'samples/trn_samples.hdf5')
                bucket.download_file('samples/val_samples.hdf5',
                                     'samples/val_samples.hdf5')
            verify_sample_count(num_trn_samples, num_val_samples, data_path,
                                bucket_name)
    elif classifier:
        get_local_classes(num_classes, data_path, output_path)
    else:
        verify_sample_count(num_trn_samples, num_val_samples, data_path,
                            bucket_name)
    verify_weights(num_classes, class_weights)

    since = time.time()
    best_loss = 999

    progress_log = Path(output_path) / 'progress.log'
    if not progress_log.exists():
        # Add header
        progress_log.open('w', buffering=1).write(
            tsv_line('ep_idx', 'phase', 'iter', 'i_p_ep', 'time'))

    trn_log = InformationLogger(output_path, 'trn')
    val_log = InformationLogger(output_path, 'val')

    if torch.cuda.is_available():
        model = model.cuda()
        if class_weights:
            criterion = nn.CrossEntropyLoss(
                weight=torch.tensor(class_weights)).cuda()
        else:
            criterion = nn.CrossEntropyLoss().cuda()
    else:
        if class_weights:
            criterion = nn.CrossEntropyLoss(weight=torch.tensor(class_weights))
        else:
            criterion = nn.CrossEntropyLoss()

    optimizer = optim.Adam(model.parameters(),
                           lr=learning_rate,
                           weight_decay=weight_decay)
    lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer, step_size=step_size, gamma=gamma)  # learning rate decay

    if pretrained != '':
        model, optimizer = load_from_checkpoint(pretrained, model, optimizer)

    if classifier:
        trn_dataset = torchvision.datasets.ImageFolder(
            os.path.join(data_path, "trn"),
            transform=transforms.Compose([
                transforms.RandomRotation((0, 275)),
                transforms.RandomHorizontalFlip(),
                transforms.Resize(299),
                transforms.ToTensor()
            ]),
            loader=loader)
        val_dataset = torchvision.datasets.ImageFolder(
            os.path.join(data_path, "val"),
            transform=transforms.Compose(
                [transforms.Resize(299),
                 transforms.ToTensor()]),
            loader=loader)
    else:
        if not bucket_name:
            trn_dataset = CreateDataset.SegmentationDataset(
                os.path.join(data_path, "samples"),
                num_trn_samples,
                "trn",
                transform=transforms.Compose([
                    aug.RandomRotationTarget(),
                    aug.HorizontalFlip(),
                    aug.ToTensorTarget()
                ]))
            val_dataset = CreateDataset.SegmentationDataset(
                os.path.join(data_path, "samples"),
                num_val_samples,
                "val",
                transform=transforms.Compose([aug.ToTensorTarget()]))
        else:
            trn_dataset = CreateDataset.SegmentationDataset(
                'samples',
                num_trn_samples,
                "trn",
                transform=transforms.Compose([
                    aug.RandomRotationTarget(),
                    aug.HorizontalFlip(),
                    aug.ToTensorTarget()
                ]))
            val_dataset = CreateDataset.SegmentationDataset(
                "samples",
                num_val_samples,
                "val",
                transform=transforms.Compose([aug.ToTensorTarget()]))

    # Shuffle must be set to True.
    trn_dataloader = DataLoader(trn_dataset,
                                batch_size=batch_size,
                                num_workers=4,
                                shuffle=True)
    val_dataloader = DataLoader(val_dataset,
                                batch_size=batch_size,
                                num_workers=4,
                                shuffle=True)

    now = datetime.datetime.now().strftime("%Y-%m-%d_%I-%M ")
    for epoch in range(0, num_epochs):
        print()
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 20)

        trn_report = train(trn_dataloader, model, criterion, optimizer,
                           lr_scheduler, num_classes, batch_size, classifier,
                           epoch, progress_log)
        trn_log.add_values(trn_report,
                           epoch,
                           ignore=['precision', 'recall', 'fscore', 'iou'])

        val_report = validation(val_dataloader, model, criterion, num_classes,
                                batch_size, classifier, epoch, progress_log,
                                batch_metrics)
        val_loss = val_report['loss'].avg
        if batch_metrics is not None:
            val_log.add_values(val_report, epoch)
        else:
            val_log.add_values(val_report,
                               epoch,
                               ignore=['precision', 'recall', 'fscore', 'iou'])

        if val_loss < best_loss:
            print("save checkpoint")
            filename = os.path.join(output_path, 'checkpoint.pth.tar')
            best_loss = val_loss
            save_checkpoint(
                {
                    'epoch': epoch,
                    'arch': model_name,
                    'model': model.state_dict(),
                    'best_loss': best_loss,
                    'optimizer': optimizer.state_dict()
                }, filename)

            if bucket_name:
                if bucket_output_path:
                    bucket_filename = os.path.join(bucket_output_path,
                                                   'checkpoint.pth.tar')
                else:
                    bucket_filename = 'checkpoint.pth.tar'
                bucket.upload_file(filename, bucket_filename)

        if bucket_name:
            save_logs_to_bucket(bucket, bucket_output_path, output_path, now,
                                batch_metrics)

        cur_elapsed = time.time() - since
        print('Current elapsed time {:.0f}m {:.0f}s'.format(
            cur_elapsed // 60, cur_elapsed % 60))

    filename = os.path.join(output_path, 'last_epoch.pth.tar')
    save_checkpoint(
        {
            'epoch': epoch,
            'arch': model_name,
            'model': model.state_dict(),
            'best_loss': best_loss,
            'optimizer': optimizer.state_dict()
        }, filename)

    if bucket_name:
        if bucket_output_path:
            bucket_filename = os.path.join(bucket_output_path,
                                           'last_epoch.pth.tar')
            bucket.upload_file(
                "output.txt",
                os.path.join(bucket_output_path, f"Logs/{now}_output.txt"))
        else:
            bucket_filename = 'last_epoch.pth.tar'
            bucket.upload_file("output.txt", f"Logs/{now}_output.txt")
        bucket.upload_file(filename, bucket_filename)

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
def evaluation(eval_loader,
               model,
               criterion,
               num_classes,
               batch_size,
               task,
               ep_idx,
               progress_log,
               batch_metrics=None,
               dataset='val',
               num_devices=0):
    """
    Evaluate the model and return the updated metrics
    :param eval_loader: data loader
    :param model: model to evaluate
    :param criterion: loss criterion
    :param num_classes: number of classes
    :param batch_size: number of samples to process simultaneously
    :param task: segmentation or classification
    :param ep_idx: epoch index (for hypertrainer log)
    :param progress_log: progress log file (for hypertrainer log)
    :param batch_metrics: (int) Metrics computed every (int) batches. If left blank, will not perform metrics.
    :param dataset: (str) 'val or 'tst'
    :param num_devices: (int) Number of GPU devices to use.
    :return: (dict) eval_metrics
    """
    eval_metrics = create_metrics_dict(num_classes)
    model.eval()

    for index, data in enumerate(eval_loader):
        progress_log.open('a', buffering=1).write(
            tsv_line(ep_idx, dataset, index, len(eval_loader), time.time()))

        with torch.no_grad():
            if task == 'classification':
                inputs, labels = data
                if torch.cuda.is_available():
                    inputs = inputs.cuda()
                    labels = labels.cuda()

                outputs = model(inputs)
                outputs_flatten = outputs
            elif task == 'segmentation':
                if num_devices > 0:
                    inputs = data['sat_img'].cuda()
                    labels = flatten_labels(data['map_img']).cuda()
                else:
                    inputs = data['sat_img']
                    labels = flatten_labels(data['map_img'])

                outputs = model(inputs)
                outputs_flatten = flatten_outputs(outputs, num_classes)

            loss = criterion(outputs_flatten, labels)
            eval_metrics['loss'].update(loss.item(), batch_size)

            if (dataset == 'val') and (batch_metrics is not None):
                # Compute metrics every n batches. Time consuming.
                if index % batch_metrics == 0:
                    a, segmentation = torch.max(outputs_flatten, dim=1)
                    eval_metrics = report_classification(
                        segmentation, labels, batch_size, eval_metrics)
            elif dataset == 'tst':
                a, segmentation = torch.max(outputs_flatten, dim=1)
                eval_metrics = report_classification(segmentation, labels,
                                                     batch_size, eval_metrics)

    print(f"{dataset} Loss: {eval_metrics['loss'].avg}")
    if batch_metrics is not None:
        print(f"{dataset} precision: {eval_metrics['precision'].avg}")
        print(f"{dataset} recall: {eval_metrics['recall'].avg}")
        print(f"{dataset} fscore: {eval_metrics['fscore'].avg}")

    return eval_metrics
def main(params):
    """
    Function to train and validate a models for semantic segmentation or classification.
    :param params: (dict) Parameters found in the yaml config file.

    """
    model, state_dict_path, model_name = net(params)
    bucket_name = params['global']['bucket_name']
    output_path = params['training']['output_path']
    data_path = params['global']['data_path']
    task = params['global']['task']
    num_classes = params['global']['num_classes']
    batch_size = params['training']['batch_size']

    if bucket_name:
        bucket, bucket_output_path, output_path, data_path = download_s3_files(
            bucket_name=bucket_name,
            data_path=data_path,
            output_path=output_path,
            num_classes=num_classes,
            task=task)

    elif not bucket_name and task == 'classification':
        get_local_classes(num_classes, data_path, output_path)

    since = time.time()
    best_loss = 999

    progress_log = Path(output_path) / 'progress.log'
    if not progress_log.exists():
        # Add header
        progress_log.open('w', buffering=1).write(
            tsv_line('ep_idx', 'phase', 'iter', 'i_p_ep', 'time'))

    trn_log = InformationLogger(output_path, 'trn')
    val_log = InformationLogger(output_path, 'val')
    tst_log = InformationLogger(output_path, 'tst')

    model, criterion, optimizer, lr_scheduler, num_devices = set_hyperparameters(
        params, model, state_dict_path)

    num_samples = get_num_samples(data_path=data_path, params=params)
    print(f"Number of samples : {num_samples}")
    trn_dataloader, val_dataloader, tst_dataloader = create_dataloader(
        data_path=data_path,
        num_samples=num_samples,
        batch_size=batch_size,
        task=task)

    now = datetime.datetime.now().strftime("%Y-%m-%d_%I-%M ")
    filename = os.path.join(output_path, 'checkpoint.pth.tar')

    for epoch in range(0, params['training']['num_epochs']):
        print()
        print('Epoch {}/{}'.format(epoch,
                                   params['training']['num_epochs'] - 1))
        print('-' * 20)

        trn_report = train(train_loader=trn_dataloader,
                           model=model,
                           criterion=criterion,
                           optimizer=optimizer,
                           scheduler=lr_scheduler,
                           num_classes=num_classes,
                           batch_size=batch_size,
                           task=task,
                           ep_idx=epoch,
                           progress_log=progress_log,
                           num_devices=num_devices)
        trn_log.add_values(trn_report,
                           epoch,
                           ignore=['precision', 'recall', 'fscore', 'iou'])

        val_report = evaluation(
            eval_loader=val_dataloader,
            model=model,
            criterion=criterion,
            num_classes=num_classes,
            batch_size=batch_size,
            task=task,
            ep_idx=epoch,
            progress_log=progress_log,
            batch_metrics=params['training']['batch_metrics'],
            dataset='val',
            num_devices=num_devices)
        val_loss = val_report['loss'].avg
        if params['training']['batch_metrics'] is not None:
            val_log.add_values(val_report, epoch)
        else:
            val_log.add_values(val_report,
                               epoch,
                               ignore=['precision', 'recall', 'fscore', 'iou'])

        if val_loss < best_loss:
            print("save checkpoint")
            best_loss = val_loss
            torch.save(
                {
                    'epoch': epoch,
                    'arch': model_name,
                    'model': model.state_dict(),
                    'best_loss': best_loss,
                    'optimizer': optimizer.state_dict()
                }, filename)

            if bucket_name:
                bucket_filename = os.path.join(bucket_output_path,
                                               'checkpoint.pth.tar')
                bucket.upload_file(filename, bucket_filename)

        if bucket_name:
            save_logs_to_bucket(bucket, bucket_output_path, output_path, now,
                                params['training']['batch_metrics'])

        cur_elapsed = time.time() - since
        print('Current elapsed time {:.0f}m {:.0f}s'.format(
            cur_elapsed // 60, cur_elapsed % 60))

    # load checkpoint model and evaluate it on test dataset.
    model = load_from_checkpoint(filename, model)
    tst_report = evaluation(eval_loader=tst_dataloader,
                            model=model,
                            criterion=criterion,
                            num_classes=num_classes,
                            batch_size=batch_size,
                            task=task,
                            ep_idx=params['training']['num_epochs'],
                            progress_log=progress_log,
                            batch_metrics=params['training']['batch_metrics'],
                            dataset='tst',
                            num_devices=num_devices)
    tst_log.add_values(tst_report, params['training']['num_epochs'])

    if bucket_name:
        bucket_filename = os.path.join(bucket_output_path,
                                       'last_epoch.pth.tar')
        bucket.upload_file(
            "output.txt",
            os.path.join(bucket_output_path, f"Logs/{now}_output.txt"))
        bucket.upload_file(filename, bucket_filename)

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))