def __init__(self,
                 config: ConfigClass,
                 save_dir: str,
                 name='ActiveTrainer'):
        super(ActiveTrainer, self).__init__(config, save_dir, name)

        self.main_data_dir = os.path.join(self.save_dir, 'Datasets')
        os.makedirs(self.main_data_dir)

        self.al_config = config.active_learn

        self.data_pool = get_pool_class(self.config)
        self.data_loaders = get_dataloaders(
            self.config.data, file_list=self.data_pool.train_pool)
        self.main_logger.info(self.data_loaders.msg)

        self._create_train_loggers(value=0)
        self._save_dataset_info()
        self.data_pool.copy_pool_files_to_dir(self.data_pool.train_pool,
                                              self.save_data_dir)

        if self.use_ensemble:
            self._init_train_components_ensemble()
        else:
            self._init_train_components()
Ejemplo n.º 2
0
def discriminator_tst():
    from data import get_dataloaders
    train, test = get_dataloaders(batch_size=16)
    disc = judge()
    for images, labels in train:
        test_out = disc(images)
        print(test_out.shape)
Ejemplo n.º 3
0
    def __init__(self, config: ConfigClass, save_dir: str):
        super(BayesianTrainer, self).__init__(config, save_dir, 'BBB_Trainer')

        self.data_loaders = get_dataloaders(config.data)
        self.main_logger.info(self.data_loaders.msg)

        self._init_train_components()
Ejemplo n.º 4
0
 def test_dataset_type(self, dataset_name, device):
     """Test that the specified dataset is the one actually being loaded.
     """
     train_loader, test_loader = get_dataloaders(dataset_name,
                                                 device=device,
                                                 train_batch_size=3,
                                                 test_batch_size=4)
     if dataset_name.lower() == 'mnist':
         assert isinstance(train_loader.dataset,
                           torchvision.datasets.mnist.MNIST)
         assert isinstance(test_loader.dataset,
                           torchvision.datasets.mnist.MNIST)
     elif dataset_name.lower() == 'cifar-10':
         assert isinstance(train_loader.dataset,
                           torchvision.datasets.cifar.CIFAR10)
         assert isinstance(test_loader.dataset,
                           torchvision.datasets.cifar.CIFAR10)
     elif dataset_name.lower() == 'cifar-100':
         assert isinstance(train_loader.dataset,
                           torchvision.datasets.cifar.CIFAR100)
         assert isinstance(test_loader.dataset,
                           torchvision.datasets.cifar.CIFAR100)
     else:
         raise ValueError("My test is flaky. Got unsupported dataset "
                          "{} from conftest.".format(dataset_name))
Ejemplo n.º 5
0
def evaluate_monte_carlo(config: ConfigClass, model):
    device = torch.device(
        f'cuda:{config.gpu_node}' if torch.cuda.is_available() else 'cpu')
    mc_passes = config.prediction.mc_passes

    # Create dataloader class
    loader_class = get_dataloaders(config.data)
    segment_metrics = SegmentationMetrics(num_classes=loader_class.num_classes,
                                          threshold=config.binarize_threshold)

    # Evaluate model
    model.eval()
    model.apply(apply_dropout)

    with torch.no_grad():
        for batch in loader_class.evaluation_loader:
            x, y = batch
            x = x.to(device=device, non_blocking=True)
            y = y.to(device=device, non_blocking=True)

            summed_y = torch.zeros_like(y)

            for idx in range(mc_passes):
                y_pred = model(x)
                summed_y += torch.sigmoid(y_pred)

            averaged_y = summed_y / mc_passes
            segment_metrics.update((averaged_y, y), process=False)

    metrics = segment_metrics.compute()
    return metrics
Ejemplo n.º 6
0
def evaluate_ensemble(config: ConfigClass, models_list):
    device = torch.device(
        f'cuda:{config.gpu_node}' if torch.cuda.is_available() else 'cpu')

    # Create dataloader class
    loader_class = get_dataloaders(config.data)
    segment_metrics = SegmentationMetrics(num_classes=loader_class.num_classes,
                                          threshold=config.binarize_threshold)

    # Evaluate models
    for model in models_list:
        model.eval()

    with torch.no_grad():
        for batch in loader_class.evaluation_loader:
            x, y = batch
            x = x.to(device=device, non_blocking=True)
            y = y.to(device=device, non_blocking=True)

            averaged_y = torch.zeros_like(y)

            for model in models_list:
                y_pred = model(x)
                averaged_y += torch.sigmoid(y_pred)

            averaged_y = averaged_y / len(models_list)
            segment_metrics.update((averaged_y, y), process=False)

    metrics = segment_metrics.compute()
    return metrics
    def __init__(self, config: ConfigClass, save_dir: str):
        super(PassiveTrainerEnsemble,
              self).__init__(config, save_dir, 'Passive_Trainer_Ensemble')

        if config.data.data_list is None:
            self.data_loaders = get_dataloaders(config.data)
        else:
            with open(join(DATA_DIR, 'MSRA10K_INIT', config.data.data_list),
                      'rb') as f:
                file_list = pickle.load(f)
            self.data_loaders = get_dataloaders(config.data,
                                                file_list=file_list)
            self.main_logger.info(
                f'Using preset file list with length {len(file_list)}')
        self.main_logger.info(self.data_loaders.msg)

        self._init_train_components_ensemble()
Ejemplo n.º 8
0
    def __init__(self, args):
        # Set the LR Scheduler and Loss Parameters
        self.args = args
        self.model = BertNLU(args)

        print(self.model)

        # Set the Device and Distributed Settings
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        if args.distributed and torch.cuda.is_available() and torch.cuda.device_count() > 1:
            self.model = torch.nn.DataParallel(self.model)
        self.model.to(self.device)

        # Define the data loaders
        self.train_loader, \
        self.val_loader, \
        self.test_loader = get_dataloaders(data_root=args.data_path,
                                           batch_size=args.batch_size,
                                           dataset=args.dataset,
                                           num_workers=args.num_workers)
                                           

        # Define the optimizers
        if args.finetune_bert:
            print('Finetuning BERT')
            self.optimizer = torch.optim.Adam([
                {'params': self.model.bert.parameters(), 'lr':args.learning_rate_bert}, 
                {'params': self.model.classifier.parameters()}
            ], lr=args.learning_rate)
        else: 
            print('Freezing BERT')
            self.optimizer = torch.optim.Adam([ 
                {'params': self.model.classifier.parameters()}
            ], lr=args.learning_rate)

        
        if args.scheduler == 'plateau':
            self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer,
                                                                        factor=0.5,
                                                                        patience=3,
                                                                        mode='max',
                                                                        verbose=True)
        elif args.scheduler == 'cycle':
            self.scheduler = torch.optim.lr_scheduler.OneCycleLR(self.optimizer,
                                                                 max_lr=args.learning_rate,
                                                                 steps_per_epoch=len(self.train_loader),
                                                                 epochs=args.num_epochs)
        self.criterion = torch.nn.CrossEntropyLoss()
      
        # Training specific params
      
        self.num_epochs = args.num_epochs
        self.print_every = args.print_every
        self.val_every = args.val_every
        self.model_dir = args.model_dir
        self.save_every = args.save_every
Ejemplo n.º 9
0
 def test_dataloader_gpu(self, dataset_name):
     """Specifying 'cuda' as the device will pin the memory.
     """
     train_loader, test_loader = get_dataloaders(
         dataset_name,
         device=torch.device('cuda'),
         train_batch_size=3,
         test_batch_size=4)
     assert train_loader.pin_memory
     assert test_loader.pin_memory
Ejemplo n.º 10
0
    def test_dataloader_nonempty(self, dataset_name, device):
        """Make sure that the dataloader contains a dataset of nonzero size
        """
        train_loader, test_loader = get_dataloaders(dataset_name,
                                                    device=device,
                                                    train_batch_size=3,
                                                    test_batch_size=4)

        assert len(train_loader.dataset) > 0
        assert len(test_loader.dataset) > 0
Ejemplo n.º 11
0
def init():
    part_of_body = 'WRIST'
    phase_cat = ['train', 'valid']

    case_data = get_study_data('XR_' + part_of_body, 'D:/MURA-v1.1/{0}/{1}/')
    dataloaders = get_dataloaders(case_data, batch_size=1)

    # train abnormal, normal images
    tai = {x: get_count(case_data[x], 'positive') for x in phase_cat}
    tni = {x: get_count(case_data[x], 'negative') for x in phase_cat}
    Wt1 = {x: np_tensor(tni[x] / (tni[x] + tai[x])) for x in phase_cat}
    Wt0 = {x: np_tensor(tai[x] / (tni[x] + tai[x])) for x in phase_cat}
Ejemplo n.º 12
0
    def test_dataloader_determinism(self, dataset_name, device):
        """The randomness with which the training set is shuffled should be
        deterministic. So the order of the training batches is random but 
        constant.
        """
        train_loader1, _ = get_dataloaders(dataset_name,
                                           seed=0,
                                           device=device,
                                           train_batch_size=3,
                                           test_batch_size=4)
        X1, y1 = next(iter(train_loader1))

        train_loader2, _ = get_dataloaders(dataset_name,
                                           seed=0,
                                           device=device,
                                           train_batch_size=3,
                                           test_batch_size=4)
        X2, y2 = next(iter(train_loader2))

        assert torch.equal(X1, X2)
        assert torch.equal(y1, y2)
    def __init__(self,
                 config: ConfigClass,
                 save_dir: str,
                 log_name='Variational_Trainer'):
        super(VariationalTrainer, self).__init__(config, save_dir, log_name)

        self.data_loaders = get_dataloaders(config.data)
        self.main_logger.info(self.data_loaders.msg)

        self.starting_kld_factor = 0 if self.loss_cfg.kld_warmup else self.loss_cfg.kld_factor
        self.starting_mse_factor = 0 if self.loss_cfg.mse_warmup else self.loss_cfg.mse_factor

        self._init_train_components()
Ejemplo n.º 14
0
 def test_cifar_wrong_numchannels(self, device):
     """num_channel=2 is invalid and should be caught
     """
     with pytest.raises(ValueError):
         train_loader, test_loader = get_dataloaders(
             dataset='cifar-10',
             device=device,
             train_batch_size=3,
             test_batch_size=4,
             augment=True,
             resize_to=(48, 48),
             num_channels=2,
         )
Ejemplo n.º 15
0
    def test_dataloader_samplers(self, dataset_name, device):
        """The training set should be shuffled, while the test set should be 
        yielded in sequential, unshuffled order.
        """
        train_loader, test_loader = get_dataloaders(dataset_name,
                                                    device=device,
                                                    train_batch_size=3,
                                                    test_batch_size=4)

        assert isinstance(train_loader.batch_sampler.sampler,
                          torch.utils.data.sampler.RandomSampler)
        assert isinstance(test_loader.batch_sampler.sampler,
                          torch.utils.data.sampler.SequentialSampler)
Ejemplo n.º 16
0
 def test_get_dataloaders(self):
     train, test = get_dataloaders(batch_size=16)
     for k, data in enumerate(train):
         images, labels = data
         print(images.shape, labels.shape)
         images, labels = images.numpy(), labels.numpy()
         images = images.squeeze(1).transpose(0, 1, 2)
         for j in range(16):
             pl.subplot(4, 4, j + 1)
             print(images.shape)
             pl.imshow(images[j, :, :])
             pl.title(labels[j])
             pl.axis('off')
         pl.show()
Ejemplo n.º 17
0
 def test_dataset_successful_iter(self, dataset_name, device, num_channels):
     """Any error in the data preparation may cause runtime errors that 
     would not be visible otherwise
     """
     train_loader, test_loader = get_dataloaders(
         dataset_name,
         device=device,
         train_batch_size=3,
         test_batch_size=4,
         augment=True,
         resize_to=(48, 48),
         num_channels=num_channels,
     )
     x, y = iter(train_loader).next()
     assert True
Ejemplo n.º 18
0
 def test_dataset_resize(self, dataset_name, device):
     """Test that passing resize_to actually resizes images to the correct 
     dimensions.
     """
     train_loader, test_loader = get_dataloaders(
         dataset_name,
         device=device,
         train_batch_size=3,
         test_batch_size=4,
         augment=True,
         resize_to=(48, 48),
         num_channels=1,
     )
     x, y = iter(train_loader).next()
     assert tuple(x.shape[-2:]) == (48, 48)
Ejemplo n.º 19
0
def main(args):

    train_dataloader, dev_dataloader = get_dataloaders(args)
    print('Dataloaders obtained')

    features = np.load(args.train_videos_npz_path)
    print('Features obtained')

    device = torch.device(args.device)
    print('Device: ', device)

    model =  R2plus1D_18()
    print('MODEL\n',model)
    model.to(device)

    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr = args.learning_rate,
        eps = args.epsilon
    )

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer
    )

    criterion = torch.nn.BCEWithLogitsLoss()

    print('TRAINING...')
    training_stats = []
    steps_stats = []

    for epoch_i in tqdm(range(0,args.epochs)):

        avg_train_loss, train_acc, train_mcc = train(model,train_dataloader,features,device,criterion,optimizer)
        print(avg_train_loss, train_acc, train_mcc)
        avg_dev_loss, dev_acc, dev_mcc = valid(model,dev_dataloader,features,device,criterion,scheduler)
        print(avg_dev_loss, dev_acc, dev_mcc)
        training_stats.append([avg_train_loss,train_acc,train_mcc,avg_dev_loss,dev_acc,dev_mcc])

        torch.save(model,str(args.save_model_path+str(epoch_i)+'_model.pth'))


    print('TRAINING COMPLETED')

    # Show training results
    col_names = ['train_loss','train_acc','train_mcc','dev_loss', 'dev_acc','dev_mcc']
    training_stats = pd.DataFrame(training_stats,columns=col_names)
    print(training_stats.head(args.epochs))
Ejemplo n.º 20
0
    def test_dataloader_batchsizes(self, dataset_name, device):
        """Test that the dataloaders yield batches of the requested sizes.
        """
        train_loader, test_loader = get_dataloaders(dataset_name,
                                                    device=device,
                                                    train_batch_size=3,
                                                    test_batch_size=4)
        assert train_loader.batch_size == 3
        X_train_batch, y_train_batch = next(iter(train_loader))
        assert X_train_batch.shape[0] == 3
        assert X_train_batch.shape[0] == 3

        assert test_loader.batch_size == 4
        X_test_batch, y_test_batch = next(iter(test_loader))
        assert X_test_batch.shape[0] == 4
        assert X_test_batch.shape[0] == 4
def test_dataloaders():
    head("Testing the dataloaders")

    try:
        datasetroot = data._DEFAULT_COMMONVOICE_ROOT
        datasetversion = data._DEFAULT_COMMONVOICE_VERSION
        use_cuda = False
        B = 10
        nthreads = 2
        train_augment = False
        min_duration = 1  # s.
        max_duration = 5  # s.
        loaders = data.get_dataloaders(datasetroot,
                                       datasetversion,
                                       cuda=use_cuda,
                                       batch_size=B,
                                       n_threads=nthreads,
                                       min_duration=min_duration,
                                       max_duration=max_duration,
                                       small_experiment=False,
                                       train_augment=train_augment,
                                       nmels=data._DEFAULT_NUM_MELS,
                                       logger=None)
        train_loader, valid_loader, test_loader = loaders

        minibatch = next(iter(train_loader))

        info(f"[1/] Got a minibatch of type {type(minibatch)}")
        if not isinstance(minibatch, tuple) or len(minibatch) != 2:
            fail("Expected a minibatch to be a tuple spectrograms, transcripts")
        else:
            succeed()

        packed_batch, packed_transcripts = minibatch

        info(f"[2/] Got two items of type {type(packed_batch), type(packed_transcripts)}")
        if not isinstance(packed_batch, PackedSequence) or\
           not isinstance(packed_transcripts, PackedSequence):
               fail("Expected two PackedSequence")
        else:
            succeed()

    except:
        fail(f"{sys.exc_info()[0]}")
        if _RERAISE: raise
Ejemplo n.º 22
0
def test_get_loaders(dataset):
    """
    Test dataloader

    :param dataset: Dataset to use
    :type dataset: str
    """
    logging.info(f"Loading dataloaders for {dataset}")
    loaders = get_dataloaders(
        dataset,
        root=f"datasets/{dataset}",
        batch_size=128,
        test_batch_size=128,
        validation_split=0.1,
        fixed_shuffle=True,
    )
    logging.info(f"Looping through dataset {dataset}")
    for loader in loaders:
        _loader_loop(loader)
Ejemplo n.º 23
0
    def __init__(self, hparams: Config):
        super().__init__()
        torch.set_num_threads(15)
        self.hparams = sanitize_dict(copy.copy(
            vars(hparams)))  # TODO: clean up

        self.dataloaders, self.regression, self.output_dim = get_dataloaders(
            hparams.sample_size, hparams.seed, hparams.batch_size,
            hparams.target)

        self.learning_rate = hparams.learning_rate
        self.weight_decay = hparams.weight_decay
        self.channels = hparams.channels
        self.dropout = hparams.dropout
        self.lr_decay = hparams.lr_decay

        # print(self.output_dim)
        # print(self.dropout)

        self.build_model()
Ejemplo n.º 24
0
    def __init__(self, hparams: Config):
        super().__init__()
        torch.set_num_threads(8)
        self.hparams = sanitize_dict(copy.copy(vars(hparams)))

        self.dataloaders, self.regression, self.output_dim = get_dataloaders(
            hparams.sample_size, hparams.repetition_num,
            hparams.sample_splits_dir, hparams.num_workers, hparams.batch_size,
            hparams.scorename)

        self.learning_rate = hparams.learning_rate
        self.weight_decay = hparams.weight_decay
        self.channels = hparams.channels
        self.dropout = hparams.dropout
        self.lr_decay = hparams.lr_decay

        # print(self.output_dim)
        # print(self.dropout)

        self.build_model()
Ejemplo n.º 25
0
def evaluate_one_pass(config: ConfigClass, model):
    device = torch.device(
        f'cuda:{config.gpu_node}' if torch.cuda.is_available() else 'cpu')

    # Create dataloader class
    loader_class = get_dataloaders(config.data)
    segment_metrics = SegmentationMetrics(num_classes=loader_class.num_classes,
                                          threshold=config.binarize_threshold)

    # Evaluate model
    model.eval()

    with torch.no_grad():
        for batch in loader_class.evaluation_loader:
            x, y = batch
            x = x.to(device=device, non_blocking=True)
            y = y.to(device=device, non_blocking=True)

            y_pred = model(x)

            segment_metrics.update((y_pred, y))

    metrics = segment_metrics.compute()
    return metrics
Ejemplo n.º 26
0
def training(local_rank, config):

    config["device"] = "cuda" if config["active_gpu_ids"] else "cpu"

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)

    device = idist.device()

    logger = setup_logger(name="Carbon Black Semantic Segmentation Training",
                          distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_path"]

    if rank == 0:
        if config["stop_iteration"] is None:
            now = utils.get_time_stamp()
        else:
            now = f"stop-on-{config['stop_iteration']}"

        folder_name = (
            f"{config['architecture']}-{config['encoder']}-{config['encoder_weights']}_"
            f"backend-{idist.backend()}-{idist.get_world_size()}_{now}")

        output_path = Path(output_path) / folder_name
        output_path.mkdir(parents=True, exist_ok=True)
        config["output_path"] = output_path.as_posix()
        config["task_name"] = output_path.stem

        logger.info(f"Output path: {output_path}")

        if "cuda" in idist.device().type:
            config["cuda_device_name"] = torch.cuda.get_device_name(local_rank)

        setup_trains_logging(config)

    dataloader_train, dataloader_val = get_dataloaders(config)

    config["num_iterations_per_epoch"] = len(dataloader_train)
    config["num_epochs"] = round(config["num_iterations"] /
                                 config["num_iterations_per_epoch"])
    model = modeling.get_model(config)

    optimizer = get_optimizer(model, config)
    loss = get_loss()

    lr_scheduler = get_lr_scheduler(optimizer, config)

    trainer = create_trainer(model, optimizer, loss, lr_scheduler,
                             dataloader_train.sampler, config, logger)

    metrics = get_metrics(loss)

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    evaluator_train = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": evaluator_train, "validation": evaluator}
        tb_logger = common.setup_tb_logging(output_path,
                                            trainer,
                                            optimizer,
                                            evaluators=evaluators)

        example_prediction_logger = ExamplePredictionLogger(
            tb_logger, model, device)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = evaluator_train.run(dataloader_train)
        data_subset = "Train"
        log_metrics(logger, epoch, state.times["COMPLETED"], data_subset,
                    state.metrics)
        log_confusion_matrix(tb_logger, epoch, data_subset, state.metrics)

        state = evaluator.run(dataloader_val)
        data_subset = "Val"
        log_metrics(logger, epoch, state.times["COMPLETED"], data_subset,
                    state.metrics)
        log_confusion_matrix(tb_logger, epoch, data_subset, state.metrics)
        example_prediction_logger.log_visualization(dataloader_val.dataset,
                                                    epoch)

    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=config["validate_every"])
        | Events.COMPLETED, run_validation)

    # Store 3 best models by validation accuracy:
    common.gen_save_best_models_by_val_score(
        save_handler=get_save_handler(config),
        evaluator=evaluator,
        models={"model": model},
        metric_name="accuracy",
        n_saved=3,
        trainer=trainer,
        tag="validation",
    )

    # TODO: Add early stopping

    # In order to check training resuming we can stop training on a given iteration
    if config["stop_iteration"] is not None:

        @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"]))
        def _():
            logger.info(
                f"Stop training on {trainer.state.iteration} iteration")
            trainer.terminate()

    # noinspection PyBroadException
    try:
        trainer.run(dataloader_train, max_epochs=config["num_epochs"])
    except Exception:
        import traceback

        print(traceback.format_exc())

    if rank == 0:
        # noinspection PyUnboundLocalVariable
        tb_logger.close()
Ejemplo n.º 27
0
def train_and_eval(tag,
                   dataroot,
                   metric='last',
                   save_path=None,
                   only_eval=False,
                   unsupervised=False,
                   mode=None):
    max_epoch = C.get()['epoch']
    trainloader, unsuploader, testloader = get_dataloaders(
        C.get()['dataset'],
        C.get()['batch'],
        C.get()['batch_unsup'],
        dataroot,
        mode=mode,
        n_labeled=args.n_labeled)

    # create a model & an optimizer
    model = get_model(C.get()['model'],
                      num_class(C.get()['dataset']),
                      data_parallel=True)

    criterion = nn.CrossEntropyLoss()
    if C.get()['optimizer']['type'] == 'sgd':
        optimizer = optim.SGD(model.parameters(),
                              lr=C.get()['lr'],
                              momentum=C.get()['optimizer'].get(
                                  'momentum', 0.9),
                              weight_decay=C.get()['optimizer']['decay'],
                              nesterov=C.get()['optimizer']['nesterov'])
    else:
        raise ValueError('invalid optimizer type=%s' %
                         C.get()['optimizer']['type'])

    lr_scheduler_type = C.get()['lr_schedule'].get('type', 'cosine')
    if lr_scheduler_type == 'cosine':
        t_max = C.get()['epoch']
        if C.get()['lr_schedule'].get('warmup', None):
            t_max -= C.get()['lr_schedule']['warmup']['epoch']
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                               T_max=t_max,
                                                               eta_min=0.)
    else:
        raise ValueError('invalid lr_schduler=%s' % lr_scheduler_type)

    if C.get()['lr_schedule'].get('warmup', None):
        scheduler = GradualWarmupScheduler(
            optimizer,
            multiplier=C.get()['lr_schedule']['warmup']['multiplier'],
            total_epoch=C.get()['lr_schedule']['warmup']['epoch'],
            after_scheduler=scheduler)

    if not tag.strip():
        from metrics import SummaryWriterDummy as SummaryWriter
        logger.warning('tag not provided, no tensorboard log.')
    else:
        from tensorboardX import SummaryWriter
    writers = [
        SummaryWriter(logdir='./logs/%s/%s' % (tag, x))
        for x in ['train', 'test']
    ]

    result = OrderedDict()
    epoch_start = 1
    if save_path and os.path.exists(save_path):
        data = torch.load(save_path)
        model.load_state_dict(data['model'])
        optimizer.load_state_dict(data['optimizer'])
        epoch_start = data['epoch']

    if only_eval:
        logger.info('evaluation only+')
        model.eval()
        rs = dict()
        rs['test'] = run_epoch(model,
                               testloader,
                               unsuploader,
                               criterion,
                               None,
                               desc_default='*test',
                               epoch=epoch_start,
                               writer=writers[1])
        for key, setname in itertools.product(['loss', 'top1', 'top5'],
                                              ['train', 'test']):
            result['%s_%s' % (key, setname)] = rs[setname][key]
        result['epoch'] = 0
        return result

    # train loop
    global best_valid_top1
    best_valid_loss = 10e10
    for epoch in range(epoch_start, max_epoch + 1):
        model.train()
        rs = dict()
        if args.train_mode == 'small':
            print(f'only small')
            rs['train'] = run_epoch(model,
                                    trainloader,
                                    unsuploader,
                                    criterion,
                                    optimizer,
                                    desc_default='train',
                                    epoch=epoch,
                                    writer=writers[0],
                                    verbose=True,
                                    unsupervised=False,
                                    scheduler=scheduler)
        else:
            rs['train'] = run_epoch(model,
                                    trainloader,
                                    unsuploader,
                                    criterion,
                                    optimizer,
                                    desc_default='train',
                                    epoch=epoch,
                                    writer=writers[0],
                                    verbose=True,
                                    unsupervised=unsupervised,
                                    scheduler=scheduler)
        if math.isnan(rs['train']['loss']):
            raise Exception('train loss is NaN.')

        model.eval()
        if epoch % (10 if 'cifar' in C.get()['dataset'] else
                    30) == 0 or epoch == max_epoch:
            rs['test'] = run_epoch(model,
                                   testloader,
                                   unsuploader,
                                   criterion,
                                   None,
                                   desc_default='*test',
                                   epoch=epoch,
                                   writer=writers[1],
                                   verbose=True)

            if best_valid_top1 < rs['test']['top1']:
                best_valid_top1 = rs['test']['top1']

            if metric == 'last' or rs[metric]['loss'] < best_valid_loss:  # TODO
                if metric != 'last':
                    best_valid_loss = rs[metric]['loss']
                for key, setname in itertools.product(['loss', 'top1', 'top5'],
                                                      ['train', 'test']):
                    result['%s_%s' % (key, setname)] = rs[setname][key]
                result['epoch'] = epoch

                writers[1].add_scalar('test_top1/best', rs['test']['top1'],
                                      epoch)

            # save checkpoint
            if save_path:
                logger.info('save model@%d to %s' % (epoch, save_path))
                torch.save(
                    {
                        'epoch': epoch,
                        'log': {
                            'train': rs['train'].get_dict(),
                            'test': rs['test'].get_dict(),
                        },
                        'optimizer': optimizer.state_dict(),
                        'model': model.state_dict()
                    }, save_path)

    del model

    return result
Ejemplo n.º 28
0
if __name__ == '__main__':
    project = Project()
    # our hyperparameters
    params = {
        'lr': 0.001,
        'batch_size': 64,
        'epochs': 20,
        'model': 'resnet18-finetune'
    }
    logging.info(f'Using device={device} 🚀')
    # everything starts with the data
    train_dl, val_dl, test_dl = get_dataloaders(
        project.dataset_dir / "train",
        project.dataset_dir / "test",
        train_transform=train_transform,
        test_transform=test_transform,
        batch_size=params['batch_size'],
        pin_memory=True,
        num_workers=4,
    )

    # it is always good practice to visualise some of the train and val images to be sure data-aug
    # is applied properly
    show_dl(train_dl)
    show_dl(test_dl)
    # define our comet experiment
    experiment = Experiment(api_key="api_key",
                            project_name="project_name",
                            workspace="workspace_name")
    experiment.log_parameters(params)
    # create our special resnet18
Ejemplo n.º 29
0
def main(pargs):

    #init distributed training
    comm_local_group = comm.init(pargs.wireup_method,
                                 pargs.batchnorm_group_size)
    comm_rank = comm.get_rank()
    comm_local_rank = comm.get_local_rank()
    comm_size = comm.get_size()
    comm_local_size = comm.get_local_size()

    # set up logging
    pargs.logging_frequency = max([pargs.logging_frequency, 1])
    log_file = os.path.normpath(
        os.path.join(pargs.output_dir, "logs", pargs.run_tag + ".log"))
    logger = mll.mlperf_logger(log_file, "deepcam", "Umbrella Corp.")
    logger.log_start(key="init_start", sync=True)
    logger.log_event(key="cache_clear")

    #set seed
    seed = pargs.seed
    logger.log_event(key="seed", value=seed)

    # Some setup
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        device = torch.device("cuda", comm_local_rank)
        torch.cuda.manual_seed(seed)
        torch.cuda.set_device(device)
        torch.backends.cudnn.benchmark = True
    else:
        device = torch.device("cpu")

    #set up directories
    root_dir = os.path.join(pargs.data_dir_prefix)
    output_dir = pargs.output_dir
    plot_dir = os.path.join(output_dir, "plots")
    if comm_rank == 0:
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)

    # logging of rank information
    logger.log_event(key="number_of_ranks", value=comm_size)
    logger.log_event(key="number_of_nodes",
                     value=(comm_size // comm_local_size))
    logger.log_event(key="accelerators_per_node", value=comm_local_size)

    # Logging hyperparameters
    logger.log_event(key="global_batch_size",
                     value=(pargs.local_batch_size * comm_size))
    logger.log_event(key="batchnorm_group_size",
                     value=pargs.batchnorm_group_size)
    logger.log_event(key="gradient_accumulation_frequency",
                     value=pargs.gradient_accumulation_frequency)
    logger.log_event(key="checkpoint", value=pargs.checkpoint)

    # Define architecture
    n_input_channels = len(pargs.channels)
    n_output_channels = 3
    net = deeplab_xception.DeepLabv3_plus(n_input=n_input_channels,
                                          n_classes=n_output_channels,
                                          os=16,
                                          pretrained=False,
                                          rank=comm_rank,
                                          process_group=comm_local_group)
    net.to(device)

    #select loss
    #some magic numbers
    loss_pow = -0.125
    class_weights = [
        0.986267818390377**loss_pow, 0.0004578708870701058**loss_pow,
        0.01327431072255291**loss_pow
    ]
    # extract loss
    criterion = losses.CELoss(class_weights).to(device)
    criterion = torch.jit.script(criterion)

    #select optimizer
    optimizer = oh.get_optimizer(pargs, net, logger)

    #restart from checkpoint if desired
    if pargs.checkpoint is not None:
        checkpoint = torch.load(pargs.checkpoint, map_location=device)
        start_step = checkpoint['step']
        start_epoch = checkpoint['epoch']
        optimizer.load_state_dict(checkpoint['optimizer'])
        net.load_state_dict(checkpoint['model'])
    else:
        start_step = 0
        start_epoch = 0

    #broadcast model and optimizer state
    steptens = torch.tensor(np.array([start_step, start_epoch]),
                            requires_grad=False).to(device)
    if dist.is_initialized():
        dist.broadcast(steptens, src=0)

    #unpack the bcasted tensor
    start_step = int(steptens.cpu().numpy()[0])
    start_epoch = int(steptens.cpu().numpy()[1])

    #select scheduler
    scheduler = None
    if pargs.lr_schedule:
        pargs.lr_schedule["lr_warmup_steps"] = pargs.lr_warmup_steps
        pargs.lr_schedule["lr_warmup_factor"] = pargs.lr_warmup_factor
        scheduler = oh.get_lr_schedule(pargs.start_lr,
                                       pargs.lr_schedule,
                                       optimizer,
                                       logger,
                                       last_step=start_step)

    # print parameters
    if comm_rank == 0:
        print(net)
        print("Total number of elements:",
              sum(p.numel() for p in net.parameters() if p.requires_grad))

    # get input shapes for the upcoming model preprocessing
    # input_shape:
    tshape, _ = get_datashapes(pargs, root_dir)
    input_shape = tuple([tshape[2], tshape[0], tshape[1]])

    #distributed model parameters
    bucket_cap_mb = 25
    if pargs.batchnorm_group_size > 1:
        bucket_cap_mb = 220

    # get stream, relevant for graph capture
    ddp_net = DDP(net,
                  device_ids=[device.index],
                  output_device=device.index,
                  find_unused_parameters=False,
                  broadcast_buffers=False,
                  bucket_cap_mb=bucket_cap_mb,
                  gradient_as_bucket_view=False)

    # get stats handler here
    bnstats_handler = bns.BatchNormStatsSynchronize(ddp_net,
                                                    reduction="mean",
                                                    inplace=True)

    # create handles
    net_validate = ddp_net
    net_train = ddp_net

    # Set up the data feeder
    train_loader, train_size, validation_loader, validation_size = get_dataloaders(
        pargs, root_dir, device, seed, comm_size, comm_rank)

    # log size of datasets
    logger.log_event(key="train_samples", value=train_size)
    val_size = validation_size
    logger.log_event(key="eval_samples", value=val_size)

    # get start steps
    step = start_step
    epoch = start_epoch
    current_lr = pargs.start_lr if not pargs.lr_schedule else scheduler.get_last_lr(
    )[0]
    stop_training = False
    net_train.train()

    # start trining
    logger.log_end(key="init_stop", sync=True)
    logger.log_start(key="run_start", sync=True)

    # training loop
    while True:

        # start epoch
        logger.log_start(key="epoch_start",
                         metadata={
                             'epoch_num': epoch + 1,
                             'step_num': step
                         },
                         sync=True)

        train_loader.sampler.set_epoch(epoch)

        # training
        step = train_step(pargs, comm_rank, comm_size, device, step, epoch,
                          net_train, criterion, optimizer, scheduler,
                          train_loader, logger)

        # average BN stats
        bnstats_handler.synchronize()

        # validation
        stop_training = validate(pargs, comm_rank, comm_size, device, step,
                                 epoch, net_validate, criterion,
                                 validation_loader, logger)

        # log the epoch
        logger.log_end(key="epoch_stop",
                       metadata={
                           'epoch_num': epoch + 1,
                           'step_num': step
                       },
                       sync=True)
        epoch += 1

        #save model if desired
        if (pargs.save_frequency > 0) and (epoch % pargs.save_frequency == 0):
            logger.log_start(key="save_start",
                             metadata={
                                 'epoch_num': epoch + 1,
                                 'step_num': step
                             },
                             sync=True)
            if comm_rank == 0:
                checkpoint = {
                    'step': step,
                    'epoch': epoch,
                    'model': net_train.state_dict(),
                    'optimizer': optimizer.state_dict()
                }
                torch.save(
                    checkpoint,
                    os.path.join(
                        output_dir,
                        pargs.model_prefix + "_step_" + str(step) + ".cpt"))
                logger.log_end(key="save_stop",
                               metadata={
                                   'epoch_num': epoch + 1,
                                   'step_num': step
                               },
                               sync=True)

        # are we done?
        if (epoch >= pargs.max_epochs) or stop_training:
            break

    # run done
    logger.log_end(key="run_stop", sync=True, metadata={'status': 'success'})
Ejemplo n.º 30
0
def eval_tta(config, augment, reporter):
    C.get()
    C.get().conf = config
    cv_ratio_test, cv_fold, save_path = augment['cv_ratio_test'], augment[
        'cv_fold'], augment['save_path']

    # setup - provided augmentation rules
    C.get()['aug'] = policy_decoder(augment, augment['num_policy'],
                                    augment['num_op'])

    # eval
    model = get_model(C.get()['model'], num_class(C.get()['dataset']))
    ckpt = torch.load(save_path + '.pth')
    if 'model' in ckpt:
        model.load_state_dict(ckpt['model'])
    else:
        model.load_state_dict(ckpt)
    model = nn.DataParallel(model).cuda()
    model.eval()

    src_loaders = []
    # for _ in range(augment['num_policy']):
    _, src_tl, src_validloader, src_ttl = get_dataloaders(
        C.get()['dataset'],
        C.get()['batch'],
        augment['dataroot'],
        cv_ratio_test,
        cv_num,
        split_idx=cv_fold,
        target=False,
        random_range=C.get()['args'].random_range)

    del src_tl, src_ttl

    start_t = time.time()
    metrics = Accumulator()
    loss_fn = torch.nn.CrossEntropyLoss(reduction='none')

    emd_loss = nn.DataParallel(emdModule()).cuda()

    losses = []
    corrects = []
    for data in src_validloader:
        with torch.no_grad():
            point_cloud = data['point_cloud'].cuda()
            label = torch.ones_like(data['label'], dtype=torch.int64).cuda()
            trans_pc = data['transformed']

            pred = model(trans_pc)

            if C.get()['args'].use_emd_false:
                loss_emd = (torch.mean(emd_loss(point_cloud.permute(0, 2, 1),
                                                trans_pc.permute(0, 2, 1), 0.05, 3000)[0])).unsqueeze(0) \
                           * C.get()['args'].emd_coeff
            else:
                loss_emd = torch.tensor([0.0])

            if C.get()['args'].no_dc:
                loss = loss_emd
            else:
                loss = loss_emd + loss_fn(pred, label)
            # print(loss)
            losses.append(loss.detach().cpu().numpy())

            pred = pred.max(dim=1)[1]
            pred = pred.t()
            correct = float(
                torch.sum(pred == label).item()) / pred.size(0) * 100
            corrects.append(correct)
            del loss, correct, pred, data, label, loss_emd

    losses = np.concatenate(losses)
    losses_min = np.min(losses, axis=0).squeeze()
    corrects_max = max(corrects)
    metrics.add_dict({
        'minus_loss': -1 * np.sum(losses_min),
        'correct': np.sum(corrects_max),
        # 'cnt': len(corrects_max)
    })
    del corrects, corrects_max

    del model
    # metrics = metrics / 'cnt'
    gpu_secs = (time.time() - start_t) * torch.cuda.device_count()
    # print(metrics)
    reporter(minus_loss=metrics['minus_loss'],
             top1_valid=metrics['correct'],
             elapsed_time=gpu_secs,
             done=True)
    return metrics['minus_loss']