def __init__(self, config: ConfigClass, save_dir: str, name='ActiveTrainer'): super(ActiveTrainer, self).__init__(config, save_dir, name) self.main_data_dir = os.path.join(self.save_dir, 'Datasets') os.makedirs(self.main_data_dir) self.al_config = config.active_learn self.data_pool = get_pool_class(self.config) self.data_loaders = get_dataloaders( self.config.data, file_list=self.data_pool.train_pool) self.main_logger.info(self.data_loaders.msg) self._create_train_loggers(value=0) self._save_dataset_info() self.data_pool.copy_pool_files_to_dir(self.data_pool.train_pool, self.save_data_dir) if self.use_ensemble: self._init_train_components_ensemble() else: self._init_train_components()
def discriminator_tst(): from data import get_dataloaders train, test = get_dataloaders(batch_size=16) disc = judge() for images, labels in train: test_out = disc(images) print(test_out.shape)
def __init__(self, config: ConfigClass, save_dir: str): super(BayesianTrainer, self).__init__(config, save_dir, 'BBB_Trainer') self.data_loaders = get_dataloaders(config.data) self.main_logger.info(self.data_loaders.msg) self._init_train_components()
def test_dataset_type(self, dataset_name, device): """Test that the specified dataset is the one actually being loaded. """ train_loader, test_loader = get_dataloaders(dataset_name, device=device, train_batch_size=3, test_batch_size=4) if dataset_name.lower() == 'mnist': assert isinstance(train_loader.dataset, torchvision.datasets.mnist.MNIST) assert isinstance(test_loader.dataset, torchvision.datasets.mnist.MNIST) elif dataset_name.lower() == 'cifar-10': assert isinstance(train_loader.dataset, torchvision.datasets.cifar.CIFAR10) assert isinstance(test_loader.dataset, torchvision.datasets.cifar.CIFAR10) elif dataset_name.lower() == 'cifar-100': assert isinstance(train_loader.dataset, torchvision.datasets.cifar.CIFAR100) assert isinstance(test_loader.dataset, torchvision.datasets.cifar.CIFAR100) else: raise ValueError("My test is flaky. Got unsupported dataset " "{} from conftest.".format(dataset_name))
def evaluate_monte_carlo(config: ConfigClass, model): device = torch.device( f'cuda:{config.gpu_node}' if torch.cuda.is_available() else 'cpu') mc_passes = config.prediction.mc_passes # Create dataloader class loader_class = get_dataloaders(config.data) segment_metrics = SegmentationMetrics(num_classes=loader_class.num_classes, threshold=config.binarize_threshold) # Evaluate model model.eval() model.apply(apply_dropout) with torch.no_grad(): for batch in loader_class.evaluation_loader: x, y = batch x = x.to(device=device, non_blocking=True) y = y.to(device=device, non_blocking=True) summed_y = torch.zeros_like(y) for idx in range(mc_passes): y_pred = model(x) summed_y += torch.sigmoid(y_pred) averaged_y = summed_y / mc_passes segment_metrics.update((averaged_y, y), process=False) metrics = segment_metrics.compute() return metrics
def evaluate_ensemble(config: ConfigClass, models_list): device = torch.device( f'cuda:{config.gpu_node}' if torch.cuda.is_available() else 'cpu') # Create dataloader class loader_class = get_dataloaders(config.data) segment_metrics = SegmentationMetrics(num_classes=loader_class.num_classes, threshold=config.binarize_threshold) # Evaluate models for model in models_list: model.eval() with torch.no_grad(): for batch in loader_class.evaluation_loader: x, y = batch x = x.to(device=device, non_blocking=True) y = y.to(device=device, non_blocking=True) averaged_y = torch.zeros_like(y) for model in models_list: y_pred = model(x) averaged_y += torch.sigmoid(y_pred) averaged_y = averaged_y / len(models_list) segment_metrics.update((averaged_y, y), process=False) metrics = segment_metrics.compute() return metrics
def __init__(self, config: ConfigClass, save_dir: str): super(PassiveTrainerEnsemble, self).__init__(config, save_dir, 'Passive_Trainer_Ensemble') if config.data.data_list is None: self.data_loaders = get_dataloaders(config.data) else: with open(join(DATA_DIR, 'MSRA10K_INIT', config.data.data_list), 'rb') as f: file_list = pickle.load(f) self.data_loaders = get_dataloaders(config.data, file_list=file_list) self.main_logger.info( f'Using preset file list with length {len(file_list)}') self.main_logger.info(self.data_loaders.msg) self._init_train_components_ensemble()
def __init__(self, args): # Set the LR Scheduler and Loss Parameters self.args = args self.model = BertNLU(args) print(self.model) # Set the Device and Distributed Settings self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if args.distributed and torch.cuda.is_available() and torch.cuda.device_count() > 1: self.model = torch.nn.DataParallel(self.model) self.model.to(self.device) # Define the data loaders self.train_loader, \ self.val_loader, \ self.test_loader = get_dataloaders(data_root=args.data_path, batch_size=args.batch_size, dataset=args.dataset, num_workers=args.num_workers) # Define the optimizers if args.finetune_bert: print('Finetuning BERT') self.optimizer = torch.optim.Adam([ {'params': self.model.bert.parameters(), 'lr':args.learning_rate_bert}, {'params': self.model.classifier.parameters()} ], lr=args.learning_rate) else: print('Freezing BERT') self.optimizer = torch.optim.Adam([ {'params': self.model.classifier.parameters()} ], lr=args.learning_rate) if args.scheduler == 'plateau': self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, factor=0.5, patience=3, mode='max', verbose=True) elif args.scheduler == 'cycle': self.scheduler = torch.optim.lr_scheduler.OneCycleLR(self.optimizer, max_lr=args.learning_rate, steps_per_epoch=len(self.train_loader), epochs=args.num_epochs) self.criterion = torch.nn.CrossEntropyLoss() # Training specific params self.num_epochs = args.num_epochs self.print_every = args.print_every self.val_every = args.val_every self.model_dir = args.model_dir self.save_every = args.save_every
def test_dataloader_gpu(self, dataset_name): """Specifying 'cuda' as the device will pin the memory. """ train_loader, test_loader = get_dataloaders( dataset_name, device=torch.device('cuda'), train_batch_size=3, test_batch_size=4) assert train_loader.pin_memory assert test_loader.pin_memory
def test_dataloader_nonempty(self, dataset_name, device): """Make sure that the dataloader contains a dataset of nonzero size """ train_loader, test_loader = get_dataloaders(dataset_name, device=device, train_batch_size=3, test_batch_size=4) assert len(train_loader.dataset) > 0 assert len(test_loader.dataset) > 0
def init(): part_of_body = 'WRIST' phase_cat = ['train', 'valid'] case_data = get_study_data('XR_' + part_of_body, 'D:/MURA-v1.1/{0}/{1}/') dataloaders = get_dataloaders(case_data, batch_size=1) # train abnormal, normal images tai = {x: get_count(case_data[x], 'positive') for x in phase_cat} tni = {x: get_count(case_data[x], 'negative') for x in phase_cat} Wt1 = {x: np_tensor(tni[x] / (tni[x] + tai[x])) for x in phase_cat} Wt0 = {x: np_tensor(tai[x] / (tni[x] + tai[x])) for x in phase_cat}
def test_dataloader_determinism(self, dataset_name, device): """The randomness with which the training set is shuffled should be deterministic. So the order of the training batches is random but constant. """ train_loader1, _ = get_dataloaders(dataset_name, seed=0, device=device, train_batch_size=3, test_batch_size=4) X1, y1 = next(iter(train_loader1)) train_loader2, _ = get_dataloaders(dataset_name, seed=0, device=device, train_batch_size=3, test_batch_size=4) X2, y2 = next(iter(train_loader2)) assert torch.equal(X1, X2) assert torch.equal(y1, y2)
def __init__(self, config: ConfigClass, save_dir: str, log_name='Variational_Trainer'): super(VariationalTrainer, self).__init__(config, save_dir, log_name) self.data_loaders = get_dataloaders(config.data) self.main_logger.info(self.data_loaders.msg) self.starting_kld_factor = 0 if self.loss_cfg.kld_warmup else self.loss_cfg.kld_factor self.starting_mse_factor = 0 if self.loss_cfg.mse_warmup else self.loss_cfg.mse_factor self._init_train_components()
def test_cifar_wrong_numchannels(self, device): """num_channel=2 is invalid and should be caught """ with pytest.raises(ValueError): train_loader, test_loader = get_dataloaders( dataset='cifar-10', device=device, train_batch_size=3, test_batch_size=4, augment=True, resize_to=(48, 48), num_channels=2, )
def test_dataloader_samplers(self, dataset_name, device): """The training set should be shuffled, while the test set should be yielded in sequential, unshuffled order. """ train_loader, test_loader = get_dataloaders(dataset_name, device=device, train_batch_size=3, test_batch_size=4) assert isinstance(train_loader.batch_sampler.sampler, torch.utils.data.sampler.RandomSampler) assert isinstance(test_loader.batch_sampler.sampler, torch.utils.data.sampler.SequentialSampler)
def test_get_dataloaders(self): train, test = get_dataloaders(batch_size=16) for k, data in enumerate(train): images, labels = data print(images.shape, labels.shape) images, labels = images.numpy(), labels.numpy() images = images.squeeze(1).transpose(0, 1, 2) for j in range(16): pl.subplot(4, 4, j + 1) print(images.shape) pl.imshow(images[j, :, :]) pl.title(labels[j]) pl.axis('off') pl.show()
def test_dataset_successful_iter(self, dataset_name, device, num_channels): """Any error in the data preparation may cause runtime errors that would not be visible otherwise """ train_loader, test_loader = get_dataloaders( dataset_name, device=device, train_batch_size=3, test_batch_size=4, augment=True, resize_to=(48, 48), num_channels=num_channels, ) x, y = iter(train_loader).next() assert True
def test_dataset_resize(self, dataset_name, device): """Test that passing resize_to actually resizes images to the correct dimensions. """ train_loader, test_loader = get_dataloaders( dataset_name, device=device, train_batch_size=3, test_batch_size=4, augment=True, resize_to=(48, 48), num_channels=1, ) x, y = iter(train_loader).next() assert tuple(x.shape[-2:]) == (48, 48)
def main(args): train_dataloader, dev_dataloader = get_dataloaders(args) print('Dataloaders obtained') features = np.load(args.train_videos_npz_path) print('Features obtained') device = torch.device(args.device) print('Device: ', device) model = R2plus1D_18() print('MODEL\n',model) model.to(device) optimizer = torch.optim.AdamW( model.parameters(), lr = args.learning_rate, eps = args.epsilon ) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer ) criterion = torch.nn.BCEWithLogitsLoss() print('TRAINING...') training_stats = [] steps_stats = [] for epoch_i in tqdm(range(0,args.epochs)): avg_train_loss, train_acc, train_mcc = train(model,train_dataloader,features,device,criterion,optimizer) print(avg_train_loss, train_acc, train_mcc) avg_dev_loss, dev_acc, dev_mcc = valid(model,dev_dataloader,features,device,criterion,scheduler) print(avg_dev_loss, dev_acc, dev_mcc) training_stats.append([avg_train_loss,train_acc,train_mcc,avg_dev_loss,dev_acc,dev_mcc]) torch.save(model,str(args.save_model_path+str(epoch_i)+'_model.pth')) print('TRAINING COMPLETED') # Show training results col_names = ['train_loss','train_acc','train_mcc','dev_loss', 'dev_acc','dev_mcc'] training_stats = pd.DataFrame(training_stats,columns=col_names) print(training_stats.head(args.epochs))
def test_dataloader_batchsizes(self, dataset_name, device): """Test that the dataloaders yield batches of the requested sizes. """ train_loader, test_loader = get_dataloaders(dataset_name, device=device, train_batch_size=3, test_batch_size=4) assert train_loader.batch_size == 3 X_train_batch, y_train_batch = next(iter(train_loader)) assert X_train_batch.shape[0] == 3 assert X_train_batch.shape[0] == 3 assert test_loader.batch_size == 4 X_test_batch, y_test_batch = next(iter(test_loader)) assert X_test_batch.shape[0] == 4 assert X_test_batch.shape[0] == 4
def test_dataloaders(): head("Testing the dataloaders") try: datasetroot = data._DEFAULT_COMMONVOICE_ROOT datasetversion = data._DEFAULT_COMMONVOICE_VERSION use_cuda = False B = 10 nthreads = 2 train_augment = False min_duration = 1 # s. max_duration = 5 # s. loaders = data.get_dataloaders(datasetroot, datasetversion, cuda=use_cuda, batch_size=B, n_threads=nthreads, min_duration=min_duration, max_duration=max_duration, small_experiment=False, train_augment=train_augment, nmels=data._DEFAULT_NUM_MELS, logger=None) train_loader, valid_loader, test_loader = loaders minibatch = next(iter(train_loader)) info(f"[1/] Got a minibatch of type {type(minibatch)}") if not isinstance(minibatch, tuple) or len(minibatch) != 2: fail("Expected a minibatch to be a tuple spectrograms, transcripts") else: succeed() packed_batch, packed_transcripts = minibatch info(f"[2/] Got two items of type {type(packed_batch), type(packed_transcripts)}") if not isinstance(packed_batch, PackedSequence) or\ not isinstance(packed_transcripts, PackedSequence): fail("Expected two PackedSequence") else: succeed() except: fail(f"{sys.exc_info()[0]}") if _RERAISE: raise
def test_get_loaders(dataset): """ Test dataloader :param dataset: Dataset to use :type dataset: str """ logging.info(f"Loading dataloaders for {dataset}") loaders = get_dataloaders( dataset, root=f"datasets/{dataset}", batch_size=128, test_batch_size=128, validation_split=0.1, fixed_shuffle=True, ) logging.info(f"Looping through dataset {dataset}") for loader in loaders: _loader_loop(loader)
def __init__(self, hparams: Config): super().__init__() torch.set_num_threads(15) self.hparams = sanitize_dict(copy.copy( vars(hparams))) # TODO: clean up self.dataloaders, self.regression, self.output_dim = get_dataloaders( hparams.sample_size, hparams.seed, hparams.batch_size, hparams.target) self.learning_rate = hparams.learning_rate self.weight_decay = hparams.weight_decay self.channels = hparams.channels self.dropout = hparams.dropout self.lr_decay = hparams.lr_decay # print(self.output_dim) # print(self.dropout) self.build_model()
def __init__(self, hparams: Config): super().__init__() torch.set_num_threads(8) self.hparams = sanitize_dict(copy.copy(vars(hparams))) self.dataloaders, self.regression, self.output_dim = get_dataloaders( hparams.sample_size, hparams.repetition_num, hparams.sample_splits_dir, hparams.num_workers, hparams.batch_size, hparams.scorename) self.learning_rate = hparams.learning_rate self.weight_decay = hparams.weight_decay self.channels = hparams.channels self.dropout = hparams.dropout self.lr_decay = hparams.lr_decay # print(self.output_dim) # print(self.dropout) self.build_model()
def evaluate_one_pass(config: ConfigClass, model): device = torch.device( f'cuda:{config.gpu_node}' if torch.cuda.is_available() else 'cpu') # Create dataloader class loader_class = get_dataloaders(config.data) segment_metrics = SegmentationMetrics(num_classes=loader_class.num_classes, threshold=config.binarize_threshold) # Evaluate model model.eval() with torch.no_grad(): for batch in loader_class.evaluation_loader: x, y = batch x = x.to(device=device, non_blocking=True) y = y.to(device=device, non_blocking=True) y_pred = model(x) segment_metrics.update((y_pred, y)) metrics = segment_metrics.compute() return metrics
def training(local_rank, config): config["device"] = "cuda" if config["active_gpu_ids"] else "cpu" rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="Carbon Black Semantic Segmentation Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_path"] if rank == 0: if config["stop_iteration"] is None: now = utils.get_time_stamp() else: now = f"stop-on-{config['stop_iteration']}" folder_name = ( f"{config['architecture']}-{config['encoder']}-{config['encoder_weights']}_" f"backend-{idist.backend()}-{idist.get_world_size()}_{now}") output_path = Path(output_path) / folder_name output_path.mkdir(parents=True, exist_ok=True) config["output_path"] = output_path.as_posix() config["task_name"] = output_path.stem logger.info(f"Output path: {output_path}") if "cuda" in idist.device().type: config["cuda_device_name"] = torch.cuda.get_device_name(local_rank) setup_trains_logging(config) dataloader_train, dataloader_val = get_dataloaders(config) config["num_iterations_per_epoch"] = len(dataloader_train) config["num_epochs"] = round(config["num_iterations"] / config["num_iterations_per_epoch"]) model = modeling.get_model(config) optimizer = get_optimizer(model, config) loss = get_loss() lr_scheduler = get_lr_scheduler(optimizer, config) trainer = create_trainer(model, optimizer, loss, lr_scheduler, dataloader_train.sampler, config, logger) metrics = get_metrics(loss) # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) evaluator_train = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": evaluator_train, "validation": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) example_prediction_logger = ExamplePredictionLogger( tb_logger, model, device) def run_validation(engine): epoch = trainer.state.epoch state = evaluator_train.run(dataloader_train) data_subset = "Train" log_metrics(logger, epoch, state.times["COMPLETED"], data_subset, state.metrics) log_confusion_matrix(tb_logger, epoch, data_subset, state.metrics) state = evaluator.run(dataloader_val) data_subset = "Val" log_metrics(logger, epoch, state.times["COMPLETED"], data_subset, state.metrics) log_confusion_matrix(tb_logger, epoch, data_subset, state.metrics) example_prediction_logger.log_visualization(dataloader_val.dataset, epoch) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation) # Store 3 best models by validation accuracy: common.gen_save_best_models_by_val_score( save_handler=get_save_handler(config), evaluator=evaluator, models={"model": model}, metric_name="accuracy", n_saved=3, trainer=trainer, tag="validation", ) # TODO: Add early stopping # In order to check training resuming we can stop training on a given iteration if config["stop_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"])) def _(): logger.info( f"Stop training on {trainer.state.iteration} iteration") trainer.terminate() # noinspection PyBroadException try: trainer.run(dataloader_train, max_epochs=config["num_epochs"]) except Exception: import traceback print(traceback.format_exc()) if rank == 0: # noinspection PyUnboundLocalVariable tb_logger.close()
def train_and_eval(tag, dataroot, metric='last', save_path=None, only_eval=False, unsupervised=False, mode=None): max_epoch = C.get()['epoch'] trainloader, unsuploader, testloader = get_dataloaders( C.get()['dataset'], C.get()['batch'], C.get()['batch_unsup'], dataroot, mode=mode, n_labeled=args.n_labeled) # create a model & an optimizer model = get_model(C.get()['model'], num_class(C.get()['dataset']), data_parallel=True) criterion = nn.CrossEntropyLoss() if C.get()['optimizer']['type'] == 'sgd': optimizer = optim.SGD(model.parameters(), lr=C.get()['lr'], momentum=C.get()['optimizer'].get( 'momentum', 0.9), weight_decay=C.get()['optimizer']['decay'], nesterov=C.get()['optimizer']['nesterov']) else: raise ValueError('invalid optimizer type=%s' % C.get()['optimizer']['type']) lr_scheduler_type = C.get()['lr_schedule'].get('type', 'cosine') if lr_scheduler_type == 'cosine': t_max = C.get()['epoch'] if C.get()['lr_schedule'].get('warmup', None): t_max -= C.get()['lr_schedule']['warmup']['epoch'] scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=t_max, eta_min=0.) else: raise ValueError('invalid lr_schduler=%s' % lr_scheduler_type) if C.get()['lr_schedule'].get('warmup', None): scheduler = GradualWarmupScheduler( optimizer, multiplier=C.get()['lr_schedule']['warmup']['multiplier'], total_epoch=C.get()['lr_schedule']['warmup']['epoch'], after_scheduler=scheduler) if not tag.strip(): from metrics import SummaryWriterDummy as SummaryWriter logger.warning('tag not provided, no tensorboard log.') else: from tensorboardX import SummaryWriter writers = [ SummaryWriter(logdir='./logs/%s/%s' % (tag, x)) for x in ['train', 'test'] ] result = OrderedDict() epoch_start = 1 if save_path and os.path.exists(save_path): data = torch.load(save_path) model.load_state_dict(data['model']) optimizer.load_state_dict(data['optimizer']) epoch_start = data['epoch'] if only_eval: logger.info('evaluation only+') model.eval() rs = dict() rs['test'] = run_epoch(model, testloader, unsuploader, criterion, None, desc_default='*test', epoch=epoch_start, writer=writers[1]) for key, setname in itertools.product(['loss', 'top1', 'top5'], ['train', 'test']): result['%s_%s' % (key, setname)] = rs[setname][key] result['epoch'] = 0 return result # train loop global best_valid_top1 best_valid_loss = 10e10 for epoch in range(epoch_start, max_epoch + 1): model.train() rs = dict() if args.train_mode == 'small': print(f'only small') rs['train'] = run_epoch(model, trainloader, unsuploader, criterion, optimizer, desc_default='train', epoch=epoch, writer=writers[0], verbose=True, unsupervised=False, scheduler=scheduler) else: rs['train'] = run_epoch(model, trainloader, unsuploader, criterion, optimizer, desc_default='train', epoch=epoch, writer=writers[0], verbose=True, unsupervised=unsupervised, scheduler=scheduler) if math.isnan(rs['train']['loss']): raise Exception('train loss is NaN.') model.eval() if epoch % (10 if 'cifar' in C.get()['dataset'] else 30) == 0 or epoch == max_epoch: rs['test'] = run_epoch(model, testloader, unsuploader, criterion, None, desc_default='*test', epoch=epoch, writer=writers[1], verbose=True) if best_valid_top1 < rs['test']['top1']: best_valid_top1 = rs['test']['top1'] if metric == 'last' or rs[metric]['loss'] < best_valid_loss: # TODO if metric != 'last': best_valid_loss = rs[metric]['loss'] for key, setname in itertools.product(['loss', 'top1', 'top5'], ['train', 'test']): result['%s_%s' % (key, setname)] = rs[setname][key] result['epoch'] = epoch writers[1].add_scalar('test_top1/best', rs['test']['top1'], epoch) # save checkpoint if save_path: logger.info('save model@%d to %s' % (epoch, save_path)) torch.save( { 'epoch': epoch, 'log': { 'train': rs['train'].get_dict(), 'test': rs['test'].get_dict(), }, 'optimizer': optimizer.state_dict(), 'model': model.state_dict() }, save_path) del model return result
if __name__ == '__main__': project = Project() # our hyperparameters params = { 'lr': 0.001, 'batch_size': 64, 'epochs': 20, 'model': 'resnet18-finetune' } logging.info(f'Using device={device} 🚀') # everything starts with the data train_dl, val_dl, test_dl = get_dataloaders( project.dataset_dir / "train", project.dataset_dir / "test", train_transform=train_transform, test_transform=test_transform, batch_size=params['batch_size'], pin_memory=True, num_workers=4, ) # it is always good practice to visualise some of the train and val images to be sure data-aug # is applied properly show_dl(train_dl) show_dl(test_dl) # define our comet experiment experiment = Experiment(api_key="api_key", project_name="project_name", workspace="workspace_name") experiment.log_parameters(params) # create our special resnet18
def main(pargs): #init distributed training comm_local_group = comm.init(pargs.wireup_method, pargs.batchnorm_group_size) comm_rank = comm.get_rank() comm_local_rank = comm.get_local_rank() comm_size = comm.get_size() comm_local_size = comm.get_local_size() # set up logging pargs.logging_frequency = max([pargs.logging_frequency, 1]) log_file = os.path.normpath( os.path.join(pargs.output_dir, "logs", pargs.run_tag + ".log")) logger = mll.mlperf_logger(log_file, "deepcam", "Umbrella Corp.") logger.log_start(key="init_start", sync=True) logger.log_event(key="cache_clear") #set seed seed = pargs.seed logger.log_event(key="seed", value=seed) # Some setup torch.manual_seed(seed) if torch.cuda.is_available(): device = torch.device("cuda", comm_local_rank) torch.cuda.manual_seed(seed) torch.cuda.set_device(device) torch.backends.cudnn.benchmark = True else: device = torch.device("cpu") #set up directories root_dir = os.path.join(pargs.data_dir_prefix) output_dir = pargs.output_dir plot_dir = os.path.join(output_dir, "plots") if comm_rank == 0: if not os.path.isdir(output_dir): os.makedirs(output_dir) # logging of rank information logger.log_event(key="number_of_ranks", value=comm_size) logger.log_event(key="number_of_nodes", value=(comm_size // comm_local_size)) logger.log_event(key="accelerators_per_node", value=comm_local_size) # Logging hyperparameters logger.log_event(key="global_batch_size", value=(pargs.local_batch_size * comm_size)) logger.log_event(key="batchnorm_group_size", value=pargs.batchnorm_group_size) logger.log_event(key="gradient_accumulation_frequency", value=pargs.gradient_accumulation_frequency) logger.log_event(key="checkpoint", value=pargs.checkpoint) # Define architecture n_input_channels = len(pargs.channels) n_output_channels = 3 net = deeplab_xception.DeepLabv3_plus(n_input=n_input_channels, n_classes=n_output_channels, os=16, pretrained=False, rank=comm_rank, process_group=comm_local_group) net.to(device) #select loss #some magic numbers loss_pow = -0.125 class_weights = [ 0.986267818390377**loss_pow, 0.0004578708870701058**loss_pow, 0.01327431072255291**loss_pow ] # extract loss criterion = losses.CELoss(class_weights).to(device) criterion = torch.jit.script(criterion) #select optimizer optimizer = oh.get_optimizer(pargs, net, logger) #restart from checkpoint if desired if pargs.checkpoint is not None: checkpoint = torch.load(pargs.checkpoint, map_location=device) start_step = checkpoint['step'] start_epoch = checkpoint['epoch'] optimizer.load_state_dict(checkpoint['optimizer']) net.load_state_dict(checkpoint['model']) else: start_step = 0 start_epoch = 0 #broadcast model and optimizer state steptens = torch.tensor(np.array([start_step, start_epoch]), requires_grad=False).to(device) if dist.is_initialized(): dist.broadcast(steptens, src=0) #unpack the bcasted tensor start_step = int(steptens.cpu().numpy()[0]) start_epoch = int(steptens.cpu().numpy()[1]) #select scheduler scheduler = None if pargs.lr_schedule: pargs.lr_schedule["lr_warmup_steps"] = pargs.lr_warmup_steps pargs.lr_schedule["lr_warmup_factor"] = pargs.lr_warmup_factor scheduler = oh.get_lr_schedule(pargs.start_lr, pargs.lr_schedule, optimizer, logger, last_step=start_step) # print parameters if comm_rank == 0: print(net) print("Total number of elements:", sum(p.numel() for p in net.parameters() if p.requires_grad)) # get input shapes for the upcoming model preprocessing # input_shape: tshape, _ = get_datashapes(pargs, root_dir) input_shape = tuple([tshape[2], tshape[0], tshape[1]]) #distributed model parameters bucket_cap_mb = 25 if pargs.batchnorm_group_size > 1: bucket_cap_mb = 220 # get stream, relevant for graph capture ddp_net = DDP(net, device_ids=[device.index], output_device=device.index, find_unused_parameters=False, broadcast_buffers=False, bucket_cap_mb=bucket_cap_mb, gradient_as_bucket_view=False) # get stats handler here bnstats_handler = bns.BatchNormStatsSynchronize(ddp_net, reduction="mean", inplace=True) # create handles net_validate = ddp_net net_train = ddp_net # Set up the data feeder train_loader, train_size, validation_loader, validation_size = get_dataloaders( pargs, root_dir, device, seed, comm_size, comm_rank) # log size of datasets logger.log_event(key="train_samples", value=train_size) val_size = validation_size logger.log_event(key="eval_samples", value=val_size) # get start steps step = start_step epoch = start_epoch current_lr = pargs.start_lr if not pargs.lr_schedule else scheduler.get_last_lr( )[0] stop_training = False net_train.train() # start trining logger.log_end(key="init_stop", sync=True) logger.log_start(key="run_start", sync=True) # training loop while True: # start epoch logger.log_start(key="epoch_start", metadata={ 'epoch_num': epoch + 1, 'step_num': step }, sync=True) train_loader.sampler.set_epoch(epoch) # training step = train_step(pargs, comm_rank, comm_size, device, step, epoch, net_train, criterion, optimizer, scheduler, train_loader, logger) # average BN stats bnstats_handler.synchronize() # validation stop_training = validate(pargs, comm_rank, comm_size, device, step, epoch, net_validate, criterion, validation_loader, logger) # log the epoch logger.log_end(key="epoch_stop", metadata={ 'epoch_num': epoch + 1, 'step_num': step }, sync=True) epoch += 1 #save model if desired if (pargs.save_frequency > 0) and (epoch % pargs.save_frequency == 0): logger.log_start(key="save_start", metadata={ 'epoch_num': epoch + 1, 'step_num': step }, sync=True) if comm_rank == 0: checkpoint = { 'step': step, 'epoch': epoch, 'model': net_train.state_dict(), 'optimizer': optimizer.state_dict() } torch.save( checkpoint, os.path.join( output_dir, pargs.model_prefix + "_step_" + str(step) + ".cpt")) logger.log_end(key="save_stop", metadata={ 'epoch_num': epoch + 1, 'step_num': step }, sync=True) # are we done? if (epoch >= pargs.max_epochs) or stop_training: break # run done logger.log_end(key="run_stop", sync=True, metadata={'status': 'success'})
def eval_tta(config, augment, reporter): C.get() C.get().conf = config cv_ratio_test, cv_fold, save_path = augment['cv_ratio_test'], augment[ 'cv_fold'], augment['save_path'] # setup - provided augmentation rules C.get()['aug'] = policy_decoder(augment, augment['num_policy'], augment['num_op']) # eval model = get_model(C.get()['model'], num_class(C.get()['dataset'])) ckpt = torch.load(save_path + '.pth') if 'model' in ckpt: model.load_state_dict(ckpt['model']) else: model.load_state_dict(ckpt) model = nn.DataParallel(model).cuda() model.eval() src_loaders = [] # for _ in range(augment['num_policy']): _, src_tl, src_validloader, src_ttl = get_dataloaders( C.get()['dataset'], C.get()['batch'], augment['dataroot'], cv_ratio_test, cv_num, split_idx=cv_fold, target=False, random_range=C.get()['args'].random_range) del src_tl, src_ttl start_t = time.time() metrics = Accumulator() loss_fn = torch.nn.CrossEntropyLoss(reduction='none') emd_loss = nn.DataParallel(emdModule()).cuda() losses = [] corrects = [] for data in src_validloader: with torch.no_grad(): point_cloud = data['point_cloud'].cuda() label = torch.ones_like(data['label'], dtype=torch.int64).cuda() trans_pc = data['transformed'] pred = model(trans_pc) if C.get()['args'].use_emd_false: loss_emd = (torch.mean(emd_loss(point_cloud.permute(0, 2, 1), trans_pc.permute(0, 2, 1), 0.05, 3000)[0])).unsqueeze(0) \ * C.get()['args'].emd_coeff else: loss_emd = torch.tensor([0.0]) if C.get()['args'].no_dc: loss = loss_emd else: loss = loss_emd + loss_fn(pred, label) # print(loss) losses.append(loss.detach().cpu().numpy()) pred = pred.max(dim=1)[1] pred = pred.t() correct = float( torch.sum(pred == label).item()) / pred.size(0) * 100 corrects.append(correct) del loss, correct, pred, data, label, loss_emd losses = np.concatenate(losses) losses_min = np.min(losses, axis=0).squeeze() corrects_max = max(corrects) metrics.add_dict({ 'minus_loss': -1 * np.sum(losses_min), 'correct': np.sum(corrects_max), # 'cnt': len(corrects_max) }) del corrects, corrects_max del model # metrics = metrics / 'cnt' gpu_secs = (time.time() - start_t) * torch.cuda.device_count() # print(metrics) reporter(minus_loss=metrics['minus_loss'], top1_valid=metrics['correct'], elapsed_time=gpu_secs, done=True) return metrics['minus_loss']