def test_dataloader_builder(): cfg = {"sampler": sampler_cfgs} dataloader = dataloader_builder.build(cfg) collate_fn1 = dataloader.collate_fn assert isinstance(collate_fn1, functools.partial) assert collate_fn1.func == _collate_fn cfg = {"sampler": sampler_cfg} dataloader = dataloader_builder.build(cfg) collate_fn2 = dataloader.collate_fn # TODO assert collate_fn2 == default_collate
def test_dataset_simple(): dataloader = dataloader_builder.build(dataloader_cfg) for idx, data in enumerate(dataloader): assert 'img' in data assert 'path' in data assert isinstance(data['img'], torch.Tensor) assert data['path'] != ERROR_STRING if idx > 500: break
def test_reid_evaluation(): dataloader = dataloader_builder.build(reid_cfg) # restore model_cfgs = evaluation_model_builder.build(baseline_model_cfg) model = model_builder.build(model_cfgs[0]) model = DataParallel(model) _run = {'config': {'device': torch.device('cuda')}} score = evaluate([dataloader], model, _run, "test") print(score)
def test_multi_dataset(): size1 = 70 size2 = 100 dummy_cfg_small = { "name": "dummy", "id": "dummy_small", "size": size1, "data_dir": "/" } dummy_cfg_large = { "name": "dummy", "id": "dummy_large", "size": size2, "data_dir": "/" } sequential_cfg1 = { "type": "sequential", "dataset": dummy_cfg_small, "batch_size": 1, "drop_last": True } sequential_cfg2 = { "type": "sequential", "dataset": dummy_cfg_large, "batch_size": 1, "drop_last": True } sampler_cfg = { "type": "concatenated_longest", "samplers": { "sampler1": sequential_cfg1, "sampler2": sequential_cfg2 } } dataloader_cfg = {"sampler": sampler_cfg} dataloader = dataloader_builder.build(dataloader_cfg) for idx, data in enumerate(dataloader): assert data['path'][0].startswith("dummy_small") assert data['path'][1].startswith("dummy_large") test = size1 if size1 > size2 else size2 print(test, idx)
def build(cfg): #TODO: Make this consistent in terms of return vals evaluation_cfgs = dict() dataloaders = [] for name, dataloader_cfg in cfg: dataloader = dataloader_builder.build(dataloader_cfg) dataloaders.append(dataloader) model_cfg = cfg['model'] model_cfgs = evaluation_model_builder.build(cfg['model']) # overwrite restored values for model_cfg in model_cfgs: model_cfg.update(cfg['model']) return evaluation_cfgs, model_cfgs
def evaluate_checkpoint_on(restore_checkpoint, dataset_cfg, _run, model_update_cfg={}): model_cfg, _, epoch = utils.restore_checkpoint(restore_checkpoint, model_cfg=model_update_cfg, map_location='cpu') #model_cfg['backbone']['output_dim'] = 256 dataloaders = dataloader_builder.build(dataset_cfg) model = model_builder.build(model_cfg) # TODO needs to be from dataset if 'seg_class_mapping' in model_cfg: mapping = model_cfg['seg_class_mapping'] else: mapping = None model.seg_mapping = mapping model = torch.nn.DataParallel(model, device_ids=_run.config['device_id']) model = model.cuda() return evaluate(dataloaders, model, epoch, keep=True)
def run_train(dataloader_cfg, model_cfg, scheduler_cfg, optimizer_cfg, loss_cfg, validation_cfg, checkpoint_frequency, restore_checkpoint, max_epochs, _run): # Lets cuDNN benchmark conv implementations and choose the fastest. # Only good if sizes stay the same within the main loop! torch.backends.cudnn.benchmark = True exit_handler = ExitHandler() device = _run.config['device'] device_id = _run.config['device_id'] # during training just one dataloader dataloader = dataloader_builder.build(dataloader_cfg)[0] epoch = 0 if restore_checkpoint is not None: model_cfg, optimizer_cfg, epoch = utils.restore_checkpoint( restore_checkpoint, model_cfg, optimizer_cfg) def overwrite(to_overwrite, dic): to_overwrite.update(dic) return to_overwrite # some models depend on dataset, for example num_joints model_cfg = overwrite(dataloader.dataset.info, model_cfg) model = model_builder.build(model_cfg) loss_cfg['model'] = model loss = loss_builder.build(loss_cfg) loss = loss.to(device) parameters = list(model.parameters()) + list(loss.parameters()) optimizer = optimizer_builder.build(optimizer_cfg, parameters) lr_scheduler = scheduler_builder.build(scheduler_cfg, optimizer, epoch) if validation_cfg is None: validation_dataloaders = None else: validation_dataloaders = dataloader_builder.build(validation_cfg) keep = False file_logger = log.get_file_logger() logger = log.get_logger() model = torch.nn.DataParallel(model, device_ids=device_id) model.cuda() model = model.train() trained_models = [] exit_handler.register(file_logger.save_checkpoint, model, optimizer, "atexit", model_cfg) start_training_time = time.time() end = time.time() while epoch < max_epochs: epoch += 1 lr_scheduler.step() logger.info("Starting Epoch %d/%d", epoch, max_epochs) len_batch = len(dataloader) acc_time = 0 for batch_id, data in enumerate(dataloader): optimizer.zero_grad() endpoints = model(data, model.module.endpoints) logger.debug("datasets %s", list(data['split_info'].keys())) data.update(endpoints) # threoretically losses could also be caluclated distributed. losses = loss(endpoints, data) loss_mean = torch.mean(losses) loss_mean.backward() optimizer.step() acc_time += time.time() - end end = time.time() report_after_batch(_run=_run, logger=logger, batch_id=batch_id, batch_len=len_batch, acc_time=acc_time, loss_mean=loss_mean, max_mem=torch.cuda.max_memory_allocated()) if epoch % checkpoint_frequency == 0: path = file_logger.save_checkpoint(model, optimizer, epoch, model_cfg) trained_models.append(path) report_after_epoch(_run=_run, epoch=epoch, max_epoch=max_epochs) if validation_dataloaders is not None and \ epoch % checkpoint_frequency == 0: model.eval() # Lets cuDNN benchmark conv implementations and choose the fastest. # Only good if sizes stay the same within the main loop! # not the case for segmentation torch.backends.cudnn.benchmark = False score = evaluate(validation_dataloaders, model, epoch, keep=keep) logger.info(score) log_score(score, _run, prefix="val_", step=epoch) torch.backends.cudnn.benchmark = True model.train() report_after_training(_run=_run, max_epoch=max_epochs, total_time=time.time() - start_training_time) path = file_logger.save_checkpoint(model, optimizer, epoch, model_cfg) if path: trained_models.append(path) file_logger.close() # TODO get best performing val model evaluate_last = _run.config['training'].get('evaluate_last', 1) if len(trained_models) < evaluate_last: logger.info("Only saved %d models (evaluate_last=%d)", len(trained_models), evaluate_last) return trained_models[-evaluate_last:]
def test_multi_dataset_loader(num_workers, sampler): P = 6 K = 10 dummy_cfg1 = { "name": "dummy", "num_pids": 100, "id": "dummy1", "size": 500, "data_dir": "/" } pk_cfg = { "type": "pk", "dataset": dummy_cfg1, "P": P, "K": K, "drop_last": True } dummy_cfg2 = { "name": "dummy", "id": "dummy2", "size": 750, "data_dir": "/" } sequential_cfg = { "type": "sequential", "dataset": dummy_cfg2, "batch_size": 70 } sampler_cfg = { "type": sampler, "samplers": { "sampler1": pk_cfg, "sampler2": sequential_cfg } } dataloader_cfg = { "sampler": sampler_cfg, "num_workers": num_workers } dataloader = dataloader_builder.build(dataloader_cfg) start = time.time() for batch_data in dataloader: actual_split = {"dummy1": [], "dummy2": []} for idx, dataset_name in enumerate(batch_data['path']): if dataset_name == "dummy1": actual_split["dummy1"].append(idx) elif dataset_name == "dummy2": actual_split["dummy2"].append(idx) else: raise RuntimeError sampler_info = batch_data['split_info'] for dataset, idxs in sampler_info.items(): assert len(idxs) == len(actual_split[dataset]) for a, b in zip(idxs, actual_split[dataset]): assert a == b idxs1 = sampler_info.get("dummy1") idxs2 = sampler_info.get("dummy2") if idxs1: print(np.array(batch_data['pid'])[idxs1]) if idxs2: print(np.array(batch_data['pid'])[idxs2]) print("Took {}".format(time.time()-start))