def calculate_flops_lut(self, file_name, input_size): shared_config = torch.zeros(18, len(self.tasks)) model = BranchMobileNetV2(tasks=['semseg'], branch_config=shared_config) in_shape = (1, 3, input_size[0], input_size[1]) n_blocks = len(model.encoder) flops = torch.zeros(n_blocks, device='cpu') model.eval() with torch.no_grad(): for idx, m in enumerate(model.encoder): m = resources.add_flops_counting_methods(m) m.start_flops_count() cache_inputs = torch.rand(in_shape) _ = model(cache_inputs) block_flops = m.compute_average_flops_cost() m.stop_flops_count() flops[idx] = block_flops flops_dict = {'per_block_flops': flops.numpy().tolist()} del model # save the FLOPS to LUT utils.write_json(flops_dict, file_name) return flops_dict
def handle(self): """ Stores job inside data_dir directory. """ data_dir = os.path.join( self.job.work_dir, ExaParserConfig.get("disk_data_handler", "data_dir")) if not os.path.exists(data_dir): os.makedirs(data_dir) write_json(os.path.join(data_dir, "job.json"), self.job.to_json())
def main(config): """ Main function for training LSTMs. After training, results on validation & test sets are recorded in the specified log_path. """ dataset, train_loader, subgraph_loader = get_data(config) # define logger Path(config['log_path']).mkdir(parents=True, exist_ok=True) logger = loggers.TensorBoardLogger(config['log_path'], version=config['version']) logger.log_hyperparams(params=config) # define model model = Model(config, dataset, train_loader, subgraph_loader) chkpt = None if config['load'] is None else get_checkpoint_path( config['load']) trainer = pl.Trainer(gpus=config['gpus'], logger=logger, max_epochs=config['epochs'], distributed_backend='dp', precision=16 if config['use_amp'] else 32, default_root_dir=config['log_path'], deterministic=True, resume_from_checkpoint=chkpt, auto_lr_find=config['auto_lr'], auto_scale_batch_size=config['auto_bsz']) trainer.fit(model) for phase in ['test', 'valid']: if phase == 'valid': trainer.eval_split = 'val' trainer.eval_mask = dataset.data.val_mask print(phase, trainer.eval_split) ret = trainer.test() if isinstance(ret, list): ret = ret[0] per_node = ret.pop('per_node') test_results = ret res_dir = Path(config['log_path']) / 'default' if config['version'] is not None: res_dir = res_dir / config['version'] else: res_dir = res_dir / ('results_' + str(config['seed'])) print(phase, ':', test_results) Path(res_dir).mkdir(parents=True, exist_ok=True) write_json(test_results, res_dir / f'{phase}_results.json', sort_keys=True, verbose=True) write_pkl(per_node, res_dir / f'{phase}_per_node.pkl') path_results = Path(config['log_path']) / f'all_{phase}_results.csv' record_results(path_results, config, test_results)
def test_branched(device, tasks, testloader, model, metrics_dict, exp_dir): model.eval() # get resources sample = next(iter(testloader)) height, width = sample['image'].shape[-2:] gflops = resources.compute_gflops(model, device=device, in_shape=(1, 3, height, width)) params = resources.count_parameters(model) results = {'gmadds': gflops / 2.0, 'mparams': params / 1e6} for idx, samples in enumerate(testloader): inputs = samples['image'].to(device, non_blocking=True) target = { task: samples[task].to(device, non_blocking=True) for task in tasks } im_size = tuple(x.item() for x in samples['meta']['im_size']) im_name = samples['meta']['image'][0] output = model(inputs) for task in tasks: uniq = torch.unique(target[task]) if len(uniq) == 1 and uniq[0] == 255: continue ground_truth = torch.squeeze(target[task], dim=0).cpu().numpy() prediction = torch.squeeze(output[task], dim=0).cpu().numpy() # metrics want numpy array of format (H x W x C) ground_truth = ground_truth.transpose(1, 2, 0) prediction = prediction.transpose(1, 2, 0) metrics_dict[task].update(prediction, ground_truth, im_size, im_name) if (idx + 1) % 100 == 0: print('{} / {} images done.'.format(idx + 1, len(testloader))) for task in tasks: results['_'.join([task, metrics_dict[task].__class__.__name__ ])] = metrics_dict[task].get_score() utils.write_json(results, Path(exp_dir) / 'eval.json')
def convert_timeseries_into_mmap(data_dir, save_dir, n_rows=100000): """ read csv file and convert time series data into mmap file. """ save_path = Path(save_dir) / 'ts.dat' shape = (n_rows, 24, 34) write_file = np.memmap(save_path, dtype=np.float32, mode='w+', shape=shape) ids = [] n = 0 info = {} info['name'] = 'ts' for split in ['train', 'val', 'test']: print('split: ', split) csv_path = Path(data_dir) / split / 'timeseries.csv' df = pd.read_csv(csv_path) arr = df.values new = np.reshape(arr, (-1, 24, 35)) pos_to_id = new[:, 0, 0] ids.append(pos_to_id) new = new[:, :, 1:] # no patient column write_file[n:n + len(new), :, :] = new info[split + '_len'] = len(new) n += len(new) del new, arr info['total'] = n info['shape'] = shape info['columns'] = list(df)[1:] del df ids = np.concatenate(ids) id2pos = {pid: pos for pos, pid in enumerate(ids)} pos2id = {pos: pid for pos, pid in enumerate(ids)} assert len(set(ids)) == len(ids) print('saving..') write_pkl(id2pos, Path(save_dir) / 'id2pos.pkl') write_pkl(pos2id, Path(save_dir) / 'pos2id.pkl') write_json(info, Path(save_dir) / 'ts_info.json') print(info)
def __init__(self, config, resume=None, modification=None): """ class to parse configuration json file. Handles hyper-parameters for training, initializations of modules, checkpoint saving and logging module. :param config: Dict containing configurations, hyper-parameters for training. contents of `config.json` file for example. :param resume: String, path to the checkpoint being loaded. :param modification: Dict keychain:value, specifying position values to be replaced from config dict. """ # load config file and apply modification self._config = _update_config(config, modification) self.resume = resume # str to bool, from modification or from default json file self.update_config('distributed', (self.config['distributed'] == 'true') or self.config['distributed']) if self.config['local_rank'] == 0: # only local master process create saved output dir # set save_dir where trained model and log will be saved. save_dir = Path(self.config['trainer']['save_dir']) experiment_name = self.config['name'] self.save_dir = save_dir / 'models' / experiment_name self.log_dir = save_dir / 'log' / experiment_name self.tensorboard_dir = save_dir / 'tensorboard' / experiment_name # make directory for saving checkpoints and log. self.save_dir.mkdir(parents=True, exist_ok=False) self.log_dir.mkdir(parents=True, exist_ok=False) self.tensorboard_dir.mkdir(parents=True, exist_ok=False) # save updated config file to the checkpoint dir, only local master save file write_json(self.config, self.save_dir / 'config.json') # configure logging module, only local master setup logging setup_logging(self.log_dir) self.log_levels = { 0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG }
def main_test(hparams, path_results=None): """ main function to load and evaluate a trained model. """ assert (hparams['load'] is not None) and (hparams['phase'] is not None) phase = hparams['phase'] log_dir = hparams['load'] # Load trained model print(f'Loading from {log_dir} to evaluate {phase} data.') model, config, dataset, train_loader, subgraph_loader = Model.load_model( log_dir, multi_gpu=hparams['multi_gpu'], num_workers=hparams['num_workers']) trainer = pl.Trainer(gpus=hparams['gpus'], logger=None, max_epochs=hparams['epochs'], default_root_dir=hparams['log_path'], deterministic=True) # Evaluate the model if phase == 'valid': trainer.eval_split = 'val' trainer.eval_mask = dataset.data.val_mask print(phase, trainer.eval_split) test_results = trainer.test(model) if isinstance(test_results, list): test_results = test_results[0] per_node = test_results.pop('per_node') print(phase, ':', test_results) # Save evaluation results results_path = Path(log_dir) / f'{phase}_results.json' write_json(test_results, results_path, sort_keys=True, verbose=True) write_pkl(per_node, Path(log_dir) / f'{phase}_per_node.pkl') if path_results is None: path_results = Path(log_dir).parent / 'results.csv' tmp = {'version': hparams['version']} tmp = {**tmp, **config} record_results(path_results, tmp, test_results)
def convert_into_mmap(data_dir, save_dir, csv_name, n_cols=None, n_rows=100000): """ read csv file and convert flat data into mmap file. """ csv_to_cols = { 'diagnoses': 520, 'diagnoses_1033': 1034, 'labels': 5, 'flat': 58 } # including patient column n_cols = (csv_to_cols[csv_name] - 1) if n_cols is None else n_cols shape = (n_rows, n_cols) save_path = Path(save_dir) / f'{csv_name}.dat' write_file = np.memmap(save_path, dtype=np.float32, mode='w+', shape=shape) info = {'name': csv_name, 'shape': shape} n = 0 for split in ['train', 'val', 'test']: print('split: ', split) csv_path = Path(data_dir) / split / f'{csv_name}.csv' df = pd.read_csv(csv_path) arr = df.values[:, 1:] # cut out patient column arr_len = len(arr) write_file[n:n + arr_len, :] = arr # write into mmap info[split + '_len'] = arr_len n += arr_len del arr info['total'] = n info['columns'] = list(df)[1:] write_json(info, Path(save_dir) / f'{csv_name}_info.json') print(info)
def main_test(hparams, path_results=None): """ main function to load and evaluate a trained model. """ assert (hparams['load'] is not None) and (hparams['phase'] is not None) phase = hparams['phase'] log_dir = hparams['load'] # Load trained model print(f'Loading from {log_dir} to evaluate {phase} data.') model, config, loaderDict, collate = DynamicGraphModel.load_model(log_dir, \ data_dir=hparams['data_dir'], multi_gpu=hparams['multi_gpu'], num_workers=hparams['num_workers']) trainer = pl.Trainer( gpus=hparams['gpus'], logger=None, max_epochs=hparams['epochs'], default_root_dir=hparams['log_path'], deterministic=True ) # Evaluate the model test_dataloader = DataLoader(loaderDict[phase], collate_fn=collate, batch_size=config['batch_size'], num_workers=config['num_workers'], shuffle=False) test_results = trainer.test(model, test_dataloaders=test_dataloader) if isinstance(test_results, list): test_results = test_results[0] per_node = test_results.pop('per_node') print(phase, ':', test_results) # Save evaluation results results_path = Path(log_dir) / f'{phase}_results.json' write_json(test_results, results_path, sort_keys=True, verbose=True) write_pkl(per_node, Path(log_dir) / f'{phase}_per_node.pkl') if path_results is None: path_results = Path(log_dir).parent / 'results.csv' tmp = {'version': hparams['version']} tmp = {**tmp, **config} record_results(path_results, tmp, test_results)
def train_search(device, start_epoch, max_epochs, tasks, trainloader_weight, trainloader_arch, model, loss, optimizer_weight, optimizer_arch, exp_dir): writer = SummaryWriter(log_dir=exp_dir) iter_per_epoch = len( trainloader_weight.dataset) // trainloader_weight.batch_size total_iter = iter_per_epoch * max_epochs delay_epochs = max_epochs // 20 model.train() for epoch in range(start_epoch, max_epochs + 1): model.warmup_flag = (epoch <= delay_epochs) # set the gumbel temperature according to a linear schedule model.gumbel_temp = min( 5.0 - (epoch - delay_epochs - 1) / (max_epochs - delay_epochs - 1) * (5.0 - 0.1), 5.0) arch_loss = 0 arch_counter = 0 if epoch > delay_epochs: print('modifying architecture...') # we reset the arch optimizer state optimizer_arch.state = defaultdict(dict) # we use current batch statistics in search period model.freeze_encoder_bn_running_stats() for samples_search in trainloader_arch: inputs_search = samples_search['image'].to(device, non_blocking=True) target_search = { task: samples_search[task].to(device, non_blocking=True) for task in tasks } optimizer_arch.zero_grad() for task in tasks: # many images don't have human parts annotations, skip those uniq = torch.unique(target_search[task]) if len(uniq) == 1 and uniq[0] == 255: continue output = model(inputs_search, task=task) tot_loss = loss(output, target_search, task=task) tot_loss.backward() arch_loss += tot_loss.item() arch_counter += 1 optimizer_arch.step() # we reset the main optimizer state because arch has changed optimizer_weight.state = defaultdict(dict) # we should reset bn running stats model.unfreeze_encoder_bn_running_stats() model.reset_encoder_bn_running_stats() for batch_idx, samples in enumerate(trainloader_weight): inputs = samples['image'].to(device, non_blocking=True) target = { task: samples[task].to(device, non_blocking=True) for task in tasks } current_loss = 0 counter = 0 for task in tasks: # many images don't have human parts annotations, skip those uniq = torch.unique(target[task]) if len(uniq) == 1 and uniq[0] == 255: continue optimizer_weight.zero_grad() output = model(inputs, task=task) tot_loss = loss(output, target, task=task, omit_resource=True) tot_loss.backward() optimizer_weight.step() current_loss += tot_loss.item() counter += 1 if (batch_idx + 1) % 100 == 0: n_iter = (epoch - 1) * iter_per_epoch + batch_idx + 1 print('Train Iterations: {}, Loss: {:.4f}'.format( utils.progress(n_iter, total_iter), current_loss / counter)) writer.add_scalar('loss_current', current_loss / counter, n_iter) writer.add_scalar('arch_loss', arch_loss / max(1, arch_counter), n_iter) writer.add_scalar('gumbel_temp', model.gumbel_temp, n_iter) for name, param in model.named_arch_parameters(): writer.add_image(name, torch.nn.functional.softmax(param.data, dim=-1), n_iter, dataformats='HW') # save model state = { 'state_dict': model.state_dict(), 'tasks': tasks, 'epoch': epoch, 'optimizer_weight': optimizer_weight.state_dict(), 'optimizer_arch': optimizer_arch.state_dict(), } torch.save(state, Path(exp_dir) / 'checkpoint.pth') branch_config = model.get_branch_config() utils.write_json({'config': branch_config}, Path(exp_dir) / 'branch_config.json')
'%s: %.4f' % (v, np.mean(stats[k])) for k, v in stats_str if len(stats[k]) > 0 ] stats_log.append('%i samples/s' % int(n_words_proc / (time.time() - tic))) logger.info(('%06i - ' % n_iter) + ' - '.join(stats_log)) # reset tic = time.time() n_words_proc = 0 for k, _ in stats_str: del stats[k][:] if params.print_grads: write_json( os.path.join(params.exp_path, 'weights_{}.json'.format(n_epoch)), weights) write_json( os.path.join(params.exp_path, 'grads_{}.json'.format(n_epoch)), grads) # embeddings / discriminator evaluation to_log = OrderedDict({'n_epoch': n_epoch}) evaluator.all_eval(to_log) evaluator.eval_dis(to_log) # JSON log / save best model / end of epoch logger.info("__log__:%s" % json.dumps(to_log)) trainer.save_best(to_log, VALIDATION_METRIC) logger.info('End of epoch %i.\n\n' % n_epoch)