def test_reporter(): linear = torch.nn.Linear(1024, 1024).cuda() inp = torch.Tensor(512, 1024).cuda() out = linear(inp).mean() out.backward() reporter = MemReporter(linear) reporter.report()
def test_reporter_LSTM(): lstm = torch.nn.LSTM(256, 256, num_layers=1).cuda() # lstm.flatten_parameters() inp = torch.Tensor(256, 256, 256).cuda() out, _ = lstm(inp) out.mean().backward() reporter = MemReporter(lstm) reporter.report()
def test_reporter_without_model(): linear = torch.nn.Linear(1024, 1024) inp = torch.Tensor(512, 1024) reporter = MemReporter() out = linear(inp*(inp+3)).mean() reporter.report() out.backward() reporter.report()
def test_reporter_sparse_tensor(): emb = torch.nn.Embedding(1024, 1024, sparse=True) inp = torch.arange(0, 128) reporter = MemReporter() out = emb(inp).mean() reporter.report() out.backward() b = emb.weight.grad * 2 reporter.report()
def test_courtesy_context(): linear = torch.nn.Linear(1024, 1024).cuda() inp = torch.Tensor(512, 1024).cuda() out = linear(inp).mean() out.backward() reporter = MemReporter(linear) with Courtesy() as ct: print('gpu>>>>>>>>>>>>>>>>>>cpu') reporter.report()
def GlobalAttentionTest(args, config, global_config): feat, params, res = load_data(args, 'GlobalAttention') conf = config.model.embeddings_and_evoformer.evoformer.msa_row_attention_with_pair_bias attn_opt = GlobalAttentionOpt(conf, global_config, output_dim=256, key_dim=feat['q_data'].shape[-1], value_dim=feat['m_data'].shape[-1]) attn_opt.load_weights_from_af2(params['attention'], None) attn_vanilla = GlobalAttention(conf, global_config, output_dim=256, key_dim=feat['q_data'].shape[-1], value_dim=feat['m_data'].shape[-1]) attn_vanilla.load_weights_from_af2(params['attention'], None) attn_vanilla.cuda() feat['q_data'] = feat['q_data'].to(device='cuda', dtype=torch.float32) feat['m_data'] = feat['m_data'].to(device='cuda', dtype=torch.float32) feat['q_mask'] = feat['q_mask'].to(device='cuda', dtype=torch.float32) handler_vanilla = torch.profiler.tensorboard_trace_handler( Path('Log') / Path('GlobalAttention')) with torch.profiler.profile(on_trace_ready=handler_vanilla, with_stack=True, with_modules=True, profile_memory=True, record_shapes=True) as profiler: res_vanilla = attn_vanilla( q_data=feat['q_data'], m_data=feat['m_data'], q_mask=feat['q_mask'].to(dtype=torch.float32), bias=feat['bias']) profiler.step() attn_opt.cuda() reporter = MemReporter() handler_opt = torch.profiler.tensorboard_trace_handler( Path('Log') / Path('GlobalAttentionOpt')) with torch.profiler.profile(on_trace_ready=handler_opt, with_stack=True, with_modules=True, profile_memory=True, record_shapes=True) as profiler: res_opt = attn_opt(q_data=feat['q_data'], m_data=feat['m_data'], q_mask=feat['q_mask'].to(dtype=torch.float32), bias=feat['bias']) profiler.step() reporter.report() check_recursive(res_opt, res_vanilla)
def print_model(model: nn.Module, reporter: pytorch_memlab.MemReporter) -> None: """ print model memory usage by layer Parameters ---------- model: nn.Module the torch model """ print('=== model definition ===') print(model) print('=== model memory usage===') reporter.report()
def test_reporter_tie_weight(): linear = torch.nn.Linear(1024, 1024).cuda() linear_2 = torch.nn.Linear(1024, 1024).cuda() linear_2.weight = linear.weight container = torch.nn.Sequential(linear, linear_2) reporter = MemReporter(container) inp = torch.Tensor(512, 1024).cuda() out = container(inp).mean() out.backward() reporter = MemReporter(container) reporter.report()
def MSAColumnGlobalAttentionTest(args, config, global_config): feat, params, res = load_data(args, 'MSAColumnGlobalAttention') conf = config.model.embeddings_and_evoformer.evoformer.msa_column_attention attn_opt = MSAColumnGlobalAttentionOpt(conf, global_config, msa_dim=feat['msa_act'].shape[-1]) attn_opt.load_weights_from_af2(params, rel_path='msa_column_global_attention') attn_vanilla = MSAColumnGlobalAttention(conf, global_config, msa_dim=feat['msa_act'].shape[-1]) attn_vanilla.load_weights_from_af2(params, rel_path='msa_column_global_attention') attn_vanilla.cuda() feat['msa_act'] = feat['msa_act'].to(device='cuda', dtype=torch.float32) feat['msa_mask'] = feat['msa_mask'].to(device='cuda', dtype=torch.float32) reporter = MemReporter() handler_vanilla = torch.profiler.tensorboard_trace_handler( Path('Log') / Path('MSAColumnGlobalAttention')) with torch.profiler.profile(on_trace_ready=handler_vanilla, with_stack=True, with_modules=True, profile_memory=True, record_shapes=True) as profiler: res_vanilla = attn_vanilla(feat['msa_act'], feat['msa_mask'].to(dtype=torch.float32)) profiler.step() reporter.report() attn_opt.cuda() reporter = MemReporter() handler_opt = torch.profiler.tensorboard_trace_handler( Path('Log') / Path('MSAColumnGlobalAttentionOpt')) with torch.profiler.profile(on_trace_ready=handler_opt, with_stack=True, with_modules=True, profile_memory=True, record_shapes=True) as profiler: res_opt = attn_opt(feat['msa_act'], feat['msa_mask'].to(dtype=torch.float32)) profiler.step() reporter.report() check_recursive(res_opt, res_vanilla)
def main(config_path, in_infix, out_infix, slot, gpuid): logger.info('-------------Doc-Rep Pre-building for %s---------------' % slot) logger.info('initial environment...') game_config, enable_cuda, device, writer = init_env( config_path, in_infix, out_infix, writer_suffix='pt_log_path', gpuid=gpuid) logger.info('reading dataset...') dataset = DocRepPTReader(game_config) logger.info('constructing model...') doc_rep_module = DocRepTestModule(game_config).to(device) doc_rep_module.load_parameters(enable_cuda, force=True, strict=True) # debug: show using memory reporter = MemReporter(doc_rep_module) reporter.report() # training arguments batch_size = game_config['train']['batch_size'] num_workers = game_config['global']['num_data_workers'] dataloader = dataset.get_dataloader_docs(batch_size, num_workers) with torch.no_grad(): logger.info('start documents encoding...') doc_rep_module.eval() all_doc_rep = test_on_model(doc_rep_module, dataloader, device) logger.info('saving documents vectors...') # suffix = '-' + data_type if data_type is not None else '' torch.save( all_doc_rep, game_config['dataset']['data_prefix'] + 'doc_rep/pt/dialog_doc_pt_rep.pt-' + slot) logger.info('finished.')
def main(): transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform) trainloader = torch.utils.data.DataLoader(trainset, batch_size=256, shuffle=True, num_workers=2) net = Net().cuda() optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) reporter = MemReporter(net) # 訓練前にレポートすることで、モデルのアーキテクチャが使っているメモリがわかる。 reporter.report() print('\nStart Training\n') for epoch in range(1): for i, data in enumerate(trainloader, 0): inputs, labels = data inputs, labels = inputs.cuda(), labels.cuda() optimizer.zero_grad() outputs = net(inputs, labels) backward(outputs) optimizer.step() print('\nTraining Finished\n') # 訓練後にレポートすることで、勾配や流れたデータ(x,yなど)が使用したメモリがわかる。 reporter.report()
def test_wrapper(self): net = EfficientNet.from_pretrained("efficientnet-b3") # print(net) # params = list(net.parameters()) # count=count_parameters(net) # print ("Params:",count/1000000) ## params and MACs macs, params = get_model_complexity_info(net, (3, 244, 244), as_strings=True, print_per_layer_stat=True, verbose=True) print('{:<30} {:<8}'.format('Computational complexity: ', macs)) print('{:<30} {:<8}'.format('Number of parameters: ', params)) ### Input for check model x = torch.rand(2, 3, 224, 224) x = torch.autograd.Variable(x) ## model size mem_params = sum([ param.nelement() * param.element_size() for param in net.parameters() ]) mem_bufs = sum( [buf.nelement() * buf.element_size() for buf in net.buffers()]) mem = mem_params + mem_bufs # in bytes print('Memory Size:', mem / 1000000) ## Memory reporter = MemReporter(net) reporter.report() ### model implementation x = net(x) self.assertTrue(x is not None)
def main(): args = parse_args() if args.category_num <= 0: raise SystemExit('ERROR: bad category_num (%d)!' % args.category_num) if not os.path.isfile('yolo/%s.trt' % args.model): raise SystemExit('ERROR: file (yolo/%s.trt) not found!' % args.model) cam = Camera(args) if not cam.isOpened(): raise SystemExit('ERROR: failed to open camera!') cls_dict = get_cls_dict(args.category_num) yolo_dim = args.model.split('-')[-1] if 'x' in yolo_dim: dim_split = yolo_dim.split('x') if len(dim_split) != 2: raise SystemExit('ERROR: bad yolo_dim (%s)!' % yolo_dim) w, h = int(dim_split[0]), int(dim_split[1]) else: h = w = int(yolo_dim) if h % 32 != 0 or w % 32 != 0: raise SystemExit('ERROR: bad yolo_dim (%s)!' % yolo_dim) load_weight_start = time.time() trt_yolo = TrtYOLO(args.model, (h, w), args.category_num) load_weights_time = datetime.timedelta(seconds=time.time() - load_weight_start) print('Load weights Time: %s' % (load_weights_time)) open_window(WINDOW_NAME, 'Camera TensorRT YOLO Demo', cam.img_width, cam.img_height) vis = BBoxVisualization(cls_dict) loop_and_detect(cam, trt_yolo, conf_th=0.3, vis=vis) reporter = MemReporter() reporter.report() cam.release() cv2.destroyAllWindows()
def grab_memory_usage_output(model): ''' Grabs the output of pytorch MemReporter Parameters: - model -- model, object of a model class Returns: - None ''' reporter = MemReporter(model) orig_stdout = sys.stdout # remebmer the original stdout with open('temp.txt', 'w') as f: sys.stdout = f reporter.report(verbose=True) with open('temp.txt', 'r') as f: lines = f.readlines() sys.stdout = orig_stdout # switch to the original stdout os.remove('temp.txt') lines[-1] = lines[-1][:-1] new_lines = ['|{}'.format(line) for line in lines] logging.info('Memory consuming:\n{}'.format(''.join( line for line in new_lines)))
def test_reporter(): linear = torch.nn.Linear(1024, 1024).cuda() inp = torch.Tensor(512, 1024).cuda() out = linear(inp).mean() out.backward() reporter = MemReporter(linear) reporter.report() ct = Courtesy() ct.yield_memory() print('gpu>>>>>>>>>>>>>>>>>>cpu') reporter.report() ct.restore() print('cpu>>>>>>>>>>>>>>>>>>gpu') reporter.report()
def main(config_path, in_infix, out_infix, slot, is_train, is_test, gpuid): logger.info('-------------Doc-Rep Pre-training for %s---------------' % slot) logger.info('initial environment...') game_config, enable_cuda, device, writer = init_env(config_path, in_infix, out_infix, writer_suffix='pt_log_path', gpuid=gpuid) logger.info('reading dataset...') dataset = DocPTReader(game_config) logger.info('constructing model...') doc_rep_module = DocRepTrainModule(game_config).to(device) doc_rep_module.load_parameters(enable_cuda, force=False, strict=True) # debug: show using memory reporter = MemReporter(doc_rep_module) reporter.report() # loss function criterion = torch.nn.NLLLoss() optimizer = get_optimizer(game_config['train']['optimizer'], game_config['train']['learning_rate'], doc_rep_module.parameters()) # training arguments batch_size = game_config['train']['batch_size'] num_workers = game_config['global']['num_data_workers'] save_steps = game_config['train']['save_steps'] train_iters = game_config['train']['train_iters'] test_iters = game_config['train']['test_iters'] # dataset loader batch_train_data = dataset.get_dataset_train_slot(slot, batch_size, num_workers, train_iters) batch_test_data = dataset.get_dataset_test_slot(slot, batch_size, num_workers, test_iters) if is_train: logger.info('start training...') clip_grad_max = game_config['train']['clip_grad_norm'] # train doc_rep_module.train() # set training = True, make sure right dropout train_on_model(model=doc_rep_module, criterion=criterion, optimizer=optimizer, dataloader=batch_train_data, clip_grad_max=clip_grad_max, device=device, writer=writer, save_steps=save_steps) if is_test: logger.info('start testing...') with torch.no_grad(): doc_rep_module.eval() test_acc = eval_on_model(model=doc_rep_module, dataloader=batch_test_data, device=device) logger.info("test_all_acc=%.2f%%" % (test_acc * 100)) writer.close() logger.info('finished.')
def warm_up(self): """ Warmup the memory allocator, by attempting to fit the largest batch :return: """ if self.opt.memory_profiling: from pytorch_memlab import MemReporter reporter = MemReporter() batch = self.train_data[0].get_largest_batch() if isinstance(self.train_data, list) \ else self.train_data.get_largest_batch() opt = self.opt if self.cuda: batch.cuda(fp16=self.opt.fp16 and not self.opt.fp16_mixed) self.model.train() self.model.zero_grad() oom = False if self.opt.memory_profiling: print("Input size: ") print(batch.size, batch.src_size, batch.tgt_size) if opt.streaming: streaming_state = self.model.init_stream() else: streaming_state = None try: targets = batch.get('target_output') tgt_mask = None outputs = self.model(batch, streaming=opt.streaming, target_mask=tgt_mask, zero_encoder=opt.zero_encoder, mirror=opt.mirror_loss, streaming_state=streaming_state, nce=opt.nce) outputs['tgt_mask'] = tgt_mask loss_dict = self.loss_function(outputs, targets, model=self.model, vocab_mask=batch.vocab_mask) loss = loss_dict[ 'loss'] # a little trick to avoid gradient overflow with fp16 full_loss = loss if opt.ctc_loss > 0.0: ctc_loss = self.ctc_loss_function(outputs, targets) ctc_loss_data = ctc_loss.item() full_loss = full_loss + opt.ctc_loss * ctc_loss if opt.mirror_loss: rev_loss = loss_dict['rev_loss'] mirror_loss = loss_dict['mirror_loss'] full_loss = full_loss + rev_loss + mirror_loss # reconstruction loss if opt.reconstruct: rec_loss = loss_dict['rec_loss'] rec_loss = rec_loss full_loss = full_loss + rec_loss if opt.lfv_multilingual: lid_logits = outputs['lid_logits'] lid_labels = batch.get('target_lang') lid_loss_function = self.loss_function.get_loss_function( 'lid_loss') lid_loss = lid_loss_function(lid_logits, lid_labels) full_loss = full_loss + lid_loss optimizer = self.optim.optimizer if self.opt.memory_profiling: reporter.report(verbose=True) # for obj in gc.get_objects(): # try: # if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)): # # print(varname(obj)) # # we can rule out parameter cost later # # if 'parameter' not in type(obj): # # if len(obj.shape) == 3: # # if not isinstance(obj, torch.nn.parameter.Parameter): # # tensor = obj # # numel = tensor. # print(type(obj), obj.type(), obj.size()) # except: # pass # print("Memory profiling complete.") # print(torch.cuda.memory_summary()) # exit() if self.cuda: with amp.scale_loss(full_loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.div_(batch.tgt_size).backward() if self.opt.memory_profiling: print('========= after backward =========') reporter.report(verbose=True) self.model.zero_grad() self.optim.zero_grad() # self.optim.step() # self.optim.reset() except RuntimeError as e: if 'out of memory' in str(e): oom = True else: raise e if oom: print( "* Warning: out-of-memory in warming up. This is due to the largest batch is too big for the GPU." ) else: print("* Warming up successuflly.") if self.opt.memory_profiling: if hasattr(torch.cuda, 'memory_summary'): print(torch.cuda.memory_summary()) exit()
def warm_up(self): """ Warmup the memory allocator, by attempting to fit the largest batch :return: """ print("Tacotron_warmup") if self.opt.memory_profiling: from pytorch_memlab import MemReporter reporter = MemReporter() batch = self.train_data[0].get_largest_batch() if isinstance(self.train_data, list) \ else self.train_data.get_largest_batch() opt = self.opt if self.cuda: batch.cuda(fp16=self.opt.fp16 and not self.opt.fp16_mixed) self.model_ae.train() self.model_ae.zero_grad() oom = False if self.opt.memory_profiling: print("Input size: ") print(batch.size, batch.src_size, batch.tgt_size) try: encoder_outputs, decoder_outputs = self.model_ae(batch) gate_padded = batch.get('gate_padded') if self.opt.n_frames_per_step > 1: slice = torch.arange(self.opt.n_frames_per_step - 1, gate_padded.size(1), self.opt.n_frames_per_step) gate_padded = gate_padded[:, slice] src_org = batch.get('source_org') src_org = src_org.narrow(2, 1, src_org.size(2) - 1) target = [src_org.permute(1, 2, 0).contiguous(), gate_padded] loss = self.loss_function_ae(decoder_outputs, target) full_loss = loss optimizer = self.optim_ae.optimizer if self.opt.memory_profiling: reporter.report(verbose=True) if self.cuda: with amp.scale_loss(full_loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.div_(batch.tgt_size).backward() if self.opt.memory_profiling: print('========= after backward =========') reporter.report(verbose=True) self.model_ae.zero_grad() self.optim_ae.zero_grad() except RuntimeError as e: if 'out of memory' in str(e): oom = True else: raise e if oom: print( "* Warning: out-of-memory in warming up. This is due to the largest batch is too big for the GPU." ) else: print("* Warming up successuflly.") if self.opt.memory_profiling: if hasattr(torch.cuda, 'memory_summary'): print(torch.cuda.memory_summary()) exit()
def main(args): if not os.path.exists(args.checkpoint_dir): os.makedirs(args.checkpoint_dir) #---------------------------- # some initilization thing #--------------------------- cuda = args.cuda if cuda and not torch.cuda.is_available(): raise Exception("No GPU found, please run without --cuda") torch.manual_seed(args.seed) if cuda: torch.cuda.manual_seed(args.seed) myNet = MyDispNet(args) """ for debugging """ if args.mode == 'debug': myNet.model.train() import gc crop_h = 256 crop_w = 512 #x_l = torch.randn((1, 3, crop_h, crop_w), requires_grad=True) #x_r = torch.randn((1, 3, crop_h, crop_w), requires_grad=True) x_l = torch.randn((1, 3, crop_h, crop_w)).cuda() x_r = torch.randn((1, 3, crop_h, crop_w)).cuda() y = torch.randn((1, crop_h, crop_w)).cuda() z = torch.randn((1, 1, crop_h//3, crop_w//3)).cuda() from pytorch_memlab import profile, MemReporter # pass in a model to automatically infer the tensor names # You can better understand the memory layout for more complicated module if 1: reporter = MemReporter(myNet.model) disp = myNet.model(x_l, x_r) loss = F.smooth_l1_loss(disp, y, reduction='mean') reporter.report(verbose=True) print('========= before backward =========') loss.backward() reporter.report(verbose=True) # generate prof which can be loaded by Google chrome trace at chrome://tracing/ if 1: with torch.autograd.profiler.profile(use_cuda=True) as prof: myNet.model(x_l, x_r) print(prof) prof.export_chrome_trace('./results/tmp/prof.out') if args.mode == 'train': print('strat training !!!') for epoch in range(1 + args.startEpoch, args.startEpoch + args.nEpochs + 1): print ("[**] do training at epoch %d/%d" % (epoch, args.startEpoch + args.nEpochs)) with torch.autograd.set_detect_anomaly(True): avg_loss, avg_err, avg_accu = myNet.train(epoch) # save the last epoch always!! myNet.save_checkpoint(args.nEpochs + args.startEpoch, { 'epoch': args.nEpochs + args.startEpoch, 'state_dict': myNet.model.state_dict(), 'optimizer' : myNet.optimizer.state_dict(), 'loss': avg_loss, 'epe_err': avg_err, 'accu3': avg_accu }, is_best = False) print('done training !!!') if args.mode == 'test': print('strat testing !!!') myNet.test()
def main(args): #---------------------------- # some initilization thing #--------------------------- cuda = args.cuda if cuda and not torch.cuda.is_available(): raise Exception("No GPU found, please run without --cuda") torch.manual_seed(args.seed) if cuda: torch.cuda.manual_seed(args.seed) myNet = MyGCNet(args) print('Number of GCNet model parameters: {}'.format( sum([p.data.nelement() for p in myNet.model.parameters()]))) if 0: print('Including:\n1) number of Feature Extraction module parameters: {}'.format( sum( [p.data.nelement() for n, p in myNet.model.named_parameters() if any( ['module.convbn0' in n, 'module.res_block' in n, 'module.conv1' in n ])] ))) print('2) number of Other modules parameters: {}'.format( sum( [p.data.nelement() for n, p in myNet.model.named_parameters() if any( ['module.conv3dbn' in n, 'module.block_3d' in n, 'module.deconv' in n, ])] ))) for i, (n, p) in enumerate(myNet.model.named_parameters()): print (i, " layer ", n, "has # param : ", p.data.nelement()) #sys.exit() """ for debugging """ if args.mode == 'debug': myNet.model.train() import gc crop_h = 256 crop_w = 512 #x_l = torch.randn((1, 3, crop_h, crop_w), requires_grad=True) #x_r = torch.randn((1, 3, crop_h, crop_w), requires_grad=True) x_l = torch.randn((1, 3, crop_h, crop_w)).cuda() x_r = torch.randn((1, 3, crop_h, crop_w)).cuda() y = torch.randn((1, crop_h, crop_w)).cuda() z = torch.randn((1, 1, crop_h//3, crop_w//3)).cuda() from pytorch_memlab import profile, MemReporter # pass in a model to automatically infer the tensor names # You can better understand the memory layout for more complicated module if 1: reporter = MemReporter(myNet.model) disp = myNet.model(x_l, x_r) loss = F.smooth_l1_loss(disp, y, reduction='mean') reporter.report(verbose=True) print('========= before backward =========') loss.backward() reporter.report(verbose=True) # generate prof which can be loaded by Google chrome trace at chrome://tracing/ if 1: with torch.autograd.profiler.profile(use_cuda=True) as prof: myNet.model(x_l, x_r) print(prof) prof.export_chrome_trace('./results/tmp/prof.out') if args.mode == 'train': print('strat training !!!') for epoch in range(1 + args.startEpoch, args.startEpoch + args.nEpochs + 1): print ("[**] do training at epoch %d/%d" % (epoch, args.startEpoch + args.nEpochs)) with torch.autograd.set_detect_anomaly(True): avg_loss, avg_err, avg_accu = myNet.train(epoch) # save the last epoch always!! myNet.save_checkpoint(args.nEpochs + args.startEpoch, { 'epoch': args.nEpochs + args.startEpoch, 'state_dict': myNet.model.state_dict(), 'optimizer' : myNet.optimizer.state_dict(), 'loss': avg_loss, 'epe_err': avg_err, 'accu3': avg_accu }, is_best = False) print('done training !!!') if args.mode == 'test': print('strat testing !!!') myNet.test()
def train(): if torch.cuda.is_available(): nvmlInit() multiprocessing.set_start_method('fork', force=True) parser = argparse.ArgumentParser( description="Training a multi-task model.") parser.add_argument("--x", help="Descriptor file (matrix market, .npy or .npz)", type=str, default=None) parser.add_argument("--y_class", "--y", "--y_classification", help="Activity file (matrix market, .npy or .npz)", type=str, default=None) parser.add_argument("--y_regr", "--y_regression", help="Activity file (matrix market, .npy or .npz)", type=str, default=None) parser.add_argument( "--y_censor", help="Censor mask for regression (matrix market, .npy or .npz)", type=str, default=None) parser.add_argument( "--weights_class", "--task_weights", "--weights_classification", help= "CSV file with columns task_id, training_weight, aggregation_weight, task_type (for classification tasks)", type=str, default=None) parser.add_argument( "--weights_regr", "--weights_regression", help= "CSV file with columns task_id, training_weight, censored_weight, aggregation_weight, aggregation_weight, task_type (for regression tasks)", type=str, default=None) parser.add_argument( "--censored_loss", help="Whether censored loss is used for training (default 1)", type=int, default=1) parser.add_argument("--folding", help="Folding file (npy)", type=str, required=True) parser.add_argument("--fold_va", help="Validation fold number", type=int, default=0) parser.add_argument("--fold_te", help="Test fold number (removed from dataset)", type=int, default=None) parser.add_argument("--batch_ratio", help="Batch ratio", type=float, default=0.02) parser.add_argument("--internal_batch_max", help="Maximum size of the internal batch", type=int, default=None) parser.add_argument( "--normalize_loss", help= "Normalization constant to divide the loss (default uses batch size)", type=float, default=None) parser.add_argument( "--normalize_regression", help="Set this to 1 if the regression tasks should be normalized", type=int, default=0) parser.add_argument( "--normalize_regr_va", help= "Set this to 1 if the regression tasks in validation fold should be normalized together with training folds", type=int, default=0) parser.add_argument( "--inverse_normalization", help= "Set this to 1 if the regression tasks in validation fold should be inverse normalized at validation time", type=int, default=0) parser.add_argument("--hidden_sizes", nargs="+", help="Hidden sizes of trunk", default=[], type=int, required=True) parser.add_argument( "--last_hidden_sizes", nargs="+", help= "Hidden sizes in the head (if specified , class and reg heads have this dimension)", default=None, type=int) #parser.add_argument("--middle_dropout", help="Dropout for layers before the last", type=float, default=0.0) #parser.add_argument("--last_dropout", help="Last dropout", type=float, default=0.2) parser.add_argument("--weight_decay", help="Weight decay", type=float, default=0.0) parser.add_argument("--last_non_linearity", help="Last layer non-linearity (depecrated)", type=str, default="relu", choices=["relu", "tanh"]) parser.add_argument("--middle_non_linearity", "--non_linearity", help="Before last layer non-linearity", type=str, default="relu", choices=["relu", "tanh"]) parser.add_argument("--input_transform", help="Transformation to apply to inputs", type=str, default="none", choices=["binarize", "none", "tanh", "log1p"]) parser.add_argument("--lr", help="Learning rate", type=float, default=1e-3) parser.add_argument("--lr_alpha", help="Learning rate decay multiplier", type=float, default=0.3) parser.add_argument("--lr_steps", nargs="+", help="Learning rate decay steps", type=int, default=[10]) parser.add_argument("--input_size_freq", help="Number of high importance features", type=int, default=None) parser.add_argument("--fold_inputs", help="Fold input to a fixed set (default no folding)", type=int, default=None) parser.add_argument("--epochs", help="Number of epochs", type=int, default=20) parser.add_argument( "--pi_zero", help="Reference class ratio to be used for calibrated aucpr", type=float, default=0.1) parser.add_argument( "--min_samples_class", help= "Minimum number samples in each class and in each fold for AUC calculation (only used if aggregation_weight is not provided in --weights_class)", type=int, default=5) parser.add_argument("--min_samples_auc", help="Obsolete: use 'min_samples_class'", type=int, default=None) parser.add_argument( "--min_samples_regr", help= "Minimum number of uncensored samples in each fold for regression metric calculation (only used if aggregation_weight is not provided in --weights_regr)", type=int, default=10) parser.add_argument("--dev", help="Device to use", type=str, default="cuda:0") parser.add_argument("--run_name", help="Run name for results", type=str, default=None) parser.add_argument( "--output_dir", help="Output directory, including boards (default 'models')", type=str, default="models") parser.add_argument("--prefix", help="Prefix for run name (default 'run')", type=str, default='run') parser.add_argument( "--verbose", help="Verbosity level: 2 = full; 1 = no progress; 0 = no output", type=int, default=2, choices=[0, 1, 2]) parser.add_argument("--save_model", help="Set this to 0 if the model should not be saved", type=int, default=1) parser.add_argument( "--save_board", help="Set this to 0 if the TensorBoard should not be saved", type=int, default=1) parser.add_argument( "--profile", help="Set this to 1 to output memory profile information", type=int, default=0) parser.add_argument( "--mixed_precision", help= "Set this to 1 to run in mixed precision mode (vs single precision)", type=int, default=0) parser.add_argument("--eval_train", help="Set this to 1 to calculate AUCs for train data", type=int, default=0) parser.add_argument("--enable_cat_fusion", help="Set this to 1 to enable catalogue fusion", type=int, default=0) parser.add_argument( "--eval_frequency", help= "The gap between AUC eval (in epochs), -1 means to do an eval at the end.", type=int, default=1) #hybrid model features parser.add_argument( "--regression_weight", help= "between 0 and 1 relative weight of regression loss vs classification loss", type=float, default=0.5) parser.add_argument( "--scaling_regularizer", help= "L2 regularizer of the scaling layer, if inf scaling layer is switched off", type=float, default=np.inf) parser.add_argument( "--class_feature_size", help= "Number of leftmost features used from the output of the trunk (default: use all)", type=int, default=-1) parser.add_argument( "--regression_feature_size", help= "Number of rightmost features used from the output of the trunk (default: use all)", type=int, default=-1) parser.add_argument( "--last_hidden_sizes_reg", nargs="+", help= "Hidden sizes in the regression head (overwritten by last_hidden_sizes)", default=None, type=int) parser.add_argument( "--last_hidden_sizes_class", nargs="+", help= "Hidden sizes in the classification head (overwritten by last_hidden_sizes)", default=None, type=int) parser.add_argument( "--dropouts_reg", nargs="+", help= "List of dropout values used in the regression head (needs one per last hidden in reg head, ignored if last_hidden_sizes_reg not specified)", default=[], type=float) parser.add_argument( "--dropouts_class", nargs="+", help= "List of dropout values used in the classification head (needs one per last hidden in class head, ignored if no last_hidden_sizes_class not specified)", default=[], type=float) parser.add_argument("--dropouts_trunk", nargs="+", help="List of dropout values used in the trunk", default=[], type=float, required=True) args = parser.parse_args() if (args.last_hidden_sizes is not None) and ((args.last_hidden_sizes_class is not None) or (args.last_hidden_sizes_reg is not None)): raise ValueError( "Head specific and general last_hidden_sizes argument were both specified!" ) if (args.last_hidden_sizes is not None): args.last_hidden_sizes_class = args.last_hidden_sizes args.last_hidden_sizes_reg = args.last_hidden_sizes if args.last_hidden_sizes_reg is not None: assert len(args.last_hidden_sizes_reg) == len( args.dropouts_reg ), "Number of hiddens and number of dropout values specified must be equal in the regression head!" if args.last_hidden_sizes_class is not None: assert len(args.last_hidden_sizes_class) == len( args.dropouts_class ), "Number of hiddens and number of dropout values specified must be equal in the classification head!" if args.hidden_sizes is not None: assert len(args.hidden_sizes) == len( args.dropouts_trunk ), "Number of hiddens and number of dropout values specified must be equal in the trunk!" def vprint(s=""): if args.verbose: print(s) vprint(args) if args.class_feature_size == -1: args.class_feature_size = args.hidden_sizes[-1] if args.regression_feature_size == -1: args.regression_feature_size = args.hidden_sizes[-1] assert args.regression_feature_size <= args.hidden_sizes[ -1], "Regression feature size cannot be larger than the trunk output" assert args.class_feature_size <= args.hidden_sizes[ -1], "Classification feature size cannot be larger than the trunk output" assert args.regression_feature_size + args.class_feature_size >= args.hidden_sizes[ -1], "Unused features in the trunk! Set regression_feature_size + class_feature_size >= trunk output!" #if args.regression_feature_size != args.hidden_sizes[-1] or args.class_feature_size != args.hidden_sizes[-1]: # raise ValueError("Hidden spliting not implemented yet!") if args.run_name is not None: name = args.run_name else: name = f"sc_{args.prefix}_h{'.'.join([str(h) for h in args.hidden_sizes])}_ldo_r{'.'.join([str(d) for d in args.dropouts_reg])}_wd{args.weight_decay}" name += f"_lr{args.lr}_lrsteps{'.'.join([str(s) for s in args.lr_steps])}_ep{args.epochs}" name += f"_fva{args.fold_va}_fte{args.fold_te}" if args.mixed_precision == 1: name += f"_mixed_precision" vprint(f"Run name is '{name}'.") if args.profile == 1: assert ( args.save_board == 1 ), "Tensorboard should be enabled to be able to profile memory usage." if args.save_board: tb_name = os.path.join(args.output_dir, "boards", name) writer = SummaryWriter(tb_name) else: writer = Nothing() assert args.input_size_freq is None, "Using tail compression not yet supported." if (args.y_class is None) and (args.y_regr is None): raise ValueError( "No label data specified, please add --y_class and/or --y_regr.") ecfp = sc.load_sparse(args.x) y_class = sc.load_sparse(args.y_class) y_regr = sc.load_sparse(args.y_regr) y_censor = sc.load_sparse(args.y_censor) if (y_regr is None) and (y_censor is not None): raise ValueError("y_censor provided please also provide --y_regr.") if y_class is None: y_class = scipy.sparse.csr_matrix((ecfp.shape[0], 0)) if y_regr is None: y_regr = scipy.sparse.csr_matrix((ecfp.shape[0], 0)) if y_censor is None: y_censor = scipy.sparse.csr_matrix(y_regr.shape) folding = np.load(args.folding) assert ecfp.shape[0] == folding.shape[ 0], "x and folding must have same number of rows" ## Loading task weights tasks_class = sc.load_task_weights(args.weights_class, y=y_class, label="y_class") tasks_regr = sc.load_task_weights(args.weights_regr, y=y_regr, label="y_regr") ## Input transformation ecfp = sc.fold_transform_inputs(ecfp, folding_size=args.fold_inputs, transform=args.input_transform) print(f"count non zero:{ecfp[0].count_nonzero()}") num_pos = np.array((y_class == +1).sum(0)).flatten() num_neg = np.array((y_class == -1).sum(0)).flatten() num_class = np.array((y_class != 0).sum(0)).flatten() if (num_class != num_pos + num_neg).any(): raise ValueError( "For classification all y values (--y_class/--y) must be 1 or -1.") num_regr = np.bincount(y_regr.indices, minlength=y_regr.shape[1]) assert args.min_samples_auc is None, "Parameter 'min_samples_auc' is obsolete. Use '--min_samples_class' that specifies how many samples a task needs per FOLD and per CLASS to be aggregated." if tasks_class.aggregation_weight is None: ## using min_samples rule fold_pos, fold_neg = sc.class_fold_counts(y_class, folding) n = args.min_samples_class tasks_class.aggregation_weight = ((fold_pos >= n).all(0) & (fold_neg >= n)).all(0).astype( np.float64) if tasks_regr.aggregation_weight is None: if y_censor.nnz == 0: y_regr2 = y_regr.copy() y_regr2.data[:] = 1 else: ## only counting uncensored data y_regr2 = y_censor.copy() y_regr2.data = (y_regr2.data == 0).astype(np.int32) fold_regr, _ = sc.class_fold_counts(y_regr2, folding) del y_regr2 tasks_regr.aggregation_weight = ( fold_regr >= args.min_samples_regr).all(0).astype(np.float64) vprint(f"Input dimension: {ecfp.shape[1]}") vprint(f"#samples: {ecfp.shape[0]}") vprint(f"#classification tasks: {y_class.shape[1]}") vprint(f"#regression tasks: {y_regr.shape[1]}") vprint( f"Using {(tasks_class.aggregation_weight > 0).sum()} classification tasks for calculating aggregated metrics (AUCROC, F1_max, etc)." ) vprint( f"Using {(tasks_regr.aggregation_weight > 0).sum()} regression tasks for calculating metrics (RMSE, Rsquared, correlation)." ) if args.fold_te is not None and args.fold_te >= 0: ## removing test data assert args.fold_te != args.fold_va, "fold_va and fold_te must not be equal." keep = folding != args.fold_te ecfp = ecfp[keep] y_class = y_class[keep] y_regr = y_regr[keep] y_censor = y_censor[keep] folding = folding[keep] normalize_inv = None if args.normalize_regression == 1 and args.normalize_regr_va == 1: y_regr, mean_save, var_save = sc.normalize_regr(y_regr) fold_va = args.fold_va idx_tr = np.where(folding != fold_va)[0] idx_va = np.where(folding == fold_va)[0] y_class_tr = y_class[idx_tr] y_class_va = y_class[idx_va] y_regr_tr = y_regr[idx_tr] y_regr_va = y_regr[idx_va] y_censor_tr = y_censor[idx_tr] y_censor_va = y_censor[idx_va] if args.normalize_regression == 1 and args.normalize_regr_va == 0: y_regr_tr, mean_save, var_save = sc.normalize_regr(y_regr_tr) if args.inverse_normalization == 1: normalize_inv = {} normalize_inv["mean"] = mean_save normalize_inv["var"] = var_save num_pos_va = np.array((y_class_va == +1).sum(0)).flatten() num_neg_va = np.array((y_class_va == -1).sum(0)).flatten() num_regr_va = np.bincount(y_regr_va.indices, minlength=y_regr.shape[1]) pos_rate = num_pos_va / (num_pos_va + num_neg_va) pos_rate_ref = args.pi_zero pos_rate = np.clip(pos_rate, 0, 0.99) cal_fact_aucpr = pos_rate * (1 - pos_rate_ref) / (pos_rate_ref * (1 - pos_rate)) #import ipdb; ipdb.set_trace() batch_size = int(np.ceil(args.batch_ratio * idx_tr.shape[0])) num_int_batches = 1 if args.internal_batch_max is not None: if args.internal_batch_max < batch_size: num_int_batches = int(np.ceil(batch_size / args.internal_batch_max)) batch_size = int(np.ceil(batch_size / num_int_batches)) vprint(f"#internal batch size: {batch_size}") tasks_cat_id_list = None select_cat_ids = None if tasks_class.cat_id is not None: tasks_cat_id_list = [[x, i] for i, x in enumerate(tasks_class.cat_id) if str(x) != 'nan'] tasks_cat_ids = [ i for i, x in enumerate(tasks_class.cat_id) if str(x) != 'nan' ] select_cat_ids = np.array(tasks_cat_ids) cat_id_size = len(tasks_cat_id_list) else: cat_id_size = 0 dataset_tr = sc.ClassRegrSparseDataset(x=ecfp[idx_tr], y_class=y_class_tr, y_regr=y_regr_tr, y_censor=y_censor_tr, y_cat_columns=select_cat_ids) dataset_va = sc.ClassRegrSparseDataset(x=ecfp[idx_va], y_class=y_class_va, y_regr=y_regr_va, y_censor=y_censor_va, y_cat_columns=select_cat_ids) loader_tr = DataLoader(dataset_tr, batch_size=batch_size, num_workers=8, pin_memory=True, collate_fn=dataset_tr.collate, shuffle=True) loader_va = DataLoader(dataset_va, batch_size=batch_size, num_workers=4, pin_memory=True, collate_fn=dataset_va.collate, shuffle=False) args.input_size = dataset_tr.input_size args.output_size = dataset_tr.output_size args.class_output_size = dataset_tr.class_output_size args.regr_output_size = dataset_tr.regr_output_size args.cat_id_size = cat_id_size dev = torch.device(args.dev) net = sc.SparseFFN(args).to(dev) loss_class = torch.nn.BCEWithLogitsLoss(reduction="none") loss_regr = sc.censored_mse_loss if not args.censored_loss: loss_regr = functools.partial(loss_regr, censored_enabled=False) tasks_class.training_weight = tasks_class.training_weight.to(dev) tasks_regr.training_weight = tasks_regr.training_weight.to(dev) tasks_regr.censored_weight = tasks_regr.censored_weight.to(dev) vprint("Network:") vprint(net) reporter = None h = None if args.profile == 1: torch_gpu_id = torch.cuda.current_device() if "CUDA_VISIBLE_DEVICES" in os.environ: ids = list( map(int, os.environ.get("CUDA_VISIBLE_DEVICES", "").split(","))) nvml_gpu_id = ids[torch_gpu_id] # remap else: nvml_gpu_id = torch_gpu_id h = nvmlDeviceGetHandleByIndex(nvml_gpu_id) if args.profile == 1: ##### output saving ##### if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) reporter = MemReporter(net) with open(f"{args.output_dir}/memprofile.txt", "w+") as profile_file: with redirect_stdout(profile_file): profile_file.write(f"\nInitial model detailed report:\n\n") reporter.report() optimizer = torch.optim.Adam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) scheduler = MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_alpha) num_prints = 0 scaler = torch.cuda.amp.GradScaler() for epoch in range(args.epochs): t0 = time.time() sc.train_class_regr(net, optimizer, loader=loader_tr, loss_class=loss_class, loss_regr=loss_regr, dev=dev, weights_class=tasks_class.training_weight * (1 - args.regression_weight) * 2, weights_regr=tasks_regr.training_weight * args.regression_weight * 2, censored_weight=tasks_regr.censored_weight, normalize_loss=args.normalize_loss, num_int_batches=num_int_batches, progress=args.verbose >= 2, reporter=reporter, writer=writer, epoch=epoch, args=args, scaler=scaler, nvml_handle=h) if args.profile == 1: with open(f"{args.output_dir}/memprofile.txt", "a+") as profile_file: profile_file.write( f"\nAfter epoch {epoch} model detailed report:\n\n") with redirect_stdout(profile_file): reporter.report() t1 = time.time() eval_round = (args.eval_frequency > 0) and ((epoch + 1) % args.eval_frequency == 0) last_round = epoch == args.epochs - 1 if eval_round or last_round: results_va = sc.evaluate_class_regr(net, loader_va, loss_class, loss_regr, tasks_class=tasks_class, tasks_regr=tasks_regr, dev=dev, progress=args.verbose >= 2, normalize_inv=normalize_inv, cal_fact_aucpr=cal_fact_aucpr) # import ipdb; ipdb.set_trace() for key, val in results_va["classification_agg"].items(): writer.add_scalar(key + "/va", val, epoch) for key, val in results_va["regression_agg"].items(): writer.add_scalar(key + "/va", val, epoch) if args.eval_train: results_tr = sc.evaluate_class_regr(net, loader_tr, loss_class, loss_regr, tasks_class=tasks_class, tasks_regr=tasks_regr, dev=dev, progress=args.verbose >= 2) for key, val in results_tr["classification_agg"].items(): writer.add_scalar(key + "/tr", val, epoch) for key, val in results_tr["regression_agg"].items(): writer.add_scalar(key + "/tr", val, epoch) else: results_tr = None if args.verbose: ## printing a new header every 20 lines header = num_prints % 20 == 0 num_prints += 1 sc.print_metrics_cr(epoch, t1 - t0, results_tr, results_va, header) scheduler.step() #print("DEBUG data for hidden spliting") #print (f"Classification mask: Sum = {net.classmask.sum()}\t Uniques: {np.unique(net.classmask)}") #print (f"Regression mask: Sum = {net.regmask.sum()}\t Uniques: {np.unique(net.regmask)}") #print (f"overlap: {(net.regmask * net.classmask).sum()}") writer.close() vprint() if args.profile == 1: multiplexer = sc.create_multiplexer(tb_name) # sc.export_scalars(multiplexer, '.', "GPUmem", "testcsv.csv") data = sc.extract_scalars(multiplexer, '.', "GPUmem") vprint(f"Peak GPU memory used: {sc.return_max_val(data)}MB") vprint("Saving performance metrics (AUCs) and model.") ##### model saving ##### if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) model_file = f"{args.output_dir}/{name}.pt" out_file = f"{args.output_dir}/{name}.json" if args.save_model: torch.save(net.state_dict(), model_file) vprint(f"Saved model weights into '{model_file}'.") results_va["classification"]["num_pos"] = num_pos_va results_va["classification"]["num_neg"] = num_neg_va results_va["regression"]["num_samples"] = num_regr_va if results_tr is not None: results_tr["classification"]["num_pos"] = num_pos - num_pos_va results_tr["classification"]["num_neg"] = num_neg - num_neg_va results_tr["regression"]["num_samples"] = num_regr - num_regr_va stats = None if args.normalize_regression == 1: stats = {} stats["mean"] = mean_save stats["var"] = np.array(var_save)[0] sc.save_results(out_file, args, validation=results_va, training=results_tr, stats=stats) vprint( f"Saved config and results into '{out_file}'.\nYou can load the results by:\n import sparsechem as sc\n res = sc.load_results('{out_file}')" )
from pytorch_memlab import MemReporter os.environ["CUDA_VISIBLE_DEVICES"] = "0" arch = "l" img_preparam = {"s": (224, 0.875), "m": (224, 0.875), "l": (224, 0.875)} net_h = img_preparam[arch][0] model = MixNet(arch=arch, num_classes=1000) print(model) optimizer = torch.optim.SGD(model.parameters(), lr=1e-1, momentum=0.90, weight_decay=1.0e-4, nesterov=True) # stat(model, (3, net_h, net_h)) model = model.cuda().train() loss_func = nn.CrossEntropyLoss().cuda() dummy_in = torch.randn(2, 3, net_h, net_h).cuda().requires_grad_() dummy_target = torch.ones(2).cuda().long().cuda() reporter = MemReporter(model) optimizer.zero_grad() dummy_out = model(dummy_in) loss = loss_func(dummy_out, dummy_target) print('========================================== before backward ===========================================') reporter.report() loss.backward() optimizer.step() print('========================================== after backward =============================================') reporter.report()
class TrainAndEvaluate: def __init__(self,hyperparameters,seed=0,eval=False,**kwargs) -> None: print(hyperparameters) #setup matplotlib fonts self.prop = FontProperties(fname="NotoColorEmoji.tff") plt.rcParams['font.family'] = self.prop.get_family() self.eval=eval #setup device self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # self.device= torch.device("cpu") #setup memory reporter self.reporter = MemReporter() #setup random seed: self.rng = np.random.RandomState(seed=seed) self.hyperparameters = hyperparameters def run(self): with wandb.init(project="bike-1b",config=self.hyperparameters, name=self.hyperparameters["exp_name"],save_code=True): # access all HPs through wandb.config, so logging matches execution! self.config = wandb.config # make the model, data, and optimization problem self.make() # and use them to train the model torch.cuda.empty_cache() self.reporter.report() if self.eval==False: self.train() print("testing:") self.evaluate(dataset='test') # and test its final performance return self.model def make(self): # Make the data self.train_loader = self.hyperparameters["dataloader"](data_set_type='train',**self.hyperparameters["dataloader_params"]) self.test_loader = self.hyperparameters["dataloader"](data_set_type='test',**self.hyperparameters["dataloader_params"]) self.val_loader = self.hyperparameters["dataloader"](data_set_type="val",**self.hyperparameters["dataloader_params"]) self.tiny_val_loader = self.hyperparameters["dataloader"](root=self.hyperparameters["dataloader_params"]["root"],data_set_type="val",data_set_size = 4, normalize = True, balance = 0.5, num_workers = 20, data_splits = {"val":1.0 }, prefetch_factor=1, batch_size = 4, transforms = self.hyperparameters["tiny_transforms"], shuffle=False) for name,loader in zip(["train","val","test"],[self.train_loader,self.val_loader,self.test_loader]): print(f"{name} loader stats:\t number of pairs: {len(loader.dataset)}\t") print(f"number of positive pairs: \t {loader.dataset.num_same_ad}") print(f"number of negative pairs: \t {loader.dataset.num_diff_ad}") print(f"number of Ads used: \t {len(loader.dataset.ad_to_img.keys())}") print("#"*5) print(f"Training set size: {len(self.train_loader.dataset)}") if self.hyperparameters["clear_redis"] == True: print("flushing redis. Expect a slower first epoch :(") self.train_loader.flush_redis() #filepaths to small batch of images to vizualise the backbone layer outputs self.tiny_filepaths = self.tiny_val_loader.dataset.same_ad_filenames + self.tiny_val_loader.dataset.diff_ad_filenames # self.tiny_filepaths = list(sum(self.tiny_filepaths, ())) # Flatten list of tuples into list # self.tiny_filepaths = [a for b in self.tiny_filepaths for b in a] self.tiny_filepaths = list(chain.from_iterable(self.tiny_filepaths)) tiny_image_as, tiny_image_bs, _ = next(iter(self.tiny_val_loader)) # Flatten batch of image pairs to batch of single images image_list = [torch.unsqueeze(x,0) for x in chain.from_iterable(zip(tiny_image_as,tiny_image_bs))] self.tiny_batch = torch.cat(image_list) # Make the model self.model = self.hyperparameters["model"](**self.config) # Make the loss and optimizer try: self.criterion = self.hyperparameters["criterion"](**self.hyperparameters) except: self.criterion = self.hyperparameters["criterion"]() self.base_optimizer = Adam(self.model.parameters(), lr=self.config.lr, weight_decay=self.config.weight_decay) # load weights and optimizer state if continuing: if self.config.starting_epoch>0: path = self.config.project_path checkpoint = torch.load(join(path,"models",f"model_{self.config.starting_epoch}.tar")) self.model.load_state_dict(checkpoint["model_state_dict"]) self.base_optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) for g in self.base_optimizer.param_groups: g['lr'] = self.config.lr g["weight_decay"] = self.config.weight_decay # make weights half precision if told to if self.config.half_precision: self.model.half() # convert to half precision #make sure bn layers are floats for stability for layer in self.model.modules(): if isinstance(layer, nn.BatchNorm2d): layer.float() self.model.to(self.device) self.move_base_optimizer_to_device() self.optimizer = CosineAnnealingLR(self.base_optimizer,last_epoch=-1, T_max = self.hyperparameters["epochs"], eta_min=0.00002) for _ in range(self.hyperparameters["starting_epoch"]): self.optimizer.step() def train(self): wandb.watch(self.model,self.criterion,log="all",log_freq=10) # Run training and track with wandb example_seen = 0 # number of examples seen batch_seen = 0 for epoch in range(self.config.starting_epoch,self.config.epochs): self.model.train() self.current_epoch = epoch with tqdm(total=len(self.train_loader),ncols=120) as pbar_train: for data in self.train_loader: torch.cuda.empty_cache() self.image_as, self.image_bs,labels = data[0].to(self.device),data[1].to(self.device),data[2].to(self.device) loss,outputs = self.train_batch([self.image_as,self.image_bs,labels]) example_seen += data[0].shape[0] batch_seen += 1 # Report metrics every 10 batches if batch_seen % 10 == 0: self.model.track_metrics(outputs,epoch,step=example_seen,criterion=self.criterion,loss=loss,split="train") pbar_train.update(1) pbar_train.set_description(f" Epoch: {epoch} loss: {loss:.4f}") #validate torch.cuda.empty_cache() # reporter.report() self.evaluate(dataset='val',epoch=epoch) def train_batch(self,data): loss,outputs,labels = self.model.train_batch(data,self.criterion,self.device,self.model) #backward pass: self.base_optimizer.zero_grad() loss.backward() self.optimizer.step(epoch=self.current_epoch) self.base_optimizer.step() if self.hyperparameters["model"] == BaselineModel_1b: return loss.detach().item(),[[outputs[0].detach().cpu(),outputs[1].detach().cpu()],labels.detach().cpu()] elif self.hyperparameters["model"] == BaselineModel_1a: return loss.detach().item(),[outputs,labels.detach().cpu()] else: raise Exception("Splat") def evaluate(self,dataset="val",epoch=None): path=self.config.project_path #put model in evaluation mode: accuracies = [] losses = [] viz_flag = True list_of_outputs = None list_of_image_a_outputs = None list_of_image_b_outputs = None list_of_labels = None #Visualise attention maps of the model if self.hyperparameters["viz_attention"]: self.model.am_viz(self.tiny_batch, self.tiny_filepaths) loader = self.val_loader if dataset=="val" else self.test_loader with torch.no_grad(): for data in loader: torch.cuda.empty_cache() # reporter.report() self.image_as, self.image_bs,labels = data[0].to(self.device),data[1].to(self.device),data[2].to(self.device) loss, accuracy, outputs = self.model.evaluate_batch([self.image_as,self.image_bs,labels],self.criterion,self.device,self.model) if viz_flag: list_of_image_a_outputs = outputs[0].cpu() list_of_image_b_outputs = outputs[1].cpu() list_of_labels = data[2].cpu() if self.hyperparameters["model"] == BaselineModel_1a: self.model.visualize(data, outputs[0], epoch, number_of_figures=self.hyperparameters["number_of_figures"], unNormalizer = UnNormalize(loader.means,loader.stds)) viz_flag =False else: list_of_image_a_outputs = torch.cat((list_of_image_a_outputs, outputs[0].cpu()), 0) list_of_image_b_outputs = torch.cat((list_of_image_b_outputs, outputs[0].cpu()), 0) list_of_labels = torch.cat((list_of_labels,data[2].cpu()),0) losses.append(loss) accuracies.append(accuracy) list_of_outputs = [[list_of_image_a_outputs, list_of_image_b_outputs], list_of_labels] if dataset == "val": self.model.track_metrics(list_of_outputs,epoch,step=epoch,criterion=self.criterion,loss=np.mean(losses),split="val") # wandb.log({"{}_accuracy".format(dataset): np.mean(accuracies),"global_step":epoch}) # wandb.log({"{}_loss".format(dataset): np.mean(losses),"global_step":epoch}) # Save the model actual_path = join(path,"models") if not os.path.exists(actual_path): os.makedirs(actual_path) #save weights and optimizer torch.save({ "epoch":epoch, "model_state_dict":self.model.state_dict(), "optimizer_state_dict":self.base_optimizer.state_dict() },join(path,"models",f"model_{epoch}.tar")) if dataset == "test": self.model.track_extra_metrics(list_of_outputs, epoch,split="test") def move_base_optimizer_to_device(self): for param in self.base_optimizer.state.values(): # Not sure there are any global tensors in the state dict if isinstance(param, torch.Tensor): param.data = param.data.to(self.device) if param._grad is not None: param._grad.data = param._grad.data.to(self.device) elif isinstance(param, dict): for subparam in param.values(): if isinstance(subparam, torch.Tensor): subparam.data = subparam.data.to(self.device) if subparam._grad is not None: subparam._grad.data = subparam._grad.data.to(self.device)