def do_validation(net, val_loader, metrics, label_index_in_batch): net.eval() metrics.reset() for nbatch, batch in enumerate(val_loader): batch = to_cuda(batch) label = batch[label_index_in_batch] datas = [ batch[i] for i in range(len(batch)) if i != label_index_in_batch % len(batch) ] outputs = net(*datas) # handle labels whether phrases are classified or not if label.dim() == 2: outputs.update({'sentence_label': label.view(-1)}) elif label.dim() == 3: sentence_label = label[:, 0, 0].view(-1) phrase_labels = label[:, :, 1].view(-1) phrase_logits = outputs["phrase_label_logits"] logits_mask = ~((phrase_logits == 100000).all(1)) outputs.update({ "sentence_label": sentence_label, "phrase_label": phrase_labels[phrase_labels > -1], "phrase_label_logits": phrase_logits[logits_mask] }) metrics.update(outputs)
def do_validation(net, val_loader, metrics, label_index_in_batch): net.eval() metrics.reset() for nbatch, batch in enumerate(val_loader): batch = to_cuda(batch) outputs, _ = net(*batch) metrics.update(outputs)
def test_submit(model: pl.LightningModule, test_loader, output_path): with torch.no_grad(): model.eval() predicts = [] cur_id = 0 for nbatch, batch in enumerate(test_loader): # bs = test_loader.batch_sampler.batch_size if test_loader.batch_sampler is not None else test_loader.batch_size batch = to_cuda(batch) outputs = model(*batch[:-1]) if outputs['label_logits'].shape[-1] == 1: prob = torch.sigmoid( outputs['label_logits'][:, 0]).detach().cpu().tolist() else: prob = torch.softmax(outputs['label_logits'], dim=-1)[:, 1].detach().cpu().tolist() sample_ids = batch[-1].cpu().tolist() for pb, id in zip(prob, sample_ids): predicts.append({ 'id': int(id), 'proba': float(pb), 'label': int(pb > 0.5) }) result_pd = pd.DataFrame.from_dict(predicts) result_pd.to_csv(output_path, index=False) model.train() return result_pd
def vis(model, loader, save_dir, rank=None, world_size=1): attention_dir = os.path.join(save_dir, 'attention_probs') hidden_dir = os.path.join(save_dir, 'hidden_states') cos_dir = os.path.join(save_dir, 'cos_similarity') # if not os.path.exists(hidden_dir): # makedirsExist(hidden_dir) # if not os.path.exists(cos_dir): # makedirsExist(cos_dir) if not os.path.exists(attention_dir): makedirsExist(attention_dir) # offset = 0 # if rank is not None: # num_samples = int(math.ceil(len(loader.dataset) * 1.0 / world_size)) # offset = num_samples * rank # index = offset model.eval() for i, data in zip(trange(len(loader)), loader): # for i, data in enumerate(loader): data = to_cuda(data) output = model(*data) for _i, (attention_probs, hidden_states) in enumerate(zip(output['attention_probs'], output['hidden_states'])): index = int(data[2][_i][-1]) if hasattr(loader.dataset, 'ids'): image_id = loader.dataset.ids[index] else: image_id = loader.dataset.database[index]['image'].split('/')[1].split('.')[0] attention_probs_arr = attention_probs.detach().cpu().numpy() hidden_states_arr = hidden_states.detach().cpu().numpy() cos_similarity_arr = (hidden_states @ hidden_states.transpose(1, 2)).detach().cpu().numpy() np.save(os.path.join(attention_dir, '{}.npy'.format(image_id)), attention_probs_arr)
def do_validation(net, val_loader, metrics, label_index_in_batch): net.eval() metrics.reset() for nbatch, batch in enumerate(val_loader): batch = to_cuda(batch) datas = [batch[i] for i in range(len(batch))] outputs = net(*datas) metrics.update(outputs)
def do_validation(net, val_loader, metrics, label_index_in_batch): net.eval() metrics.reset() for nbatch, batch in enumerate(val_loader): batch = to_cuda(batch) label = batch[label_index_in_batch] datas = [batch[i] for i in range(len(batch)) if i != label_index_in_batch % len(batch)] outputs = net(*datas) outputs.update({'sentence_label': label.view(-1)}) metrics.update(outputs)
def step(a_batch, r_batch): a_batch = to_cuda(a_batch) a_label = a_batch[label_index_in_batch] a_datas = [ a_batch[i] for i in range(len(a_batch)) if i != label_index_in_batch % len(a_batch) ] r_batch = to_cuda(r_batch) r_label = r_batch[label_index_in_batch] r_datas = [ r_batch[i] for i in range(len(r_batch)) if i != label_index_in_batch % len(r_batch) ] a_outputs = answer_net(*a_datas) r_outputs = rationale_net(*r_datas) outputs = {'answer_' + k: v for k, v in a_outputs.items()} outputs.update({'rationale_' + k: v for k, v in r_outputs.items()}) outputs.update({'answer_label': a_label, 'rationale_label': r_label}) metrics.update(outputs)
def do_validation(net, val_loader, metrics, label_index_in_batch): net.eval() metrics.reset() for nbatch, batch in tqdm(enumerate(val_loader)): batch = to_cuda(batch) # label = batch[label_index_in_batch] # datas = [batch[i] for i in range(len(batch)) if i != label_index_in_batch % len(batch)] datas = [batch[i] for i in range(len(batch))] outputs = net(*datas) # outputs.update({'label': label.long()}) metrics.update(outputs)
def do_validation(net, val_loader, metrics, label_index_in_batch, epoch_num=0, finetune_strategy='standard', policy_net=None, policy_optimizer=None, global_decision=False, policy_decisions=None, policy_total=None): net.eval() if finetune_strategy in PolicyVec: policy_net.eval() policy_save = torch.zeros(PolicyVec[finetune_strategy]).cpu() policy_max = 0 # check if we have to make a global decision if global_decision: # calculate the policy policy_decisions = policy_decisions / policy_total policy_init = (policy_decisions > 0.5).float() metrics.reset() for nbatch, batch in enumerate(val_loader): batch = to_cuda(batch) label = batch[label_index_in_batch] datas = [ batch[i] for i in range(len(batch)) if i != label_index_in_batch % len(batch) ] if finetune_strategy in PolicyVec: if not global_decision: policy_vector = policy_net(*datas) policy_action = gumbel_softmax( policy_vector.view(policy_vector.size(0), -1, 2)) policy = policy_action[:, :, 1] else: # repeat to match the batch size policy = policy_init.repeat(batch[1].size(0), 1) policy_save = policy_save + policy.clone().detach().cpu().sum(0) policy_max += policy.size(0) outputs = net(*datas, policy) else: outputs = net(*datas) outputs.update({'label': label}) metrics.update(outputs) if finetune_strategy in PolicyVec: # plot val visualizations print("Plotting val visualizations") vis(finetune_strategy, policy_save, policy_max, epoch_num, mode='val')
def do_validation(net, val_loader, metrics, label_index_in_batch, model_dir=None, epoch_num=0): net.eval() metrics.reset() predicts = [] for nbatch, batch in enumerate(val_loader): batch = to_cuda(batch) label = batch[label_index_in_batch] datas = [batch[i] for i in range(len(batch)) if i != label_index_in_batch % len(batch)] outputs = net(*datas) outputs.update({'label': label}) metrics.update(outputs) idx = batch[-1].cpu().tolist() if outputs['label_logits'].shape[-1] == 1: prob = torch.sigmoid(outputs['label_logits'][:, 0]).detach().cpu().tolist() else: prob = torch.softmax(outputs['label_logits'], dim=-1)[:, 1].detach().cpu().tolist() if label.ndim == 2: if label.shape[-1] == 1: label = label.squeeze(dim=-1) label = label.cpu().tolist() for pb, id, lb in zip(prob, idx, label): predicts.append({ 'id': int(id), 'proba': float(pb), 'label': int(pb > 0.5), 'target': lb, 'error': abs(float(lb) - float(pb)) }) if model_dir is not None: output_path = os.path.join(model_dir, f'val_{epoch_num}.json') with open(output_path, 'w') as f: json.dump(predicts, f) print('>>> do_validation result JSON saved to {}.'.format(output_path)) output_path = os.path.join(model_dir, f'val_{epoch_num}.csv') result_pd = pd.DataFrame.from_dict(predicts) result_pd.to_csv(output_path, index=False) print('>>> do_validation result CSV saved to {}.'.format(output_path))
def do_validation(net, val_loader, metrics, label_index_in_batch): return net.eval() answer = 0 # metrics.reset() for nbatch, batch in enumerate(val_loader): batch = to_cuda(batch) # label = batch[label_index_in_batch] # datas = [batch[i] for i in range(len(batch)) if i != label_index_in_batch % len(batch)] score, sim = net(*batch) answer += score # outputs.update({'label': label}) # metrics.update(outputs) if len(val_loader) == 0: len_b = 1 else: len_b = len(val_loader) answer = answer / len_b print("batch score: ", answer)
def main(): args, a_config, r_config = parse_args() if args.cudnn_off: torch.backends.cudnn.enabled = False with jsonlines.open(args.annot) as reader: gts = [(obj['answer_label'], obj['rationale_label']) for obj in reader] a_gt = np.array([gt[0] for gt in gts], dtype=np.int64) r_gt = np.array([gt[1] for gt in gts], dtype=np.int64) # cache a_cache_fn = os.path.join(args.result_path, '{}_a_pred.npy'.format(args.result_name)) r_cache_fn = os.path.join(args.result_path, '{}_r_pred.npy'.format(args.result_name)) a_pred = r_pred = None if not os.path.exists(args.result_path): os.makedirs(args.result_path) if args.use_cache: if os.path.exists(a_cache_fn): print("Load cached predictions from {}...".format(a_cache_fn)) a_pred = np.load(a_cache_fn) if os.path.exists(r_cache_fn): print("Load cached predictions from {}...".format(r_cache_fn)) r_pred = np.load(r_cache_fn) else: if a_config is not None and args.a_ckpt is not None: print("Build model and dataloader for Q->A...") # get model device_ids = args.gpus # os.environ['CUDA_VISIBLE_DEVICES'] = ','.join([str(k) for k in args.gpus]) a_config.GPUS = ','.join([str(k) for k in args.gpus]) answer_model = eval(a_config.MODULE)(a_config) if len(device_ids) > 1: answer_model = torch.nn.DataParallel( answer_model, device_ids=device_ids).cuda() else: torch.cuda.set_device(device_ids[0]) answer_model = answer_model.cuda() if args.fp16: [answer_model] = amp.initialize([answer_model], opt_level='O2', keep_batchnorm_fp32=False) a_ckpt = torch.load(args.a_ckpt, map_location=lambda storage, loc: storage) smart_load_model_state_dict(answer_model, a_ckpt['state_dict']) answer_model.eval() # get data loader a_config.DATASET.TASK = 'Q2A' a_config.VAL.SHUFFLE = False answer_loader = make_dataloader(a_config, mode='val', distributed=False) label_index_in_batch = a_config.DATASET.LABEL_INDEX_IN_BATCH print("Inference Q->A...") # inference n_batch = len(answer_loader) a_pred = np.zeros((len(gts), 4), dtype=np.float) i_sample = 0 for nbatch, a_batch in zip(trange(len(answer_loader)), answer_loader): # for a_batch in answer_loader: a_batch = to_cuda(a_batch) a_batch = [ a_batch[i] for i in range(len(a_batch)) if i != label_index_in_batch % len(a_batch) ] a_out = answer_model(*a_batch) a_batch_pred = a_out['label_logits'] batch_size = a_batch_pred.shape[0] if a_batch_pred.dim() == 2: a_pred[i_sample:( i_sample + batch_size )] = a_batch_pred.detach().cpu().numpy().astype(np.float, copy=False) elif a_batch_pred.dim() == 1: assert a_batch_pred.shape[0] % 4 == 0 a_batch_pred = a_batch_pred.view((-1, 4)) a_pred[int(i_sample / 4):int((i_sample + batch_size) / 4)] \ = a_batch_pred.float().detach().cpu().numpy().astype(np.float, copy=False) else: raise ValueError("Invalid") i_sample += batch_size # print("inference {}/{}".format(i_sample, len(answer_loader.dataset))) np.save(a_cache_fn, a_pred) if r_config is not None and args.r_ckpt is not None: print("Build model and dataloader for QA->R...") # get model device_ids = args.gpus # os.environ['CUDA_VISIBLE_DEVICES'] = ','.join([str(k) for k in args.gpus]) r_config.GPUS = ','.join([str(k) for k in args.gpus]) rationale_model = eval(r_config.MODULE)(r_config) if len(device_ids) > 1: rationale_model = torch.nn.DataParallel( rationale_model, device_ids=device_ids).cuda() else: torch.cuda.set_device(device_ids[0]) rationale_model = rationale_model.cuda() if args.fp16: [rationale_model] = amp.initialize([rationale_model], opt_level='O2', keep_batchnorm_fp32=False) r_ckpt = torch.load(args.r_ckpt, map_location=lambda storage, loc: storage) smart_load_model_state_dict(rationale_model, r_ckpt['state_dict']) rationale_model.eval() # get data loader r_config.DATASET.TASK = 'QA2R' r_config.VAL.SHUFFLE = False rationale_loader = make_dataloader(r_config, mode='val', distributed=False) label_index_in_batch = r_config.DATASET.LABEL_INDEX_IN_BATCH print("Inference QA->R...") # inference n_batch = len(rationale_loader) r_pred = np.zeros((len(rationale_loader.dataset), 4), dtype=np.float) i_sample = 0 for nbatch, r_batch in zip(trange(len(rationale_loader)), rationale_loader): # for r_batch in rationale_loader: r_batch = to_cuda(r_batch) r_batch = [ r_batch[i] for i in range(len(r_batch)) if i != label_index_in_batch % len(r_batch) ] r_out = rationale_model(*r_batch) r_batch_pred = r_out['label_logits'] batch_size = r_batch_pred.shape[0] r_pred[i_sample:(i_sample + batch_size)] =\ r_batch_pred.float().detach().cpu().numpy().astype(np.float, copy=False) i_sample += batch_size # print("inference {}/{}".format(i_sample, len(rationale_loader.dataset))) np.save(r_cache_fn, r_pred) # evaluate print("Evaluate...") if a_pred is not None: acc_a = (a_pred.argmax(1) == a_gt).sum() * 1.0 / a_gt.size print("Q->A\t{:.1f}".format(acc_a * 100.0)) if r_pred is not None: acc_r = (r_pred.argmax(1) == r_gt).sum() * 1.0 / r_gt.size print("QA->R\t{:.1f}".format(acc_r * 100.0)) if a_pred is not None and r_pred is not None: acc_joint = ((a_pred.argmax(1) == a_gt) * (r_pred.argmax(1) == r_gt)).sum() * 1.0 / a_gt.size print("Q->AR\t{:.1f}".format(acc_joint * 100.0))
def test_net(args, config, ckpt_path=None, save_path=None, save_name=None): if save_path is None: logger, test_output_path = create_logger(config.OUTPUT_PATH, args.cfg, config.DATASET.IMAGE_SET, split='test') save_path = test_output_path if ckpt_path is None: _, train_output_path = create_logger(config.OUTPUT_PATH, args.cfg, config.DATASET.IMAGE_SET, split='train') model_prefix = os.path.join(train_output_path, config.MODEL_PREFIX) ckpt_path = '{}-best.model'.format(model_prefix) print('Use best checkpoint {}...'.format(ckpt_path)) if save_name is None: save_name = os.path.split(ckpt_path)[-1] if not os.path.exists(save_path): os.makedirs(save_path) result_csv_path = os.path.join(save_path, '{}_test_result.csv'.format(save_name)) if args.repredict or not os.path.isfile(result_csv_path): print('test net...') pprint.pprint(args) pprint.pprint(config) device_ids = [int(d) for d in config.GPUS.split(',')] # os.environ['CUDA_VISIBLE_DEVICES'] = config.GPUS if ckpt_path is None: _, train_output_path = create_logger(config.OUTPUT_PATH, args.cfg, config.DATASET.IMAGE_SET, split='train') model_prefix = os.path.join(train_output_path, config.MODEL_PREFIX) ckpt_path = '{}-best.model'.format(model_prefix) print('Use best checkpoint {}...'.format(ckpt_path)) shutil.copy2( ckpt_path, os.path.join(save_path, '{}_test_ckpt.model'.format(config.MODEL_PREFIX))) # torch.backends.cudnn.enabled = False # torch.backends.cudnn.deterministic = True # torch.backends.cudnn.benchmark = False # get network model = eval(config.MODULE)(config) if len(device_ids) > 1: model = torch.nn.DataParallel(model, device_ids=device_ids).cuda() else: model = model.cuda() if args.fp16: [model] = amp.initialize([model], opt_level='O2', keep_batchnorm_fp32=False) checkpoint = torch.load(ckpt_path, map_location=lambda storage, loc: storage) smart_load_model_state_dict(model, checkpoint['state_dict']) # loader test_loader = make_dataloader(config, mode='test', distributed=False) test_dataset = test_loader.dataset test_database = test_dataset.database # test sentence_logits = [] test_ids = [] sentence_labels = [] cur_id = 0 model.eval() for batch in test_loader: batch = to_cuda(batch) output = model(*batch) sentence_logits.append( output['sentence_label_logits'].float().detach().cpu().numpy()) batch_size = batch[0].shape[0] sentence_labels.append([ test_database[cur_id + k]['label'] for k in range(batch_size) ]) test_ids.append([ test_database[cur_id + k]['pair_id'] for k in range(batch_size) ]) cur_id += batch_size sentence_logits = np.concatenate(sentence_logits, axis=0) test_ids = np.concatenate(test_ids, axis=0) sentence_labels = np.concatenate(sentence_labels, axis=0) if config.DATASET.ALIGN_CAPTION_IMG: sentence_prediction = np.argmax(sentence_logits, axis=1).reshape(-1) else: sentence_prediction = (sentence_logits > 0.).astype(int).reshape(-1) # generate final result csv dataframe = pd.DataFrame(data=sentence_prediction, columns=["sentence_pred_label"]) dataframe['pair_id'] = test_ids dataframe['sentence_labels'] = sentence_labels # Save predictions dataframe = dataframe.set_index('pair_id', drop=True) dataframe.to_csv(result_csv_path) print('result csv saved to {}.'.format(result_csv_path)) else: print( "Cache found in {}, skip test prediction!".format(result_csv_path)) dataframe = pd.read_csv(result_csv_path) sentence_prediction = np.array(dataframe["sentence_pred_label"].values) sentence_labels = np.array(dataframe["sentence_labels"].values) # Evaluate predictions for metric in ["overall_accuracy", "easy_accuracy", "alignment_accuracy"]: accuracy = compute_metrics_sentence_level(metric, sentence_prediction, sentence_labels) print("{} on test set is: {}".format(metric, str(accuracy)))
def test_net(args, config, ckpt_path=None, save_path=None, save_name=None): if save_path is None: logger, test_output_path = create_logger(config.OUTPUT_PATH, args.cfg, config.DATASET.TEST_IMAGE_SET, split='test') save_path = test_output_path if save_name is None: save_name = config.MODEL_PREFIX if not os.path.exists(save_path): os.makedirs(save_path) result_csv_path = os.path.join(save_path, '{}_test_result_{}.csv'.format(save_name, config.DATASET.TASK)) if args.use_cache and os.path.isfile(result_csv_path): print("Cache found in {}, skip test!".format(result_csv_path)) return result_csv_path print('test net...') pprint.pprint(args) pprint.pprint(config) device_ids = [int(d) for d in config.GPUS.split(',')] # os.environ['CUDA_VISIBLE_DEVICES'] = config.GPUS if ckpt_path is None: _, train_output_path = create_logger(config.OUTPUT_PATH, args.cfg, config.DATASET.TRAIN_IMAGE_SET, split='train') model_prefix = os.path.join(train_output_path, config.MODEL_PREFIX) ckpt_path = '{}-best.model'.format(model_prefix) print('Use best checkpoint {}...'.format(ckpt_path)) shutil.copy2(ckpt_path, os.path.join(save_path, '{}_test_ckpt_{}.model'.format(config.MODEL_PREFIX, config.DATASET.TASK))) # torch.backends.cudnn.enabled = False # torch.backends.cudnn.deterministic = True # torch.backends.cudnn.benchmark = False # get network model = eval(config.MODULE)(config) if len(device_ids) > 1: model = torch.nn.DataParallel(model, device_ids=device_ids).cuda() else: model = model.cuda() if args.fp16: [model] = amp.initialize([model], opt_level='O2', keep_batchnorm_fp32=False) checkpoint = torch.load(ckpt_path, map_location=lambda storage, loc: storage) smart_load_model_state_dict(model, checkpoint['state_dict']) # loader test_loader = make_dataloader(config, mode='test', distributed=False) test_dataset = test_loader.dataset test_database = test_dataset.database # test test_probs = [] test_ids = [] cur_id = 0 model.eval() for nbatch, batch in zip(trange(len(test_loader)), test_loader): # for nbatch, batch in tqdm(enumerate(test_loader)): batch = to_cuda(batch) if config.DATASET.TASK == 'Q2A': output = model(*batch) probs = F.softmax(output['label_logits'].float(), dim=1) batch_size = probs.shape[0] test_probs.append(probs.float().detach().cpu().numpy()) test_ids.append([test_database[cur_id + k]['annot_id'] for k in range(batch_size)]) cur_id += batch_size elif config.DATASET.TASK == 'QA2R': conditioned_probs = [] for a_id in range(4): q_index_in_batch = test_loader.dataset.data_names.index('question') q_align_mat_index_in_batch = test_loader.dataset.data_names.index('question_align_matrix') batch_ = [*batch] batch_[q_index_in_batch] = batch[q_index_in_batch][:, a_id, :, :] batch_[q_align_mat_index_in_batch] = batch[q_align_mat_index_in_batch][:, a_id, :, :] output = model(*batch_) probs = F.softmax(output['label_logits'].float(), dim=1) conditioned_probs.append(probs.float().detach().cpu().numpy()) conditioned_probs = np.concatenate(conditioned_probs, axis=1) test_probs.append(conditioned_probs) test_ids.append([test_database[cur_id + k]['annot_id'] for k in range(conditioned_probs.shape[0])]) cur_id += conditioned_probs.shape[0] else: raise ValueError('Not Support Task {}'.format(config.DATASET.TASK)) test_probs = np.concatenate(test_probs, axis=0) test_ids = np.concatenate(test_ids, axis=0) result_npy_path = os.path.join(save_path, '{}_test_result_{}.npy'.format(save_name, config.DATASET.TASK)) np.save(result_npy_path, test_probs) print('result npy saved to {}.'.format(result_npy_path)) # generate final result csv if config.DATASET.TASK == 'Q2A': columns = ['answer_{}'.format(i) for i in range(4)] else: columns = ['rationale_conditioned_on_a{}_{}'.format(i, j) for i in range(4) for j in range(4)] dataframe = pd.DataFrame(data=test_probs, columns=columns) dataframe['annot_id'] = test_ids dataframe = dataframe.set_index('annot_id', drop=True) dataframe.to_csv(result_csv_path) print('result csv saved to {}.'.format(result_csv_path)) return result_csv_path
def test_net2018(args, config, ckpt_path=None, save_path=None, save_name=None): print('test net...') pprint.pprint(args) pprint.pprint(config) device_ids = [int(d) for d in config.GPUS.split(',')] # os.environ['CUDA_VISIBLE_DEVICES'] = config.GPUS torch.backends.cudnn.enabled = False torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False if ckpt_path is None: _, train_output_path = create_logger(config.OUTPUT_PATH, args.cfg, config.DATASET.TRAIN_IMAGE_SET, split='train') model_prefix = os.path.join(train_output_path, config.MODEL_PREFIX) ckpt_path = '{}-best.model'.format(model_prefix) print('Use best checkpoint {}...'.format(ckpt_path)) if save_path is None: logger, test_output_path = create_logger(config.OUTPUT_PATH, args.cfg, config.DATASET.TEST_IMAGE_SET, split='test') save_path = test_output_path if not os.path.exists(save_path): os.makedirs(save_path) shutil.copy2( ckpt_path, os.path.join( save_path, '{}_test_ckpt_{}.model'.format(config.MODEL_PREFIX, config.DATASET.TASK))) # ************ # Step 1: Select model architecture and preload trained model model = eval(config.MODULE)(config) if len(device_ids) > 1: model = torch.nn.DataParallel(model, device_ids=device_ids).cuda() else: torch.cuda.set_device(device_ids[0]) model = model.cuda() checkpoint = torch.load(ckpt_path, map_location=lambda storage, loc: storage) smart_load_model_state_dict(model, checkpoint['state_dict']) # ************ # Step 2: Create dataloader to include all caption-image pairs test_loader = make_dataloader(config, mode='test', distributed=False) test_dataset = test_loader.dataset test_database = test_dataset.database vocab = test_dataset.MLT_vocab # ************ # Step 3: Run all pairs through model for inference word_de_ids = [] words_de = [] words_en = [] captions_en = [] captions_de = [] logit_words = [] logits = [] model.eval() cur_id = 0 for nbatch, batch in zip(trange(len(test_loader)), test_loader): bs = test_loader.batch_sampler.batch_size if test_loader.batch_sampler is not None else test_loader.batch_size # for id in range(cur_id, min(cur_id + bs, len(test_database))): # print(test_database[id]) words_de.extend([ test_database[id]['word_de'] for id in range(cur_id, min(cur_id + bs, len(test_database))) ]) words_en.extend([ test_database[id]['word_en'] for id in range(cur_id, min(cur_id + bs, len(test_database))) ]) captions_en.extend([ test_database[id]['caption_en'] for id in range(cur_id, min(cur_id + bs, len(test_database))) ]) captions_de.extend([ test_database[id]['caption_de'] for id in range(cur_id, min(cur_id + bs, len(test_database))) ]) batch = to_cuda(batch) output = model(*batch) # FM note: output is tuple (outputs, loss) probs = F.softmax(output[0]['MLT_logits'].float(), dim=1) batch_size = probs.shape[0] logits.extend(probs.argmax(dim=1).detach().cpu().tolist()) # word_de_ids.extend(output[0]['MLT_label'].detach().cpu().tolist()) logit_words.extend([ vocab[id] for id in logits[cur_id:min(cur_id + bs, len(test_database))] ]) cur_id += bs # output = model(*batch) # probs = F.softmax(output['label_logits'].float(), dim=1) # batch_size = probs.shape[0] # test_probs.append(probs.float().detach().cpu().numpy()) # test_ids.append([test_database[cur_id + k]['annot_id'] for k in range(batch_size)]) # logits.extend(F.sigmoid(output[0]['relationship_logits']).detach().cpu().tolist()) # ************ # Step 3: Store all logit results in file for later evalution result = [{ 'logit': l_id, 'word_en': word_en, 'word_de': word_de, 'word_pred': logit_word, 'caption_en': caption_en, 'caption_de': caption_de } for l_id, word_en, word_de, logit_word, caption_en, caption_de in zip( logits, words_en, words_de, logit_words, captions_en, captions_de)] cfg_name = os.path.splitext(os.path.basename(args.cfg))[0] result_json_path = os.path.join( save_path, '{}_MLT_{}.json'.format(cfg_name if save_name is None else save_name, config.DATASET.TEST_IMAGE_SET)) with open(result_json_path, 'w') as f: json.dump(result, f) print('result json saved to {}.'.format(result_json_path)) return result_json_path
def test_net(args, config, ckpt_path=None, save_path=None, save_name=None): print('test net...') pprint.pprint(args) pprint.pprint(config) device_ids = [int(d) for d in config.GPUS.split(',')] obj_cats = config.OBJECT_CATEGORIES pred_cats = config.PREDICATE_CATEGORIES torch.backends.cudnn.enabled = False torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False if ckpt_path is None: _, train_output_path = create_logger(config.OUTPUT_PATH, args.cfg, config.DATASET.TRAIN_IMAGE_SET, split='train') model_prefix = os.path.join(train_output_path, config.MODEL_PREFIX) ckpt_path = '{}-best.model'.format(model_prefix) print('Use best checkpoint {}...'.format(ckpt_path)) if save_path is None: logger, test_output_path = create_logger(config.OUTPUT_PATH, args.cfg, config.DATASET.TEST_IMAGE_SET, split='test') save_path = test_output_path if not os.path.exists(save_path): os.makedirs(save_path) shutil.copy2(ckpt_path, os.path.join(save_path, '{}_test_ckpt_{}.model'.format( config.MODEL_PREFIX, config.DATASET.TASK ))) # get network model = eval(config.MODULE)(config) if len(device_ids) > 1: model = torch.nn.DataParallel(model, device_ids=device_ids).cuda() else: torch.cuda.set_device(device_ids[0]) model = model.cuda() checkpoint = torch.load(ckpt_path, map_location=lambda storage, loc: storage) smart_load_model_state_dict(model, checkpoint['state_dict']) split = args.split loader = make_dataloader(config, mode=split, distributed=False) nb_of_correct_50 = nb_of_sample = nb_of_correct_top100 = 0 model.eval() save_dir = '' if args.visualize_mask: # For mask visualization purpose save_dir = 'heatmap/vrd' if not os.path.isdir(save_dir): os.makedirs(save_dir) for nbatch, batch in zip(trange(len(loader)), loader): batch = to_cuda(batch) output = model(*batch) n_correct, n_sample, n_correct_top100 = compute_recall(output, obj_cats, pred_cats, remove_bg=config.TRAIN.SAMPLE_RELS != -1, visualize_mask=args.visualize_mask, save_dir=save_dir) nb_of_correct_50 += n_correct nb_of_correct_top100 += n_correct_top100 nb_of_sample += n_sample recall_50 = nb_of_correct_50 / nb_of_sample recall_100 = nb_of_correct_top100 / nb_of_sample return recall_50, recall_100
def val_net(args, config, ckpt_path=None, save_path=None, save_name=None): print('test net...') pprint.pprint(args) pprint.pprint(config) device_ids = [int(d) for d in config.GPUS.split(',')] # os.environ['CUDA_VISIBLE_DEVICES'] = config.GPUS torch.backends.cudnn.enabled = False torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False if ckpt_path is None: _, train_output_path = create_logger(config.OUTPUT_PATH, args.cfg, config.DATASET.TRAIN_IMAGE_SET, split='train') model_prefix = os.path.join(train_output_path, config.MODEL_PREFIX) ckpt_path = '{}-best.model'.format(model_prefix) print('Use best checkpoint {}...'.format(ckpt_path)) # if save_path is None: # logger, test_output_path = create_logger(config.OUTPUT_PATH, args.cfg, config.DATASET.TEST_IMAGE_SET, # split='test') # save_path = test_output_path # if not os.path.exists(save_path): # os.makedirs(save_path) # shutil.copy2(ckpt_path, # os.path.join(save_path, '{}_test_ckpt_{}.model'.format(config.MODEL_PREFIX, config.DATASET.TASK))) # get network model = eval(config.MODULE)(config) if hasattr(model, 'setup_adapter'): model.setup_adapter() if len(device_ids) > 1: model = torch.nn.DataParallel(model, device_ids=device_ids).cuda() else: torch.cuda.set_device(device_ids[0]) model = model.cuda() checkpoint = torch.load(ckpt_path, map_location=lambda storage, loc: storage) smart_load_model_state_dict(model, checkpoint['state_dict']) # loader test_loader = make_dataloader(config, mode='val', distributed=False) test_dataset = test_loader.dataset test_database = test_dataset.database # test predicts = [] model.eval() cur_id = 0 for nbatch, batch in zip(trange(len(test_loader)), test_loader): # for nbatch, batch in tqdm(enumerate(test_loader)): bs = test_loader.batch_sampler.batch_size if test_loader.batch_sampler is not None else test_loader.batch_size batch = to_cuda(batch) outputs = model(*batch[:-1]) if outputs['label_logits'].shape[-1] == 1: prob = torch.sigmoid(outputs['label_logits'][:, 0]).detach().cpu().tolist() else: prob = torch.softmax(outputs['label_logits'], dim=-1)[:, 1].detach().cpu().tolist() sample_ids = batch[-1].cpu().tolist() targets = batch[config.DATASET.LABEL_INDEX_IN_BATCH] for pb, id, tg in zip(prob, sample_ids, targets): predicts.append({ 'id': int(id), 'proba': float(pb), 'label': int(pb > 0.5), 'target': float(tg) }) pred_probs = [p['proba'] for p in predicts] pred_labels = [p['label'] for p in predicts] targets = [p['target'] for p in predicts] roc_auc = roc_auc_score(targets, pred_probs) print(f"roc_auc: {roc_auc}") max_accuracy = 0.0 best_threshold = 1e-2 for th in range(1, 100): targets_idx = [int(p['target'] > 1e-2 * th) for p in predicts] accuracy = accuracy_score(targets_idx, pred_labels) if accuracy > max_accuracy: max_accuracy = accuracy best_threshold = th * 1e-2 print(f"max accuracy: {max_accuracy}, best_threshold: {best_threshold}")
def test_net(args, config, ckpt_path=None, save_path=None, save_name=None): print('test net...') pprint.pprint(args) pprint.pprint(config) device_ids = [int(d) for d in config.GPUS.split(',')] # os.environ['CUDA_VISIBLE_DEVICES'] = config.GPUS torch.backends.cudnn.enabled = False torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False if ckpt_path is None: _, train_output_path = create_logger(config.OUTPUT_PATH, args.cfg, config.DATASET.TRAIN_IMAGE_SET, split='train') model_prefix = os.path.join(train_output_path, config.MODEL_PREFIX) ckpt_path = '{}-best.model'.format(model_prefix) print('Use best checkpoint {}...'.format(ckpt_path)) if save_path is None: logger, test_output_path = create_logger(config.OUTPUT_PATH, args.cfg, config.DATASET.TEST_IMAGE_SET, split='test') save_path = test_output_path if not os.path.exists(save_path): os.makedirs(save_path) test_ckpt_path = '{}_test_ckpt_{}.model'.format(config.MODEL_PREFIX, config.DATASET.TASK) try: shutil.copy2(ckpt_path, os.path.join(save_path, test_ckpt_path)) except shutil.SameFileError: print(f'Test checkpoints is alredy exist: {test_ckpt_path}') # get network model = eval(config.MODULE)(config) if hasattr(model, 'setup_adapter'): model.setup_adapter() # if len(device_ids) > 1: # model = torch.nn.DataParallel(model, device_ids=device_ids).cuda() # else: torch.cuda.set_device(min(device_ids)) model = model.cuda() checkpoint = torch.load(ckpt_path, map_location=lambda storage, loc: storage) smart_load_model_state_dict(model, checkpoint['state_dict']) # loader test_loader = make_dataloader(config, mode='test', distributed=False) test_dataset = test_loader.dataset test_database = test_dataset.database # test predicts = [] model.eval() cur_id = 0 for nbatch, batch in zip(trange(len(test_loader)), test_loader): # for nbatch, batch in tqdm(enumerate(test_loader)): bs = test_loader.batch_sampler.batch_size if test_loader.batch_sampler is not None else test_loader.batch_size batch = to_cuda(batch) outputs = model(*batch[:-1]) if outputs['label_logits'].shape[-1] == 1: prob = torch.sigmoid(outputs['label_logits'][:, 0]).detach().cpu().tolist() else: prob = torch.softmax(outputs['label_logits'], dim=-1)[:, 1].detach().cpu().tolist() sample_ids = batch[-1].cpu().tolist() for pb, id in zip(prob, sample_ids): predicts.append({ 'id': int(id), 'proba': float(pb), 'label': int(pb > 0.5) }) cfg_name = os.path.splitext(os.path.basename(args.cfg))[0] output_name = cfg_name if save_name is None else save_name result_json_path = os.path.join(save_path, f'{output_name}_cls_{config.DATASET.TEST_IMAGE_SET}.json') result_csv_path = os.path.join(save_path, f'{output_name}_cls_{config.DATASET.TEST_IMAGE_SET}.csv') with open(result_json_path, 'w') as f: json.dump(predicts, f) print('result json saved to {}.'.format(result_json_path)) pd.DataFrame.from_dict(predicts).to_csv(result_csv_path, index=False) return result_json_path
def test_net(args, config): print('test net...') pprint.pprint(args) pprint.pprint(config) task = config.FOIL_TASK device_ids = [int(d) for d in config.GPUS.split(',')] #os.environ['CUDA_VISIBLE_DEVICES'] = config.GPUS config.DATASET.TEST_IMAGE_SET = args.split ckpt_path = args.ckpt save_path = args.result_path if not os.path.exists(save_path): os.makedirs(save_path) shutil.copy2( ckpt_path, os.path.join( save_path, '{}_test_ckpt_{}.model'.format(config.MODEL_PREFIX, config.DATASET.TASK))) torch.backends.cudnn.enabled = False torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # get network model = eval(config.MODULE)(config) if len(device_ids) > 1: model = torch.nn.DataParallel(model, device_ids=device_ids).cuda() else: torch.cuda.set_device(device_ids[0]) model = model.cuda() checkpoint = torch.load(ckpt_path, map_location=lambda storage, loc: storage) smart_load_model_state_dict(model, checkpoint['state_dict']) # loader test_loader = make_dataloader(config, mode='test', distributed=False) test_dataset = test_loader.dataset test_database = test_dataset.database # test ref_ids = [] pred_boxes = [] model.eval() cur_id = 0 for nbatch, batch in zip(trange(len(test_loader)), test_loader): # for nbatch, batch in tqdm(enumerate(test_loader)): bs = test_loader.batch_sampler.batch_size if test_loader.batch_sampler is not None else test_loader.batch_size ref_ids.extend([ test_database[id]['ref_id'] for id in range(cur_id, min(cur_id + bs, len(test_database))) ]) batch = to_cuda(batch) output = model(*batch) pred_boxes.extend(output['pred_boxes'].detach().cpu().tolist()) cur_id += bs result = [{ 'ref_id': ref_id, 'box': box } for ref_id, box in zip(ref_ids, pred_boxes)] result_json_path = os.path.join( save_path, '{}_refcoco+_{}.json'.format( config.MODEL_PREFIX if args.result_name is None else args.result_name, config.DATASET.TEST_IMAGE_SET)) with open(result_json_path, 'w') as f: json.dump(result, f) print('result json saved to {}.'.format(result_json_path)) # evaluate (test label of refcoco+ has been released) print("Evaluate on split: {}...".format(config.DATASET.TEST_IMAGE_SET)) pred_boxes_arr = np.array(pred_boxes) gt_boxes_arr = np.array( [test_dataset.refer.getRefBox(ref_id=ref_id) for ref_id in ref_ids]) gt_boxes_arr[:, [2, 3]] += gt_boxes_arr[:, [0, 1]] iou = cacluate_iou(pred_boxes_arr, gt_boxes_arr) acc = float((iou >= POSITIVE_THRESHOLD).sum() * 1.0 / iou.shape[0]) print("Accuracy: {}.".format(acc * 100.0)) return result_json_path
def test_net(args, config, ckpt_path=None, save_path=None, save_name=None): print('test net...') pprint.pprint(args) pprint.pprint(config) device_ids = [int(d) for d in config.GPUS.split(',')] # os.environ['CUDA_VISIBLE_DEVICES'] = config.GPUS torch.backends.cudnn.enabled = False torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False if ckpt_path is None: _, train_output_path = create_logger(config.OUTPUT_PATH, args.cfg, config.DATASET.TRAIN_IMAGE_SET, split='train') model_prefix = os.path.join(train_output_path, config.MODEL_PREFIX) ckpt_path = '{}-best.model'.format(model_prefix) print('Use best checkpoint {}...'.format(ckpt_path)) if save_path is None: logger, test_output_path = create_logger(config.OUTPUT_PATH, args.cfg, config.DATASET.TEST_IMAGE_SET, split='test') save_path = test_output_path if not os.path.exists(save_path): os.makedirs(save_path) shutil.copy2(ckpt_path, os.path.join(save_path, '{}_test_ckpt_{}.model'.format(config.MODEL_PREFIX, config.DATASET.TASK))) # get network model = eval(config.MODULE)(config) if len(device_ids) > 1: model = torch.nn.DataParallel(model, device_ids=device_ids).cuda() else: torch.cuda.set_device(device_ids[0]) model = model.cuda() checkpoint = torch.load(ckpt_path, map_location=lambda storage, loc: storage) smart_load_model_state_dict(model, checkpoint['state_dict']) # loader test_loader = make_dataloader(config, mode='test', distributed=False) split = args.split + 'id' if args.split == 'val' else args.split # 'val' -> 'valid' # test if config.TEST.EXCL_LEFT_RIGHT: precompute_test_cache = f'{args.log_dir}/pred_{split}_{ckpt_path[-10:-6]}_excl-left-right.pickle' else: precompute_test_cache = f'{args.log_dir}/pred_{split}_{ckpt_path[-10:-6]}.pickle' if not os.path.isdir(args.log_dir): os.makedirs(args.log_dir) pred_file = precompute_test_cache if not os.path.exists(precompute_test_cache): _ids = [] losses = [] predictions = [] model.eval() if args.visualize_mask: # For mask visualization purpose save_dir = 'heatmap/spasen' if not os.path.isdir(save_dir): # os.mkdir(save_dir) os.makedirs(save_dir) for nbatch, batch in zip(trange(len(test_loader)), test_loader): _ids.extend(batch[0]) # the first input element is _id batch = to_cuda(batch) output = model(*batch) predictions.append(output['prediction']) losses.append(output['ans_loss'].item()) if args.visualize_mask: # For mask visualization purpose mask = output['spo_fused_masks'].cpu() # torch.Size([8, 3, 14, 14]) subj_name = output['subj_name'] # list of 8 strs obj_name = output['obj_name'] # list of 8 strs pred_name = output['pred_name'] # list of 8 strs im_path = output['im_path'] # list of 8 img urls for i in range(mask.shape[0]): img, dataset = read_img(im_path[i], config.IMAGEPATH) img_name = dataset + '-' + im_path[i].split('/')[-1] show_cam_on_image(img, mask[i], img_name, subj_name[i], obj_name[i], pred_name[i], save_dir) predictions = [v.item() for v in torch.cat(predictions)] loss = sum(losses) / len(losses) pickle.dump((_ids, predictions, loss), open(pred_file, 'wb')) accs, loss = accuracies(pred_file, 'data/spasen/annotations.json', split) return accs, loss
def test_translation_net(args, config, ckpt_path=None, save_path=None, save_name=None): print('test net...') pprint.pprint(args) pprint.pprint(config) device_ids = [int(d) for d in config.GPUS.split(',')] # os.environ['CUDA_VISIBLE_DEVICES'] = config.GPUS torch.backends.cudnn.enabled = False torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False if ckpt_path is None: _, train_output_path = create_logger(config.OUTPUT_PATH, args.cfg, config.DATASET.TRAIN_IMAGE_SET, split='train') model_prefix = os.path.join(train_output_path, config.MODEL_PREFIX) ckpt_path = '{}-best.model'.format(model_prefix) print('Use best checkpoint {}...'.format(ckpt_path)) if save_path is None: logger, test_output_path = create_logger(config.OUTPUT_PATH, args.cfg, config.DATASET.TEST_IMAGE_SET, split='test') save_path = test_output_path if not os.path.exists(save_path): os.makedirs(save_path) shutil.copy2(ckpt_path, os.path.join(save_path, '{}_test_ckpt_{}.model'.format(config.MODEL_PREFIX, config.DATASET.TASK))) # ************ # Step 1: Select model architecture and preload trained model model = eval(config.MODULE)(config) if len(device_ids) > 1: model = torch.nn.DataParallel(model, device_ids=device_ids).cuda() else: torch.cuda.set_device(device_ids[0]) model = model.cuda() checkpoint = torch.load(ckpt_path, map_location=lambda storage, loc: storage) smart_load_model_state_dict(model, checkpoint['state_dict']) # ************ # Step 2: Create dataloader to include all caption-image pairs test_loader = make_dataloader(config, mode='test', distributed=False) test_dataset = test_loader.dataset test_database = test_dataset.database # ************ # Step 3: Run all pairs through model for inference caption_ids = [] image_ids = [] logits = [] model.eval() cur_id = 0 for nbatch, batch in zip(trange(len(test_loader)), test_loader): bs = test_loader.batch_sampler.batch_size if test_loader.batch_sampler is not None else test_loader.batch_size caption_ids.extend([test_database[id]['caption_en_index'] for id in range(cur_id, min(cur_id + bs, len(test_database)))]) image_ids.extend([test_database[id]['caption_de_index'] for id in range(cur_id, min(cur_id + bs, len(test_database)))]) batch = to_cuda(batch) output = model(*batch) logits.extend(F.sigmoid(output[0]['relationship_logits']).detach().cpu().tolist()) cur_id += bs #TODO: remove this is just for checking # if nbatch>900: # break # ************ # Step 3: Store all logit results in file for later evalution result = [{'caption_en_index': c_id, 'caption_de_index': i_id, 'logit': l_id} for c_id, i_id, l_id in zip(caption_ids, image_ids, logits)] cfg_name = os.path.splitext(os.path.basename(args.cfg))[0] result_json_path = os.path.join(save_path, '{}_retrieval_translation_{}.json'.format(cfg_name if save_name is None else save_name, config.DATASET.TEST_IMAGE_SET)) with open(result_json_path, 'w') as f: json.dump(result, f) print('result json saved to {}.'.format(result_json_path)) return result_json_path
def test_net(args, config, ckpt_path=None, save_path=None, save_name=None): print('test net...') pprint.pprint(args) pprint.pprint(config) device_ids = [int(d) for d in config.GPUS.split(',')] # os.environ['CUDA_VISIBLE_DEVICES'] = config.GPUS torch.backends.cudnn.enabled = False torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False if ckpt_path is None: _, train_output_path = create_logger(config.OUTPUT_PATH, args.cfg, config.DATASET.TRAIN_IMAGE_SET, split='train') model_prefix = os.path.join(train_output_path, config.MODEL_PREFIX) ckpt_path = '{}-best.model'.format(model_prefix) print('Use best checkpoint {}...'.format(ckpt_path)) if save_path is None: logger, test_output_path = create_logger(config.OUTPUT_PATH, args.cfg, config.DATASET.TEST_IMAGE_SET, split='test') save_path = test_output_path if not os.path.exists(save_path): os.makedirs(save_path) shutil.copy2( ckpt_path, os.path.join( save_path, '{}_test_ckpt_{}.model'.format(config.MODEL_PREFIX, config.DATASET.TASK))) # get network model = eval(config.MODULE)(config) if len(device_ids) > 1: model = torch.nn.DataParallel(model, device_ids=device_ids).cuda() else: torch.cuda.set_device(device_ids[0]) model = model.cuda() checkpoint = torch.load(ckpt_path, map_location=lambda storage, loc: storage) smart_load_model_state_dict(model, checkpoint['state_dict']) # loader test_loader = make_dataloader(config, mode='test', distributed=False) test_dataset = test_loader.dataset test_database = test_dataset.database # test q_ids = [] answer_ids = [] model.eval() cur_id = 0 for nbatch, batch in zip(trange(len(test_loader)), test_loader): # for nbatch, batch in tqdm(enumerate(test_loader)): bs = test_loader.batch_sampler.batch_size if test_loader.batch_sampler is not None else test_loader.batch_size q_ids.extend([ str(test_database[id]['annot_id']) for id in range(cur_id, min(cur_id + bs, len(test_database))) ]) batch = to_cuda(batch) output = model(*batch) answer_ids.extend(output['label_logits'].cpu().numpy().tolist()) cur_id += bs result = [q_ids, answer_ids] cfg_name = os.path.splitext(os.path.basename(args.cfg))[0] result_json_path = os.path.join( save_path, '{}_vqa2_{}.json'.format(cfg_name if save_name is None else save_name, config.DATASET.TEST_IMAGE_SET)) with open(result_json_path, 'w') as f: json.dump(result, f) print('result json saved to {}.'.format(result_json_path)) return result_json_path
def train_net(args, config): # setup logger logger, final_output_path = create_logger(config.OUTPUT_PATH, args.cfg, config.DATASET[0].TRAIN_IMAGE_SET if isinstance(config.DATASET, list) else config.DATASET.TRAIN_IMAGE_SET, split='train') model_prefix = os.path.join(final_output_path, config.MODEL_PREFIX) if args.log_dir is None: args.log_dir = os.path.join(final_output_path, 'tensorboard_logs') pprint.pprint(args) logger.info('training args:{}\n'.format(args)) pprint.pprint(config) logger.info('training config:{}\n'.format(pprint.pformat(config))) # manually set random seed if config.RNG_SEED > -1: random.seed(config.RNG_SEED) np.random.seed(config.RNG_SEED) torch.random.manual_seed(config.RNG_SEED) torch.cuda.manual_seed_all(config.RNG_SEED) # cudnn torch.backends.cudnn.benchmark = False if args.cudnn_off: torch.backends.cudnn.enabled = False if args.dist: model = eval(config.MODULE)(config) local_rank = int(os.environ.get('LOCAL_RANK') or 0) config.GPUS = str(local_rank) torch.cuda.set_device(local_rank) master_address = os.environ['MASTER_ADDR'] master_port = int(os.environ['MASTER_PORT'] or 23456) world_size = int(os.environ['WORLD_SIZE'] or 1) rank = int(os.environ['RANK'] or 0) if args.slurm: distributed.init_process_group(backend='nccl') else: distributed.init_process_group( backend='nccl', init_method='tcp://{}:{}'.format(master_address, master_port), world_size=world_size, rank=rank, group_name='mtorch') print(f'native distributed, size: {world_size}, rank: {rank}, local rank: {local_rank}') torch.cuda.set_device(local_rank) config.GPUS = str(local_rank) model = model.cuda() if not config.TRAIN.FP16: model = DDP(model, device_ids=[local_rank], output_device=local_rank) if rank == 0: summary_parameters(model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model, logger) shutil.copy(args.cfg, final_output_path) shutil.copy(inspect.getfile(eval(config.MODULE)), final_output_path) writer = None if args.log_dir is not None: tb_log_dir = os.path.join(args.log_dir, 'rank{}'.format(rank)) if not os.path.exists(tb_log_dir): os.makedirs(tb_log_dir) writer = SummaryWriter(log_dir=tb_log_dir) if isinstance(config.DATASET, list): train_loaders_and_samplers = make_dataloaders(config, mode='train', distributed=True, num_replicas=world_size, rank=rank, expose_sampler=True) train_loader = MultiTaskDataLoader([loader for loader, _ in train_loaders_and_samplers]) train_sampler = train_loaders_and_samplers[0][1] else: train_loader, train_sampler = make_dataloader(config, mode='train', distributed=True, num_replicas=world_size, rank=rank, expose_sampler=True) batch_size = world_size * (sum(config.TRAIN.BATCH_IMAGES) if isinstance(config.TRAIN.BATCH_IMAGES, list) else config.TRAIN.BATCH_IMAGES) if config.TRAIN.GRAD_ACCUMULATE_STEPS > 1: batch_size = batch_size * config.TRAIN.GRAD_ACCUMULATE_STEPS base_lr = config.TRAIN.LR * batch_size optimizer_grouped_parameters = [{'params': [p for n, p in model.named_parameters() if _k in n], 'lr': base_lr * _lr_mult} for _k, _lr_mult in config.TRAIN.LR_MULT] optimizer_grouped_parameters.append({'params': [p for n, p in model.named_parameters() if all([_k not in n for _k, _ in config.TRAIN.LR_MULT])]}) if config.TRAIN.OPTIMIZER == 'SGD': optimizer = optim.SGD(optimizer_grouped_parameters, lr=config.TRAIN.LR * batch_size, momentum=config.TRAIN.MOMENTUM, weight_decay=config.TRAIN.WD) elif config.TRAIN.OPTIMIZER == 'Adam': optimizer = optim.Adam(optimizer_grouped_parameters, lr=config.TRAIN.LR * batch_size, weight_decay=config.TRAIN.WD) elif config.TRAIN.OPTIMIZER == 'AdamW': optimizer = AdamW(optimizer_grouped_parameters, lr=config.TRAIN.LR * batch_size, betas=(0.9, 0.999), eps=1e-6, weight_decay=config.TRAIN.WD, correct_bias=True) else: raise ValueError('Not support optimizer {}!'.format(config.TRAIN.OPTIMIZER)) total_gpus = world_size else: #os.environ['CUDA_VISIBLE_DEVICES'] = config.GPUS model = eval(config.MODULE)(config) summary_parameters(model, logger) shutil.copy(args.cfg, final_output_path) shutil.copy(inspect.getfile(eval(config.MODULE)), final_output_path) num_gpus = len(config.GPUS.split(',')) assert num_gpus <= 1 or (not config.TRAIN.FP16), "Not support fp16 with torch.nn.DataParallel. " \ "Please use amp.parallel.DistributedDataParallel instead." total_gpus = num_gpus rank = None writer = SummaryWriter(log_dir=args.log_dir) if args.log_dir is not None else None # model if num_gpus > 1: model = torch.nn.DataParallel(model, device_ids=[int(d) for d in config.GPUS.split(',')]).cuda() else: torch.cuda.set_device(int(config.GPUS)) model.cuda() # loader if isinstance(config.DATASET, list): train_loaders = make_dataloaders(config, mode='train', distributed=False) train_loader = MultiTaskDataLoader(train_loaders) else: train_loader = make_dataloader(config, mode='train', distributed=False) train_sampler = None batch_size = num_gpus * (sum(config.TRAIN.BATCH_IMAGES) if isinstance(config.TRAIN.BATCH_IMAGES, list) else config.TRAIN.BATCH_IMAGES) # partial load pretrain state dict if config.NETWORK.PARTIAL_PRETRAIN != "": pretrain_state_dict = torch.load(config.NETWORK.PARTIAL_PRETRAIN, map_location=lambda storage, loc: storage)['state_dict'] prefix_change = [prefix_change.split('->') for prefix_change in config.NETWORK.PARTIAL_PRETRAIN_PREFIX_CHANGES] if len(prefix_change) > 0: pretrain_state_dict_parsed = {} for k, v in pretrain_state_dict.items(): no_match = True for pretrain_prefix, new_prefix in prefix_change: if k.startswith(pretrain_prefix): k = new_prefix + k[len(pretrain_prefix):] pretrain_state_dict_parsed[k] = v no_match = False break if no_match: pretrain_state_dict_parsed[k] = v pretrain_state_dict = pretrain_state_dict_parsed smart_partial_load_model_state_dict(model, pretrain_state_dict) # batch end callbacks batch_size = len(config.GPUS.split(',')) * (sum(config.TRAIN.BATCH_IMAGES) if isinstance(config.TRAIN.BATCH_IMAGES, list) else config.TRAIN.BATCH_IMAGES) batch_end_callbacks = [Speedometer(batch_size, config.LOG_FREQUENT, batches_per_epoch=len(train_loader), epochs=1)] # broadcast parameter from rank 0 before training start if args.dist: for v in model.state_dict().values(): distributed.broadcast(v, src=0) # set net to train mode model.eval() # init end time end_time = time.time() # Parameter to pass to batch_end_callback BatchEndParam = namedtuple('BatchEndParams', ['epoch', 'nbatch', 'rank', 'add_step', 'data_in_time', 'data_transfer_time', 'forward_time', 'backward_time', 'optimizer_time', 'metric_time', 'eval_metric', 'locals']) def _multiple_callbacks(callbacks, *args, **kwargs): """Sends args and kwargs to any configured callbacks. This handles the cases where the 'callbacks' variable is ``None``, a single function, or a list. """ if isinstance(callbacks, list): for cb in callbacks: cb(*args, **kwargs) return if callbacks: callbacks(*args, **kwargs) # initialize Fisher fisher = {} for n, p in model.named_parameters(): fisher[n] = p.new_zeros(p.size()) p.requires_grad = True p.retain_grad() # training for nbatch, batch in enumerate(train_loader): model.zero_grad() global_steps = len(train_loader) + nbatch os.environ['global_steps'] = str(global_steps) # record time data_in_time = time.time() - end_time # transfer data to GPU data_transfer_time = time.time() batch = to_cuda(batch) data_transfer_time = time.time() - data_transfer_time # forward forward_time = time.time() outputs, loss = model(*batch) loss = loss.mean() forward_time = time.time() - forward_time # backward backward_time = time.time() loss.backward() backward_time = time.time() - backward_time for n, p in model.named_parameters(): assert p.grad is not None, print(batch) fisher[n] += p.grad**2 / len(train_loader) batch_end_params = BatchEndParam(epoch=0, nbatch=nbatch, add_step=True, rank=rank, data_in_time=data_in_time, data_transfer_time=data_transfer_time, forward_time=forward_time, backward_time=backward_time, optimizer_time=0., metric_time=0., eval_metric=None, locals=locals()) _multiple_callbacks(batch_end_callbacks, batch_end_params) with open(os.path.join(config.EWC_STATS_PATH, "fisher"), "wb") as fisher_file: pickle.dump(fisher, fisher_file) torch.save(model.state_dict(), os.path.join(config.EWC_STATS_PATH, "params_pretrain"))