def test_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if pt != '': test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if k in model_flags: setattr(args, k, opt[k]) print(args) symbols, tokenizer = get_symbol_and_tokenizer(args.encoder, args.temp_dir) model = AbsSummarizer(args, device, checkpoint, symbols=symbols) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True, tokenizer=tokenizer) predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, step)
def test_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) # 为了中文的tokenize能把unused分开 # for chinese tokenization, or it will split the word 'unused' add_token_list = ['[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]'] if args.bart: tokenizer = AutoTokenizer.from_pretrained('bart-base', do_lower_case=True, cache_dir=args.temp_dir, local_files_only=False) symbols = {'BOS': tokenizer.encoder['madeupword0000'], 'EOS': tokenizer.encoder['madeupword0001'], 'PAD': 0, 'EOQ': tokenizer.encoder['madeupword0002']} else: tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True, cache_dir=args.temp_dir, local_files_only=False, additional_special_tokens=add_token_list) symbols = {'BOS': tokenizer.vocab['[unused1]'], 'EOS': tokenizer.vocab['[unused2]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused3]']} predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, step)
def test_text_abs(args): logger.info('Loading checkpoint from %s' % args.test_from) device = "cpu" if args.visible_gpus == '-1' else "cuda" checkpoint = torch.load(args.test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) logger.info('Loading args inside test_text_abs %s' % args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.load_text(args, args.text_src, args.text_tgt, device) logger.info('test_iter is %s' % test_iter) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) symbols = { 'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]'] } logger.info('symbols is %s' % symbols) predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, -1)
def test_text_abs(args, device_id, pt, step, tokenizer): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) symbols = {'BOS': tokenizer.convert_tokens_to_ids('<s>'), 'EOS': tokenizer.convert_tokens_to_ids('</s>'), 'PAD': tokenizer.convert_tokens_to_ids('[PAD]')} predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, step)
def val_abs(self, args, iter_fct, step): self.model.eval() symbols = {'BOS': self.tokenizer.vocab['[unused0]'], 'EOS': self.tokenizer.vocab['[unused1]'], 'PAD': self.tokenizer.vocab['[PAD]'], 'EOQ': self.tokenizer.vocab['[unused2]']} predictor = build_predictor(args, self.tokenizer, symbols, self.model, logger) return predictor.translate(iter_fct, step, return_entities=True)
def test_text_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']} predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, step)
def test_text(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) tokenizer = BertTokenizer.from_pretrained(args.bert_dir) model = AbsSummarizer(args, device, tokenizer.vocab, checkpoint) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, args.test_batch_ex_size, device, shuffle=False, is_test=True) predictor = build_predictor(args, tokenizer, model, logger) predictor.translate(test_iter, step)
def test_abs(args, device_id, pt, step): """ Implements testing process (meta / non-memta) Arguments: device_id (int) : the GPU id to be used pt() : checkpoint model step (int) : checkpoint step Process: - load checkpoint - prepare dataloader class - prepare model class - prepare predictor - predictor.translate() """ device = "cpu" if args.visible_gpus == '-1' else "cuda" logger.info('Device ID %d', device_id) logger.info('Device %s', device) # Load chekcpoint if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) # Prepare dataloader test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) # Prepare model if (args.meta_mode): model = MTLAbsSummarizer(args, device, checkpoint) else: model = AbsSummarizer(args, device, checkpoint) model.eval() # Prepare predictor tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) symbols = { 'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]'] } predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, step) # long time
def test_model(self, corpus_type, topn=0): model_file = _top_model(self.model_path, n=topn) logger.info('Test GuidAbs model %s' % model_file) fn_touch = path.join( self.model_path, 'finished_%s.test_guidabs_model%s' % (corpus_type, topn)) if path.exists(fn_touch): return args = self._build_abs_args() args.mode = 'test' args.bert_data_path = path.join(self.data_path, 'cnndm') args.model_path = self.model_path args.log_file = path.join( self.model_path, 'test_abs_bert_cnndm_%s_top%s.log' % (corpus_type, topn)) args.result_path = path.join(self.model_path, 'cnndm_%s_top%s' % (corpus_type, topn)) init_logger(args.log_file) step = int(model_file.split('.')[-2].split('_')[-1]) # load abs model step_abs = int(model_file.split('.')[-2].split('_')[-1]) checkpoint = torch.load(model_file, map_location=lambda storage, loc: storage) model_abs = model_bld.AbsSummarizer(args, args.device, checkpoint) model_abs.eval() # init model testers tokenizer = BertTokenizer.from_pretrained(path.join( args.bert_model_path, model_abs.bert.model_name), do_lower_case=True, cache_dir=args.temp_dir) symbols = { 'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]'] } predictor = pred_abs.build_predictor(args, tokenizer, symbols, model_abs, logger) test_iter = data_ldr.Dataloader(args, data_ldr.load_dataset(args, corpus_type, shuffle=False), args.test_batch_size, args.device, shuffle=False, is_test=True, keep_order=True) avg_f1 = test_abs(logger, args, predictor, step_abs, test_iter) os.system('touch %s' % fn_touch) return avg_f1
def build_data(self, loader): """ Build polluted textgraph by adversarial trained generator""" print("Build Pollued data by adv model from {}".format(self.args.savepath)) test = TreeDataset(self.args, loader, self.dataname, self.device, self.tokenizer) data_iter = Iterator(test, train=False, device=self.device, batch_size=len(test), sort_key=lambda x: len(x.src), sort_within_batch=False) best_adv_model = os.path.join(self.args.savepath, 'best_adv_model.pt') self.model.load_state_dict(torch.load(best_adv_model, map_location=lambda storage, loc: storage)['model']) predictor = build_predictor(self.args, self.model, self.tokenizer, self.symbols, logger) predictor.build(data_iter)
def test_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) parser = argparse.ArgumentParser() parser.add_argument('--bpe-codes', default="/content/PhoBERT_base_transformers/bpe.codes", required=False, type=str, help='path to fastBPE BPE') args1, unknown = parser.parse_known_args() bpe = fastBPE(args1) # Load the dictionary vocab = Dictionary() vocab.add_from_file("/content/PhoBERT_base_transformers/dict.txt") tokenizer = bpe symbols = { 'BOS': vocab.indices['[unused0]'], 'EOS': vocab.indices['[unused1]'], 'PAD': vocab.indices['[PAD]'], 'EOQ': vocab.indices['[unused2]'] } predictor = build_predictor(args, vocab, symbols, model, logger) predictor.translate(test_iter, step)
def test_detector(self, loader, run_gen=False, portion='all'): """ Testing detector """ test = TreeDataset(self.args, loader, self.dataname, self.device, self.tokenizer) #test = CommentDataset(self.args, loader, self.dataname, self.device, self.tokenizer) data_iter = Iterator(test, train=False, device=self.device, batch_size=len(test) if len(test)<self.bs else self.bs, sort_key=lambda x: len(x.src), sort_within_batch=False) # Define trainer train_loss = abs_loss(self.args.label_num, self.maxepoch, self.device, train=False) trainer = build_trainer(self.args, self.model, self.optim, train_loss) logger.info('Test on best model (stage-1)') best_model = os.path.join(self.args.savepath, 'best_model.pt') if os.path.exists(best_model): try: self.model.load_state_dict(torch.load(best_model, map_location=lambda storage, loc: storage)['model']) except: self.model.load_state_dict(torch.load(best_model, map_location=lambda storage, loc: storage)['model'], strict=False) logger.info('[Warning] The keys in state dict do not strictly match') test_stat = trainer.testing(data_iter, tokenizer=self.tokenizer, gen_flag=False, info="Without Generated Response >>", write_type="w") test_stat.write_results(os.path.join(self.args.savepath, 'result_test.csv'), 'test-'+portion, self.args.label_num) if self.args.test_adv: logger.info('Test on adversarially-trained model (stage-2)') best_model = os.path.join(self.args.savepath, 'best_adv_model.pt') if os.path.exists(best_model): try: self.model.load_state_dict(torch.load(best_model, map_location=lambda storage, loc: storage)['model']) except: self.model.load_state_dict(torch.load(best_model, map_location=lambda storage, loc: storage)['model'], strict=False) logger.info('[Warning] The keys in state dict do not strictly match') test_stat = trainer.testing(data_iter, tokenizer=self.tokenizer, gen_flag=True, info="\nWith Generated Response from {} >>".format(best_model.split("/")[-1]), write_type=="a") predictor = build_predictor(self.args, self.model, self.tokenizer, self.symbols, logger) predictor.translate(data_iter, 'best', have_gold=False)
def __init__(self, abs_model_file): self.args = self._build_abs_args() # load model step_abs = int(abs_model_file.split('.')[-2].split('_')[-1]) checkpoint = torch.load(abs_model_file, map_location=lambda storage, loc: storage) self.model_abs = model_bld.AbsSummarizer(self.args, self.args.device, checkpoint) self.model_abs.eval() # prepare tokenizer and predictor self.tokenizer = BertTokenizer.from_pretrained(path.join(self.args.bert_model_path, self.model_abs.bert.model_name), do_lower_case=True) self.symbols = {'BOS': self.tokenizer.vocab['[unused0]'], 'EOS': self.tokenizer.vocab['[unused1]'], 'PAD': self.tokenizer.vocab['[PAD]'], 'EOQ': self.tokenizer.vocab['[unused2]']} self.predictor = pred_abs.build_predictor(self.args, self.tokenizer, self.symbols, self.model_abs, logger) # special tokens self.sep_token = '[SEP]' self.cls_token = '[CLS]' self.pad_token = '[PAD]' self.sep_vid = self.tokenizer.vocab[self.sep_token] self.cls_vid = self.tokenizer.vocab[self.cls_token] self.pad_vid = self.tokenizer.vocab[self.pad_token]
def test_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if not (args.test_from): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) tokenizer = BertTokenizer.from_pretrained( '../ETRI_koBERT/003_bert_eojeol_pytorch/vocab.txt', do_lower_case=False, cache_dir=args.temp_dir) if not args.share_emb: tokenizer = add_tokens(tokenizer) symbols = { 'BOS': tokenizer.vocab['<S>'], 'EOS': tokenizer.vocab['<T>'], 'PAD': tokenizer.vocab['[PAD]'] } predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, step)
def test_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == "-1" else "cuda" if pt != "": test_from = pt else: test_from = args.test_from logger.info("Loading checkpoint from %s" % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint["opt"]) for k in opt.keys(): if k in model_flags: setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.Dataloader( args, load_dataset(args, "test", shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True, ) tokenizer = BertTokenizer.from_pretrained( "chinese_roberta_wwm_ext_pytorch", do_lower_case=True, cache_dir=args.temp_dir) symbols = { "BOS": tokenizer.vocab["[unused1]"], "EOS": tokenizer.vocab["[unused2]"], "PAD": tokenizer.vocab["[PAD]"], "EOQ": tokenizer.vocab["[unused3]"], } predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, step)
def test_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) tokenizer = BertData(args).tokenizer #tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) # tokenizer = None # if args.pretrained_model_type in ['bert-base-uncased', 'bert-base-multilingual-uncased']: # tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_type, do_lower_case=True, cache_dir=args.temp_dir) # # if not tokenizer: # raise NotImplementedError("tokenizer") # tokenizer = add_to_vocab(tokenizer, ['[unused0]', '[unused1]', '[PAD]', '[unused2]']) symbols = {'BOS': tokenizer.convert_tokens_to_ids('[unused0]'), 'EOS': tokenizer.convert_tokens_to_ids('[unused1]'), 'PAD': tokenizer.convert_tokens_to_ids('[PAD]'), 'EOQ': tokenizer.convert_tokens_to_ids('[unused2]')} predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, step)
def __init__(self, model_path='cache/abs_bert_model.pt'): if not os.path.exists('cache'): os.mkdir('cache') # check if model is downloaded if not os.path.exists(model_path): print('Model not found in cache') self.download_model(model_path) # setup cache for bert model and tokenizer cache_dir = 'cache' if not os.path.exists(cache_dir): os.mkdir(cache_dir) checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage) device = "cuda" if torch.cuda.is_available() else "cpu" self.model = BertSummarizer(checkpoint, device, cache_dir) self.model.eval() tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=cache_dir) self.predictor = build_predictor(tokenizer, self.model)
def main(method, labels, unit_num, conv_layers, class_num, n_layers, dropout_ratio, model_path, save_path): # Dataset preparation train, val, test, train_smiles, val_smiles, test_smiles = data.load_dataset(method, labels) # --- model preparation --- model = predictor.build_predictor( method, unit_num, conv_layers, class_num, dropout_ratio, n_layers) classifier = L.Classifier(model, lossfun=F.sigmoid_cross_entropy, accfun=F.binary_accuracy) print('Loading model parameter from ', model_path) serializers.load_npz(model_path, model) target_dataset = val target_smiles = val_smiles val_mols = [Chem.MolFromSmiles(smi) for smi in tqdm(val_smiles)] pyridine_mol = Chem.MolFromSmarts(PYRIDINE_SMILES) pyridine_index = np.where(np.array([mol.HasSubstructMatch(pyridine_mol) for mol in val_mols]) == True) val_pyridine_mols = np.array(val_mols)[pyridine_index] # It only extracts one substructure, not expected behavior # val_pyridine_pos = [set(mol.GetSubstructMatch(pi)) for mol in val_pyridine_mols] def flatten_tuple(x): return [element for tupl in x for element in tupl] val_pyridine_pos = [flatten_tuple(mol.GetSubstructMatches(pyridine_mol)) for mol in val_pyridine_mols] # print('pyridine_index', pyridine_index) # print('val_pyridine_mols', val_pyridine_mols.shape) # print('val_pyridine_pos', val_pyridine_pos) # print('val_pyridine_pos length', [len(k) for k in val_pyridine_pos]) pyrigine_dataset = NumpyTupleDataset(*target_dataset.features[pyridine_index, :]) pyrigine_smiles = target_smiles[pyridine_index] print('pyrigine_dataset', len(pyrigine_dataset), len(pyrigine_smiles)) atoms = pyrigine_dataset.features[:, 0] num_atoms = [len(a) for a in atoms] def clip_original_size(saliency, num_atoms): """`saliency` array is 0 padded, this method align to have original molecule's length """ assert len(saliency) == len(num_atoms) saliency_list = [] for i in range(len(saliency)): saliency_list.append(saliency[i, :num_atoms[i]]) return saliency_list def preprocess_fun(*inputs): atom, adj, t = inputs # HACKING for now... atom_embed = classifier.predictor.graph_conv.embed(atom) return atom_embed, adj, t def eval_fun(*inputs): atom_embed, adj, t = inputs prob = classifier.predictor(atom_embed, adj) out = F.sum(prob) return out calculator_method = args.calculator print('calculator method', calculator_method) if calculator_method == 'gradient': # option1: Gradient calculator = GradientCalculator( classifier, eval_fun=eval_fun, # target_key='embed', eval_key='out', target_key=0, # multiply_target=True # this will calculate grad * input ) elif calculator_method == 'integrated_gradients': # option2: IntegratedGradients calculator = IntegratedGradientsCalculator( classifier, eval_fun=eval_fun, # target_key='embed', eval_key='out', target_key=0, steps=10 ) elif calculator_method == 'occlusion': # option3: Occlusion def eval_fun_occlusion(*inputs): atom_embed, adj, t = inputs prob = classifier.predictor(atom_embed, adj) # Do not take sum, instead return batch-wise score out = F.sigmoid(prob) return out calculator = OcclusionCalculator( classifier, eval_fun=eval_fun_occlusion, # target_key='embed', eval_key='out', target_key=0, slide_axis=1 ) else: raise ValueError("[ERROR] Unexpected value calculator_method={}".format(calculator_method)) M = 100 num = 20 rates = np.linspace(0.1, 1, num=num) print('M', M) # --- VanillaGrad --- saliency_arrays = calculator.compute_vanilla( pyrigine_dataset, converter=concat_mols, preprocess_fn=preprocess_fun) saliency = calculator.transform( saliency_arrays, ch_axis=3, method='square') # saliency_arrays -> M, batch_size, max_atom, ch_dim # print('saliency_arrays', saliency_arrays.shape) # saliency -> batch_size, max_atom # print('saliency', saliency.shape) saliency_vanilla = clip_original_size(saliency, num_atoms) # recall & precision vanilla_recall, vanilla_precision = calc_recall_precision(saliency_vanilla, rates, val_pyridine_pos) print('vanilla_recall', vanilla_recall) print('vanilla_precision', vanilla_precision) # --- SmoothGrad --- saliency_arrays = calculator.compute_smooth( pyrigine_dataset, converter=concat_mols, preprocess_fn=preprocess_fun, M=M, mode='absolute', scale=0.15 # previous implementation # mode='relative', scale=0.05 ) saliency = calculator.transform( saliency_arrays, ch_axis=3, method='square') saliency_smooth = clip_original_size(saliency, num_atoms) # recall & precision smooth_recall, smooth_precision = calc_recall_precision(saliency_smooth, rates, val_pyridine_pos) print('smooth_recall', smooth_recall) print('smooth_precision', smooth_precision) # --- BayesGrad --- # bayes grad is calculated by compute_vanilla with train=True saliency_arrays = calculator.compute_vanilla( pyrigine_dataset, converter=concat_mols, preprocess_fn=preprocess_fun, M=M, train=True) saliency = calculator.transform( saliency_arrays, ch_axis=3, method='square', lam=0) saliency_bayes = clip_original_size(saliency, num_atoms) bayes_recall, bayes_precision = calc_recall_precision(saliency_bayes, rates, val_pyridine_pos) print('bayes_recall', bayes_recall) print('bayes_precision', bayes_precision) plt.figure(figsize=(7, 5), dpi=200) plt.plot(vanilla_recall, vanilla_precision, 'k-', color='blue', label='VanillaGrad') plt.plot(smooth_recall, smooth_precision, 'k-', color='green', label='SmoothGrad') plt.plot(bayes_recall, bayes_precision, 'k-', color='red', label='BayesGrad(Ours)') plt.axhline(y=vanilla_precision[-1], color='gray', linestyle='--') plt.legend() plt.xlabel("recall") plt.ylabel("precision") if save_path: print('saved to ', save_path) plt.savefig(save_path) # plt.savefig('artificial_pr.eps') else: plt.show()
os.environ["CUDA_VISIBLE_DEVICES"] = args.visible_gpus init_logger(args.log_file) device = "cpu" if args.visible_gpus == '-1' else "cuda" device_id = 0 if device == "cuda" else -1 if (args.task == 'abs'): if (args.mode == 'train'): train_abs(args, device_id) elif (args.mode == 'validate'): validate_abs(args, device_id) elif (args.mode == 'score'): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']} predictor = build_predictor(args, tokenizer, symbols, None, logger) # step = 30000 gold_path = args.result_path + '.gold' can_path = args.result_path + '.candidate' rouges = predictor._report_rouge(gold_path, can_path) logger.info('Rouges at step %d \n%s' % (args.result_path, rouge_results_to_str(rouges))) # if self.tensorboard_writer is not None: # self.tensorboard_writer.add_scalar('test/rouge1-F', rouges['rouge_1_f_score'], step) # self.tensorboard_writer.add_scalar('test/rouge2-F', rouges['rouge_2_f_score'], step) # self.tensorboard_writer.add_scalar('test/rougeL-F', rouges['rouge_l_f_score'], step) elif (args.mode == 'lead'): baseline(args, cal_lead=True) elif (args.mode == 'oracle'): baseline(args, cal_oracle=True) if (args.mode == 'test'):
def test_model(self, extractor, corpus_type='test', block_trigram=True, quick_test=False): logger.info('Test SentExt model (%s) and GuidAbs model (%s) ...' % (extractor.name, self.model_file)) testname = '%s_guidabs_%s' % (extractor.name, 'blocktrigram' if block_trigram else 'noblocktrigram') # buid args args = self._build_abs_args() args.mode = 'test' args.bert_data_path = path.join(self.data_path, 'cnndm') args.model_path = self.result_path args.log_file = path.join(self.result_path, 'test_varextabs.%s.log' % testname) args.result_path = path.join(self.result_path, 'cnndm_' + testname) args.block_trigram = block_trigram init_logger(args.log_file) # load abs model abs_model_file = self.model_file logger.info('Loading abs model %s' % abs_model_file) step_abs = int(abs_model_file.split('.')[-2].split('_')[-1]) checkpoint = torch.load(abs_model_file, map_location=lambda storage, loc: storage) model_abs = model_bld.AbsSummarizer(args, args.device, checkpoint) model_abs.eval() # init model testers tokenizer = BertTokenizer.from_pretrained(path.join( args.bert_model_path, model_abs.bert.model_name), do_lower_case=True, cache_dir=args.temp_dir) symbols = { 'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]'] } predictor = pred_abs.build_predictor(args, tokenizer, symbols, model_abs, logger) test_iter = data_ldr.Dataloader(args, data_ldr.load_dataset(args, corpus_type, shuffle=False), args.test_batch_size, args.device, shuffle=False, is_test=True, keep_order=True) logger.info('Generating Ext/GuidAbs results %s ...' % args.result_path) avg_f1 = test_ext_abs(logger, args, extractor, predictor, 0, step_abs, test_iter, quick_test=quick_test) return avg_f1
def train(gpu, method, epoch, batchsize, n_unit, conv_layers, dataset, smiles, M, n_split, split_idx, order): n = len(dataset) assert len(order) == n left_idx = (n // n_split) * split_idx is_right_most_split = (n_split == split_idx + 1) if is_right_most_split: test_order = order[left_idx:] train_order = order[:left_idx] else: right_idx = (n // n_split) * (split_idx + 1) test_order = order[left_idx:right_idx] train_order = np.concatenate([order[:left_idx], order[right_idx:]]) new_order = np.concatenate([train_order, test_order]) n_train = len(train_order) # Standard Scaler for labels ss = StandardScaler() labels = dataset.get_datasets()[-1] train_label = labels[new_order[:n_train]] ss = ss.fit(train_label) # fit only by train labels = ss.transform(dataset.get_datasets()[-1]) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (labels, ))) dataset_train = SubDataset(dataset, 0, n_train, new_order) dataset_test = SubDataset(dataset, n_train, n, new_order) # Network model = predictor.build_predictor(method, n_unit, conv_layers, 1, dropout_ratio=0.25, n_layers=1) train_iter = I.SerialIterator(dataset_train, batchsize) val_iter = I.SerialIterator(dataset_test, batchsize, repeat=False, shuffle=False) def scaled_abs_error(x0, x1): if isinstance(x0, Variable): x0 = cuda.to_cpu(x0.data) if isinstance(x1, Variable): x1 = cuda.to_cpu(x1.data) scaled_x0 = ss.inverse_transform(cuda.to_cpu(x0)) scaled_x1 = ss.inverse_transform(cuda.to_cpu(x1)) diff = scaled_x0 - scaled_x1 return np.mean(np.absolute(diff), axis=0)[0] regressor = Regressor(model, lossfun=F.mean_squared_error, metrics_fun={'abs_error': scaled_abs_error}, device=gpu) optimizer = O.Adam(alpha=0.0005) optimizer.setup(regressor) updater = training.StandardUpdater(train_iter, optimizer, device=gpu, converter=concat_mols) dir_path = get_dir_path(batchsize, n_unit, conv_layers, M, method) dir_path = os.path.join(dir_path, str(split_idx) + "-" + str(n_split)) os.makedirs(dir_path, exist_ok=True) print('creating ', dir_path) np.save(os.path.join(dir_path, "test_idx"), np.array(test_order)) trainer = training.Trainer(updater, (epoch, 'epoch'), out=dir_path) trainer.extend( E.Evaluator(val_iter, regressor, device=gpu, converter=concat_mols)) trainer.extend(E.LogReport()) trainer.extend( E.PrintReport([ 'epoch', 'main/loss', 'main/abs_error', 'validation/main/loss', 'validation/main/abs_error', 'elapsed_time' ])) trainer.extend(E.ProgressBar()) trainer.run() # --- Plot regression evaluation result --- dataset_test = SubDataset(dataset, n_train, n, new_order) batch_all = concat_mols(dataset_test, device=gpu) serializers.save_npz(os.path.join(dir_path, "model.npz"), model) result = model(batch_all[0], batch_all[1]) result = ss.inverse_transform(cuda.to_cpu(result.data)) answer = ss.inverse_transform(cuda.to_cpu(batch_all[2])) plot_result(result, answer, save_filepath=os.path.join(dir_path, "result.png")) # --- Plot regression evaluation result end --- np.save(os.path.join(dir_path, "output.npy"), result) np.save(os.path.join(dir_path, "answer.npy"), answer) smiles_part = np.array(smiles)[test_order] np.save(os.path.join(dir_path, "smiles.npy"), smiles_part) # calculate saliency and save it. save_result(dataset, model, dir_path, M)
def main(): # Supported preprocessing/network list method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'nfpdrop', 'ggnndrop'] label_names = D.get_tox21_label_names() + ['pyridine'] iterator_type = ['serial', 'balanced'] parser = argparse.ArgumentParser( description='Multitask Learning with Tox21.') parser.add_argument('--method', '-m', type=str, choices=method_list, default='nfp', help='graph convolution model to use ' 'as a predictor.') parser.add_argument('--label', '-l', type=str, choices=label_names, default='', help='target label for logistic ' 'regression. Use all labels if this option ' 'is not specified.') parser.add_argument('--iterator-type', type=str, choices=iterator_type, default='serial', help='iterator type. If `balanced` ' 'is specified, data is sampled to take same number of' 'positive/negative labels during training.') parser.add_argument('--conv-layers', '-c', type=int, default=4, help='number of convolution layers') parser.add_argument('--n-layers', type=int, default=1, help='number of mlp layers') parser.add_argument('--batchsize', '-b', type=int, default=32, help='batch size') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID to use. Negative value indicates ' 'not to use GPU and to run the code in CPU.') parser.add_argument('--out', '-o', type=str, default='result', help='path to output directory') parser.add_argument('--epoch', '-e', type=int, default=10, help='number of epochs') parser.add_argument('--unit-num', '-u', type=int, default=16, help='number of units in one layer of the model') parser.add_argument('--resume', '-r', type=str, default='', help='path to a trainer snapshot') parser.add_argument('--frequency', '-f', type=int, default=-1, help='Frequency of taking a snapshot') parser.add_argument('--dropout-ratio', '-d', type=float, default=0.25, help='dropout_ratio') parser.add_argument('--seed', type=int, default=0, help='random seed') parser.add_argument('--num-train', type=int, default=-1, help='number of training data to be used, ' 'negative value indicates use all train data') args = parser.parse_args() method = args.method if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None class_num = len(label_names) # Dataset preparation train, val, test, train_smiles, val_smiles, test_smiles = data.load_dataset( method, labels) num_train = args.num_train # 100 if num_train > 0: # reduce size of train data seed = args.seed # 0 np.random.seed(seed) train_selected_label = np.random.permutation(np.arange( len(train)))[:num_train] print('num_train', num_train, len(train_selected_label), train_selected_label) train = NumpyTupleDataset(*train.features[train_selected_label, :]) # Network predictor_ = predictor.build_predictor(method, args.unit_num, args.conv_layers, class_num, args.dropout_ratio, args.n_layers) iterator_type = args.iterator_type if iterator_type == 'serial': train_iter = I.SerialIterator(train, args.batchsize) elif iterator_type == 'balanced': if class_num > 1: raise ValueError('BalancedSerialIterator can be used with only one' 'label classification, please specify label to' 'be predicted by --label option.') train_iter = BalancedSerialIterator(train, args.batchsize, train.features[:, -1], ignore_labels=-1) train_iter.show_label_stats() else: raise ValueError('Invalid iterator type {}'.format(iterator_type)) val_iter = I.SerialIterator(val, args.batchsize, repeat=False, shuffle=False) classifier = L.Classifier(predictor_, lossfun=F.sigmoid_cross_entropy, accfun=F.binary_accuracy) if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() classifier.to_gpu() optimizer = O.Adam() optimizer.setup(classifier) updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend( E.Evaluator(val_iter, classifier, device=args.gpu, converter=concat_mols)) trainer.extend(E.LogReport()) # --- ROCAUC Evaluator --- train_eval_iter = I.SerialIterator(train, args.batchsize, repeat=False, shuffle=False) trainer.extend( ROCAUCEvaluator(train_eval_iter, classifier, eval_func=predictor_, device=args.gpu, converter=concat_mols, name='train')) trainer.extend( ROCAUCEvaluator(val_iter, classifier, eval_func=predictor_, device=args.gpu, converter=concat_mols, name='val')) trainer.extend( E.PrintReport([ 'epoch', 'main/loss', 'main/accuracy', 'train/main/roc_auc', 'validation/main/loss', 'validation/main/accuracy', 'val/main/roc_auc', 'elapsed_time' ])) trainer.extend(E.ProgressBar(update_interval=10)) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run() with open(os.path.join(args.out, 'args.json'), 'w') as f: json.dump(vars(args), f, indent=4) chainer.serializers.save_npz(os.path.join(args.out, 'predictor.npz'), predictor_)
def inference_for_demo(self, loader, run_gen=False, portion='all', level=3): """ Testing detector """ test = TreeDataset(self.args, loader, self.dataname, self.device, self.tokenizer) data_iter = Iterator(test, train=False, device=self.device, batch_size=len(test) if len(test)<self.bs else self.bs, sort_key=lambda x: len(x.src), sort_within_batch=False) # Define trainer train_loss = abs_loss(self.args.label_num, self.maxepoch, self.device, train=False) trainer = build_trainer(self.args, self.model, self.optim, train_loss) if level == 1: logger.info('Test on detection model (stage-1)') best_model = os.path.join(self.args.savepath, 'best_model.pt') ckpt = torch.load(best_model, map_location=lambda storage, loc: storage)['model'] try: self.model.load_state_dict(ckpt) except: mismatch = self.model.load_state_dict(ckpt, strict=False) print(mismatch) logger.info('[Warning] The keys in state dict do not strictly match') trainer.testing(data_iter, tokenizer=self.tokenizer, gen_flag=False, info="{}".format(best_model.split("/")[-1]), write_type="w") logger.info('Test on final model (stage-3)') best_model = os.path.join(self.args.savepath, 'best_final_model.pt') ckpt = torch.load(best_model, map_location=lambda storage, loc: storage)['model'] try: self.model.load_state_dict(ckpt) except: mismatch = self.model.load_state_dict(ckpt, strict=False) print(mismatch) logger.info('[Warning] The keys in state dict do not strictly match') trainer.testing(data_iter, tokenizer=self.tokenizer, gen_flag=False, info="{}".format(best_model.split("/")[-1]), write_type="a") if level >= 2: logger.info('Test on adv model (stage-2)') best_model = os.path.join(self.args.savepath, 'best_adv_model.pt') ckpt = torch.load(best_model, map_location=lambda storage, loc: storage)['model'] try: self.model.load_state_dict(ckpt) except: mismatch = self.model.load_state_dict(ckpt, strict=False) print(mismatch) logger.info('[Warning] The keys in state dict do not strictly match') # Test without generated response trainer.testing(data_iter, tokenizer=self.tokenizer, gen_flag=False, info="{}".format(best_model.split("/")[-1]), write_type="w") # Test with generated response _, wrongs_before = trainer.testing(data_iter, tokenizer=self.tokenizer, gen_flag=True, info="{} With Generated Response".format(best_model.split("/")[-1]), write_type="a", output_wrong_pred=True) predictor = build_predictor(self.args, self.model, self.tokenizer, self.symbols, logger) predictor.translate(data_iter, 'best', have_gold=False, info="{} With Generated Response".format(best_model.split("/")[-1])) if level >= 3: logger.info('Test on final model (stage-3)') best_model = os.path.join(self.args.savepath, 'best_model.pt') ckpt = torch.load(best_model, map_location=lambda storage, loc: storage)['model'] #ckpt['classifier.pooler.conv.lin.weight'] = ckpt['classifier.pooler.conv.weight'] try: self.model.load_state_dict(ckpt) except: mismatch = self.model.load_state_dict(ckpt, strict=False) print(mismatch) logger.info('[Warning] The keys in state dict do not strictly match') # Test without generated response trainer.testing(data_iter, tokenizer=self.tokenizer, gen_flag=False, info="{}".format(best_model.split("/")[-1]), write_type="a") # Test with generated response _, wrongs_after = trainer.testing(data_iter, tokenizer=self.tokenizer, gen_flag=True, info="{} With Generated Response".format(best_model.split("/")[-1]), write_type="a", output_wrong_pred=True) predictor = build_predictor(self.args, self.model, self.tokenizer, self.symbols, logger) predictor.translate(data_iter, 'best', have_gold=False, info="{} With Generated Response".format(best_model.split("/")[-1])) # Find the data that are successfully attack and fixed fixed = [] for i in wrongs_before: if i not in wrongs_after: fixed.append(str(i)) wrongs_before = [str(i) for i in wrongs_before] with open(os.path.join(self.args.savepath, "id_fixed.txt"), "w") as f: f.write("\n".join(fixed)) with open(os.path.join(self.args.savepath, "id_attack_success.txt"), "w") as f: f.write("\n".join(wrongs_before))