Example #1
0
def test_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if pt != '':
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if k in model_flags:
            setattr(args, k, opt[k])
    print(args)
    symbols, tokenizer = get_symbol_and_tokenizer(args.encoder, args.temp_dir)
    model = AbsSummarizer(args, device, checkpoint, symbols=symbols)
    model.eval()

    test_iter = data_loader.Dataloader(args,
                                       load_dataset(args,
                                                    'test',
                                                    shuffle=False),
                                       args.test_batch_size,
                                       device,
                                       shuffle=False,
                                       is_test=True,
                                       tokenizer=tokenizer)

    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, step)
Example #2
0
def test_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False),
                                       args.test_batch_size, device,
                                       shuffle=False, is_test=True)
    # 为了中文的tokenize能把unused分开
    # for chinese tokenization, or it will split the word 'unused'
    add_token_list = ['[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]']
    if args.bart:
        tokenizer = AutoTokenizer.from_pretrained('bart-base', do_lower_case=True, cache_dir=args.temp_dir, local_files_only=False)
        symbols = {'BOS': tokenizer.encoder['madeupword0000'], 'EOS': tokenizer.encoder['madeupword0001'],
                   'PAD': 0, 'EOQ': tokenizer.encoder['madeupword0002']}
    else:
        tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True,
                                                  cache_dir=args.temp_dir, local_files_only=False, additional_special_tokens=add_token_list)
        symbols = {'BOS': tokenizer.vocab['[unused1]'], 'EOS': tokenizer.vocab['[unused2]'],
                   'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused3]']}
    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, step)
def test_text_abs(args):

    logger.info('Loading checkpoint from %s' % args.test_from)
    device = "cpu" if args.visible_gpus == '-1' else "cuda"

    checkpoint = torch.load(args.test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)
    logger.info('Loading args inside test_text_abs %s' % args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.load_text(args, args.text_src, args.text_tgt,
                                      device)

    logger.info('test_iter is %s' % test_iter)

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True,
                                              cache_dir=args.temp_dir)
    symbols = {
        'BOS': tokenizer.vocab['[unused0]'],
        'EOS': tokenizer.vocab['[unused1]'],
        'PAD': tokenizer.vocab['[PAD]'],
        'EOQ': tokenizer.vocab['[unused2]']
    }

    logger.info('symbols is %s' % symbols)
    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, -1)
Example #4
0
def test_text_abs(args, device_id, pt, step, tokenizer):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False),
                                       args.test_batch_size, device,
                                       shuffle=False, is_test=True)
    symbols = {'BOS': tokenizer.convert_tokens_to_ids('<s>'), 'EOS': tokenizer.convert_tokens_to_ids('</s>'),
               'PAD': tokenizer.convert_tokens_to_ids('[PAD]')}
    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, step)
Example #5
0
    def val_abs(self, args, iter_fct, step):
        self.model.eval()

        symbols = {'BOS': self.tokenizer.vocab['[unused0]'], 'EOS': self.tokenizer.vocab['[unused1]'],
                   'PAD': self.tokenizer.vocab['[PAD]'], 'EOQ': self.tokenizer.vocab['[unused2]']}
        predictor = build_predictor(args, self.tokenizer, symbols, self.model, logger)
        return predictor.translate(iter_fct, step, return_entities=True)
Example #6
0
def test_text_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False),
                                       args.test_batch_size, device,
                                       shuffle=False, is_test=True)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir)
    symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'],
               'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']}
    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, step)
def test_text(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    tokenizer = BertTokenizer.from_pretrained(args.bert_dir)

    model = AbsSummarizer(args, device, tokenizer.vocab, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False),
                                       args.test_batch_size, args.test_batch_ex_size, device,
                                       shuffle=False, is_test=True)
    predictor = build_predictor(args, tokenizer, model, logger)
    predictor.translate(test_iter, step)
Example #8
0
def test_abs(args, device_id, pt, step):
    """ Implements testing process (meta / non-memta)
    Arguments:
        device_id (int) : the GPU id to be used
        pt() : checkpoint model
        step (int) : checkpoint step
    Process:
        - load checkpoint
        - prepare dataloader class
        - prepare model class
        - prepare predictor
        - predictor.translate()
    """
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    logger.info('Device ID %d', device_id)
    logger.info('Device %s', device)

    # Load chekcpoint
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)
    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])

    # Prepare dataloader
    test_iter = data_loader.Dataloader(args,
                                       load_dataset(args,
                                                    'test',
                                                    shuffle=False),
                                       args.test_batch_size,
                                       device,
                                       shuffle=False,
                                       is_test=True)
    # Prepare model
    if (args.meta_mode):
        model = MTLAbsSummarizer(args, device, checkpoint)
    else:
        model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    # Prepare predictor
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True,
                                              cache_dir=args.temp_dir)
    symbols = {
        'BOS': tokenizer.vocab['[unused0]'],
        'EOS': tokenizer.vocab['[unused1]'],
        'PAD': tokenizer.vocab['[PAD]'],
        'EOQ': tokenizer.vocab['[unused2]']
    }

    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, step)  # long time
Example #9
0
    def test_model(self, corpus_type, topn=0):
        model_file = _top_model(self.model_path, n=topn)
        logger.info('Test GuidAbs model %s' % model_file)
        fn_touch = path.join(
            self.model_path,
            'finished_%s.test_guidabs_model%s' % (corpus_type, topn))
        if path.exists(fn_touch):
            return
        args = self._build_abs_args()
        args.mode = 'test'
        args.bert_data_path = path.join(self.data_path, 'cnndm')
        args.model_path = self.model_path
        args.log_file = path.join(
            self.model_path,
            'test_abs_bert_cnndm_%s_top%s.log' % (corpus_type, topn))
        args.result_path = path.join(self.model_path,
                                     'cnndm_%s_top%s' % (corpus_type, topn))
        init_logger(args.log_file)
        step = int(model_file.split('.')[-2].split('_')[-1])
        # load abs model
        step_abs = int(model_file.split('.')[-2].split('_')[-1])
        checkpoint = torch.load(model_file,
                                map_location=lambda storage, loc: storage)
        model_abs = model_bld.AbsSummarizer(args, args.device, checkpoint)
        model_abs.eval()
        # init model testers
        tokenizer = BertTokenizer.from_pretrained(path.join(
            args.bert_model_path, model_abs.bert.model_name),
                                                  do_lower_case=True,
                                                  cache_dir=args.temp_dir)
        symbols = {
            'BOS': tokenizer.vocab['[unused0]'],
            'EOS': tokenizer.vocab['[unused1]'],
            'PAD': tokenizer.vocab['[PAD]'],
            'EOQ': tokenizer.vocab['[unused2]']
        }

        predictor = pred_abs.build_predictor(args, tokenizer, symbols,
                                             model_abs, logger)
        test_iter = data_ldr.Dataloader(args,
                                        data_ldr.load_dataset(args,
                                                              corpus_type,
                                                              shuffle=False),
                                        args.test_batch_size,
                                        args.device,
                                        shuffle=False,
                                        is_test=True,
                                        keep_order=True)

        avg_f1 = test_abs(logger, args, predictor, step_abs, test_iter)
        os.system('touch %s' % fn_touch)
        return avg_f1
Example #10
0
    def build_data(self, loader):
        """ Build polluted textgraph by adversarial trained generator"""
        print("Build Pollued data by adv model from {}".format(self.args.savepath))
        test = TreeDataset(self.args, loader, self.dataname, self.device, self.tokenizer)
        data_iter = Iterator(test, train=False, device=self.device, batch_size=len(test),
                             sort_key=lambda x: len(x.src),
                             sort_within_batch=False)

        best_adv_model = os.path.join(self.args.savepath, 'best_adv_model.pt')
        self.model.load_state_dict(torch.load(best_adv_model, map_location=lambda storage, 
                                              loc: storage)['model'])
        predictor = build_predictor(self.args, self.model, self.tokenizer, self.symbols, logger)
        predictor.build(data_iter)
Example #11
0
def test_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(args,
                                       load_dataset(args,
                                                    'test',
                                                    shuffle=False),
                                       args.test_batch_size,
                                       device,
                                       shuffle=False,
                                       is_test=True)
    parser = argparse.ArgumentParser()
    parser.add_argument('--bpe-codes',
                        default="/content/PhoBERT_base_transformers/bpe.codes",
                        required=False,
                        type=str,
                        help='path to fastBPE BPE')
    args1, unknown = parser.parse_known_args()
    bpe = fastBPE(args1)

    # Load the dictionary
    vocab = Dictionary()
    vocab.add_from_file("/content/PhoBERT_base_transformers/dict.txt")

    tokenizer = bpe
    symbols = {
        'BOS': vocab.indices['[unused0]'],
        'EOS': vocab.indices['[unused1]'],
        'PAD': vocab.indices['[PAD]'],
        'EOQ': vocab.indices['[unused2]']
    }

    predictor = build_predictor(args, vocab, symbols, model, logger)
    predictor.translate(test_iter, step)
Example #12
0
    def test_detector(self, loader, run_gen=False, portion='all'):
        """ Testing detector  """
        test = TreeDataset(self.args, loader, self.dataname, self.device, self.tokenizer)
        #test = CommentDataset(self.args, loader, self.dataname, self.device, self.tokenizer)

        data_iter = Iterator(test, train=False, device=self.device, 
                             batch_size=len(test) if len(test)<self.bs else self.bs,
                             sort_key=lambda x: len(x.src),
                             sort_within_batch=False)
        # Define trainer
        train_loss = abs_loss(self.args.label_num, self.maxepoch, self.device, train=False)
        trainer = build_trainer(self.args, self.model, self.optim, train_loss)

        logger.info('Test on best model (stage-1)')
        best_model = os.path.join(self.args.savepath, 'best_model.pt')

        if os.path.exists(best_model):
            try:
                self.model.load_state_dict(torch.load(best_model, map_location=lambda storage, 
                                                  loc: storage)['model'])
            except:
                self.model.load_state_dict(torch.load(best_model, map_location=lambda storage, 
                                                  loc: storage)['model'], strict=False)
                logger.info('[Warning] The keys in state dict do not strictly match')

            test_stat = trainer.testing(data_iter, tokenizer=self.tokenizer, gen_flag=False, info="Without Generated Response >>", write_type="w")
            test_stat.write_results(os.path.join(self.args.savepath, 'result_test.csv'), 'test-'+portion, self.args.label_num)

        if self.args.test_adv:

            logger.info('Test on adversarially-trained model (stage-2)')
            best_model = os.path.join(self.args.savepath, 'best_adv_model.pt')
            if os.path.exists(best_model):
                try:
                    self.model.load_state_dict(torch.load(best_model, map_location=lambda storage, 
                                                      loc: storage)['model'])
                except:
                    self.model.load_state_dict(torch.load(best_model, map_location=lambda storage, 
                                                      loc: storage)['model'], strict=False)
                    logger.info('[Warning] The keys in state dict do not strictly match')

            test_stat = trainer.testing(data_iter, tokenizer=self.tokenizer, gen_flag=True, info="\nWith Generated Response from {} >>".format(best_model.split("/")[-1]), write_type=="a")
            predictor = build_predictor(self.args, self.model, self.tokenizer, self.symbols, logger)
            predictor.translate(data_iter, 'best', have_gold=False)
 def __init__(self, abs_model_file):
     self.args = self._build_abs_args()
     # load model
     step_abs = int(abs_model_file.split('.')[-2].split('_')[-1])
     checkpoint = torch.load(abs_model_file, map_location=lambda storage, loc: storage)
     self.model_abs = model_bld.AbsSummarizer(self.args, self.args.device, checkpoint)
     self.model_abs.eval()
     # prepare tokenizer and predictor
     self.tokenizer = BertTokenizer.from_pretrained(path.join(self.args.bert_model_path, self.model_abs.bert.model_name), do_lower_case=True)
     self.symbols = {'BOS': self.tokenizer.vocab['[unused0]'], 'EOS': self.tokenizer.vocab['[unused1]'],
                'PAD': self.tokenizer.vocab['[PAD]'], 'EOQ': self.tokenizer.vocab['[unused2]']}
     self.predictor = pred_abs.build_predictor(self.args, self.tokenizer, self.symbols, self.model_abs, logger)
     # special tokens
     self.sep_token = '[SEP]'
     self.cls_token = '[CLS]'
     self.pad_token = '[PAD]'
     self.sep_vid = self.tokenizer.vocab[self.sep_token]
     self.cls_vid = self.tokenizer.vocab[self.cls_token]
     self.pad_vid = self.tokenizer.vocab[self.pad_token]
Example #14
0
def test_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if not (args.test_from):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(args,
                                       load_dataset(args,
                                                    'test',
                                                    shuffle=False),
                                       args.test_batch_size,
                                       device,
                                       shuffle=False,
                                       is_test=True)

    tokenizer = BertTokenizer.from_pretrained(
        '../ETRI_koBERT/003_bert_eojeol_pytorch/vocab.txt',
        do_lower_case=False,
        cache_dir=args.temp_dir)
    if not args.share_emb:
        tokenizer = add_tokens(tokenizer)

    symbols = {
        'BOS': tokenizer.vocab['<S>'],
        'EOS': tokenizer.vocab['<T>'],
        'PAD': tokenizer.vocab['[PAD]']
    }

    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, step)
def test_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == "-1" else "cuda"
    if pt != "":
        test_from = pt
    else:
        test_from = args.test_from
    logger.info("Loading checkpoint from %s" % test_from)

    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint["opt"])
    for k in opt.keys():
        if k in model_flags:
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(
        args,
        load_dataset(args, "test", shuffle=False),
        args.test_batch_size,
        device,
        shuffle=False,
        is_test=True,
    )
    tokenizer = BertTokenizer.from_pretrained(
        "chinese_roberta_wwm_ext_pytorch",
        do_lower_case=True,
        cache_dir=args.temp_dir)
    symbols = {
        "BOS": tokenizer.vocab["[unused1]"],
        "EOS": tokenizer.vocab["[unused2]"],
        "PAD": tokenizer.vocab["[PAD]"],
        "EOQ": tokenizer.vocab["[unused3]"],
    }
    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, step)
Example #16
0
def test_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False),
                                       args.test_batch_size, device,
                                       shuffle=False, is_test=True)

    tokenizer = BertData(args).tokenizer

    #tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir)
    # tokenizer = None
    # if args.pretrained_model_type in ['bert-base-uncased', 'bert-base-multilingual-uncased']:
    #     tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_type, do_lower_case=True, cache_dir=args.temp_dir)
    #
    # if not tokenizer:
    #     raise NotImplementedError("tokenizer")

   # tokenizer = add_to_vocab(tokenizer, ['[unused0]', '[unused1]', '[PAD]', '[unused2]'])
    symbols = {'BOS': tokenizer.convert_tokens_to_ids('[unused0]'), 'EOS': tokenizer.convert_tokens_to_ids('[unused1]'),
               'PAD': tokenizer.convert_tokens_to_ids('[PAD]'), 'EOQ': tokenizer.convert_tokens_to_ids('[unused2]')}
    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, step)
Example #17
0
    def __init__(self, model_path='cache/abs_bert_model.pt'):
        if not os.path.exists('cache'):
            os.mkdir('cache')

        # check if model is downloaded
        if not os.path.exists(model_path):
            print('Model not found in cache')
            self.download_model(model_path)

        # setup cache for bert model and tokenizer
        cache_dir = 'cache'
        if not os.path.exists(cache_dir):
            os.mkdir(cache_dir)

        checkpoint = torch.load(model_path,
                                map_location=lambda storage, loc: storage)
        device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = BertSummarizer(checkpoint, device, cache_dir)
        self.model.eval()

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                                  do_lower_case=True,
                                                  cache_dir=cache_dir)
        self.predictor = build_predictor(tokenizer, self.model)
def main(method, labels, unit_num, conv_layers, class_num, n_layers,
         dropout_ratio, model_path, save_path):
    # Dataset preparation
    train, val, test, train_smiles, val_smiles, test_smiles = data.load_dataset(method, labels)

    # --- model preparation ---
    model = predictor.build_predictor(
        method, unit_num, conv_layers, class_num, dropout_ratio, n_layers)

    classifier = L.Classifier(model,
                              lossfun=F.sigmoid_cross_entropy,
                              accfun=F.binary_accuracy)

    print('Loading model parameter from ', model_path)
    serializers.load_npz(model_path, model)

    target_dataset = val
    target_smiles = val_smiles

    val_mols = [Chem.MolFromSmiles(smi) for smi in tqdm(val_smiles)]

    pyridine_mol = Chem.MolFromSmarts(PYRIDINE_SMILES)
    pyridine_index = np.where(np.array([mol.HasSubstructMatch(pyridine_mol) for mol in val_mols]) == True)
    val_pyridine_mols = np.array(val_mols)[pyridine_index]

    # It only extracts one substructure, not expected behavior
    # val_pyridine_pos = [set(mol.GetSubstructMatch(pi)) for mol in val_pyridine_mols]
    def flatten_tuple(x):
        return [element for tupl in x for element in tupl]

    val_pyridine_pos = [flatten_tuple(mol.GetSubstructMatches(pyridine_mol)) for mol in val_pyridine_mols]

    # print('pyridine_index', pyridine_index)
    # print('val_pyridine_mols', val_pyridine_mols.shape)
    # print('val_pyridine_pos', val_pyridine_pos)
    # print('val_pyridine_pos length', [len(k) for k in val_pyridine_pos])

    pyrigine_dataset = NumpyTupleDataset(*target_dataset.features[pyridine_index, :])
    pyrigine_smiles = target_smiles[pyridine_index]
    print('pyrigine_dataset', len(pyrigine_dataset), len(pyrigine_smiles))

    atoms = pyrigine_dataset.features[:, 0]
    num_atoms = [len(a) for a in atoms]

    def clip_original_size(saliency, num_atoms):
        """`saliency` array is 0 padded, this method align to have original
        molecule's length
        """
        assert len(saliency) == len(num_atoms)
        saliency_list = []
        for i in range(len(saliency)):
            saliency_list.append(saliency[i, :num_atoms[i]])
        return saliency_list

    def preprocess_fun(*inputs):
        atom, adj, t = inputs
        # HACKING for now...
        atom_embed = classifier.predictor.graph_conv.embed(atom)
        return atom_embed, adj, t

    def eval_fun(*inputs):
        atom_embed, adj, t = inputs
        prob = classifier.predictor(atom_embed, adj)
        out = F.sum(prob)
        return out

    calculator_method = args.calculator
    print('calculator method', calculator_method)
    if calculator_method == 'gradient':
        # option1: Gradient
        calculator = GradientCalculator(
            classifier, eval_fun=eval_fun,
            # target_key='embed', eval_key='out',
            target_key=0,
            # multiply_target=True  # this will calculate grad * input
        )
    elif calculator_method == 'integrated_gradients':
        # option2: IntegratedGradients
        calculator = IntegratedGradientsCalculator(
            classifier, eval_fun=eval_fun,
            # target_key='embed', eval_key='out',
            target_key=0, steps=10
        )
    elif calculator_method == 'occlusion':
        # option3: Occlusion
        def eval_fun_occlusion(*inputs):
            atom_embed, adj, t = inputs
            prob = classifier.predictor(atom_embed, adj)
            # Do not take sum, instead return batch-wise score
            out = F.sigmoid(prob)
            return out
        calculator = OcclusionCalculator(
            classifier, eval_fun=eval_fun_occlusion,
            # target_key='embed', eval_key='out',
            target_key=0, slide_axis=1
        )
    else:
        raise ValueError("[ERROR] Unexpected value calculator_method={}".format(calculator_method))

    M = 100
    num = 20
    rates = np.linspace(0.1, 1, num=num)
    print('M', M)

    # --- VanillaGrad ---
    saliency_arrays = calculator.compute_vanilla(
        pyrigine_dataset, converter=concat_mols, preprocess_fn=preprocess_fun)
    saliency = calculator.transform(
        saliency_arrays, ch_axis=3, method='square')
    # saliency_arrays -> M, batch_size, max_atom, ch_dim
    # print('saliency_arrays', saliency_arrays.shape)
    # saliency -> batch_size, max_atom
    # print('saliency', saliency.shape)
    saliency_vanilla = clip_original_size(saliency, num_atoms)

    # recall & precision
    vanilla_recall, vanilla_precision = calc_recall_precision(saliency_vanilla, rates, val_pyridine_pos)
    print('vanilla_recall', vanilla_recall)
    print('vanilla_precision', vanilla_precision)

    # --- SmoothGrad ---
    saliency_arrays = calculator.compute_smooth(
        pyrigine_dataset, converter=concat_mols, preprocess_fn=preprocess_fun,
        M=M,
        mode='absolute', scale=0.15  # previous implementation
        # mode='relative', scale=0.05
    )
    saliency = calculator.transform(
        saliency_arrays, ch_axis=3, method='square')

    saliency_smooth = clip_original_size(saliency, num_atoms)

    # recall & precision
    smooth_recall, smooth_precision = calc_recall_precision(saliency_smooth, rates, val_pyridine_pos)
    print('smooth_recall', smooth_recall)
    print('smooth_precision', smooth_precision)

    # --- BayesGrad ---
    # bayes grad is calculated by compute_vanilla with train=True
    saliency_arrays = calculator.compute_vanilla(
        pyrigine_dataset, converter=concat_mols, preprocess_fn=preprocess_fun,
        M=M, train=True)
    saliency = calculator.transform(
        saliency_arrays, ch_axis=3, method='square', lam=0)
    saliency_bayes = clip_original_size(saliency, num_atoms)

    bayes_recall, bayes_precision = calc_recall_precision(saliency_bayes, rates, val_pyridine_pos)
    print('bayes_recall', bayes_recall)
    print('bayes_precision', bayes_precision)

    plt.figure(figsize=(7, 5), dpi=200)
    plt.plot(vanilla_recall, vanilla_precision, 'k-', color='blue', label='VanillaGrad')
    plt.plot(smooth_recall, smooth_precision, 'k-', color='green', label='SmoothGrad')
    plt.plot(bayes_recall, bayes_precision, 'k-', color='red', label='BayesGrad(Ours)')
    plt.axhline(y=vanilla_precision[-1], color='gray', linestyle='--')
    plt.legend()
    plt.xlabel("recall")
    plt.ylabel("precision")
    if save_path:
        print('saved to ', save_path)
        plt.savefig(save_path)
        # plt.savefig('artificial_pr.eps')
    else:
        plt.show()
Example #19
0
    os.environ["CUDA_VISIBLE_DEVICES"] = args.visible_gpus

    init_logger(args.log_file)
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    device_id = 0 if device == "cuda" else -1

    if (args.task == 'abs'):
        if (args.mode == 'train'):
            train_abs(args, device_id)
        elif (args.mode == 'validate'):
            validate_abs(args, device_id)
        elif (args.mode == 'score'):
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir)
            symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'],
                       'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']}
            predictor = build_predictor(args, tokenizer, symbols, None, logger)

            # step = 30000
            gold_path = args.result_path + '.gold'
            can_path = args.result_path + '.candidate'
            rouges = predictor._report_rouge(gold_path, can_path)
            logger.info('Rouges at step %d \n%s' % (args.result_path, rouge_results_to_str(rouges)))
            # if self.tensorboard_writer is not None:
            #     self.tensorboard_writer.add_scalar('test/rouge1-F', rouges['rouge_1_f_score'], step)
            #     self.tensorboard_writer.add_scalar('test/rouge2-F', rouges['rouge_2_f_score'], step)
            #     self.tensorboard_writer.add_scalar('test/rougeL-F', rouges['rouge_l_f_score'], step)
        elif (args.mode == 'lead'):
            baseline(args, cal_lead=True)
        elif (args.mode == 'oracle'):
            baseline(args, cal_oracle=True)
        if (args.mode == 'test'):
    def test_model(self,
                   extractor,
                   corpus_type='test',
                   block_trigram=True,
                   quick_test=False):
        logger.info('Test SentExt model (%s) and GuidAbs model (%s) ...' %
                    (extractor.name, self.model_file))
        testname = '%s_guidabs_%s' % (extractor.name, 'blocktrigram'
                                      if block_trigram else 'noblocktrigram')
        # buid args
        args = self._build_abs_args()
        args.mode = 'test'
        args.bert_data_path = path.join(self.data_path, 'cnndm')
        args.model_path = self.result_path
        args.log_file = path.join(self.result_path,
                                  'test_varextabs.%s.log' % testname)
        args.result_path = path.join(self.result_path, 'cnndm_' + testname)
        args.block_trigram = block_trigram
        init_logger(args.log_file)
        # load abs model
        abs_model_file = self.model_file
        logger.info('Loading abs model %s' % abs_model_file)
        step_abs = int(abs_model_file.split('.')[-2].split('_')[-1])
        checkpoint = torch.load(abs_model_file,
                                map_location=lambda storage, loc: storage)
        model_abs = model_bld.AbsSummarizer(args, args.device, checkpoint)
        model_abs.eval()
        # init model testers
        tokenizer = BertTokenizer.from_pretrained(path.join(
            args.bert_model_path, model_abs.bert.model_name),
                                                  do_lower_case=True,
                                                  cache_dir=args.temp_dir)
        symbols = {
            'BOS': tokenizer.vocab['[unused0]'],
            'EOS': tokenizer.vocab['[unused1]'],
            'PAD': tokenizer.vocab['[PAD]'],
            'EOQ': tokenizer.vocab['[unused2]']
        }

        predictor = pred_abs.build_predictor(args, tokenizer, symbols,
                                             model_abs, logger)
        test_iter = data_ldr.Dataloader(args,
                                        data_ldr.load_dataset(args,
                                                              corpus_type,
                                                              shuffle=False),
                                        args.test_batch_size,
                                        args.device,
                                        shuffle=False,
                                        is_test=True,
                                        keep_order=True)

        logger.info('Generating Ext/GuidAbs results %s ...' % args.result_path)
        avg_f1 = test_ext_abs(logger,
                              args,
                              extractor,
                              predictor,
                              0,
                              step_abs,
                              test_iter,
                              quick_test=quick_test)
        return avg_f1
Example #21
0
def train(gpu, method, epoch, batchsize, n_unit, conv_layers, dataset, smiles,
          M, n_split, split_idx, order):
    n = len(dataset)
    assert len(order) == n
    left_idx = (n // n_split) * split_idx
    is_right_most_split = (n_split == split_idx + 1)
    if is_right_most_split:
        test_order = order[left_idx:]
        train_order = order[:left_idx]
    else:
        right_idx = (n // n_split) * (split_idx + 1)
        test_order = order[left_idx:right_idx]
        train_order = np.concatenate([order[:left_idx], order[right_idx:]])

    new_order = np.concatenate([train_order, test_order])
    n_train = len(train_order)

    # Standard Scaler for labels
    ss = StandardScaler()
    labels = dataset.get_datasets()[-1]
    train_label = labels[new_order[:n_train]]
    ss = ss.fit(train_label)  # fit only by train
    labels = ss.transform(dataset.get_datasets()[-1])
    dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (labels, )))

    dataset_train = SubDataset(dataset, 0, n_train, new_order)
    dataset_test = SubDataset(dataset, n_train, n, new_order)

    # Network
    model = predictor.build_predictor(method,
                                      n_unit,
                                      conv_layers,
                                      1,
                                      dropout_ratio=0.25,
                                      n_layers=1)

    train_iter = I.SerialIterator(dataset_train, batchsize)
    val_iter = I.SerialIterator(dataset_test,
                                batchsize,
                                repeat=False,
                                shuffle=False)

    def scaled_abs_error(x0, x1):
        if isinstance(x0, Variable):
            x0 = cuda.to_cpu(x0.data)
        if isinstance(x1, Variable):
            x1 = cuda.to_cpu(x1.data)
        scaled_x0 = ss.inverse_transform(cuda.to_cpu(x0))
        scaled_x1 = ss.inverse_transform(cuda.to_cpu(x1))
        diff = scaled_x0 - scaled_x1
        return np.mean(np.absolute(diff), axis=0)[0]

    regressor = Regressor(model,
                          lossfun=F.mean_squared_error,
                          metrics_fun={'abs_error': scaled_abs_error},
                          device=gpu)

    optimizer = O.Adam(alpha=0.0005)
    optimizer.setup(regressor)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=gpu,
                                       converter=concat_mols)

    dir_path = get_dir_path(batchsize, n_unit, conv_layers, M, method)
    dir_path = os.path.join(dir_path, str(split_idx) + "-" + str(n_split))
    os.makedirs(dir_path, exist_ok=True)
    print('creating ', dir_path)
    np.save(os.path.join(dir_path, "test_idx"), np.array(test_order))

    trainer = training.Trainer(updater, (epoch, 'epoch'), out=dir_path)
    trainer.extend(
        E.Evaluator(val_iter, regressor, device=gpu, converter=concat_mols))
    trainer.extend(E.LogReport())
    trainer.extend(
        E.PrintReport([
            'epoch', 'main/loss', 'main/abs_error', 'validation/main/loss',
            'validation/main/abs_error', 'elapsed_time'
        ]))
    trainer.extend(E.ProgressBar())
    trainer.run()

    # --- Plot regression evaluation result ---
    dataset_test = SubDataset(dataset, n_train, n, new_order)
    batch_all = concat_mols(dataset_test, device=gpu)
    serializers.save_npz(os.path.join(dir_path, "model.npz"), model)
    result = model(batch_all[0], batch_all[1])
    result = ss.inverse_transform(cuda.to_cpu(result.data))
    answer = ss.inverse_transform(cuda.to_cpu(batch_all[2]))
    plot_result(result,
                answer,
                save_filepath=os.path.join(dir_path, "result.png"))

    # --- Plot regression evaluation result end ---
    np.save(os.path.join(dir_path, "output.npy"), result)
    np.save(os.path.join(dir_path, "answer.npy"), answer)
    smiles_part = np.array(smiles)[test_order]
    np.save(os.path.join(dir_path, "smiles.npy"), smiles_part)

    # calculate saliency and save it.
    save_result(dataset, model, dir_path, M)
Example #22
0
def main():
    # Supported preprocessing/network list
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'nfpdrop', 'ggnndrop']
    label_names = D.get_tox21_label_names() + ['pyridine']
    iterator_type = ['serial', 'balanced']

    parser = argparse.ArgumentParser(
        description='Multitask Learning with Tox21.')
    parser.add_argument('--method',
                        '-m',
                        type=str,
                        choices=method_list,
                        default='nfp',
                        help='graph convolution model to use '
                        'as a predictor.')
    parser.add_argument('--label',
                        '-l',
                        type=str,
                        choices=label_names,
                        default='',
                        help='target label for logistic '
                        'regression. Use all labels if this option '
                        'is not specified.')
    parser.add_argument('--iterator-type',
                        type=str,
                        choices=iterator_type,
                        default='serial',
                        help='iterator type. If `balanced` '
                        'is specified, data is sampled to take same number of'
                        'positive/negative labels during training.')
    parser.add_argument('--conv-layers',
                        '-c',
                        type=int,
                        default=4,
                        help='number of convolution layers')
    parser.add_argument('--n-layers',
                        type=int,
                        default=1,
                        help='number of mlp layers')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=32,
                        help='batch size')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID to use. Negative value indicates '
                        'not to use GPU and to run the code in CPU.')
    parser.add_argument('--out',
                        '-o',
                        type=str,
                        default='result',
                        help='path to output directory')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=10,
                        help='number of epochs')
    parser.add_argument('--unit-num',
                        '-u',
                        type=int,
                        default=16,
                        help='number of units in one layer of the model')
    parser.add_argument('--resume',
                        '-r',
                        type=str,
                        default='',
                        help='path to a trainer snapshot')
    parser.add_argument('--frequency',
                        '-f',
                        type=int,
                        default=-1,
                        help='Frequency of taking a snapshot')
    parser.add_argument('--dropout-ratio',
                        '-d',
                        type=float,
                        default=0.25,
                        help='dropout_ratio')
    parser.add_argument('--seed', type=int, default=0, help='random seed')
    parser.add_argument('--num-train',
                        type=int,
                        default=-1,
                        help='number of training data to be used, '
                        'negative value indicates use all train data')
    args = parser.parse_args()

    method = args.method
    if args.label:
        labels = args.label
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        class_num = len(label_names)

    # Dataset preparation
    train, val, test, train_smiles, val_smiles, test_smiles = data.load_dataset(
        method, labels)

    num_train = args.num_train  # 100
    if num_train > 0:
        # reduce size of train data
        seed = args.seed  # 0
        np.random.seed(seed)
        train_selected_label = np.random.permutation(np.arange(
            len(train)))[:num_train]
        print('num_train', num_train, len(train_selected_label),
              train_selected_label)
        train = NumpyTupleDataset(*train.features[train_selected_label, :])
    # Network
    predictor_ = predictor.build_predictor(method, args.unit_num,
                                           args.conv_layers, class_num,
                                           args.dropout_ratio, args.n_layers)

    iterator_type = args.iterator_type
    if iterator_type == 'serial':
        train_iter = I.SerialIterator(train, args.batchsize)
    elif iterator_type == 'balanced':
        if class_num > 1:
            raise ValueError('BalancedSerialIterator can be used with only one'
                             'label classification, please specify label to'
                             'be predicted by --label option.')
        train_iter = BalancedSerialIterator(train,
                                            args.batchsize,
                                            train.features[:, -1],
                                            ignore_labels=-1)
        train_iter.show_label_stats()
    else:
        raise ValueError('Invalid iterator type {}'.format(iterator_type))
    val_iter = I.SerialIterator(val,
                                args.batchsize,
                                repeat=False,
                                shuffle=False)
    classifier = L.Classifier(predictor_,
                              lossfun=F.sigmoid_cross_entropy,
                              accfun=F.binary_accuracy)
    if args.gpu >= 0:
        chainer.cuda.get_device_from_id(args.gpu).use()
        classifier.to_gpu()

    optimizer = O.Adam()
    optimizer.setup(classifier)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=args.gpu,
                                       converter=concat_mols)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    trainer.extend(
        E.Evaluator(val_iter,
                    classifier,
                    device=args.gpu,
                    converter=concat_mols))
    trainer.extend(E.LogReport())

    # --- ROCAUC Evaluator ---
    train_eval_iter = I.SerialIterator(train,
                                       args.batchsize,
                                       repeat=False,
                                       shuffle=False)
    trainer.extend(
        ROCAUCEvaluator(train_eval_iter,
                        classifier,
                        eval_func=predictor_,
                        device=args.gpu,
                        converter=concat_mols,
                        name='train'))
    trainer.extend(
        ROCAUCEvaluator(val_iter,
                        classifier,
                        eval_func=predictor_,
                        device=args.gpu,
                        converter=concat_mols,
                        name='val'))
    trainer.extend(
        E.PrintReport([
            'epoch', 'main/loss', 'main/accuracy', 'train/main/roc_auc',
            'validation/main/loss', 'validation/main/accuracy',
            'val/main/roc_auc', 'elapsed_time'
        ]))

    trainer.extend(E.ProgressBar(update_interval=10))
    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()

    with open(os.path.join(args.out, 'args.json'), 'w') as f:
        json.dump(vars(args), f, indent=4)
    chainer.serializers.save_npz(os.path.join(args.out, 'predictor.npz'),
                                 predictor_)
Example #23
0
    def inference_for_demo(self, loader, run_gen=False, portion='all', level=3):
        """ Testing detector  """
        test = TreeDataset(self.args, loader, self.dataname, self.device, self.tokenizer)

        data_iter = Iterator(test, train=False, device=self.device, 
                             batch_size=len(test) if len(test)<self.bs else self.bs,
                             sort_key=lambda x: len(x.src),
                             sort_within_batch=False)
        # Define trainer
        train_loss = abs_loss(self.args.label_num, self.maxepoch, self.device, train=False)
        trainer = build_trainer(self.args, self.model, self.optim, train_loss)

        if level == 1:
            logger.info('Test on detection model (stage-1)')
            best_model = os.path.join(self.args.savepath, 'best_model.pt')
            ckpt = torch.load(best_model, map_location=lambda storage, loc: storage)['model']
            try:
                self.model.load_state_dict(ckpt)
            except:
                mismatch = self.model.load_state_dict(ckpt, strict=False)
                print(mismatch)
                logger.info('[Warning] The keys in state dict do not strictly match')

            trainer.testing(data_iter, tokenizer=self.tokenizer, gen_flag=False, info="{}".format(best_model.split("/")[-1]), write_type="w")

            logger.info('Test on final model (stage-3)')
            best_model = os.path.join(self.args.savepath, 'best_final_model.pt')
            ckpt = torch.load(best_model, map_location=lambda storage, loc: storage)['model']
            try:
                self.model.load_state_dict(ckpt)
            except:
                mismatch = self.model.load_state_dict(ckpt, strict=False)
                print(mismatch)
                logger.info('[Warning] The keys in state dict do not strictly match')

            trainer.testing(data_iter, tokenizer=self.tokenizer, gen_flag=False, info="{}".format(best_model.split("/")[-1]), write_type="a")


        if level >= 2:
            logger.info('Test on adv model (stage-2)')
            best_model = os.path.join(self.args.savepath, 'best_adv_model.pt')
            ckpt = torch.load(best_model, map_location=lambda storage, loc: storage)['model']
            try:
                self.model.load_state_dict(ckpt)
            except:
                mismatch = self.model.load_state_dict(ckpt, strict=False)
                print(mismatch)
                logger.info('[Warning] The keys in state dict do not strictly match')

            # Test without generated response
            trainer.testing(data_iter, tokenizer=self.tokenizer, gen_flag=False, info="{}".format(best_model.split("/")[-1]), write_type="w")

            # Test with generated response
            _, wrongs_before = trainer.testing(data_iter, tokenizer=self.tokenizer, gen_flag=True, info="{} With Generated Response".format(best_model.split("/")[-1]), write_type="a", output_wrong_pred=True)
            predictor = build_predictor(self.args, self.model, self.tokenizer, self.symbols, logger)
            predictor.translate(data_iter, 'best', have_gold=False, info="{} With Generated Response".format(best_model.split("/")[-1]))


        if level >= 3:
            logger.info('Test on final model (stage-3)')
            best_model = os.path.join(self.args.savepath, 'best_model.pt')
            ckpt = torch.load(best_model, map_location=lambda storage, loc: storage)['model']
            #ckpt['classifier.pooler.conv.lin.weight'] = ckpt['classifier.pooler.conv.weight']
            try:
                self.model.load_state_dict(ckpt)
            except:
                mismatch = self.model.load_state_dict(ckpt, strict=False)
                print(mismatch)
                logger.info('[Warning] The keys in state dict do not strictly match')

            # Test without generated response
            trainer.testing(data_iter, tokenizer=self.tokenizer, gen_flag=False, info="{}".format(best_model.split("/")[-1]), write_type="a")

            # Test with generated response
            _, wrongs_after = trainer.testing(data_iter, tokenizer=self.tokenizer, gen_flag=True, info="{} With Generated Response".format(best_model.split("/")[-1]), write_type="a", output_wrong_pred=True)
            predictor = build_predictor(self.args, self.model, self.tokenizer, self.symbols, logger)
            predictor.translate(data_iter, 'best', have_gold=False, info="{} With Generated Response".format(best_model.split("/")[-1]))

            # Find the data that are successfully attack and fixed
            fixed = []
            for i in wrongs_before:
                if i not in wrongs_after:
                    fixed.append(str(i))

            wrongs_before = [str(i) for i in wrongs_before]
            with open(os.path.join(self.args.savepath, "id_fixed.txt"), "w") as f:
                f.write("\n".join(fixed))

            with open(os.path.join(self.args.savepath, "id_attack_success.txt"), "w") as f:
                f.write("\n".join(wrongs_before))