Ejemplo n.º 1
0
    def setup(self, config):
        self.config = config

        print("Load the models")
        vocab = torch.load(config.vocab)  # type: Vocab
        parser = load_parser(fetch_best_ckpt_name(config.parser_model))

        self.task = ParserTask(vocab, parser)

        print("Load the dataset")

        train_corpus = Corpus.load(config.ftrain)

        if config.hk_training_set == 'on':
            self.corpus = train_corpus
        else:
            self.corpus = Corpus.load(config.fdata)
        dataset = TextDataset(vocab.numericalize(self.corpus, True))
        # set the data loader
        self.loader = DataLoader(dataset=dataset, collate_fn=collate_fn)

        def embed_backward_hook(module, grad_in, grad_out):
            ram_write('embed_grad', grad_out[0])

        self.parser.char_lstm.embed.register_backward_hook(embed_backward_hook)
        # self.parser.embed.register_backward_hook(embed_backward_hook)
        self.parser.eval()

        self.embed_searcher = EmbeddingSearcher(
            embed=self.parser.char_lstm.embed.weight,
            idx2word=lambda x: self.vocab.chars[x],
            word2idx=lambda x: self.vocab.char_dict[x])

        random.seed(1)
        torch.manual_seed(1)
Ejemplo n.º 2
0
    def pre_attack(self, config):
        print("Load the models")
        self.vocab = torch.load(config.vocab)
        self.parser = load_parser(fetch_best_ckpt_name(config.parser_model))
        self.task = ParserTask(self.vocab, self.parser)

        print("Load the dataset")
        corpus = Corpus.load(config.fdata)
        dataset = TextDataset(self.vocab.numericalize(corpus, training=True))
        loader = DataLoader(dataset=dataset, collate_fn=collate_fn)
        return corpus, loader
Ejemplo n.º 3
0
    def __call__(self, config):
        print("Load the models")
        vocab = torch.load(config.vocab)
        parser = load_parser(fetch_best_ckpt_name(config.parser_model))
        task = ParserTask(vocab, parser)
        if config.pred_tag:
            tagger = PosTagger.load(fetch_best_ckpt_name(config.tagger_model))
        else:
            tagger = None

        print("Load the dataset")
        corpus = Corpus.load(config.fdata)
        dataset = TextDataset(vocab.numericalize(corpus))
        # set the data loader
        loader = batchify(dataset, config.batch_size, config.buckets)

        print("Evaluate the dataset")
        loss, metric = task.evaluate(loader, config.punct, tagger, True)
        print(f"Loss: {loss:.4f} {metric}")
Ejemplo n.º 4
0
    def __call__(self, config):
        print("Load the models")
        vocab = torch.load(config.vocab)
        parser = load_parser(fetch_best_ckpt_name(config.parser_model))
        task = ParserTask(vocab, parser)
        if config.pred_tag:
            tagger = PosTagger.load(fetch_best_ckpt_name(config.tagger_model))
        else:
            tagger = None

        print("Load the dataset")
        corpus = Corpus.load(config.fdata)
        dataset = TextDataset(vocab.numericalize(corpus, training=False))
        # set the data loader
        loader = batchify(dataset, config.batch_size)

        print("Make predictions on the dataset")
        corpus.tags, corpus.heads, corpus.rels = task.predict(loader, tagger)

        saved_path = '{}/raw_result.conllx'.format(config.result_path)
        print(f"Save the predicted result to {saved_path}")
        corpus.save(saved_path)
Ejemplo n.º 5
0
    def __call__(self, config):
        self.vocab = torch.load(config.vocab)
        self.parser = load_parser(fetch_best_ckpt_name(config.parser_model))
        self.task = ParserTask(self.vocab, self.parser)
        # load training data
        corpus = Corpus.load(config.ftrain)
        dataset = TextDataset(self.vocab.numericalize(corpus, training=True))
        loader = DataLoader(dataset=dataset, collate_fn=collate_fn)
        augmentation_corpus = Corpus([])
        training_data_number = len(corpus.sentences)
        self.attack_seq_generator = self.get_attack_seq_generator(config)

        # random prob to decide whether to change a specific training data
        # if prob[index] < augmentation_rate, augmented.
        prob = np.random.uniform(0.0, 1.0, size=(training_data_number,))
        for index, (seq_idx, tag_idx, chars, arcs, rels) in enumerate(loader):
            sentence = corpus.sentences[index]
            augmentation_corpus.sentences.append(sentence)
            if index % 1000 == 0:
                print("{} sentences have processed! ".format(index))

            if prob[index] < config.augmentation_rate:
                seqs = self.get_seqs_name(seq_idx)
                tags = self.get_tags_name(tag_idx)
                mask = self.get_mask(seq_idx, self.vocab.pad_index, punct_list=self.vocab.puncts)
                raw_metric = self.task.evaluate([(seq_idx, tag_idx, chars, arcs, rels)],mst=config.mst)
                augmentation_seq, _,  _,  _, _, _ = self.attack_seq_generator.generate_attack_seq(' '.join(seqs[1:]), seq_idx, tags, tag_idx, chars, arcs, rels, mask, raw_metric)
                augmentation_corpus.sentences.append(init_sentence(sentence.FORM, tuple(augmentation_seq[1:]), sentence.POS, sentence.HEAD, sentence.DEPREL))

        if config.input == 'char':
            saved_file = '{}/ptb_train_typo_only_substitute_{:.0f}%_{}.sd'.format(config.augmentation_dir, config.augmentation_rate*100, config.revised_rate)
        else:
            saved_file = '{}/ptb_train_{:.0f}%_{}.sd'.format(config.augmentation_dir, config.augmentation_rate*100, config.revised_rate)

        print("Complete! {} sentences have processed!".format(training_data_number))
        print("Current training data number is {}.".format(len(augmentation_corpus.sentences)))
        print("The augmentation data are saved to file {}".format(saved_file))
        augmentation_corpus.save(saved_file)
Ejemplo n.º 6
0
def compare_idxes(nbr1, nbr2):
    nbr1 = set(cast_list(nbr1))
    nbr2 = set(cast_list(nbr2))
    inter = nbr1.intersection(nbr2)
    return len(inter)


if __name__ == '__main__':
    from dpattack.libs.luna import fetch_best_ckpt_name, cast_list, show_mean_std, time_record
    from dpattack.utils.parser_helper import load_parser
    from dpattack.utils.vocab import Vocab

    vocab = torch.load(
        "/disks/sdb/zjiehang/zhou_data/ptb/vocab")  # type: Vocab
    parser = load_parser(
        fetch_best_ckpt_name(
            "/disks/sdb/zjiehang/zhou_data/saved_models/word_tag/lzynb"))
    # print(type(vocab))
    esglv = EmbeddingSearcher(embed=vocab.embeddings,
                              idx2word=lambda x: vocab.words[x],
                              word2idx=lambda x: vocab.word_dict[x])

    with time_record():
        esglv.use_faiss_backend(False, True, 10, 1)
        for _ in range(10):
            esglv.find_neighbours(0, 100)

    # esglv.show_embedding_info()
    # esmdl = EmbeddingSearcher(embed=parser.embed.weight,
    #                           idx2word=lambda x: vocab.words[x],
    #                           word2idx=lambda x: vocab.word_dict[x])
Ejemplo n.º 7
0
    def __call__(self, config):
        print("Preprocess the data")
        train = Corpus.load(config.ftrain)
        dev = Corpus.load(config.fdev)
        test = Corpus.load(config.ftest)
        if os.path.exists(config.vocab):
            vocab = torch.load(config.vocab)
        else:
            vocab = Vocab.from_corpus(corpus=train, min_freq=2)
            vocab.read_embeddings(Pretrained.load(config.fembed, config.unk))
            torch.save(vocab, config.vocab)
        config.update({
            'n_words': vocab.n_train_words,
            'n_tags': vocab.n_tags,
            'n_rels': vocab.n_rels,
            'n_chars': vocab.n_chars,
            'pad_index': vocab.pad_index,
            'unk_index': vocab.unk_index
        })
        print(vocab)

        print("Load the dataset")
        trainset = TextDataset(vocab.numericalize(train))
        devset = TextDataset(vocab.numericalize(dev))
        testset = TextDataset(vocab.numericalize(test))
        # set the data loaders
        train_loader = batchify(dataset=trainset,
                                batch_size=config.batch_size,
                                n_buckets=config.buckets,
                                shuffle=True)
        dev_loader = batchify(dataset=devset,
                              batch_size=config.batch_size,
                              n_buckets=config.buckets)
        test_loader = batchify(dataset=testset,
                               batch_size=config.batch_size,
                               n_buckets=config.buckets)
        print(f"{'train:':6} {len(trainset):5} sentences in total, "
              f"{len(train_loader):3} batches provided")
        print(f"{'dev:':6} {len(devset):5} sentences in total, "
              f"{len(dev_loader):3} batches provided")
        print(f"{'test:':6} {len(testset):5} sentences in total, "
              f"{len(test_loader):3} batches provided")

        print("Create the models")
        assert config.train_task in ['parser', 'tagger']
        is_training_parser = config.train_task == 'parser'

        if config.augmentation_training:
            aug_test = Corpus.load(config.augmentation_test_file)
            aug_testset = TextDataset(vocab.numericalize(aug_test))
            aug_test_loader = batchify(dataset=aug_testset,
                                       batch_size=config.batch_size,
                                       n_buckets=config.buckets)
            print(f"{'test:':6} {len(aug_testset):5} sentences in total, "
                  f"{len(aug_test_loader):3} batches provided")

        if is_training_parser:
            model = init_parser(config, vocab.embeddings)
            task = ParserTask(vocab, model)
            best_e, best_metric = 1, ParserMetric()
        else:
            model = PosTagger(config, vocab.embeddings)
            task = TaggerTask(vocab, model)
            best_e, best_metric = 1, TaggerMetric()

        if torch.cuda.is_available():
            model = model.cuda()
        print(f"{model}\n")
        total_time = timedelta()
        # best_e, best_metric = 1, TaggerMetric()
        task.optimizer = Adam(task.model.parameters(), config.lr,
                              (config.beta_1, config.beta_2), config.epsilon)
        task.scheduler = ExponentialLR(task.optimizer,
                                       config.decay**(1 / config.steps))
        for epoch in range(1, config.epochs + 1):
            start = datetime.now()
            # train one epoch and update the parameters
            task.train(train_loader)

            print(f"Epoch {epoch} / {config.epochs}:")
            loss, train_metric = task.evaluate(train_loader, config.punct)
            print(f"{'train:':6} Loss: {loss:.4f} {train_metric}")
            loss, dev_metric = task.evaluate(dev_loader, config.punct)
            print(f"{'dev:':6} Loss: {loss:.4f} {dev_metric}")
            loss, test_metric = task.evaluate(test_loader, config.punct)
            print(f"{'test:':6} Loss: {loss:.4f} {test_metric}")
            if config.augmentation_training:
                loss, aug_test_metric = task.evaluate(aug_test_loader,
                                                      config.punct)
                print(f"{'test:':6} Loss: {loss:.4f} {aug_test_metric}")

            t = datetime.now() - start

            if dev_metric > best_metric and epoch > config.patience:
                best_e, best_metric = epoch, dev_metric
                if is_training_parser:
                    task.model.save(config.parser_model + f".{best_e}")
                else:
                    task.model.save(config.tagger_model + f".{best_e}")
                print(f"{t}s elapsed (saved)\n")
            else:
                print(f"{t}s elapsed\n")
            sys.stdout.flush()
            total_time += t
            if epoch - best_e >= config.patience:
                break

        if is_training_parser:
            copyfile(config.parser_model + f'.{best_e}',
                     config.parser_model + '.best')
            task.model = load_parser(config.parser_model + f".{best_e}")
        else:
            copyfile(config.tagger_model + f'.{best_e}',
                     config.tagger_model + '.best')
            task.model = PosTagger.load(config.tagger_model + f".{best_e}")
        loss, metric = task.evaluate(test_loader, config.punct)

        print(f"max score of dev is {best_metric.score:.2%} at epoch {best_e}")
        print(f"the score of test at epoch {best_e} is {metric.score:.2%}")

        if config.augmentation_training:
            loss, metric = task.evaluate(aug_test_loader, config.punct)
            print(
                f"the score of aug test at epoch {best_e} is {metric.score:.2%}"
            )

        print(f"average time of each epoch is {total_time / epoch}s")
        print(f"{total_time}s elapsed")