Beispiel #1
0
    def __call__(self, config):
        random.seed(1)
        np.random.seed(1)
        torch.manual_seed(1)

        #self.distance = F.cosine_similarity
        self.succeed_number = 0

        corpus, loader = self.pre_attack(config)
        print(self.vocab.char_dict)
        self.punct_rel_idx = self.vocab.rel_dict[CONSTANT.PUNCT]
        self.punct_idx = [value for key, value in self.vocab.char_dict.items() if not (key.isdigit() or key.isalpha())]
        # for saving
        attack_corpus = Corpus([])

        self.embedding_weight = self.parser.char_lstm.embed.weight
        self.parser.char_lstm.embed.register_backward_hook(self.char_embed_backward_hook)
        # attack seq generator
        # self.attack_seq_generator = self.get_attack_seq_generator(config)
        self.attack(loader, config, attack_corpus)

        # save to file
        if config.save_result_to_file:
            attack_corpus_save_path = self.get_attack_corpus_saving_path(config)
            attack_corpus.save(attack_corpus_save_path)
            print('Result after attacking has saved in {}'.format(attack_corpus_save_path))
Beispiel #2
0
    def __call__(self, config):
        random.seed(1)
        np.random.seed(1)
        torch.manual_seed(1)

        self.init_config(config)

        corpus, loader = self.pre_attack(config)
        self.parser.eval()

        attack_corpus = Corpus([])

        # attack seq generator
        self.attack_seq_generator = self.get_attack_seq_generator(config)
        #self.attack_index = AttackIndexUnkReplacement(config, self.vocab, self.parser)
        with torch.no_grad():
            self.attack(config, loader, corpus, attack_corpus)

        if config.save_result_to_file:
            # corpus_save_path = '{}/{}'.format(config.result_path,'origin.conllx')
            # corpus.save(corpus_save_path)
            # print('Result before attacking has saved in {}'.format(corpus_save_path))
            attack_corpus_save_path = self.get_attack_corpus_saving_path(
                config)
            attack_corpus.save(attack_corpus_save_path)
            print('Result after attacking has saved in {}'.format(
                attack_corpus_save_path))
Beispiel #3
0
    def __call__(self, config):
        random.seed(1)
        np.random.seed(1)
        torch.manual_seed(1)

        corpus, loader = self.pre_attack(config)
        # for saving
        attack_corpus = Corpus([])

        # attack seq generator
        self.attack_seq_generator = self.get_attack_seq_generator(config)
        self.attack(loader, config, attack_corpus)

        # save to file
        if config.save_result_to_file:
            attack_corpus_save_path = self.get_attack_corpus_saving_path(config)
            attack_corpus.save(attack_corpus_save_path)
            print('Result after attacking has saved in {}'.format(attack_corpus_save_path))
Beispiel #4
0
    def __call__(self, config):
        self.vocab = torch.load(config.vocab)
        self.parser = load_parser(fetch_best_ckpt_name(config.parser_model))
        self.task = ParserTask(self.vocab, self.parser)
        # load training data
        corpus = Corpus.load(config.ftrain)
        dataset = TextDataset(self.vocab.numericalize(corpus, training=True))
        loader = DataLoader(dataset=dataset, collate_fn=collate_fn)
        augmentation_corpus = Corpus([])
        training_data_number = len(corpus.sentences)
        self.attack_seq_generator = self.get_attack_seq_generator(config)

        # random prob to decide whether to change a specific training data
        # if prob[index] < augmentation_rate, augmented.
        prob = np.random.uniform(0.0, 1.0, size=(training_data_number,))
        for index, (seq_idx, tag_idx, chars, arcs, rels) in enumerate(loader):
            sentence = corpus.sentences[index]
            augmentation_corpus.sentences.append(sentence)
            if index % 1000 == 0:
                print("{} sentences have processed! ".format(index))

            if prob[index] < config.augmentation_rate:
                seqs = self.get_seqs_name(seq_idx)
                tags = self.get_tags_name(tag_idx)
                mask = self.get_mask(seq_idx, self.vocab.pad_index, punct_list=self.vocab.puncts)
                raw_metric = self.task.evaluate([(seq_idx, tag_idx, chars, arcs, rels)],mst=config.mst)
                augmentation_seq, _,  _,  _, _, _ = self.attack_seq_generator.generate_attack_seq(' '.join(seqs[1:]), seq_idx, tags, tag_idx, chars, arcs, rels, mask, raw_metric)
                augmentation_corpus.sentences.append(init_sentence(sentence.FORM, tuple(augmentation_seq[1:]), sentence.POS, sentence.HEAD, sentence.DEPREL))

        if config.input == 'char':
            saved_file = '{}/ptb_train_typo_only_substitute_{:.0f}%_{}.sd'.format(config.augmentation_dir, config.augmentation_rate*100, config.revised_rate)
        else:
            saved_file = '{}/ptb_train_{:.0f}%_{}.sd'.format(config.augmentation_dir, config.augmentation_rate*100, config.revised_rate)

        print("Complete! {} sentences have processed!".format(training_data_number))
        print("Current training data number is {}.".format(len(augmentation_corpus.sentences)))
        print("The augmentation data are saved to file {}".format(saved_file))
        augmentation_corpus.save(saved_file)