Beispiel #1
0
 def read_from_content_batch(self,
                             text_a_list: List[str],
                             text_b_list: List[str] = None,
                             label_list: List[str] = None,
                             tokenizer: bool = True) -> Dataset:
     '''
     :param text_a_list:
     :param text_b_list:
     :param label_list:
     :param tokenizer:
     :return:
     '''
     if text_b_list is None:
         text_b_list = [None] * len(text_a_list)
     if label_list is None:
         label_list = [None] * len(text_a_list)
     instances = []
     for index, (text_a, text_b, label) in enumerate(
             zip(text_a_list, text_b_list, label_list)):
         instances.append(
             InputInstance(guid='predict{}'.format(index),
                           text_a=text_a,
                           text_b=text_b,
                           label=label))
     dataset = self.convert_instances_to_dataset(instances,
                                                 tokenizer=tokenizer,
                                                 use_tqdm=False)
     return dataset
Beispiel #2
0
 def create_instances(self, lines, set_type):
     """Creates instances for the training and dev sets."""
     instances = []
     for (i, line) in enumerate(lines):
         label, text_a, text_b = line
         instances.append(
             InputInstance(guid="{}-{}".format(set_type, i),
                           text_a=text_a,
                           text_b=text_b,
                           label=label))
     return instances
Beispiel #3
0
 def read_from_content(self,
                       text_a: str,
                       text_b: str = None,
                       label: str = None,
                       tokenizer: bool = True) -> Dataset:
     instance = InputInstance(guid='predict',
                              text_a=text_a,
                              text_b=text_b,
                              label=label)
     dataset = self.convert_instances_to_dataset([instance],
                                                 tokenizer=tokenizer,
                                                 use_tqdm=False)
     return dataset
Beispiel #4
0
 def create_instances(self, lines, set_type):
     """Creates instances for the training and dev sets."""
     instances = []
     for (i, line) in enumerate(lines):
         guid = "%s-%s" % (set_type, i)
         text_a, label = line
         label = "10" + label
         instances.append(
             InputInstance(guid=guid,
                           text_a=text_a,
                           text_b=None,
                           label=label))
     return instances
Beispiel #5
0
 def perturb_batch(self,
                   instances: List[InputInstance]) -> List[InputInstance]:
     result_instances = []
     for instance in instances:
         perturb_sentences = self.augmentor.get_perturbed_batch(
             instance.perturbable_sentence().lower())
         tmp_instances = []
         for sentence in perturb_sentences:
             tmp_instances.append(
                 InputInstance.from_instance_and_perturb_sentence(
                     instance, sentence))
         result_instances.extend(tmp_instances)
     return result_instances
Beispiel #6
0
def mask_instance(instance: InputInstance,
                  rate: float,
                  token: str,
                  nums: int = 1,
                  return_indexes: bool = False,
                  forbidden_indexes: List[int] = None,
                  random_probs: List[float] = None) -> List[InputInstance]:
    sentence = instance.perturbable_sentence()
    results = mask_sentence(sentence, rate, token, nums, return_indexes,
                            forbidden_indexes, random_probs)
    if return_indexes:
        mask_sentences_list = results[0]
    else:
        mask_sentences_list = results
    tmp_instances = [
        InputInstance.from_instance_and_perturb_sentence(instance, sent)
        for sent in mask_sentences_list
    ]
    if return_indexes:
        return tmp_instances, results[1]
    else:
        return tmp_instances
Beispiel #7
0
 def mask_instance_decorator(self,
                             args: ClassifierArgs,
                             instance: InputInstance,
                             numbers: int = 1,
                             return_indexes: bool = False):
     if self.forbidden_words is not None:
         forbidden_index = mask_forbidden_index(
             instance.perturbable_sentence(), self.forbidden_words)
         return mask_instance(instance, args.sparse_mask_rate,
                              self.tokenizer.mask_token, numbers,
                              return_indexes, forbidden_index)
     else:
         return mask_instance(instance, args.sparse_mask_rate,
                              self.tokenizer.mask_token, numbers,
                              return_indexes)
Beispiel #8
0
    def augmentation(self, args: ClassifierArgs, **kwargs):
        self.loading_model_from_file(
            args.saving_dir, args.build_saving_file_name(description='best'))
        self.model.eval()

        train_instances, _ = self.build_data_loader(args,
                                                    'train',
                                                    tokenizer=False)
        train_dataset_len = len(train_instances.data)
        print('Training Set: {} sentences. '.format(train_dataset_len))

        # delete instance whose length is smaller than 3
        train_instances_deleted = [
            instance for instance in train_instances.data
            if instance.length() >= 3
        ]
        dataset_to_aug = np.random.choice(train_instances_deleted,
                                          size=(int(train_dataset_len *
                                                    0.5), ),
                                          replace=False)

        dataset_to_write = np.random.choice(train_instances.data,
                                            size=(int(train_dataset_len *
                                                      0.5), ),
                                            replace=False).tolist()
        attacker = self.build_attacker(args)
        attacker_log_manager = AttackLogManager()
        dataset = CustomTextAttackDataset.from_instances(
            args.dataset_name, dataset_to_aug, self.data_reader.get_labels())
        results_iterable = attacker.attack_dataset(dataset)
        aug_instances = []
        for result, instance in tqdm(zip(results_iterable, dataset_to_aug),
                                     total=len(dataset)):
            try:
                adv_sentence = result.perturbed_text()
                aug_instances.append(
                    InputInstance.from_instance_and_perturb_sentence(
                        instance, adv_sentence))
            except:
                print('one error happend, delete one instance')

        dataset_to_write.extend(aug_instances)
        self.data_reader.saving_instances(dataset_to_write, args.dataset_dir,
                                          'aug_{}'.format(args.attack_method))
        print('Writing {} Sentence. '.format(len(dataset_to_write)))
        attacker_log_manager.enable_stdout()
        attacker_log_manager.log_summary()