Example #1
0
    def get_examples(self, filepath):
        """See base class."""
        """
            filepath: the file of article-category pairs 
        """
        examples = []
        i = 0
        with open(filepath) as fin:
            lines = fin.read().strip().split("\n")
            for line in tqdm(lines):
                line = line.strip().split("\t")

                pos_cat = line[0]
                neg_cats = line[1:-1]
                article = line[-1]
                for neg_cat in neg_cats:
                    examples.append(
                        InputExample(guid=i,
                                     text_a=pos_cat,
                                     text_b=article,
                                     label='1'))
                    i += 1
                    examples.append(
                        InputExample(guid=i,
                                     text_a=neg_cat,
                                     text_b=article,
                                     label='0'))
                    i += 1

        return examples
Example #2
0
    def get_examples(self, filepath):
        """See base class."""
        """
            filepath: the file of the evaluation dataset 
        """
        examples = []
        labels = []
        i = 0
        with open(filepath) as fin:
            lines = fin.read().strip().split("\n")
            for line in tqdm(lines):
                line = line.strip().split(",", 1)
                if line[0].startswith("'") or line[0].startswith('"'):
                    line[0] = line[0][1:-1]
                label = int(line[0]) - 1
                text = " ".join(line[1][1:-1].split()[:128])
                if text.strip() == "":
                    text = "N/A"
                for cat in self.cats:
                    i += 1
                    if cat == self.cats[label]:
                        examples.append(InputExample(guid=i, text_a=cat, text_b=text, label=1))
                    else:
                        examples.append(InputExample(guid=i, text_a=cat, text_b=text, label=0))

        return examples
    def _create_examples(self, lines_a, lines_b):

        original_examples = []
        pos_examples = []
        neg_examples = []
        for (i, (line_a, line_b)) in enumerate(zip(lines_a, lines_b)):
            if (i + 1) % 5000 == 0:
                print("create examples:{}".format(i))
            original_guid = "%s_%s_%s" % ("train", 'original', i)
            original_examples.append(
                InputExample(guid=original_guid,
                             text_a=line_a,
                             text_b=None,
                             label=1))

            pos_guid = "%s_%s_%s" % ("train", 'pos', i)
            pos_examples.append(
                InputExample(guid=pos_guid,
                             text_a=line_b,
                             text_b=None,
                             label=1))

            neg_guid = "%s_%s_%s" % ("train", 'neg', i)
            neg_line = self.get_neg_sent(line_a)
            neg_examples.append(
                InputExample(guid=neg_guid,
                             text_a=neg_line,
                             text_b=None,
                             label=1))

        return original_examples, pos_examples, neg_examples
Example #4
0
 def _create_examples(self, path, set_type):
     """
     创建数据集
     Args:
         path: train, dev, test数据集路径
         set_type: 标记数据类型, train, dev, test
     Returns:
     """
     examples = []
     #样本计数
     count = 0
     dirs = os.listdir(path)
     #如果是测试数据,或者预测数据,不是双层文件夹,是一层文件夹
     if set_type != 'test':
         for dir in dirs:
             files = os.listdir(os.path.join(path, dir))
             for file in tqdm(files):
                 file_content = self.docx2text(os.path.join(path, dir, file))
                 # 过滤掉内容少于5个字符的无意义文档
                 if len(file_content) > 5:
                     guid = "%s-%s" % (set_type, count)
                     count += 1
                     text_a = file_content
                     label = dir
                     examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
     else:
         #设置label为None
         for file in dirs:
             file_content = self.docx2text(os.path.join(path, file))
             guid = "%s-%s" % (set_type, count)
             count += 1
             text_a = file_content
             label = None
             examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
     return examples
Example #5
0
    def _create_examples(self, filename, set_type):
        """Creates examples for the training, dev and test sets."""

        data = pd.read_csv(filename)
        user_review_examples = []
        item_review_examples = []
        for i in range(data.shape[0]):
            guid = "%s-%s" % (set_type, i)
            row = data.iloc[i]
            user_id = str(row["user_id"])
            item_id = str(row["item_id"])
            label = float(row["ratings"])
            rating = row["ratings"]
            user_reviews = self.user_reviews[
                user_id] if user_id in self.user_reviews else ["N/A"]
            item_reviews = self.item_reviews[
                item_id] if item_id in self.user_reviews else ["N/A"]
            random.shuffle(user_reviews)
            random.shuffle(item_reviews)
            user_reviews = " [SEP] ".join(user_reviews)
            item_reviews = " [SEP] ".join(item_reviews)
            user_review_examples.append(
                InputExample(guid=guid, text_a=user_reviews, label=label))
            item_review_examples.append(
                InputExample(guid=guid, text_a=item_reviews, label=label))

        return user_review_examples, item_review_examples
Example #6
0
def get_FEVER_examples(prefix, hypo_only=False):
    '''
    train_fitems.jsonl, dev_fitems.jsonl, test_fitems.jsonl
    dev_fitems.label.recovered.jsonl
    '''
    examples = []
    path = '/export/home/Dataset/para_entail_datasets/nli_FEVER/nli_fever/'
    filename = path+prefix+'_fitems.jsonl'
    if prefix == 'test' or prefix == 'dev':
        filename = path+'dev_fitems.label.recovered.jsonl'
    print('loading FEVER...', filename)
    guid_id = 0
    pos_size = 0
    with open(filename, 'r') as f:
        for line in json_lines.reader(f):
            guid_id+=1
            premise = line.get('context')
            hypothesis = line.get('query')
            label = 'entailment' if line.get('label') == 'SUPPORTS' else 'not_entailment'
            if label == 'entailment':
                pos_size+=1
            if len(premise) == 0 or len(hypothesis)==0:
                continue

            if hypo_only:
                examples.append(InputExample(guid=str(guid_id), text_a=hypothesis, text_b=None, label=label))
            else:
                examples.append(InputExample(guid=str(guid_id), text_a=premise, text_b=hypothesis, label=label))
    print('FEVER size:', len(examples))
    return examples, pos_size
Example #7
0
def load_DocNLI(prefix, hypo_only=False):
    readfile = codecs.open('/export/home/Dataset/para_entail_datasets/'+prefix+'.json', 'r', 'utf-8')

    data = json.load(readfile)
    examples = []
    for dic in data:
        premise = dic.get('premise')
        hypothesis = dic.get('hypothesis')
        label  = dic.get('label')
        if hypo_only:
            examples.append(InputExample(guid='ex', text_a=hypothesis, text_b=None, label=label))
        else:
            examples.append(InputExample(guid='ex', text_a=premise, text_b=hypothesis, label=label))
    return examples
Example #8
0
def get_MCTest_examples(prefix, hypo_only=False):
    path = '/export/home/Dataset/para_entail_datasets/MCTest/'
    filename = path+prefix+'_in_entail.txt'
    print('loading MCTest...', filename)
    readfile = codecs.open(filename, 'r', 'utf-8')
    guid_id = 0
    pos_size = 0
    neg_size = 0
    examples = []
    for line in readfile:
        guid_id+=1
        parts = line.strip().split('\t')
        if len(parts) ==3:
            premise = parts[1]
            hypothesis = parts[2]
            label = 'entailment' if parts[0] == 'entailment' else 'not_entailment'
            # if label == 'entailment':
            #     pos_size+=1
            if len(premise) == 0 or len(hypothesis)==0:
                # print('MCTest premise:', premise)
                # print('hypothesis:', hypothesis)
                continue

            if label == 'entailment':
                pos_size+=1
            else:
                neg_size+=1
            if hypo_only:
                examples.append(InputExample(guid=prefix+str(guid_id), text_a=hypothesis, text_b=None, label=label))
            else:
                examples.append(InputExample(guid=prefix+str(guid_id), text_a=premise, text_b=hypothesis, label=label))
    print('MCTest size:', len(examples))
    # if prefix == 'train':
    #     new_examples = []
    #     new_pos_size = 0
    #     new_neg_size = 0
    #     for ex in examples:
    #         if ex.label == 'not_entailment':
    #             if random.uniform(0.0, 1.0) <= pos_size/neg_size:
    #                 new_examples.append(ex)
    #                 new_neg_size+=1
    #         else:
    #             new_examples.append(ex)
    #             new_pos_size+=1
    #     print('>>new pos:neg: ', new_pos_size, new_neg_size)
    #     return new_examples, new_pos_size
    # else:
    #     return examples, pos_size
    return examples, pos_size
    def test_mnli_dev_no_contradiction(self):
        processor = MnliNoContradictionProcessor()
        base_name = 'crosslangt.nli.dataprep_nli.MnliProcessor' \
                    '.get_dev_examples'
        examples = [
            InputExample('2', 'text c', 'text e', 'entailment'),
            InputExample('3', 'text d', 'text f', 'neutral'),
            InputExample('1', 'text a', 'text b', 'contradiction')
        ]

        expected = examples[:2]  # All but the contradiction example

        with patch(base_name, return_value=examples):
            actual = processor.get_dev_examples('/some/data/dir')
            self.assertListEqual(expected, list(actual))
Example #10
0
 def get_example_from_tensor_dict(self, tensor_dict):
     return InputExample(
         tensor_dict["idx"].numpy(),
         tensor_dict["sentence1"].numpy().decode("utf-8"),
         tensor_dict["sentence2"].numpy().decode("utf-8"),
         str(tensor_dict["label"].numpy()),
     )
Example #11
0
 def _create_examples(self, lines, set_type):
     examples = []
     for (i, line) in enumerate(lines):
         guid = "%s-%s" % (set_type, i)
         examples.append(
             InputExample(guid=guid, text_a=line, text_b=None, label=1))
     return examples
Example #12
0
 def __create_examples(self, pmid, text_li, genes, relations):
     examples = []
     text_li_ori = text_li
     guids = set()
     for g1 in genes:
         for g2 in genes:
             guid = f"{pmid}_{g1}_{g2}"
             if self.testData and f"{pmid}_{g2}_{g1}" in guids:
                 continue
             text_li = text_li_ori.copy()
             if (g1, g2) in relations or (g2, g1) in relations:
                 label = "1"
                 self.label_1_count += 1
             else:
                 label = "0"
                 self.label_0_count += 1
             g1_l = "Gene_S" if g1 == g2 else "Gene_A"
             g2_l = "Gene_S" if g1 == g2 else "Gene_B"
             for i, word in enumerate(text_li):
                 if word[:5] == "Gene_":
                     if word[5:] == g1:
                         text_li[i] = g1_l
                     elif word[5:] == g2:
                         text_li[i] = g2_l
                     else:
                         text_li[i] = "Gene_N"
             text_a = " ".join(text_li)
             if self.testData:
                 guids.add(guid)
             examples.append(
                 InputExample(guid=guid,
                              text_a=text_a,
                              text_b=None,
                              label=label))
     return examples
Example #13
0
def predict(context, replies, tokenizer, model, label_list, args):
    model.eval()

    best_score = 0.0
    best_reply = None

    results = []
    for index, reply in enumerate(replies):
        example = InputExample(guid=0, text_a = context, text_b = [reply])
        feature = convert_single_example_to_features(example, tokenizer, max_length=512, 
                                      pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
                                      pad_token_segment_id=0,
                                      mask_padding_with_zero=True)
 

        
        # with torch.no_grad():
        all_input_ids = torch.tensor([feature.input_ids],dtype=torch.long).to(args.device)
        all_attention_mask = torch.tensor([feature.attention_mask ], dtype=torch.long).to(args.device)
        all_token_type_ids = torch.tensor([feature.token_type_ids ], dtype=torch.long).to(args.device)
        #dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids)
        
        inputs = {'input_ids': all_input_ids, 'attention_mask': all_attention_mask, 'token_type_ids': all_token_type_ids}

        outputs = model(**inputs)
        
        logits = outputs[0]
        ## label is None, so there  we got logits. 
        # logits.detach().cpu().numpy() 
        prob = np.argmax(logits.detach().cpu().numpy() )
        results.append(prob)

    return results
Example #14
0
    def load_and_cache_examples(self, x1, x2, task, tokenizer):
        processor = processors[task]()
        output_mode = output_modes[task]
        # Load data features from cache or dataset file
        label_list = processor.get_labels()
        examples = []
        for t1, t2 in zip(x1, x2):
            guid = "%s-%s" % ('dev_matched', t1)
            examples.append(InputExample(guid=guid, text_a=t1, text_b=t2, label='negative'))

        features = convert_examples_to_features(examples,
                                                tokenizer,
                                                label_list=label_list,
                                                max_length=128,
                                                output_mode=output_mode,
                                                pad_on_left=False,
                                                # pad on the left for xlnet
                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
                                                pad_token_segment_id=0,
                                                )

        # Convert to Tensors and build dataset
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
        if output_mode == "classification":
            all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
        elif output_mode == "regression":
            all_labels = torch.tensor([f.label for f in features], dtype=torch.float)

        dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
        return dataset
Example #15
0
 def _create_examples(self, lines, set_type, do_text_b):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         # if i == 0:
         #     continue
         guid = "%s-%s" % (set_type, i)
         try:
             text_a = line[0]
         except IndexError:
             print(i)
             print(line)
         if do_text_b:
             text_b = "Climate change and global warming are serious concerns."
         else:
             text_b = None
         try:
             label = line[-1]
         except IndexError:
             print(i, line)
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
 def get_valid_examples(self, data_dir):
     lg = self.language if self.train_language is None else self.train_language
     lines = self._read_tsv(
         os.path.join(
             data_dir,
             "XNLI-MT-1.0/multinli/multinli.valid.{}.tsv".format(lg)))
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "%s-%s" % ("valid", i)
         text_a = line[0]
         text_b = line[1]
         label = "contradiction" if line[2] == "contradictory" else line[2]
         assert isinstance(
             text_a, str), f"Validation input {text_a} is not a string"
         assert isinstance(
             text_b, str), f"Validation input {text_b} is not a string"
         assert isinstance(label,
                           str), f"Validation label {label} is not a string"
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
Example #17
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         if i != 0:
             guid = "%s-%s-%s" % (str(i), set_type, line[0])
             text_a = line[1]
             text_a = text_a.replace(
                 '@problem$', self.problem_pattern).replace(
                     '@treatment$', self.treatment_pattern
                 ).replace('@test$', self.test_pattern).replace(
                     "@problem-problem$",
                     self.problem_problem_pattern).replace(
                         "@test-problem$",
                         self.test_problem_pattern).replace(
                             "@test-test$", self.test_test_pattern).replace(
                                 '@treatment-test$',
                                 self.treatment_test_pattern).replace(
                                     '@treatment-treatment$',
                                     self.treatment_treatment_pattern)
             label = line[2]
             assert label in self.get_labels()
             examples.append(
                 InputExample(guid=guid,
                              text_a=text_a,
                              text_b=None,
                              label=label))
     return examples
Example #18
0
 def get_train_examples(self, data_dir):
     """See base class."""
     # lg = self.language if self.train_language is None else self.train_language
     # lines = self._read_tsv(os.path.join(data_dir, "XNLI-MT-1.0/multinli/multinli.train.{}.tsv".format(lg)))
     train_data = self.read_data(join(data_dir, 'snli_1.0_train.jsonl'))
     dev_data = self.read_data(join(data_dir, 'snli_1.0_dev.jsonl'))
     lines = train_data + dev_data
     lines = [x for x in lines if (x['gold_label'] != '-')]
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "%s-%s" % ('train', i)
         text_a = line['sentence1']
         text_b = line['sentence2']
         # label = "contradiction" if line[2] == "contradictory" else line[2]
         label = line['gold_label']
         assert isinstance(text_a, str) and isinstance(
             text_b, str) and isinstance(label, str)
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
 def get_example_from_tensor_dict(self, tensor_dict):
     return InputExample(
         tensor_dict['idx'].numpy(),
         tensor_dict['sentence1'].numpy().decode('utf-8'),
         tensor_dict['sentence2'].numpy().decode('utf-8'),
         str(tensor_dict['label'].numpy()),
     )
Example #20
0
    def create_examples(self, df, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for idx, row in df.iterrows():
            guid = "%s-%s" % (set_type, idx)

            default_input_columns = ['query', 'text']
            default_output_columns = ['label']

            input_columns = self.configs.get('input', default_input_columns)
            output_columns = self.configs.get('output', default_output_columns)

            try:
                text_a, text_b = row[input_columns[0]], None
                if len(input_columns) > 1:
                    text_b = row[input_columns[1]]

                label = row[output_columns[0]]

            except KeyError:
                print('No corresponding columns found for config keys {}'.format(input_columns))

            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))

        return examples
Example #21
0
 def load_and_cache_examples(candidates, tokenizer):
     max_length = 128
     examples = [
         InputExample(guid=str(i), text_a=x)
         for i, x in enumerate(candidates)
     ]
     features = glue_convert_examples_to_features(
         examples,
         tokenizer,
         label_list=["0", "1"],
         max_length=max_length,
         output_mode="classification")
     # Convert to Tensors and build dataset
     all_input_ids = torch.tensor([f.input_ids for f in features],
                                  dtype=torch.long)
     all_attention_mask = torch.tensor(
         [f.attention_mask for f in features], dtype=torch.long)
     all_labels = torch.tensor([0 for f in features], dtype=torch.long)
     all_token_type_ids = torch.tensor([[0.0] * max_length
                                        for f in features],
                                       dtype=torch.long)
     dataset = torch.utils.data.TensorDataset(all_input_ids,
                                              all_attention_mask,
                                              all_token_type_ids,
                                              all_labels)
     return dataset
Example #22
0
def load_and_cache_examples(args, task, tokenizer, evaluate=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    cached_features_file = '/home/ray/transformers/trec/cacha_{}'.format(
        'test' if evaluate else 'train')
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        train_dataset = trec_dataset('/home/ray/transformers/trec',
                                     train=not evaluate,
                                     test=evaluate)
        examples = []
        for i, inst in enumerate(train_dataset):
            examples.append(
                InputExample(guid='', text_a=inst['text'],
                             label=inst['label']))

        features = convert_examples_to_features(
            examples,
            tokenizer,
            label_list=['ABBR', 'DESC', 'ENTY', 'HUM', 'LOC', 'NUM'],
            max_length=args.max_seq_length,
            output_mode="classification",
            pad_on_left=False,  # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(features, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features],
                                      dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features],
                                      dtype=torch.long)
    output_mode = "classification"
    if output_mode == "classification":
        all_labels = torch.tensor([f.label for f in features],
                                  dtype=torch.long)
    elif output_mode == "regression":
        all_labels = torch.tensor([f.label for f in features],
                                  dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_attention_mask,
                            all_token_type_ids, all_labels)
    return dataset
Example #23
0
 def get_example_from_tensor_dict(self, tensor_dict):
     """See base class."""
     return InputExample(
         tensor_dict["idx"].numpy(),
         tensor_dict["question"].numpy().decode("utf-8"),
         tensor_dict["sentence"].numpy().decode("utf-8"),
         str(tensor_dict["label"].numpy()),
     )
Example #24
0
 def get_example_from_tensor_dict(self, tensor_dict):
     """See base class."""
     return InputExample(
         tensor_dict["idx"].numpy(),
         tensor_dict["premise"].numpy().decode("utf-8"),
         tensor_dict["hypothesis"].numpy().decode("utf-8"),
         str(tensor_dict["label"].numpy()),
     )
Example #25
0
 def _create_pos_examples(self, lines_a, lines_b, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, (line_a, line_b)) in enumerate(zip(lines_a, lines_b)):
         guid = "%s-%s" % (set_type, i)
         examples.append(
             InputExample(guid=guid, text_a=line_a, text_b=line_b, label=1))
     return examples
Example #26
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         guid = "%s-%s" % (set_type, i)
         examples.append(
             InputExample(guid=guid, text_a=line, text_b=None, label=1))
     return examples
def create_input_feature(tokenizer, output_mode, example, max_length,
                         mask_padding_with_zero, pad_on_left, pad_token,
                         pad_token_segment_id, label_map):
    example = InputExample(
        example['id'], example['sentence1'],
        example['sentence2'] if 'sentence2' in example else None,
        example['label'])

    inputs = tokenizer.encode_plus(
        example.text_a,
        example.text_b,
        add_special_tokens=True,
        max_length=max_length,
        truncation_strategy=
        'only_first'  # We're truncating the first sequence in priority
    )
    input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

    # Zero-pad up to the sequence length.
    padding_length = max_length - len(input_ids)
    if pad_on_left:
        input_ids = ([pad_token] * padding_length) + input_ids
        attention_mask = ([0 if mask_padding_with_zero else 1] *
                          padding_length) + attention_mask
        token_type_ids = ([pad_token_segment_id] *
                          padding_length) + token_type_ids
    else:
        input_ids = input_ids + ([pad_token] * padding_length)
        attention_mask = attention_mask + (
            [0 if mask_padding_with_zero else 1] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_segment_id] *
                                           padding_length)

    assert len(
        input_ids) == max_length, "Error with input length {} vs {}".format(
            len(input_ids), max_length)
    assert len(attention_mask
               ) == max_length, "Error with input length {} vs {}".format(
                   len(attention_mask), max_length)
    assert len(token_type_ids
               ) == max_length, "Error with input length {} vs {}".format(
                   len(token_type_ids), max_length)

    if output_mode == "classification":
        label = label_map[example.label]
    elif output_mode == "regression":
        label = float(example.label)
    else:
        raise KeyError(output_mode)

    return InputFeatures(input_ids=input_ids,
                         attention_mask=attention_mask,
                         token_type_ids=token_type_ids,
                         label=label)
Example #28
0
def get_ANLI_examples(prefix, hypo_only=False):
    folders = ['R1', 'R2', 'R3']
    examples = []
    guid_id = 0
    pos_size = 0
    neg_size = 0
    path = '/export/home/Dataset/para_entail_datasets/ANLI/anli_v0.1/'
    for folder in folders:
        filename = path+folder+'/'+prefix+'.jsonl'
        print('loading ANLI...', filename)
        with open(filename, 'r') as f:
            for line in json_lines.reader(f):
                guid_id+=1
                premise = line.get('context')
                hypothesis = line.get('hypothesis')
                label = 'entailment' if line.get('label') == 'e' else 'not_entailment'
                if len(premise) == 0 or len(hypothesis)==0:
                    continue
                if label == 'entailment':
                    pos_size+=1
                else:
                    neg_size+=1
                if hypo_only:
                    examples.append(InputExample(guid=str(guid_id), text_a=hypothesis, text_b=None, label=label))
                else:
                    examples.append(InputExample(guid=str(guid_id), text_a=premise, text_b=hypothesis, label=label))
    print('>>pos:neg: ', pos_size, neg_size)
    print('ANLI size:', len(examples))
    # if prefix == 'train':
    #     new_examples = []
    #     new_pos_size = 0
    #     new_neg_size = 0
    #     for ex in examples:
    #         if ex.label == 'not_entailment':
    #             if random.uniform(0.0, 1.0) <= pos_size/neg_size:
    #                 new_examples.append(ex)
    #                 new_neg_size+=1
    #         else:
    #             new_examples.append(ex)
    #             new_pos_size+=1
    #     print('>>new pos:neg: ', new_pos_size, new_neg_size)
    #     return new_examples, new_pos_size
    # else:
    #     return examples, pos_size
    return examples, pos_size
Example #29
0
def deal_with_block(block_line_list, filter_label_set, hypo_only=False):
    examples = []
    premise = ''

    if not block_line_list[0].startswith('document>>'):
        return [], 0, 0
    first_line_parts = block_line_list[0].strip().split('\t')
    # premise = first_line_parts[1].strip()
    premise = first_line_parts[2].strip()
    if len(premise) == 0:
        return [], 0, 0

    pos_hypo_list = []
    neg_hypo_list = []
    for line in block_line_list[1:]:
        if len(line.strip())>0:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                filter_label = parts[1].strip()
                if parts[0] == 'positive>>':
                    pos_hypo = parts[2].strip() # harsh version
                    if filter_label not in filter_label_set and len(pos_hypo) >0:
                        pos_hypo_list.append(pos_hypo)
                elif parts[0] == 'negative>>':
                    neg_hypo = parts[2].strip()
                    '''we do not need filter any negative summary in train, dev, and test'''
                    if len(neg_hypo) >0:
                        neg_hypo_list.append(neg_hypo)


    for pos_hypo in pos_hypo_list:

        if hypo_only:
            examples.append(InputExample(guid='ex', text_a=pos_hypo, text_b=None, label='entailment'))
        else:
            examples.append(InputExample(guid='ex', text_a=premise, text_b=pos_hypo, label='entailment'))

    for neg_hypo in neg_hypo_list:

        if hypo_only:
            examples.append(InputExample(guid='ex', text_a=neg_hypo, text_b=None, label='not_entailment'))
        else:
            examples.append(InputExample(guid='ex', text_a=premise, text_b=neg_hypo, label='not_entailment'))

    return examples, len(pos_hypo_list), len(neg_hypo_list)
Example #30
0
 def _create_examples(self, lines, set_type, LABELS=False):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         guid = "%s-%s" % (set_type, str(i))
         text_q = line[0]  # question
         text_r = line[2]  # response
         if LABELS:
             label = int(line[-1])  # label
             label = "1" if label > 0 else "0"
             examples.append(
                 InputExample(guid=guid,
                              text_a=text_q,
                              text_b=text_r,
                              label=label))
         else:
             examples.append(
                 InputExample(guid=guid, text_a=text_q, text_b=text_r))
     return examples