Example #1
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path, encoding='utf8') as f:
            for line in f:
                example_json = json.loads(line)
                label = str(example_json['label']) if 'label' in example_json else None
                idx = example_json['idx']
                guid = "%s-%s" % (set_type, idx)
                text_a = example_json['premise']
                meta = {
                    'choice1': example_json['choice1'],
                    'choice2': example_json['choice2'],
                    'question': example_json['question']
                }
                example = InputExample(guid=guid, text_a=text_a, label=label, meta=meta, idx=idx)
                examples.append(example)

        if set_type == 'train' or set_type == 'unlabeled':
            mirror_examples = []
            for ex in examples:
                label = "1" if ex.label == "0" else "0"
                meta = {
                    'choice1': ex.meta['choice2'],
                    'choice2': ex.meta['choice1'],
                    'question': ex.meta['question']
                }
                mirror_example = InputExample(guid=ex.guid + 'm', text_a=ex.text_a, label=label, meta=meta)
                mirror_examples.append(mirror_example)
            examples += mirror_examples
            logger.info(f"Added {len(mirror_examples)} mirror examples, total size is {len(examples)}...")
        return examples
Example #2
0
    def _create_examples(self,
                         path: str,
                         set_type: str,
                         hypothesis_name: str = "hypothesis",
                         premise_name: str = "premise") -> List[InputExample]:
        examples = []

        with open(path, encoding='utf8') as f:
            for line_idx, line in enumerate(f):
                example_json = json.loads(line)
                idx = example_json['idx']
                if isinstance(idx, str):
                    try:
                        idx = int(idx)
                    except ValueError:
                        idx = line_idx
                label = example_json.get('label')
                guid = "%s-%s" % (set_type, idx)
                text_a = example_json[premise_name]
                text_b = example_json[hypothesis_name]

                example = InputExample(guid=guid,
                                       text_a=text_a,
                                       text_b=text_b,
                                       label=label,
                                       idx=idx)
                examples.append(example)

        return examples
Example #3
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path, encoding='utf8') as f:
            for line in f:
                example_json = json.loads(line)

                passage_idx = example_json['idx']
                text = example_json['passage']['text']
                questions = example_json['passage']['questions']
                for question_json in questions:
                    question = question_json["question"]
                    question_idx = question_json['idx']
                    answers = question_json["answers"]
                    for answer_json in answers:
                        label = str(answer_json["label"]) if 'label' in answer_json else None
                        answer_idx = answer_json["idx"]
                        guid = f'{set_type}-p{passage_idx}-q{question_idx}-a{answer_idx}'
                        meta = {
                            'passage_idx': passage_idx,
                            'question_idx': question_idx,
                            'answer_idx': answer_idx,
                            'answer': answer_json["text"]
                        }
                        idx = [passage_idx, question_idx, answer_idx]
                        example = InputExample(guid=guid, text_a=text, text_b=question, label=label, meta=meta, idx=idx)
                        examples.append(example)

        question_indices = list(set(example.meta['question_idx'] for example in examples))
        label_distribution = Counter(example.label for example in examples)
        logger.info(f"Returning {len(examples)} examples corresponding to {len(question_indices)} questions with label "
                    f"distribution {list(label_distribution.items())}")
        return examples
Example #4
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path, encoding='utf8') as f:
            for line in f:
                example_json = json.loads(line)
                idx = example_json['idx']
                label = str(example_json['label']) if 'label' in example_json else None
                guid = "%s-%s" % (set_type, idx)
                text_a = example_json['text']
                meta = {
                    'span1_text': example_json['target']['span1_text'],
                    'span2_text': example_json['target']['span2_text'],
                    'span1_index': example_json['target']['span1_index'],
                    'span2_index': example_json['target']['span2_index']
                }

                # the indices in the dataset are wrong for some examples, so we manually fix them
                span1_index, span1_text = meta['span1_index'], meta['span1_text']
                span2_index, span2_text = meta['span2_index'], meta['span2_text']
                words_a = text_a.split()
                words_a_lower = text_a.lower().split()
                words_span1_text = span1_text.lower().split()
                span1_len = len(words_span1_text)

                if words_a_lower[span1_index:span1_index + span1_len] != words_span1_text:
                    for offset in [-1, +1]:
                        if words_a_lower[span1_index + offset:span1_index + span1_len + offset] == words_span1_text:
                            span1_index += offset

                if words_a_lower[span1_index:span1_index + span1_len] != words_span1_text:
                    logger.warning(f"Got '{words_a_lower[span1_index:span1_index + span1_len]}' but expected "
                                   f"'{words_span1_text}' at index {span1_index} for '{words_a}'")

                if words_a[span2_index] != span2_text:
                    for offset in [-1, +1]:
                        if words_a[span2_index + offset] == span2_text:
                            span2_index += offset

                    if words_a[span2_index] != span2_text and words_a[span2_index].startswith(span2_text):
                        words_a = words_a[:span2_index] \
                                  + [words_a[span2_index][:len(span2_text)], words_a[span2_index][len(span2_text):]] \
                                  + words_a[span2_index + 1:]

                assert words_a[span2_index] == span2_text, \
                    f"Got '{words_a[span2_index]}' but expected '{span2_text}' at index {span2_index} for '{words_a}'"

                text_a = ' '.join(words_a)
                meta['span1_index'], meta['span2_index'] = span1_index, span2_index

                example = InputExample(guid=guid, text_a=text_a, label=label, meta=meta, idx=idx)
                if set_type == 'train' and label != 'True':
                    continue
                examples.append(example)

        return examples
Example #5
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path, encoding="utf8") as f:
            for line in f:
                example_json = json.loads(line)
                label = str(
                    example_json["label"]) if "label" in example_json else None
                idx = example_json["idx"]
                guid = "%s-%s" % (set_type, idx)
                text_a = example_json["premise"]
                meta = {
                    "choice1": example_json["choice1"],
                    "choice2": example_json["choice2"],
                    "question": example_json["question"],
                }
                example = InputExample(guid=guid,
                                       text_a=text_a,
                                       label=label,
                                       meta=meta,
                                       idx=idx)
                examples.append(example)

        if set_type == "train" or set_type == "unlabeled":
            mirror_examples = []
            for ex in examples:
                label = "1" if ex.label == "0" else "0"
                meta = {
                    "choice1": ex.meta["choice2"],
                    "choice2": ex.meta["choice1"],
                    "question": ex.meta["question"]
                }
                mirror_example = InputExample(guid=ex.guid + "m",
                                              text_a=ex.text_a,
                                              label=label,
                                              meta=meta)
                mirror_examples.append(mirror_example)
            examples += mirror_examples
            logger.info(
                f"Added {len(mirror_examples)} mirror examples, total size is {len(examples)}..."
            )
        return examples
Example #6
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path) as f:
            reader = csv.reader(f, delimiter=',')
            for idx, row in enumerate(reader):
                label, body = row
                guid = "%s-%s" % (set_type, idx)
                text_a = body.replace('\\n', ' ').replace('\\', ' ')

                example = InputExample(guid=guid, text_a=text_a, label=label)
                examples.append(example)

        return examples
Example #7
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path, encoding='utf8') as f:
            for line in f:
                example_json = json.loads(line)
                idx = example_json['idx']
                label = str(example_json['label']) if 'label' in example_json else None
                guid = "%s-%s" % (set_type, idx)
                text_a = example_json['passage']
                text_b = example_json['question']
                example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, idx=idx)
                examples.append(example)

        return examples
Example #8
0
    def _create_examples(lines: List[List[str]], set_type: str) -> List[InputExample]:
        examples = []

        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, line[0])
            text_a = line[8]
            text_b = line[9]
            label = line[-1]

            example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
            examples.append(example)

        return examples
Example #9
0
    def _create_examples(self, path, set_type, max_examples=-1, skip_first=0):
        """Creates examples for the training and dev sets."""
        examples = []

        with open(path) as f:
            reader = csv.reader(f, delimiter=':->')
            for idx, row in enumerate(reader):
                guid = "%s-%s" % (set_type, idx)
                label = row[MyTaskDataProcessor.LABEL_COLUMN]
                text_a = row[MyTaskDataProcessor.TEXT_A_COLUMN]
                text_b = row[MyTaskDataProcessor.TEXT_B_COLUMN] if MyTaskDataProcessor.TEXT_B_COLUMN >= 0 else None
                example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
                examples.append(example)

        return examples
Example #10
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path, encoding='utf8') as f:
            reader = csv.reader(f, delimiter=',')
            for idx, row in enumerate(reader):
                label, question_title, question_body, answer = row
                guid = "%s-%s" % (set_type, idx)
                text_a = ' '.join([question_title.replace('\\n', ' ').replace('\\', ' '),
                                   question_body.replace('\\n', ' ').replace('\\', ' ')])
                text_b = answer.replace('\\n', ' ').replace('\\', ' ')

                example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
                examples.append(example)

        return examples
Example #11
0
 def _create_examples(path: str, set_type: str) -> List[InputExample]:
     examples = []
     with open(path, encoding='utf8') as f:
         for line in f:
             example_json = json.loads(line)
             idx = example_json['idx']
             if isinstance(idx, str):
                 idx = int(idx)
             label = "T" if example_json.get('label') else "F"
             guid = "%s-%s" % (set_type, idx)
             text_a = example_json['sentence1']
             text_b = example_json['sentence2']
             meta = {'word': example_json['word']}
             example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, idx=idx, meta=meta)
             examples.append(example)
     return examples
Example #12
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        df = pd.read_table(path)
        for idx, row in df.iterrows():
            label = str(row['prefix'])
            guid = "%s-%s" % (set_type, idx)
            text_a = str(row['input_text'])
            text_b = str(row['target_text'])
            example = InputExample(guid=guid,
                                   text_a=text_a,
                                   text_b=text_b,
                                   label=label,
                                   idx=idx)
            examples.append(example)

        return examples
Example #13
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path) as f:
            reader = csv.reader(f, delimiter=",")
            for idx, row in enumerate(reader):
                label, headline, body = row
                guid = "%s-%s" % (set_type, idx)
                text_a = headline.replace("\\", " ")
                text_b = body.replace("\\", " ")

                example = InputExample(guid=guid,
                                       text_a=text_a,
                                       text_b=text_b,
                                       label=label)
                examples.append(example)

        return examples
Example #14
0
    def _create_examples(self, path: str) -> List[InputExample]:
        examples = []

        with open(path, encoding='utf8') as f:
            for line in f:
                example_json = json.loads(line)
                label = example_json['label']
                id_ = example_json['id']
                text_a = example_json['question']
                text_b = example_json['comment']
                language = example_json['language']

                if self.language is not None and language != self.language:
                    continue

                example = InputExample(guid=id_, text_a=text_a, text_b=text_b, label=label)
                examples.append(example)

        return examples
Example #15
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path, encoding="utf8") as f:
            for line in f:
                example_json = json.loads(line)
                idx = example_json["idx"]
                label = str(
                    example_json["label"]) if "label" in example_json else None
                guid = "%s-%s" % (set_type, idx)
                text_a = example_json["passage"]
                text_b = example_json["question"]
                example = InputExample(guid=guid,
                                       text_a=text_a,
                                       text_b=text_b,
                                       label=label,
                                       idx=idx)
                examples.append(example)

        return examples
Example #16
0
 def _create_examples(path: str, set_type: str) -> List[InputExample]:
     examples = []
     with open(path, encoding="utf8") as f:
         for line in f:
             example_json = json.loads(line)
             idx = example_json["idx"]
             if isinstance(idx, str):
                 idx = int(idx)
             label = "T" if example_json.get("label") else "F"
             guid = "%s-%s" % (set_type, idx)
             text_a = example_json["sentence1"]
             text_b = example_json["sentence2"]
             meta = {"word": example_json["word"]}
             example = InputExample(guid=guid,
                                    text_a=text_a,
                                    text_b=text_b,
                                    label=label,
                                    idx=idx,
                                    meta=meta)
             examples.append(example)
     return examples
Example #17
0
    def _create_examples_unlabelled(self,
                                    path,
                                    set_type,
                                    max_examples=1,
                                    skip_first=0):
        """Creates examples for the unlabelled set."""
        examples = []

        with open(path, encoding="utf8") as f:
            reader = csv.reader(f, delimiter=":")
            for idx, row in enumerate(reader):
                guid = "%s-%s" % (set_type, idx)
                #label = row[MyTaskDataProcessor.LABEL_COLUMN]
                text_a = row[0]
                text_b = row[
                    MyTaskDataProcessor.
                    TEXT_B_COLUMN] if MyTaskDataProcessor.TEXT_B_COLUMN >= 0 else None
                example = InputExample(guid=guid, text_a=text_a, text_b=text_b)
                examples.append(example)

        return examples
Example #18
0
File: tasks.py Project: cccntu/pet
    def _create_examples(self, lines: List[List[str]],
                         set_type: str) -> List[InputExample]:
        examples = []

        id_to_lables = self.get_labels()
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = f"{set_type}-{line['idx']}"
            text_a = line['premise']
            text_b = line['hypothesis']
            label = id_to_lables[
                line['label']]  # need to return string, hf datasets uses int

            example = InputExample(guid=guid,
                                   text_a=text_a,
                                   text_b=text_b,
                                   label=label)
            examples.append(example)

        return examples
Example #19
0
    def _create_examples(
            path,
            set_type,
            seed=42,
            max_train_candidates_per_question: int = 10) -> List[InputExample]:
        examples = []

        entity_shuffler = random.Random(seed)

        with open(path, encoding='utf8') as f:
            for idx, line in enumerate(f):
                example_json = json.loads(line)

                idx = example_json['idx']
                text = example_json['passage']['text']
                entities = set()

                for entity_json in example_json['passage']['entities']:
                    start = entity_json['start']
                    end = entity_json['end']
                    entity = text[start:end + 1]
                    entities.add(entity)

                entities = list(entities)

                text = text.replace(
                    "@highlight\n", "- "
                )  # we follow the GPT-3 paper wrt @highlight annotations
                questions = example_json['qas']

                for question_json in questions:
                    question = question_json['query']
                    question_idx = question_json['idx']
                    answers = set()

                    for answer_json in question_json.get('answers', []):
                        answer = answer_json['text']
                        answers.add(answer)

                    answers = list(answers)

                    if set_type == 'train':
                        # create a single example per *correct* answer
                        for answer_idx, answer in enumerate(answers):
                            candidates = [
                                ent for ent in entities if ent not in answers
                            ]
                            if len(candidates
                                   ) > max_train_candidates_per_question - 1:
                                entity_shuffler.shuffle(candidates)
                                candidates = candidates[:
                                                        max_train_candidates_per_question
                                                        - 1]

                            guid = f'{set_type}-p{idx}-q{question_idx}-a{answer_idx}'
                            meta = {
                                'passage_idx': idx,
                                'question_idx': question_idx,
                                'candidates': [answer] + candidates,
                                'answers': [answer]
                            }
                            ex_idx = [idx, question_idx, answer_idx]
                            example = InputExample(guid=guid,
                                                   text_a=text,
                                                   text_b=question,
                                                   label="1",
                                                   meta=meta,
                                                   idx=ex_idx)
                            examples.append(example)

                    else:
                        # create just one example with *all* correct answers and *all* answer candidates
                        guid = f'{set_type}-p{idx}-q{question_idx}'
                        meta = {
                            'passage_idx': idx,
                            'question_idx': question_idx,
                            'candidates': entities,
                            'answers': answers
                        }
                        example = InputExample(guid=guid,
                                               text_a=text,
                                               text_b=question,
                                               label="1",
                                               meta=meta)
                        examples.append(example)

        question_indices = list(
            set(example.meta['question_idx'] for example in examples))
        label_distribution = Counter(example.label for example in examples)
        logger.info(
            f"Returning {len(examples)} examples corresponding to {len(question_indices)} questions with label "
            f"distribution {list(label_distribution.items())}")
        return examples
Example #20
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path, encoding="utf8") as f:
            for line in f:
                example_json = json.loads(line)
                idx = example_json["idx"]
                label = str(
                    example_json["label"]) if "label" in example_json else None
                guid = "%s-%s" % (set_type, idx)
                text_a = example_json["text"]
                meta = {
                    "span1_text": example_json["target"]["span1_text"],
                    "span2_text": example_json["target"]["span2_text"],
                    "span1_index": example_json["target"]["span1_index"],
                    "span2_index": example_json["target"]["span2_index"],
                }

                # the indices in the dataset are wrong for some examples, so we manually fix them
                span1_index, span1_text = meta["span1_index"], meta[
                    "span1_text"]
                span2_index, span2_text = meta["span2_index"], meta[
                    "span2_text"]
                words_a = text_a.split()
                words_a_lower = text_a.lower().split()
                words_span1_text = span1_text.lower().split()
                span1_len = len(words_span1_text)

                if words_a_lower[span1_index:span1_index +
                                 span1_len] != words_span1_text:
                    for offset in [-1, +1]:
                        if words_a_lower[span1_index + offset:span1_index +
                                         span1_len +
                                         offset] == words_span1_text:
                            span1_index += offset

                if words_a_lower[span1_index:span1_index +
                                 span1_len] != words_span1_text:
                    logger.warning(
                        f"Got '{words_a_lower[span1_index:span1_index + span1_len]}' but expected "
                        f"'{words_span1_text}' at index {span1_index} for '{words_a}'"
                    )

                if words_a[span2_index] != span2_text:
                    for offset in [-1, +1]:
                        if words_a[span2_index + offset] == span2_text:
                            span2_index += offset

                    if words_a[span2_index] != span2_text and words_a[
                            span2_index].startswith(span2_text):
                        words_a = (words_a[:span2_index] + [
                            words_a[span2_index][:len(span2_text)],
                            words_a[span2_index][len(span2_text):]
                        ] + words_a[span2_index + 1:])

                assert (
                    words_a[span2_index] == span2_text
                ), f"Got '{words_a[span2_index]}' but expected '{span2_text}' at index {span2_index} for '{words_a}'"

                text_a = " ".join(words_a)
                meta["span1_index"], meta[
                    "span2_index"] = span1_index, span2_index

                example = InputExample(guid=guid,
                                       text_a=text_a,
                                       label=label,
                                       meta=meta,
                                       idx=idx)
                if set_type == "train" and label != "True":
                    continue
                examples.append(example)

        return examples