コード例 #1
0
    def _create_examples(self, lines: List[List[str]], type: str):
        """Creates examples for the training and dev sets."""

        examples = [
            InputExample(
                example_id=line[0],
                question="",  # in the swag dataset, the
                # common beginning of each
                # choice is stored in "sent2".
                contexts=[line[1], line[1], line[1], line[1], line[1]],
                endings=[line[2], line[3], line[4], line[5], line[6]],
                label=line[7],
            ) for line in lines  # we skip the line with the column names
        ]

        return examples
コード例 #2
0
def windowed_tokenization(example: InputExample,
                          label_map: dict,
                          max_window_length: int,
                          max_length: int,
                          stride: int,
                          no_answer_text: str,
                          tokenizer: PreTrainedTokenizer,
                          text_tokenizer: TextTokenizer,
                          window_fn: Callable = None) -> List[InputExample]:
    # ToDo := Different amount of windows will trigger an error because of
    # different size in input features? sequences should be grouped by
    # size and chopped, padded accordingly

    # ToDo := no_answer_text is not used by now, no label corrected
    able_to_correct_label = no_answer_text is not None and no_answer_text != ""
    window_fn = window_fn if window_fn is not None else create_windows
    window_texts = window_fn(example.contexts[0], tokenizer, max_window_length,
                             stride)
    windowed_examples = []
    for win_idx, win_text in enumerate(window_texts):
        str_win_idx = str(win_idx)
        if len(str_win_idx) % 2 != 0:
            str_win_idx = '0' + str_win_idx

        if able_to_correct_label and should_correct_label(
                win_text, example.endings, no_answer_text, text_tokenizer):
            label, endings = correct_label(win_text, example.endings.copy(),
                                           no_answer_text, text_tokenizer)
        else:
            label = label_map[example.label]
            endings = example.endings

        # maximum 100 windows
        example_id = int(str(example.example_id) + str_win_idx)
        windowed_examples.append(
            InputExample(
                example_id=example_id,
                question=example.question,
                contexts=[win_text] * len(endings),
                endings=endings,
                label=label,
            ))

    return windowed_examples
コード例 #3
0
    def _create_examples(self, lines: List[List[str]], type: str):
        """Creates examples for the training and dev sets."""
        if type == "train" and lines[0][-1] != "label":
            raise ValueError(
                "For training, the input file must contain a label column.")

        examples = [
            InputExample(
                example_id=line[2],
                question=line[5],  # in the swag dataset, the
                # common beginning of each
                # choice is stored in "sent2".
                contexts=[line[4], line[4], line[4], line[4]],
                endings=[line[7], line[8], line[9], line[10]],
                label=line[11],
            ) for line in lines[1:]  # we skip the line with the column names
        ]

        return examples
コード例 #4
0
    def _create_examples(self, data, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for data_raw in data['data']:
            article = data_raw['article']
            # encode by default. When input_batching with ids, no string tensor is
            # allowed, ensure that example ids are always numeric.
            for i in range(len(data_raw["answers"])):
                example_id = self._encode_id(data_raw['id'], i)
                truth = str(ord(data_raw["answers"][i]) - ord("A"))
                question = data_raw["questions"][i]
                options = data_raw["options"][i]
                self.nof_labels = len(options)

                examples.append(
                    InputExample(
                        example_id=example_id,
                        question=question,
                        contexts=[article] * len(options),
                        endings=options,
                        label=truth,
                    ))
        return examples
コード例 #5
0
    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (_, data_raw) in enumerate(lines):
            article = data_raw["article"]
            for i in range(len(data_raw["answers"])):
                race_id = self._encode_id(data_raw['race_id'], i)
                truth = str(ord(data_raw["answers"][i]) - ord("A"))
                question = data_raw["questions"][i]
                options = data_raw["options"][i]

                examples.append(
                    InputExample(
                        example_id=race_id,
                        question=question,
                        contexts=[article, article, article, article
                                  ],  # this is not efficient but convenient
                        endings=[
                            options[0], options[1], options[2], options[3]
                        ],
                        label=truth,
                    ))
        return examples
コード例 #6
0
    def _create_examples(self, lines, type):
        """Creates examples for the training and dev sets."""

        # There are two types of labels. They should be normalized
        def normalize(truth):
            if truth in "ABCD":
                return ord(truth) - ord("A")
            elif truth in "1234":
                return int(truth) - 1
            else:
                logger.info("truth ERROR! %s", str(truth))
                return None

        examples = []
        three_choice = 0
        four_choice = 0
        five_choice = 0
        other_choices = 0
        # we deleted example which has more than or less than four choices
        for line in tqdm.tqdm(lines, desc="read arc data"):
            data_raw = json.loads(line.strip("\n"))
            if len(data_raw["question"]["choices"]) == 3:
                three_choice += 1
                continue
            elif len(data_raw["question"]["choices"]) == 5:
                five_choice += 1
                continue
            elif len(data_raw["question"]["choices"]) != 4:
                other_choices += 1
                continue
            four_choice += 1
            truth = str(normalize(data_raw["answerKey"]))
            assert truth != "None"
            question_choices = data_raw["question"]
            question = question_choices["stem"]
            id = data_raw["id"]
            options = question_choices["choices"]
            if len(options) == 4:
                examples.append(
                    InputExample(
                        example_id=id,
                        question=question,
                        contexts=[
                            options[0]["para"].replace("_", ""),
                            options[1]["para"].replace("_", ""),
                            options[2]["para"].replace("_", ""),
                            options[3]["para"].replace("_", ""),
                        ],
                        endings=[
                            options[0]["text"], options[1]["text"],
                            options[2]["text"], options[3]["text"]
                        ],
                        label=truth,
                    ))

        if type == "train":
            assert len(examples) > 1
            assert examples[0].label is not None
        logger.info("len examples: %s}", str(len(examples)))
        logger.info("Three choices: %s", str(three_choice))
        logger.info("Five choices: %s", str(five_choice))
        logger.info("Other choices: %s", str(other_choices))
        logger.info("four choices: %s", str(four_choice))

        return examples
コード例 #7
0
def window_examples(
    examples: List[InputExample],
    label_list: List[str],
    max_length: int,
    tokenizer: PreTrainedTokenizer,
    enable_windowing: bool = False,
    stride: int = None,
    no_answer_text: str = None,
    window_fn: Callable = None,
) -> List[InputExample]:
    """
    Loads a data file into a list of `InputFeatures`
    """
    if enable_windowing and stride is None:
        raise ValueError(
            'Windowing mechanism is activated, but no "stride" or '
            'was provided, please provide it or disable'
            'the mechanism altogether with `enable_windowing=False`')

    windowed_examples = []
    label_map = {label: i for i, label in enumerate(label_list)}
    text_tokenizer = TextTokenizer()
    # three special tokens will be added, remove them from the count
    windowing_max_length = max_length - 6
    for (ex_index, example) in tqdm.tqdm(enumerate(examples),
                                         desc="windowing examples"):
        if enable_windowing:
            trigger_windowing, max_ending_length = should_window(
                example=example,
                tokenizer=tokenizer,
                max_length=windowing_max_length,
                no_answer_text=no_answer_text,
            )
        else:
            trigger_windowing, max_ending_length = False, None

        if trigger_windowing:
            windowed_examples.extend(
                windowed_tokenization(example=example,
                                      label_map=label_map,
                                      max_window_length=windowing_max_length -
                                      max_ending_length,
                                      max_length=max_length,
                                      stride=stride,
                                      no_answer_text=no_answer_text,
                                      tokenizer=tokenizer,
                                      text_tokenizer=text_tokenizer,
                                      window_fn=window_fn))
        else:
            # Append 00 window idx to conform to the rest of windowed features
            example_id = example.example_id
            if enable_windowing:
                example_id = int(str(example_id) + '00')

            windowed_examples.append(
                InputExample(
                    example_id=example_id,
                    question=example.question,
                    contexts=example.contexts,
                    endings=example.endings,
                    label=example.label,
                ))

    return windowed_examples