def test_as_array_produces_token_sequence_bert_cased_sentence_pair(self):
     tokenizer = cached_transformers.get_tokenizer("bert-base-cased")
     allennlp_tokenizer = PretrainedTransformerTokenizer(
         "bert-base-cased", add_special_tokens=False)
     indexer = PretrainedTransformerIndexer(model_name="bert-base-cased")
     default_format = "[CLS] AllenNLP is great! [SEP] Really it is! [SEP]"
     tokens = tokenizer.tokenize(default_format)
     expected_ids = tokenizer.convert_tokens_to_ids(tokens)
     allennlp_tokens = allennlp_tokenizer.add_special_tokens(
         allennlp_tokenizer.tokenize("AllenNLP is great!"),
         allennlp_tokenizer.tokenize("Really it is!"),
     )
     vocab = Vocabulary()
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     assert indexed["token_ids"] == expected_ids
    def test_type_ids_when_folding(self):
        allennlp_tokenizer = PretrainedTransformerTokenizer(
            "bert-base-uncased", add_special_tokens=False)
        indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased",
                                               max_length=6)
        first_string = "How do trees get online?"
        second_string = "They log in!"

        tokens = allennlp_tokenizer.add_special_tokens(
            allennlp_tokenizer.tokenize(first_string),
            allennlp_tokenizer.tokenize(second_string))
        vocab = Vocabulary()
        indexed = indexer.tokens_to_indices(tokens, vocab)
        assert min(indexed["type_ids"]) == 0
        assert max(indexed["type_ids"]) == 1
 def test_as_array_produces_token_sequence_roberta_sentence_pair(self):
     tokenizer = AutoTokenizer.from_pretrained("roberta-base")
     allennlp_tokenizer = PretrainedTransformerTokenizer(
         "roberta-base", add_special_tokens=False)
     indexer = PretrainedTransformerIndexer(model_name="roberta-base")
     default_format = "<s> AllenNLP is great! </s> </s> Really it is! </s>"
     tokens = tokenizer.tokenize(default_format)
     expected_ids = tokenizer.convert_tokens_to_ids(tokens)
     allennlp_tokens = allennlp_tokenizer.add_special_tokens(
         allennlp_tokenizer.tokenize("AllenNLP is great!"),
         allennlp_tokenizer.tokenize("Really it is!"),
     )
     vocab = Vocabulary()
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     assert indexed["token_ids"] == expected_ids
예제 #4
0
class TransformerSuperGlueRteReader(DatasetReader):
    """
    Dataset reader for the SuperGLUE Recognizing Textual Entailment task, to be used with a transformer
    model such as RoBERTa. The dataset is in the JSON Lines format.

    It will generate `Instances` with the following fields:

     * `tokens`, a `TextField` that contains the concatenation of premise and hypothesis,
     * `label`, a `LabelField` containing the label, if one exists.
     * `metadata`, a `MetadataField` that stores the instance's index in the file, the original premise,
       the original hypothesis, both of these in tokenized form, and the gold label, accessible as
       `metadata['index']`, `metadata['premise']`, `metadata['hypothesis']`, `metadata['tokens']`,
       and `metadata['label']`.

    # Parameters

    type : `str`, optional (default=`'roberta-base'`)
        This reader chooses tokenizer according to this setting.
    """

    def __init__(
        self,
        transformer_model_name: str = "roberta-base",
        tokenizer_kwargs: Dict[str, Any] = None,
        **kwargs
    ) -> None:
        super().__init__(
            manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs
        )
        self._tokenizer = PretrainedTransformerTokenizer(
            transformer_model_name,
            add_special_tokens=False,
            tokenizer_kwargs=tokenizer_kwargs,
        )
        self._token_indexers = {
            "tokens": PretrainedTransformerIndexer(
                transformer_model_name, tokenizer_kwargs=tokenizer_kwargs, max_length=512
            )
        }

    @overrides
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path, extract_archive=True)

        logger.info("Reading file at %s", file_path)
        yielded_relation_count = 0
        from allennlp.common.file_utils import json_lines_from_file

        for relation in self.shard_iterable(json_lines_from_file(file_path)):
            premise = relation["premise"]
            hypothesis = relation["hypothesis"]
            if "label" in relation:
                label = relation["label"]
            else:
                label = None
            index = relation["idx"]

            # todo: see if we even need this to be in a separate method
            instance = self.text_to_instance(index, label, premise, hypothesis)

            yield instance
            yielded_relation_count += 1

    @overrides
    def text_to_instance(
        self,
        index: int,
        label: str,
        premise: str,
        hypothesis: str,
    ) -> Instance:
        tokenized_premise = self._tokenizer.tokenize(premise)
        tokenized_hypothesis = self._tokenizer.tokenize(hypothesis)

        fields = {}

        premise_and_hypothesis = TextField(
            self._tokenizer.add_special_tokens(tokenized_premise, tokenized_hypothesis),
        )
        fields["tokens"] = TextField(premise_and_hypothesis)

        # make the metadata
        metadata = {
            "premise": premise,
            "premise_tokens": tokenized_premise,
            "hypothesis": hypothesis,
            "hypothesis_tokens": tokenized_hypothesis,
            "index": index,
        }
        if label:
            fields["label"] = LabelField(label)
            metadata["label"] = label

        fields["metadata"] = MetadataField(metadata)

        return Instance(fields)

    @overrides
    def apply_token_indexers(self, instance: Instance) -> None:
        instance["tokens"].token_indexers = self._token_indexers
예제 #5
0
class TransformerMCReader(DatasetReader):
    """
    Read input data for the TransformerMC model. This is the base class for all readers that produce
    data for TransformerMC.

    Instances have two fields:
     * `alternatives`, a `ListField` of `TextField`
     * `correct_alternative`, `IndexField` with the correct answer among `alternatives`
     * `qid`, a `MetadataField` containing question ids

    Parameters
    ----------
    transformer_model_name : `str`, optional (default=`"roberta-large"`)
        This reader chooses tokenizer and token indexer according to this setting.
    length_limit : `int`, optional (default=`512`)
        We will make sure that the length of an alternative never exceeds this many word pieces.
    """
    def __init__(self,
                 transformer_model_name: str = "roberta-large",
                 length_limit: int = 512,
                 **kwargs) -> None:
        super().__init__(**kwargs)
        from allennlp.data.tokenizers import PretrainedTransformerTokenizer

        self._tokenizer = PretrainedTransformerTokenizer(
            transformer_model_name, add_special_tokens=False)
        from allennlp.data.token_indexers import PretrainedTransformerIndexer

        self._token_indexers = {
            "tokens": PretrainedTransformerIndexer(transformer_model_name)
        }
        self.length_limit = length_limit

    def text_to_instance(
        self,  # type: ignore
        qid: str,
        start: str,
        alternatives: List[str],
        label: Optional[int] = None,
    ) -> Instance:
        # tokenize
        start = self._tokenizer.tokenize(start)

        sequences = []
        for alternative in alternatives:
            alternative = self._tokenizer.tokenize(alternative)
            length_for_start = (self.length_limit - len(alternative) -
                                self._tokenizer.num_special_tokens_for_pair())
            if length_for_start < 0:
                # If the alternative is too long by itself, we take the beginning and add no tokens from the start.
                alternative = alternative[:length_for_start]
                length_for_start = 0
            sequences.append(
                self._tokenizer.add_special_tokens(start[:length_for_start],
                                                   alternative))

        # make fields
        from allennlp.data.fields import TextField

        sequences = [
            TextField(sequence, self._token_indexers) for sequence in sequences
        ]
        from allennlp.data.fields import ListField

        sequences = ListField(sequences)

        from allennlp.data.fields import MetadataField

        fields = {
            "alternatives": sequences,
            "qid": MetadataField(qid),
        }

        if label is not None:
            if label < 0 or label >= len(sequences):
                raise ValueError("Alternative %d does not exist", label)
            from allennlp.data.fields import IndexField

            fields["correct_alternative"] = IndexField(label, sequences)

        return Instance(fields)
예제 #6
0
class TweetCandidateSpanDatasetReader(DatasetReader):
    def __init__(
        self,
        lazy: bool = False,
        cache_directory: Optional[str] = None,
        max_instances: Optional[int] = None,
        min_num_candidate: int = 3,
        max_num_candidate: int = 5,
        transformer_model_name_or_archive_path: str = "bert-base-uncased",
    ) -> None:
        super().__init__(lazy=lazy,
                         cache_directory=cache_directory,
                         max_instances=max_instances)
        if "tar.gz" in transformer_model_name_or_archive_path:
            config = extract_config_from_archive(
                transformer_model_name_or_archive_path)
            model_name = config.as_dict(
            )["dataset_reader"]["tokenizer"]["model_name"]
        else:
            model_name = transformer_model_name_or_archive_path
        self._tokenizer = PretrainedTransformerTokenizer(
            model_name=model_name, add_special_tokens=False)
        self._tokenindexer = PretrainedTransformerIndexer(
            model_name=model_name)
        self._min_num_candidate = min_num_candidate
        self._max_num_candidate = max_num_candidate

    def _read(self, file_path: str) -> Iterable[Instance]:
        file_path = cached_path(file_path)
        df = pd.read_json(file_path, lines=True)
        for record in df.to_dict("records"):
            if record["selected_text"]:
                text = record["text"]
                if not isinstance(text, str):
                    continue
                elif text.strip() == "":
                    continue
                elif len(record["candidate_spans"]) < self._min_num_candidate:
                    continue
                else:
                    yield self.text_to_instance(
                        " " + text.strip(),
                        record["sentiment"],
                        record["candidate_spans"],
                        record["textID"],
                        record.get("selected_text"),
                        record.get("selected_text_span"),
                    )

    def text_to_instance(
        self,
        text: str,
        sentiment: str,
        candidate_spans: list,
        text_id: Optional[str] = None,
        selected_text: Optional[str] = None,
        selected_text_span: Optional[tuple] = None,
    ) -> Instance:
        fields = {}
        text_tokens = self._tokenizer.tokenize(text)
        sentiment_tokens = self._tokenizer.tokenize(sentiment)
        text_with_sentiment_tokens = self._tokenizer.add_special_tokens(
            text_tokens, sentiment_tokens)
        fields["text_with_sentiment"] = TextField(
            text_with_sentiment_tokens, {"tokens": self._tokenindexer})
        candidate_spans = [
            tuple(i) for i in candidate_spans[:self._max_num_candidate]
        ]
        additional_metadata = {}
        if selected_text_span is not None:
            selected_text_span = tuple(selected_text_span)
            additional_metadata["selected_text_span"] = selected_text_span
            if selected_text_span not in candidate_spans:
                candidate_spans.append(selected_text_span)
                fields["label"] = LabelField(len(candidate_spans) - 1,
                                             skip_indexing=True)
                have_truth = False
            else:
                fields["label"] = LabelField(
                    candidate_spans.index(selected_text_span),
                    skip_indexing=True)
                have_truth = True
            additional_metadata["have_truth"] = have_truth
            additional_metadata["candidate_num"] = len(candidate_spans)
        fields["candidate_span_pairs"] = SpanPairsField(
            candidate_spans, fields["text_with_sentiment"])
        metadata = {
            "text": text,
            "sentiment": sentiment,
            "selected_text": selected_text,
            "text_with_sentiment_tokens": text_with_sentiment_tokens
        }
        if text_id is not None:
            metadata["text_id"] = text_id
        if additional_metadata:
            metadata.update(additional_metadata)
        fields["metadata"] = MetadataField(metadata)
        return Instance(fields)

    def span_to_str(self, text, span_start, span_end):
        text_tokens = self._tokenizer.tokenize(text)
        text_tokens = self._tokenizer.add_special_tokens(text_tokens)
        return span_tokens_to_text(text, text_tokens, span_start, span_end)
class WorldTreeSupportReader(DatasetReader):
    """

    """
    def __init__(self,
                 transformer_model_name: str = "roberta-large",
                 topk: int = 5,
                 **kwargs) -> None:
        super().__init__(**kwargs)

        self._tokenizer = PretrainedTransformerTokenizer(
            transformer_model_name, add_special_tokens=False)

        self._token_indexers = {
            "tokens": PretrainedTransformerIndexer(transformer_model_name)
        }

        # Get the topk supporting facts
        self.topk = topk

    @overrides
    def _read(self, file_path: str) -> Iterable[Instance]:
        """


        """
        with open(cached_path(file_path), "r") as data_file:

            logger.info("Reading questions from file at: %s", file_path)

            questions = json.load(data_file)

            for question in questions:
                qid = question["id"]
                question_text = question["question"]
                supporting_facts = question["supports"]
                choices = question["choices"]
                answer = question["answer"]
                yield self.text_to_instance(qid, question_text,
                                            supporting_facts, choices, answer)

    @overrides
    def text_to_instance(
        self,  # type: ignore
        qid: str,
        question: str,
        supporting_facts: List[str],
        choices: List[str],
        answer_idx: Optional[int] = None,
    ) -> Instance:

        # **A hack**
        # We need to make each question have exactly four choices to process them in batches.
        # Either drop the choice or use a non-answer one to fill.
        if len(choices) == 5:
            if answer_idx != 4:  # Just drop the last choice
                choices = choices[:-1]
            elif answer_idx == 4:  # Answer is the last, so drop the first
                choices = choices[1:]
                answer_idx -= 1
        elif len(choices) == 3:
            if answer_idx != 2:  # Use the last to fill
                choices.append(choices[-1])
            else:  # Use the first to fill
                choices.append(choices[0])

        # Base checks
        assert len(choices) == 4
        if answer_idx < 0 or answer_idx >= len(choices):
            # print(answer_idx)
            raise ValueError("Choice %d does not exist", answer_idx)

        # Combine supporting facts with questions
        # Here we're just concatenate the supporting facts to the end of the question
        supporting_facts = supporting_facts[:self.topk]
        question = " ".join([question] + supporting_facts)

        # Tokenize the question
        question_tokens = self._tokenizer.tokenize(question)

        # Tokenize the choices and concate them and the question into question-choice pairs
        qc_pairs = []
        for choice in choices:
            choice_tokens = self._tokenizer.tokenize(choice)
            qc_pair = self._tokenizer.add_special_tokens(
                question_tokens, choice_tokens)
            qc_pairs.append(qc_pair)

        # Wrap them into AllenNLP fields
        qc_pairs = [TextField(pair, self._token_indexers) for pair in qc_pairs]
        qc_pairs = ListField(qc_pairs)
        answer_idx = IndexField(answer_idx, qc_pairs)
        metadata = MetadataField({
            "id": qid,
            "question": question,
            "choices": choices
        })

        return Instance({
            "qc_pairs": qc_pairs,
            "answer_idx": answer_idx,
            "metadata": metadata,
        })
예제 #8
0
class TransformerSquadReader(DatasetReader):
    """
    Dataset reader suitable for JSON-formatted SQuAD-like datasets to be used with a transformer-based
    QA model, such as [`TransformerQA`](../../models/transformer_qa#TransformerQA).

    It will generate `Instances` with the following fields:

     * `question_with_context`, a `TextField` that contains the concatenation of question and context,
     * `answer_span`, a `SpanField` into the `question` `TextField` denoting the answer.
     * `context_span`, a `SpanField` into the `question` `TextField` denoting the context, i.e., the part of
       the text that potential answers can come from.
     * `cls_index` (optional), an `IndexField` that holds the index of the `[CLS]` token within the
       `question_with_context` field. This is needed because the `[CLS]` token is used to indicate
       an impossible question. Since most tokenizers/models have the `[CLS]` token as the first
       token, this will only be included in the instance if the `[CLS]` token is NOT the first token.
     * `metadata`, a `MetadataField` that stores the instance's ID, the original question, the original
       passage text, both of these in tokenized form, and the gold answer strings, accessible as
       `metadata['id']`, `metadata['question']`, `metadata['context']`, `metadata['question_tokens']`,
       `metadata['context_tokens']`, and `metadata['answers']`. This is so that we can more easily use the
       official SQuAD evaluation script to get metrics.

    For SQuAD v2.0-style datasets that contain impossible questions, we set the gold answer span
    to the span of the `[CLS]` token when there are no answers.

    We also support limiting the maximum length for the question. When the context+question is too long, we run a
    sliding window over the context and emit multiple instances for a single question.
    If `skip_impossible_questions` is `True`, then we only emit instances that contain a gold answer.
    As a result, the per-instance metrics you get during training and evaluation might not correspond
    100% to the SQuAD task.

    To get a final number for SQuAD v1.1, you have to run

    ```
    python -m allennlp_models.rc.tools.transformer_qa_eval
    ```

    # Parameters

    transformer_model_name : `str`, optional (default=`'bert-base-cased'`)
        This reader chooses tokenizer and token indexer according to this setting.

    length_limit : `int`, optional (default=`384`)
        We will make sure that the length of context+question never exceeds this many word pieces.

    stride : `int`, optional (default=`128`)
        When context+question are too long for the length limit, we emit multiple instances for one question,
        where the context is shifted. This parameter specifies the overlap between the shifted context window. It
        is called "stride" instead of "overlap" because that's what it's called in the original huggingface
        implementation.

    skip_impossible_questions : `bool`, optional (default=`False`)
        If this is true, we will skip examples that don't have an answer. This could happen if the question
        is marked impossible in the dataset, or if the question+context is truncated according to `length_limit`
        such that the context no longer contains a gold answer.

        For SQuAD v1.1-style datasets, you should set this to `True` during training, and `False` any other time.

        For SQuAD v2.0-style datasets you should leave this as `False`.

    max_query_length : `int`, optional (default=`64`)
        The maximum number of wordpieces dedicated to the question. If the question is longer than this, it will be
        truncated.

    """

    def __init__(
        self,
        transformer_model_name: str = "bert-base-cased",
        length_limit: int = 384,
        stride: int = 128,
        skip_impossible_questions: bool = False,
        max_query_length: int = 64,
        tokenizer_kwargs: Dict[str, Any] = None,
        **kwargs
    ) -> None:
        if "skip_invalid_examples" in kwargs:
            import warnings

            warnings.warn(
                "'skip_invalid_examples' is deprecated, please use 'skip_impossible_questions' instead",
                DeprecationWarning,
            )
            skip_impossible_questions = kwargs.pop("skip_invalid_examples")

        super().__init__(**kwargs)
        self._tokenizer = PretrainedTransformerTokenizer(
            transformer_model_name,
            add_special_tokens=False,
            tokenizer_kwargs=tokenizer_kwargs,
        )
        self._token_indexers = {
            "tokens": PretrainedTransformerIndexer(
                transformer_model_name, tokenizer_kwargs=tokenizer_kwargs
            )
        }
        self.length_limit = length_limit
        self.stride = stride
        self.skip_impossible_questions = skip_impossible_questions
        self.max_query_length = max_query_length
        self._cls_token = self._tokenizer.tokenizer.cls_token
        # We'll include the `cls_index` IndexField in instances if the CLS token is
        # not always the first token.
        self._include_cls_index = (
            self._find_cls_index(
                self._tokenizer.add_special_tokens(
                    self._tokenizer.tokenize("a"), self._tokenizer.tokenize("a")
                )
            )
            != 0
        )

    @overrides
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        logger.info("Reading file at %s", file_path)
        with open_compressed(file_path) as dataset_file:
            dataset_json = json.load(dataset_file)
            dataset = dataset_json["data"]
        logger.info("Reading the dataset")
        yielded_question_count = 0
        questions_with_more_than_one_instance = 0
        for article in dataset:
            for paragraph_json in article["paragraphs"]:
                context = paragraph_json["context"]
                for question_answer in paragraph_json["qas"]:
                    answers = [answer_json["text"] for answer_json in question_answer["answers"]]

                    # Just like huggingface, we only use the first answer for training.
                    if len(answers) > 0:
                        first_answer_offset = int(question_answer["answers"][0]["answer_start"])
                    else:
                        first_answer_offset = None

                    instances = self.make_instances(
                        question_answer.get("id", None),
                        question_answer["question"],
                        answers,
                        context,
                        first_answer_offset=first_answer_offset,
                        always_add_answer_span=True,
                    )
                    instances_yielded = 0
                    for instance in instances:
                        yield instance
                        instances_yielded += 1
                    if instances_yielded > 1:
                        questions_with_more_than_one_instance += 1
                    yielded_question_count += 1

        if questions_with_more_than_one_instance > 0:
            logger.info(
                "%d (%.2f%%) questions have more than one instance",
                questions_with_more_than_one_instance,
                100 * questions_with_more_than_one_instance / yielded_question_count,
            )

    def make_instances(
        self,
        qid: str,
        question: str,
        answers: List[str],
        context: str,
        first_answer_offset: Optional[int],
        always_add_answer_span: bool = False,
    ) -> Iterable[Instance]:
        """
        Create training instances from a SQuAD example.
        """
        # tokenize context by spaces first, and then with the wordpiece tokenizer
        # For RoBERTa, this produces a bug where every token is marked as beginning-of-sentence. To fix it, we
        # detect whether a space comes before a word, and if so, add "a " in front of the word.
        def tokenize_slice(start: int, end: int) -> Iterable[Token]:
            text_to_tokenize = context[start:end]
            if start - 1 >= 0 and context[start - 1].isspace():
                prefix = "a "  # must end in a space, and be short so we can be sure it becomes only one token
                wordpieces = self._tokenizer.tokenize(prefix + text_to_tokenize)
                for wordpiece in wordpieces:
                    if wordpiece.idx is not None:
                        wordpiece.idx -= len(prefix)
                return wordpieces[1:]
            else:
                return self._tokenizer.tokenize(text_to_tokenize)

        tokenized_context = []
        token_start = 0
        for i, c in enumerate(context):
            if c.isspace():
                for wordpiece in tokenize_slice(token_start, i):
                    if wordpiece.idx is not None:
                        wordpiece.idx += token_start
                    tokenized_context.append(wordpiece)
                token_start = i + 1
        for wordpiece in tokenize_slice(token_start, len(context)):
            if wordpiece.idx is not None:
                wordpiece.idx += token_start
            tokenized_context.append(wordpiece)

        if first_answer_offset is None:
            (token_answer_span_start, token_answer_span_end) = (-1, -1)
        else:
            (token_answer_span_start, token_answer_span_end), _ = char_span_to_token_span(
                [
                    (t.idx, t.idx + len(sanitize_wordpiece(t.text))) if t.idx is not None else None
                    for t in tokenized_context
                ],
                (first_answer_offset, first_answer_offset + len(answers[0])),
            )

        # Tokenize the question.
        tokenized_question = self._tokenizer.tokenize(question)
        tokenized_question = tokenized_question[: self.max_query_length]

        # Stride over the context, making instances.
        space_for_context = (
            self.length_limit
            - len(tokenized_question)
            - len(self._tokenizer.sequence_pair_start_tokens)
            - len(self._tokenizer.sequence_pair_mid_tokens)
            - len(self._tokenizer.sequence_pair_end_tokens)
        )
        stride_start = 0
        while True:
            tokenized_context_window = tokenized_context[stride_start:]
            tokenized_context_window = tokenized_context_window[:space_for_context]

            window_token_answer_span = (
                token_answer_span_start - stride_start,
                token_answer_span_end - stride_start,
            )
            if any(i < 0 or i >= len(tokenized_context_window) for i in window_token_answer_span):
                # The answer is not contained in the window.
                window_token_answer_span = None

            if not self.skip_impossible_questions or window_token_answer_span is not None:
                additional_metadata = {"id": qid}
                instance = self.text_to_instance(
                    question,
                    tokenized_question,
                    context,
                    tokenized_context_window,
                    answers=answers,
                    token_answer_span=window_token_answer_span,
                    additional_metadata=additional_metadata,
                    always_add_answer_span=always_add_answer_span,
                )
                yield instance

            stride_start += space_for_context
            if stride_start >= len(tokenized_context):
                break
            stride_start -= self.stride

    @overrides
    def text_to_instance(
        self,  # type: ignore
        question: str,
        tokenized_question: List[Token],
        context: str,
        tokenized_context: List[Token],
        answers: List[str] = None,
        token_answer_span: Optional[Tuple[int, int]] = None,
        additional_metadata: Dict[str, Any] = None,
        always_add_answer_span: bool = False,
    ) -> Instance:
        fields = {}

        # make the question field
        question_field = TextField(
            self._tokenizer.add_special_tokens(tokenized_question, tokenized_context),
            self._token_indexers,
        )
        fields["question_with_context"] = question_field

        cls_index = self._find_cls_index(question_field.tokens)
        if self._include_cls_index:
            fields["cls_index"] = IndexField(cls_index, question_field)

        start_of_context = (
            len(self._tokenizer.sequence_pair_start_tokens)
            + len(tokenized_question)
            + len(self._tokenizer.sequence_pair_mid_tokens)
        )

        # make the answer span
        if token_answer_span is not None:
            assert all(i >= 0 for i in token_answer_span)
            assert token_answer_span[0] <= token_answer_span[1]

            fields["answer_span"] = SpanField(
                token_answer_span[0] + start_of_context,
                token_answer_span[1] + start_of_context,
                question_field,
            )
        elif always_add_answer_span:
            fields["answer_span"] = SpanField(cls_index, cls_index, question_field)

        # make the context span, i.e., the span of text from which possible answers should be drawn
        fields["context_span"] = SpanField(
            start_of_context, start_of_context + len(tokenized_context) - 1, question_field
        )

        # make the metadata
        metadata = {
            "question": question,
            "question_tokens": tokenized_question,
            "context": context,
            "context_tokens": tokenized_context,
            "answers": answers or [],
        }
        if additional_metadata is not None:
            metadata.update(additional_metadata)
        fields["metadata"] = MetadataField(metadata)

        return Instance(fields)

    def _find_cls_index(self, tokens: List[Token]) -> int:
        return next(i for i, t in enumerate(tokens) if t.text == self._cls_token)
예제 #9
0
파일: berty_tsv.py 프로젝트: dugu9sword/dne
class BertyTSVReader(DatasetReader):
    def __init__(
        self,
        sent1_col: str,
        sent2_col: str = None,
        label_col: str = 'label',
        bert_model: str = 'bert-base-uncased',
        max_sequence_length: int = 500,
        skip_label_indexing: bool = False,
        lower: bool = True,
        lazy: bool = False,
    ) -> None:
        super().__init__(lazy=lazy)
        self._sent1_col = sent1_col
        self._sent2_col = sent2_col
        self._label_col = label_col
        self._tokenizer = PretrainedTransformerTokenizer(
            bert_model,
            add_special_tokens=False,
            max_length=max_sequence_length
        )  # type: PretrainedTransformerTokenizer
        self._max_sequence_length = max_sequence_length
        self._skip_label_indexing = skip_label_indexing
        self._lower = lower
        self._token_indexers = {
            "tokens": PretrainedTransformerIndexer(model_name=bert_model)
        }

    @overrides
    def _read(self, file_path):
        with open(cached_path(file_path), "r") as data_file:
            # without the quoting arg, errors will occur with line having quoting characters "/'
            df = pandas.read_csv(data_file, sep='\t', quoting=csv.QUOTE_NONE)
            has_label = self._label_col in df.columns
            for rid in range(0, df.shape[0]):
                sent1 = df.iloc[rid][self._sent1_col]
                if self._lower:
                    sent1 = sent1.lower()

                if self._sent2_col:
                    sent2 = df.iloc[rid][self._sent2_col]
                    if self._lower:
                        sent2 = sent2.lower()
                else:
                    sent2 = None

                if has_label:
                    label = df.iloc[rid][self._label_col]
                    if self._skip_label_indexing:
                        label = int(label)
                else:
                    label = None

                instance = self.text_to_instance(sent1=sent1,
                                                 sent2=sent2,
                                                 label=label)
                if instance is not None:
                    yield instance

    @overrides
    def text_to_instance(
            self,
            sent1: str,
            sent2: str = None,
            label: Optional[str] = None) -> Instance:  # type: ignore
        fields: Dict[str, Field] = {}

        if sent2:
            # tokens = self._tokenizer.tokenize_sentence_pair(sent1, sent2)
            tokens1 = self._tokenizer.tokenize(sent1)
            tokens2 = self._tokenizer.tokenize(sent2)
            tokens = self._tokenizer.add_special_tokens(tokens1, tokens2)
        else:
            tokens = self._tokenizer.tokenize(sent1)
            tokens = self._tokenizer.add_special_tokens(tokens)

        fields['sent'] = TextField(tokens, self._token_indexers)

        if label is not None:
            fields['label'] = LabelField(
                label, skip_indexing=self._skip_label_indexing)
        return Instance(fields)

    def instance_to_text(self, instance: Instance):
        return allenutil.bert_instance_as_json(instance)
예제 #10
0
class TransformerSquadReader(DatasetReader):
    """
    Reads a JSON-formatted SQuAD file and returns a ``Dataset`` where the ``Instances`` have four
    fields:
     * ``question_with_context``, a ``TextField`` that contains the concatenation of question and context,
     * ``answer_span``, a ``SpanField`` into the ``question`` ``TextField`` denoting the answer.
     * ``context_span`` a ``SpanField`` into the ``question`` ``TextField`` denoting the context, i.e., the part of
       the text that potential answers can come from.
     * A ``MetadataField`` that stores the instance's ID, the original question, the original passage text, both of
       these in tokenized form, and the gold answer strings, accessible as ``metadata['id']``,
       ``metadata['question']``, ``metadata['context']``, ``metadata['question_tokens']``,
       ``metadata['context_tokens']``, and ``metadata['answers']. This is so that we can more easily use the
       official SQuAD evaluation script to get metrics.

    We also support limiting the maximum length for the question. When the context+question is too long, we run a
    sliding window over the context and emit multiple instances for a single question. At training time, we only
    emit instances that contain a gold answer. At test time, we emit all instances. As a result, the per-instance
    metrics you get during training and evaluation don't correspond 100% to the SQuAD task. To get a final number,
    you have to run the script in scripts/transformer_qa_eval.py.

    # Parameters

    transformer_model_name : `str`, optional (default=`'bert-base-cased'`)
        This reader chooses tokenizer and token indexer according to this setting.
    length_limit : `int`, optional (default=`384`)
        We will make sure that the length of context+question never exceeds this many word pieces.
    stride : `int`, optional (default=`128`)
        When context+question are too long for the length limit, we emit multiple instances for one question,
        where the context is shifted. This parameter specifies the overlap between the shifted context window. It
        is called "stride" instead of "overlap" because that's what it's called in the original huggingface
        implementation.
    skip_invalid_examples: `bool`, optional (default=`False`)
        If this is true, we will skip examples that don't have a gold answer. You should set this to True during
        training, and False any other time.
    max_query_length : `int`, optional (default=`64`)
        The maximum number of wordpieces dedicated to the question. If the question is longer than this, it will be
        truncated.
    """
    def __init__(self,
                 transformer_model_name: str = "bert-base-cased",
                 length_limit: int = 384,
                 stride: int = 128,
                 skip_invalid_examples: bool = False,
                 max_query_length: int = 64,
                 **kwargs) -> None:
        super().__init__(**kwargs)
        self._tokenizer = PretrainedTransformerTokenizer(
            transformer_model_name, add_special_tokens=False)
        self._token_indexers = {
            "tokens": PretrainedTransformerIndexer(transformer_model_name)
        }
        self.length_limit = length_limit
        self.stride = stride
        self.skip_invalid_examples = skip_invalid_examples
        self.max_query_length = max_query_length

    @overrides
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        logger.info("Reading file at %s", file_path)
        with open_compressed(file_path) as dataset_file:
            dataset_json = json.load(dataset_file)
            dataset = dataset_json["data"]
        logger.info("Reading the dataset")
        yielded_question_count = 0
        questions_with_more_than_one_instance = 0
        for article in dataset:
            for paragraph_json in article["paragraphs"]:
                context = paragraph_json["context"]
                for question_answer in paragraph_json["qas"]:
                    answers = [
                        answer_json["text"]
                        for answer_json in question_answer["answers"]
                    ]

                    # Just like huggingface, we only use the first answer for training.
                    if len(answers) > 0:
                        first_answer_offset = int(
                            question_answer["answers"][0]["answer_start"])
                    else:
                        first_answer_offset = None

                    instances = self.make_instances(
                        question_answer.get("id", None),
                        question_answer["question"],
                        answers,
                        context,
                        first_answer_offset,
                    )
                    instances_yielded = 0
                    for instance in instances:
                        yield instance
                        instances_yielded += 1
                    if instances_yielded > 1:
                        questions_with_more_than_one_instance += 1
                    yielded_question_count += 1

        if questions_with_more_than_one_instance > 0:
            logger.info(
                "%d (%.2f%%) questions have more than one instance",
                questions_with_more_than_one_instance,
                100 * questions_with_more_than_one_instance /
                yielded_question_count,
            )

    def make_instances(
        self,
        qid: str,
        question: str,
        answers: List[str],
        context: str,
        first_answer_offset: Optional[int],
    ) -> Iterable[Instance]:
        # tokenize context by spaces first, and then with the wordpiece tokenizer
        # For RoBERTa, this produces a bug where every token is marked as beginning-of-sentence. To fix it, we
        # detect whether a space comes before a word, and if so, add "a " in front of the word.
        def tokenize_slice(start: int, end: int) -> Iterable[Token]:
            text_to_tokenize = context[start:end]
            if start - 1 >= 0 and context[start - 1].isspace():
                prefix = "a "  # must end in a space, and be short so we can be sure it becomes only one token
                wordpieces = self._tokenizer.tokenize(prefix +
                                                      text_to_tokenize)
                for wordpiece in wordpieces:
                    if wordpiece.idx is not None:
                        wordpiece.idx -= len(prefix)
                return wordpieces[1:]
            else:
                return self._tokenizer.tokenize(text_to_tokenize)

        tokenized_context = []
        token_start = 0
        for i, c in enumerate(context):
            if c.isspace():
                for wordpiece in tokenize_slice(token_start, i):
                    if wordpiece.idx is not None:
                        wordpiece.idx += token_start
                    tokenized_context.append(wordpiece)
                token_start = i + 1
        for wordpiece in tokenize_slice(token_start, len(context)):
            if wordpiece.idx is not None:
                wordpiece.idx += token_start
            tokenized_context.append(wordpiece)

        if first_answer_offset is None:
            (token_answer_span_start, token_answer_span_end) = (-1, -1)
        else:
            (token_answer_span_start,
             token_answer_span_end), _ = char_span_to_token_span(
                 [(t.idx, t.idx + len(sanitize_wordpiece(t.text)))
                  if t.idx is not None else None for t in tokenized_context],
                 (first_answer_offset, first_answer_offset + len(answers[0])),
             )

        # Tokenize the question
        tokenized_question = self._tokenizer.tokenize(question)
        tokenized_question = tokenized_question[:self.max_query_length]

        # Stride over the context, making instances
        # Sequences are [CLS] question [SEP] [SEP] context [SEP], hence the - 4 for four special tokens.
        # This is technically not correct for anything but RoBERTa, but it does not affect the scores.
        space_for_context = (self.length_limit - len(tokenized_question) -
                             len(self._tokenizer.sequence_pair_start_tokens) -
                             len(self._tokenizer.sequence_pair_mid_tokens) -
                             len(self._tokenizer.sequence_pair_end_tokens))
        stride_start = 0
        while True:
            tokenized_context_window = tokenized_context[stride_start:]
            tokenized_context_window = tokenized_context_window[:
                                                                space_for_context]

            window_token_answer_span = (
                token_answer_span_start - stride_start,
                token_answer_span_end - stride_start,
            )
            if any(i < 0 or i >= len(tokenized_context_window)
                   for i in window_token_answer_span):
                # The answer is not contained in the window.
                window_token_answer_span = None

            if not self.skip_invalid_examples or window_token_answer_span is not None:
                additional_metadata = {"id": qid}
                instance = self.text_to_instance(
                    question,
                    tokenized_question,
                    context,
                    tokenized_context_window,
                    answers,
                    window_token_answer_span,
                    additional_metadata,
                )
                yield instance

            stride_start += space_for_context
            if stride_start >= len(tokenized_context):
                break
            stride_start -= self.stride

    @overrides
    def text_to_instance(
        self,  # type: ignore
        question: str,
        tokenized_question: List[Token],
        context: str,
        tokenized_context: List[Token],
        answers: List[str],
        token_answer_span: Optional[Tuple[int, int]],
        additional_metadata: Dict[str, Any] = None,
    ) -> Instance:
        fields = {}

        # make the question field
        question_field = TextField(
            self._tokenizer.add_special_tokens(tokenized_question,
                                               tokenized_context),
            self._token_indexers,
        )
        fields["question_with_context"] = question_field
        start_of_context = (len(self._tokenizer.sequence_pair_start_tokens) +
                            len(tokenized_question) +
                            len(self._tokenizer.sequence_pair_mid_tokens))

        # make the answer span
        if token_answer_span is not None:
            assert all(i >= 0 for i in token_answer_span)
            assert token_answer_span[0] <= token_answer_span[1]

            fields["answer_span"] = SpanField(
                token_answer_span[0] + start_of_context,
                token_answer_span[1] + start_of_context,
                question_field,
            )
        else:
            # We have to put in something even when we don't have an answer, so that this instance can be batched
            # together with other instances that have answers.
            fields["answer_span"] = SpanField(-1, -1, question_field)

        # make the context span, i.e., the span of text from which possible answers should be drawn
        fields["context_span"] = SpanField(
            start_of_context, start_of_context + len(tokenized_context) - 1,
            question_field)

        # make the metadata
        metadata = {
            "question": question,
            "question_tokens": tokenized_question,
            "context": context,
            "context_tokens": tokenized_context,
            "answers": answers,
        }
        if additional_metadata is not None:
            metadata.update(additional_metadata)
        fields["metadata"] = MetadataField(metadata)

        return Instance(fields)
예제 #11
0
class RecordTaskReader(DatasetReader):
    """
    Reader for Reading Comprehension with Commonsense Reasoning(ReCoRD) task from SuperGLUE. The
    task is detailed in the paper ReCoRD: Bridging the Gap between Human and Machine Commonsense
    Reading Comprehension (arxiv.org/pdf/1810.12885.pdf) by Zhang et al. Leaderboards and the
    official evaluation script for the ReCoRD task can be found sheng-z.github.io/ReCoRD-explorer/.

    The reader reads a JSON file in the format from
    sheng-z.github.io/ReCoRD-explorer/dataset-readme.txt


    # Parameters

    tokenizer: `Tokenizer`, optional
        The tokenizer class to use. Defaults to SpacyTokenizer

    token_indexers : `Dict[str, TokenIndexer]`, optional
        We similarly use this for both the question and the passage.  See :class:`TokenIndexer`.
        Default is `{"tokens": SingleIdTokenIndexer()}`.

    passage_length_limit : `int`, optional (default=`None`)
        If specified, we will cut the passage if the length of passage exceeds this limit.

    question_length_limit : `int`, optional (default=`None`)
        If specified, we will cut the question if the length of question exceeds this limit.

    raise_errors: `bool`, optional (default=`False`)
        If the reader should raise errors or just continue.

    kwargs: `Dict`
        Keyword arguments to be passed to the DatasetReader parent class constructor.

    """
    def __init__(
        self,
        transformer_model_name: str = "bert-base-cased",
        length_limit: int = 384,
        question_length_limit: int = 64,
        stride: int = 128,
        raise_errors: bool = False,
        tokenizer_kwargs: Dict[str, Any] = None,
        one_instance_per_query: bool = False,
        max_instances: int = None,
        **kwargs,
    ) -> None:
        """
        Initialize the RecordTaskReader.
        """
        super(RecordTaskReader,
              self).__init__(manual_distributed_sharding=True,
                             max_instances=max_instances,
                             **kwargs)

        self._kwargs = kwargs

        self._model_name = transformer_model_name
        self._tokenizer_kwargs = tokenizer_kwargs or {}
        # Save the values passed to __init__ to protected attributes
        self._tokenizer = PretrainedTransformerTokenizer(
            transformer_model_name,
            add_special_tokens=False,
            tokenizer_kwargs=tokenizer_kwargs,
        )
        self._token_indexers = {
            "tokens":
            PretrainedTransformerIndexer(transformer_model_name,
                                         tokenizer_kwargs=tokenizer_kwargs)
        }
        self._length_limit = length_limit
        self._query_len_limit = question_length_limit
        self._stride = stride
        self._raise_errors = raise_errors
        self._cls_token = "@placeholder"
        self._one_instance_per_query = one_instance_per_query

    def _to_params(self) -> Dict[str, Any]:
        """
        Get the configuration dictionary for this class.

        # Returns

        `Dict[str, Any]` The config dict.
        """
        return {
            "type": "superglue_record",
            "transformer_model_name": self._model_name,
            "length_limit": self._length_limit,
            "question_length_limit": self._query_len_limit,
            "stride": self._stride,
            "raise_errors": self._raise_errors,
            "tokenizer_kwargs": self._tokenizer_kwargs,
            "one_instance_per_query": self._one_instance_per_query,
            "max_instances": self.max_instances,
            **self._kwargs,
        }

    def _read(self, file_path: Union[Path, str]) -> Iterable[Instance]:
        # IF `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        # Read the 'data' key from the dataset
        logger.info(f"Reading '{file_path}'")
        with open(file_path) as fp:
            dataset = json.load(fp)["data"]
        logger.info(f"Found {len(dataset)} examples from '{file_path}'")

        # Keep track of certain stats while reading the file
        # examples_multiple_instance_count: The number of questions with more than
        #   one instance. Can happen because there is multiple queries for a
        #   single passage.
        # passages_yielded: The total number of instances found/yielded.
        examples_multiple_instance_count = 0
        examples_no_instance_count = 0
        passages_yielded = 0

        # Iterate through every example from the ReCoRD data file.
        for example in dataset:

            # Get the list of instances for the current example
            instances_for_example = self.get_instances_from_example(example)

            # Keep track of number of instances for this specific example that
            # have been yielded. Since it instances_for_example is a generator, we
            # do not know its length. To address this, we create an counter int.
            instance_count = 0

            # Iterate through the instances and yield them.
            for instance in instances_for_example:
                yield instance
                instance_count += 1

            if instance_count == 0:
                logger.warning(f"Example '{example['id']}' had no instances.")
                examples_no_instance_count += 1

            # Check if there was more than one instance for this example. If
            # there was we increase examples_multiple_instance_count by 1.
            # Otherwise we increase by 0.
            examples_multiple_instance_count += 1 if instance_count > 1 else 0

            passages_yielded += instance_count

            # Check to see if we are over the max_instances to yield.
            if self.max_instances and passages_yielded > self.max_instances:
                logger.info("Passed max instances")
                break

        # Log pertinent information.
        if passages_yielded:
            logger.info(
                f"{examples_multiple_instance_count}/{passages_yielded} "
                f"({examples_multiple_instance_count / passages_yielded * 100:.2f}%) "
                f"examples had more than one instance")
            logger.info(
                f"{examples_no_instance_count}/{passages_yielded} "
                f"({examples_no_instance_count / passages_yielded * 100:.2f}%) "
                f"examples had no instances")
        else:
            logger.warning(f"Could not find any instances in '{file_path}'")

    def get_instances_from_example(
            self,
            example: Dict,
            always_add_answer_span: bool = False) -> Iterable[Instance]:
        """
        Helper function to get instances from an example.

        Much of this comes from `transformer_squad.make_instances`

        # Parameters

        example: `Dict[str,Any]`
            The example dict.

        # Returns:

        `Iterable[Instance]` The instances for each example
        """
        # Get the passage dict from the example, it has text and
        # entities
        example_id: str = example["id"]
        passage_dict: Dict = example["passage"]
        passage_text: str = passage_dict["text"]

        # Tokenize the passage
        tokenized_passage: List[Token] = self.tokenize_str(passage_text)

        # TODO: Determine what to do with entities. Superglue marks them
        #   explicitly as input (https://arxiv.org/pdf/1905.00537.pdf)

        # Get the queries from the example dict
        queries: List = example["qas"]
        logger.debug(f"{len(queries)} queries for example {example_id}")

        # Tokenize and get the context windows for each queries
        for query in queries:

            # Create the additional metadata dict that will be passed w/ extra
            # data for each query. We store the question & query ids, all
            # answers, and other data following `transformer_qa`.
            additional_metadata = {
                "id": query["id"],
                "example_id": example_id,
            }
            instances_yielded = 0
            # Tokenize, and truncate, the query based on the max set in
            # `__init__`
            tokenized_query = self.tokenize_str(
                query["query"])[:self._query_len_limit]

            # Calculate where the context needs to start and how many tokens we have
            # for it. This is due to the limit on the number of tokens that a
            # transformer can use because they have quadratic memory usage. But if
            # you are reading this code, you probably know that.
            space_for_context = (
                self._length_limit - len(list(tokenized_query))
                # Used getattr so I can test without having to load a
                # transformer model.
                -
                len(getattr(self._tokenizer, "sequence_pair_start_tokens",
                            [])) -
                len(getattr(self._tokenizer, "sequence_pair_mid_tokens", [])) -
                len(getattr(self._tokenizer, "sequence_pair_end_tokens", [])))

            # Check if answers exist for this query. We assume that there are no
            # answers for this query, and set the start and end index for the
            # answer span to -1.
            answers = query.get("answers", [])
            if not answers:
                logger.warning(f"Skipping {query['id']}, no answers")
                continue

            # Create the arguments needed for `char_span_to_token_span`
            token_offsets = [(t.idx, t.idx + len(sanitize_wordpiece(t.text)))
                             if t.idx is not None else None
                             for t in tokenized_passage]

            # Get the token offsets for the answers for this current passage.
            answer_token_start, answer_token_end = (-1, -1)
            for answer in answers:

                # Try to find the offsets.
                offsets, _ = char_span_to_token_span(
                    token_offsets, (answer["start"], answer["end"]))

                # If offsets for an answer were found, it means the answer is in
                # the passage, and thus we can stop looking.
                if offsets != (-1, -1):
                    answer_token_start, answer_token_end = offsets
                    break

            # Go through the context and find the window that has the answer in it.
            stride_start = 0

            while True:
                tokenized_context_window = tokenized_passage[stride_start:]
                tokenized_context_window = tokenized_context_window[:
                                                                    space_for_context]

                # Get the token offsets w.r.t the current window.
                window_token_answer_span = (
                    answer_token_start - stride_start,
                    answer_token_end - stride_start,
                )
                if any(i < 0 or i >= len(tokenized_context_window)
                       for i in window_token_answer_span):
                    # The answer is not contained in the window.
                    window_token_answer_span = None

                if (
                        # not self.skip_impossible_questions
                        window_token_answer_span is not None):
                    # The answer WAS found in the context window, and thus we
                    # can make an instance for the answer.
                    instance = self.text_to_instance(
                        query["query"],
                        tokenized_query,
                        passage_text,
                        tokenized_context_window,
                        answers=[answer["text"] for answer in answers],
                        token_answer_span=window_token_answer_span,
                        additional_metadata=additional_metadata,
                        always_add_answer_span=always_add_answer_span,
                    )
                    yield instance
                    instances_yielded += 1

                if instances_yielded == 1 and self._one_instance_per_query:
                    break

                stride_start += space_for_context

                # If we have reached the end of the passage, stop.
                if stride_start >= len(tokenized_passage):
                    break

                # I am not sure what this does...but it is here?
                stride_start -= self._stride

    def tokenize_slice(self,
                       text: str,
                       start: int = None,
                       end: int = None) -> Iterable[Token]:
        """
        Get + tokenize a span from a source text.

        *Originally from the `transformer_squad.py`*

        # Parameters

        text: `str`
            The text to draw from.
        start: `int`
            The start index for the span.
        end: `int`
            The end index for the span. Assumed that this is inclusive.

        # Returns

        `Iterable[Token]` List of tokens for the retrieved span.
        """
        start = start or 0
        end = end or len(text)
        text_to_tokenize = text[start:end]

        # Check if this is the start of the text. If the start is >= 0, check
        # for a preceding space. If it exists, then we need to tokenize a
        # special way because of a bug with RoBERTa tokenizer.
        if start - 1 >= 0 and text[start - 1].isspace():

            # Per the original tokenize_slice function, you need to add a
            # garbage token before the actual text you want to tokenize so that
            # the tokenizer does not add a beginning of sentence token.
            prefix = "a "

            # Tokenize the combined prefix and text
            wordpieces = self._tokenizer.tokenize(prefix + text_to_tokenize)

            # Go through each wordpiece in the tokenized wordpieces.
            for wordpiece in wordpieces:

                # Because we added the garbage prefix before tokenize, we need
                # to adjust the idx such that it accounts for this. Therefore we
                # subtract the length of the prefix from each token's idx.
                if wordpiece.idx is not None:
                    wordpiece.idx -= len(prefix)

            # We do not want the garbage token, so we return all but the first
            # token.
            return wordpieces[1:]
        else:

            # Do not need any sort of prefix, so just return all of the tokens.
            return self._tokenizer.tokenize(text_to_tokenize)

    def tokenize_str(self, text: str) -> List[Token]:
        """
        Helper method to tokenize a string.

        Adapted from the `transformer_squad.make_instances`

        # Parameters
            text: `str`
                The string to tokenize.

        # Returns

        `Iterable[Tokens]` The resulting tokens.

        """
        # We need to keep track of the current token index so that we can update
        # the results from self.tokenize_slice such that they reflect their
        # actual position in the string rather than their position in the slice
        # passed to tokenize_slice. Also used to construct the slice.
        token_index = 0

        # Create the output list (can be any iterable) that will store the
        # tokens we found.
        tokenized_str = []

        # Helper function to update the `idx` and add every wordpiece in the
        # `tokenized_slice` to the `tokenized_str`.
        def add_wordpieces(tokenized_slice: Iterable[Token]) -> None:
            for wordpiece in tokenized_slice:
                if wordpiece.idx is not None:
                    wordpiece.idx += token_index
                tokenized_str.append(wordpiece)

        # Iterate through every character and their respective index in the text
        # to create the slices to tokenize.
        for i, c in enumerate(text):

            # Check if the current character is a space. If it is, we tokenize
            # the slice of `text` from `token_index` to `i`.
            if c.isspace():
                add_wordpieces(self.tokenize_slice(text, token_index, i))
                token_index = i + 1

        # Add the end slice that is not collected by the for loop.
        add_wordpieces(self.tokenize_slice(text, token_index, len(text)))

        return tokenized_str

    @staticmethod
    def get_spans_from_text(text: str, spans: List[Tuple[int,
                                                         int]]) -> List[str]:
        """
        Helper function to get a span from a string

        # Parameter

        text: `str`
            The source string
        spans: `List[Tuple[int,int]]`
            List of start and end indices for spans.

            Assumes that the end index is inclusive. Therefore, for start
            index `i` and end index `j`, retrieves the span at `text[i:j+1]`.

        # Returns

        `List[str]` The extracted string from text.
        """
        return [text[start:end + 1] for start, end in spans]

    def text_to_instance(
        self,
        query: str,
        tokenized_query: List[Token],
        passage: str,
        tokenized_passage: List[Token],
        answers: List[str],
        token_answer_span: Optional[Tuple[int, int]] = None,
        additional_metadata: Optional[Dict[str, Any]] = None,
        always_add_answer_span: Optional[bool] = False,
    ) -> Instance:
        """
        A lot of this comes directly from the `transformer_squad.text_to_instance`
        """
        fields = {}

        # Create the query field from the tokenized question and context. Use
        # `self._tokenizer.add_special_tokens` function to add the necessary
        # special tokens to the query.
        query_field = TextField(
            self._tokenizer.add_special_tokens(
                # The `add_special_tokens` function automatically adds in the
                # separation token to mark the separation between the two lists of
                # tokens. Therefore, we can create the query field WITH context
                # through passing them both as arguments.
                tokenized_query,
                tokenized_passage,
            ),
            self._token_indexers,
        )

        # Add the query field to the fields dict that will be outputted as an
        # instance. Do it here rather than assign above so that we can use
        # attributes from `query_field` rather than continuously indexing
        # `fields`.
        fields["question_with_context"] = query_field

        # Calculate the index that marks the start of the context.
        start_of_context = (
            +len(tokenized_query)
            # Used getattr so I can test without having to load a
            # transformer model.
            + len(getattr(self._tokenizer, "sequence_pair_start_tokens", [])) +
            len(getattr(self._tokenizer, "sequence_pair_mid_tokens", [])))

        # make the answer span
        if token_answer_span is not None:
            assert all(i >= 0 for i in token_answer_span)
            assert token_answer_span[0] <= token_answer_span[1]

            fields["answer_span"] = SpanField(
                token_answer_span[0] + start_of_context,
                token_answer_span[1] + start_of_context,
                query_field,
            )
        # make the context span, i.e., the span of text from which possible
        # answers should be drawn
        fields["context_span"] = SpanField(
            start_of_context, start_of_context + len(tokenized_passage) - 1,
            query_field)

        # make the metadata
        metadata = {
            "question": query,
            "question_tokens": tokenized_query,
            "context": passage,
            "context_tokens": tokenized_passage,
            "answers": answers or [],
        }
        if additional_metadata is not None:
            metadata.update(additional_metadata)
        fields["metadata"] = MetadataField(metadata)

        return Instance(fields)

    def _find_cls_index(self, tokens: List[Token]) -> int:
        """
        From transformer_squad
        Args:
            self:
            tokens:

        Returns:

        """
        return next(i for i, t in enumerate(tokens)
                    if t.text == self._cls_token)
예제 #12
0
class FakeReader(DatasetReader):
    """
    Creates fake multiple-choice input. If your model doesn't get 99% on this data, it is broken.

    Instances have two fields:
     * `alternatives`, a ListField of TextField
     * `correct_alternative`, IndexField with the correct answer among `alternatives`

    Parameters
    ----------
    transformer_model_name : `str`, optional (default=`roberta-large`)
        This reader chooses tokenizer and token indexer according to this setting.
    length_limit : `int`, optional (default=512)
        We will make sure that the length of the alternatives never exceeds this many word pieces.
    """
    def __init__(self,
                 transformer_model_name: str = "roberta-large",
                 length_limit: int = 512,
                 **kwargs) -> None:
        super().__init__(**kwargs)

        if self.max_instances is None:
            raise ValueError("FakeReader requires max_instances to be set.")

        from allennlp.data.tokenizers import PretrainedTransformerTokenizer

        self._tokenizer = PretrainedTransformerTokenizer(
            transformer_model_name, add_special_tokens=False)

        from allennlp.data.token_indexers import PretrainedTransformerIndexer

        self._token_indexers = {
            "tokens": PretrainedTransformerIndexer(transformer_model_name)
        }
        self.length_limit = length_limit

    def _read(self, file_path: str):
        logger.info("Ignoring file at %s", file_path)

        for i in range(self.max_instances):
            label = i % 2
            texts = [f"This is the false choice {i}."] * 2
            texts[label] = f"This is the true choice {i}."
            yield self.text_to_instance(texts, label)

    def text_to_instance(
        self,  # type: ignore
        alternatives: List[str],
        correct_alternative: int,
    ) -> Instance:
        # tokenize
        alternatives = [
            self._tokenizer.tokenize(alternative)
            for alternative in alternatives
        ]

        # add special tokens
        alternatives = [
            self._tokenizer.add_special_tokens(alternative)
            for alternative in alternatives
        ]

        # make fields
        from allennlp.data.fields import TextField

        alternatives = [
            TextField(alternative, self._token_indexers)
            for alternative in alternatives
        ]
        if correct_alternative < 0 or correct_alternative >= len(alternatives):
            raise ValueError("Alternative %d does not exist.",
                             correct_alternative)
        from allennlp.data.fields import ListField

        alternatives = ListField(alternatives)

        from allennlp.data.fields import IndexField

        return Instance({
            "alternatives":
            alternatives,
            "correct_alternative":
            IndexField(correct_alternative, alternatives),
        })
예제 #13
0
class WorldTreeReader(DatasetReader):
    """

    """
    def __init__(self,
                 transformer_model_name: str = "roberta-large",
                 **kwargs) -> None:
        super().__init__(**kwargs)

        self._tokenizer = PretrainedTransformerTokenizer(
            transformer_model_name, add_special_tokens=False)

        self._token_indexers = {
            "tokens": PretrainedTransformerIndexer(transformer_model_name)
        }

    @overrides
    def _read(self, file_path: str) -> Iterable[Instance]:
        """


        """
        with open(cached_path(file_path), "r") as data_file:

            logger.info("Reading questions from file at: %s", file_path)

            df = pd.read_csv(file_path, delimiter="\t")

            for _, row in df.iterrows():
                qid = row["QuestionID"]
                raw_question = row["question"]
                question, choices = parse_raw_question(raw_question)
                answer = row["AnswerKey"]
                answer_idx = answser_to_index(answer)
                yield self.text_to_instance(qid, question, choices, answer_idx)

    @overrides
    def text_to_instance(
        self,  # type: ignore
        qid: str,
        question: str,
        choices: List[str],
        answer_idx: Optional[int] = None,
    ) -> Instance:

        # **A hack**
        # We need to make each question have exactly four choices to process them in batches.
        # Either drop the choice or use a non-answer one to fill.
        if len(choices) == 5:
            if answer_idx != 4:  # Just drop the last choice
                choices = choices[:-1]
            elif answer_idx == 4:  # Answer is the last, so drop the first
                choices = choices[1:]
                answer_idx -= 1
        elif len(choices) == 3:
            if answer_idx != 2:  # Use the last to fill
                choices.append(choices[-1])
            else:  # Use the first to fill
                choices.append(choices[0])

        # Base checks
        assert len(choices) == 4
        if answer_idx < 0 or answer_idx >= len(choices):
            # print(answer_idx)
            raise ValueError("Choice %d does not exist", answer_idx)

        # Tokenize the question
        question_tokens = self._tokenizer.tokenize(question)

        # Tokenize the choices and concate them with the question into question-choice pairs
        qc_pairs = []
        for choice in choices:
            choice_tokens = self._tokenizer.tokenize(choice)
            qc_pair = self._tokenizer.add_special_tokens(
                question_tokens, choice_tokens)
            qc_pairs.append(qc_pair)

        # Wrap them into AllenNLP fields
        qc_pairs = [TextField(pair, self._token_indexers) for pair in qc_pairs]
        qc_pairs = ListField(qc_pairs)
        answer_idx = IndexField(answer_idx, qc_pairs)
        metadata = MetadataField({
            "id": qid,
            "question": question,
            "choices": choices
        })

        return Instance({
            "qc_pairs": qc_pairs,
            "answer_idx": answer_idx,
            "metadata": metadata,
        })
예제 #14
0
class BertDatasetReader(DatasetReader):
    """
    Reads a file from ProPara state change dataset.  This data is formatted as TSV, one instance per line.
    Format: "Query \t\t\t step \t\t\t state_change_types"
    state_change_types: string label applicable to this datapoint

    We convert these columns into fields named 
    "tokens", 
    "state_change_types".

    Parameters
    ----------
    token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": BertTokenIndexer()}``)
    IMPORTANT NOTE: All components like tokeniser, token-indexer, token embedder and Seq2VecEncoder should be of Bert-type.
    """

    def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self.transformer_model = "bert-base-uncased"
        self.tokenizer = PretrainedTransformerTokenizer(model_name=self.transformer_model,add_special_tokens=False,max_length=512)
        self.token_indexer = PretrainedTransformerIndexer(model_name=self.transformer_model,max_length =512)
#         self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}

    @overrides
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        instances = []
        with open(file_path, 'r') as state_change_file:
            logger.info("Reading state change instances from TSV dataset at: %s", file_path)
            for line in tqdm.tqdm(state_change_file):
                parts: List[str] = line.split('\t\t\t')
                query_text = parts[0].lower()
                query_tokens = self.tokenizer.tokenize(query_text)
                step_text = parts[1].lower()
                step_tokens = self.tokenizer.tokenize(step_text)
                combined_tokens = self.tokenizer.add_special_tokens(query_tokens,step_tokens)

                # parse labels
                state_change_types = parts[2].strip()

                # create instance
                yield self.text_to_instance(combined_tokens=combined_tokens,
                                                       state_change_types=state_change_types)


    @overrides
    def text_to_instance(self,  # type: ignore
                         combined_tokens: List[str],
                         state_change_types: Optional[List[str]] = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
#         print(verb_vector)
        # encode inputs
        token_field = TextField(combined_tokens, {'tokens': self.token_indexer})
#         token_field.index(vocab)
        fields['tokens'] = token_field
#         fields['verb_span'] = SequenceLabelField(verb_vector, token_field, 'indicator_tags')
#         fields['entity_span'] = SequenceLabelField(entity_vector, token_field, 'indicator_tags')

        # encode outputs
        if state_change_types:
            fields['state_change_type_labels'] = LabelField(state_change_types, 'state_change_type_labels')
        return Instance(fields)

    @classmethod
    def from_params(cls, params: Params,constructor_to_call=None, constructor_to_inspect=None) -> 'BertDatasetReader':
#         token_indexers = TokenIndexer()
#         print(params.pop("token_indexer", {}))
#         token_indexers = {'tokens': SingleIdTokenIndexer()}
#         params.assert_empty(cls.__name__)
        return BertDatasetReader()