def text_to_instance(
            self,  # type: ignore
            tokens: List[str],
            lemmas: List[str] = None,
            pos_tags: List[str] = None,
            arc_indices: List[Tuple[int, int]] = None,
            arc_tags: List[str] = None,
            gold_actions: List[str] = None,
            root_id: List[int] = None,
            meta_info: List[str] = None,
            concept_label: List[int] = None,
            tokens_range: List[Tuple[int, int]] = None,
            gold_mrps: List[str] = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        token_field = TextField([Token(t) for t in tokens],
                                self._token_indexers)

        fields["tokens"] = token_field
        meta_dict = {"tokens": tokens}

        if lemmas is not None and self._lemma_indexers is not None:
            fields["lemmas"] = TextField([Token(l) for l in lemmas],
                                         self._lemma_indexers)
        if pos_tags is not None:
            fields["pos_tags"] = SequenceLabelField(pos_tags,
                                                    token_field,
                                                    label_namespace="pos")

        if arc_indices is not None and arc_tags is not None:
            meta_dict["arc_indices"] = arc_indices
            meta_dict["arc_tags"] = arc_tags
            fields["arc_tags"] = TextField([Token(a) for a in arc_tags],
                                           self._arc_tag_indexers)

        if gold_actions is not None:
            meta_dict["gold_actions"] = gold_actions
            fields["gold_actions"] = TextField(
                [Token(a) for a in gold_actions], self._action_indexers)

        if meta_info is not None:
            meta_dict["meta_info"] = meta_info[0]

        if gold_mrps is not None:
            meta_dict["gold_mrps"] = gold_mrps[0]

        if tokens_range is not None:
            meta_dict["tokens_range"] = tokens_range

        if concept_label is not None:
            meta_dict["concept_label"] = concept_label
            fields["concept_label"] = TextField(
                [Token(a) for a in concept_label],
                self._concept_label_indexers)

        if root_id is not None:
            meta_dict["root_id"] = root_id[0]

        fields["metadata"] = MetadataField(meta_dict)

        return Instance(fields)
Ejemplo n.º 2
0
    def text_to_instance(
            self,  # type: ignore
            tokens,
            pos_tags=None,
            gold_tree=None):
        u"""
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.

        Parameters
        ----------
        tokens : ``List[str]``, required.
            The tokens in a given sentence.
        pos_tags ``List[str]``, optional, (default = None).
            The POS tags for the words in the sentence.
        gold_tree : ``Tree``, optional (default = None).
            The gold parse tree to create span labels from.

        Returns
        -------
        An ``Instance`` containing the following fields:
            tokens : ``TextField``
                The tokens in the sentence.
            pos_tags : ``SequenceLabelField``
                The POS tags of the words in the sentence.
                Only returned if ``use_pos_tags`` is ``True``
            spans : ``ListField[SpanField]``
                A ListField containing all possible subspans of the
                sentence.
            span_labels : ``SequenceLabelField``, optional.
                The constiutency tags for each of the possible spans, with
                respect to a gold parse tree. If a span is not contained
                within the tree, a span will have a ``NO-LABEL`` label.
            gold_tree : ``MetadataField(Tree)``
                The gold NLTK parse tree for use in evaluation.
        """
        # pylint: disable=arguments-differ
        text_field = TextField([Token(x) for x in tokens],
                               token_indexers=self._token_indexers)
        fields = {u"tokens": text_field}

        if self._use_pos_tags and pos_tags is not None:
            pos_tag_field = SequenceLabelField(pos_tags,
                                               text_field,
                                               label_namespace=u"pos")
            fields[u"pos_tags"] = pos_tag_field
        elif self._use_pos_tags:
            raise ConfigurationError(
                u"use_pos_tags was set to True but no gold pos"
                u" tags were passed to the dataset reader.")
        spans = []
        gold_labels = []

        if gold_tree is not None:
            gold_spans = {}
            self._get_gold_spans(gold_tree, 0, gold_spans)

        else:
            gold_spans = None
        for start, end in enumerate_spans(tokens):
            spans.append(SpanField(start, end, text_field))

            if gold_spans is not None:
                if (start, end) in list(gold_spans.keys()):
                    gold_labels.append(gold_spans[(start, end)])
                else:
                    gold_labels.append(u"NO-LABEL")

        metadata = {u"tokens": tokens}
        if gold_tree:
            metadata[u"gold_tree"] = gold_tree
        if self._use_pos_tags:
            metadata[u"pos_tags"] = pos_tags

        fields[u"metadata"] = MetadataField(metadata)

        span_list_field = ListField(spans)
        fields[u"spans"] = span_list_field
        if gold_tree is not None:
            fields[u"span_labels"] = SequenceLabelField(
                gold_labels, span_list_field)
        return Instance(fields)
Ejemplo n.º 3
0
    def predictions_to_labeled_instances(
            self, instance: Instance,
            outputs: Dict[str, numpy.ndarray]) -> List[Instance]:
        """
        This function currently only handles BIOUL tags.

        Imagine an NER model predicts three named entities (each one with potentially
        multiple tokens). For each individual entity, we create a new Instance that has
        the label set to only that entity and the rest of the tokens are labeled as outside.
        We then return a list of those Instances.

        For example:
        Mary  went to Seattle to visit Microsoft Research
        U-Per  O    O   U-Loc  O   O     B-Org     L-Org

        We create three instances.
        Mary  went to Seattle to visit Microsoft Research
        U-Per  O    O    O     O   O       O         O

        Mary  went to Seattle to visit Microsoft Research
        O      O    O   U-LOC  O   O       O         O

        Mary  went to Seattle to visit Microsoft Research
        O      O    O    O     O   O     B-Org     L-Org
        """
        predicted_tags = outputs["tags"]
        predicted_spans = []

        i = 0
        while i < len(predicted_tags):
            tag = predicted_tags[i]
            # if its a U, add it to the list
            if tag[0] == "U":
                current_tags = [
                    t if idx == i else "O"
                    for idx, t in enumerate(predicted_tags)
                ]
                predicted_spans.append(current_tags)
            # if its a B, keep going until you hit an L.
            elif tag[0] == "B":
                begin_idx = i
                while tag[0] != "L":
                    i += 1
                    tag = predicted_tags[i]
                end_idx = i
                current_tags = [
                    t if begin_idx <= idx <= end_idx else "O"
                    for idx, t in enumerate(predicted_tags)
                ]
                predicted_spans.append(current_tags)
            i += 1

        # Creates a new instance for each contiguous tag
        instances = []
        for labels in predicted_spans:
            new_instance = deepcopy(instance)
            text_field: TextField = instance["tokens"]  # type: ignore
            new_instance.add_field("tags",
                                   SequenceLabelField(labels, text_field),
                                   self._model.vocab)
            instances.append(new_instance)
        instances.reverse(
        )  # NER tags are in the opposite order as desired for the interpret UI

        return instances
Ejemplo n.º 4
0
    def predictions_to_labeled_instances(
            self, instance: Instance,
            outputs: Dict[str, numpy.ndarray]) -> List[Instance]:
        new_instance = deepcopy(instance)
        # For BiDAF
        if "best_span" in outputs:
            span_start_label = outputs["best_span"][0]
            span_end_label = outputs["best_span"][1]
            passage_field: SequenceField = new_instance[
                "passage"]  # type: ignore
            new_instance.add_field(
                "span_start", IndexField(int(span_start_label), passage_field))
            new_instance.add_field(
                "span_end", IndexField(int(span_end_label), passage_field))

        # For NAQANet model. It has the fields: answer_as_passage_spans, answer_as_question_spans,
        # answer_as_add_sub_expressions, answer_as_counts. We need labels for all.
        elif "answer" in outputs:
            answer_type = outputs["answer"]["answer_type"]

            # When the problem is a counting problem
            if answer_type == "count":
                field = ListField([
                    LabelField(int(outputs["answer"]["count"]),
                               skip_indexing=True)
                ])
                new_instance.add_field("answer_as_counts", field)

            # When the answer is in the passage
            elif answer_type == "passage_span":
                # TODO(mattg): Currently we only handle one predicted span.
                span = outputs["answer"]["spans"][0]

                # Convert character span indices into word span indices
                word_span_start = None
                word_span_end = None
                offsets = new_instance["metadata"].metadata[
                    "passage_token_offsets"]  # type: ignore
                for index, offset in enumerate(offsets):
                    if offset[0] == span[0]:
                        word_span_start = index
                    if offset[1] == span[1]:
                        word_span_end = index

                passage_field: SequenceField = new_instance[
                    "passage"]  # type: ignore
                field = ListField(
                    [SpanField(word_span_start, word_span_end, passage_field)])
                new_instance.add_field("answer_as_passage_spans", field)

            # When the answer is an arithmetic calculation
            elif answer_type == "arithmetic":
                # The different numbers in the passage that the model encounters
                sequence_labels = outputs["answer"]["numbers"]

                numbers_field: ListField = instance[
                    "number_indices"]  # type: ignore

                # The numbers in the passage are given signs, that's what we are labeling here.
                # Negative signs are given the class label 2 (for 0 and 1, the sign matches the
                # label).
                labels = []
                for label in sequence_labels:
                    if label["sign"] == -1:
                        labels.append(2)
                    else:
                        labels.append(label["sign"])
                # There's a dummy number added in the dataset reader to handle passages with no
                # numbers; it has a label of 0 (not included).
                labels.append(0)

                field = ListField([SequenceLabelField(labels, numbers_field)])
                new_instance.add_field("answer_as_add_sub_expressions", field)

            # When the answer is in the question
            elif answer_type == "question_span":
                span = outputs["answer"]["spans"][0]

                # Convert character span indices into word span indices
                word_span_start = None
                word_span_end = None
                question_offsets = new_instance[
                    "metadata"].metadata[  # type: ignore
                        "question_token_offsets"]
                for index, offset in enumerate(question_offsets):
                    if offset[0] == span[0]:
                        word_span_start = index
                    if offset[1] == span[1]:
                        word_span_end = index

                question_field: SequenceField = new_instance[
                    "question"]  # type: ignore
                field = ListField([
                    SpanField(word_span_start, word_span_end, question_field)
                ])
                new_instance.add_field("answer_as_question_spans", field)

        return [new_instance]
Ejemplo n.º 5
0
    def text_to_instance(
        self,  # type: ignore
        tokens: List[str],
        ccg_categories: List[str] = None,
        original_pos_tags: List[str] = None,
        modified_pos_tags: List[str] = None,
        predicate_arg_categories: List[str] = None,
    ) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.

        # Parameters

        tokens : `List[str]`, required.
            The tokens in a given sentence.
        ccg_categories : `List[str]`, optional, (default = `None`).
            The CCG categories for the words in the sentence. (e.g. N/N)
        original_pos_tags : `List[str]`, optional, (default = `None`).
            The tag assigned to the word in the Penn Treebank.
        modified_pos_tags : `List[str]`, optional, (default = `None`).
            The POS tag might have changed during the translation to CCG.
        predicate_arg_categories : `List[str]`, optional, (default = `None`).
            Encodes the word-word dependencies in the underlying predicate-
            argument structure.

        # Returns

        An `Instance` containing the following fields:
            tokens : `TextField`
                The tokens in the sentence.
            tags : `SequenceLabelField`
                The tags corresponding to the `tag_label` constructor argument.
            feature_label_tags : `SequenceLabelField`
                Tags corresponding to each feature_label (if any) specified in the
                `feature_labels` constructor argument.
        """

        text_field = TextField([Token(x) for x in tokens],
                               token_indexers=self._token_indexers)
        fields: Dict[str, Field] = {"tokens": text_field}

        # Add "feature labels" to instance
        if "ccg" in self.feature_labels:
            if ccg_categories is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use CCG categories as "
                    "features. Pass them to text_to_instance.")
            fields["ccg_tags"] = SequenceLabelField(ccg_categories, text_field,
                                                    "ccg_tags")
        if "original_pos" in self.feature_labels:
            if original_pos_tags is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use original POS tags as "
                    "features. Pass them to text_to_instance.")
            fields["original_pos_tags"] = SequenceLabelField(
                original_pos_tags, text_field, "original_pos_tags")
        if "modified_pos" in self.feature_labels:
            if modified_pos_tags is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use modified POS tags as "
                    " features. Pass them to text_to_instance.")
            fields["modified_pos_tags"] = SequenceLabelField(
                modified_pos_tags, text_field, "modified_pos_tags")
        if "predicate_arg" in self.feature_labels:
            if predicate_arg_categories is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use predicate arg tags as "
                    " features. Pass them to text_to_instance.")
            fields["predicate_arg_tags"] = SequenceLabelField(
                predicate_arg_categories, text_field, "predicate_arg_tags")

        # Add "tag label" to instance
        if self.tag_label == "ccg" and ccg_categories is not None:
            fields["tags"] = SequenceLabelField(ccg_categories, text_field,
                                                self.label_namespace)
        elif self.tag_label == "original_pos" and original_pos_tags is not None:
            fields["tags"] = SequenceLabelField(original_pos_tags, text_field,
                                                self.label_namespace)
        elif self.tag_label == "modified_pos" and modified_pos_tags is not None:
            fields["tags"] = SequenceLabelField(modified_pos_tags, text_field,
                                                self.label_namespace)
        elif self.tag_label == "predicate_arg" and predicate_arg_categories is not None:
            fields["tags"] = SequenceLabelField(predicate_arg_categories,
                                                text_field,
                                                self.label_namespace)

        return Instance(fields)
Ejemplo n.º 6
0
    def text_to_instance(
        self,  # type: ignore
        tokens: List[str],
        pos_tags: List[str] = None,
        gold_tree: Tree = None,
    ) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.

        Parameters
        ----------
        tokens : ``List[str]``, required.
            The tokens in a given sentence.
        pos_tags : ``List[str]``, optional, (default = None).
            The POS tags for the words in the sentence.
        gold_tree : ``Tree``, optional (default = None).
            The gold parse tree to create span labels from.

        Returns
        -------
        An ``Instance`` containing the following fields:
            tokens : ``TextField``
                The tokens in the sentence.
            pos_tags : ``SequenceLabelField``
                The POS tags of the words in the sentence.
                Only returned if ``use_pos_tags`` is ``True``
            spans : ``ListField[SpanField]``
                A ListField containing all possible subspans of the
                sentence.
            span_labels : ``SequenceLabelField``, optional.
                The constituency tags for each of the possible spans, with
                respect to a gold parse tree. If a span is not contained
                within the tree, a span will have a ``NO-LABEL`` label.
            gold_tree : ``MetadataField(Tree)``
                The gold NLTK parse tree for use in evaluation.
        """

        if self._convert_parentheses:
            tokens = [PTB_PARENTHESES.get(token, token) for token in tokens]
        text_field = TextField([Token(x) for x in tokens],
                               token_indexers=self._token_indexers)
        fields: Dict[str, Field] = {"tokens": text_field}

        pos_namespace = self._label_namespace_prefix + self._pos_label_namespace
        if self._use_pos_tags and pos_tags is not None:
            pos_tag_field = SequenceLabelField(pos_tags,
                                               text_field,
                                               label_namespace=pos_namespace)
            fields["pos_tags"] = pos_tag_field
        elif self._use_pos_tags:
            raise ConfigurationError(
                "use_pos_tags was set to True but no gold pos"
                " tags were passed to the dataset reader.")
        spans: List[Field] = []
        gold_labels = []

        if gold_tree is not None:
            gold_spans: Dict[Tuple[int, int], str] = {}
            self._get_gold_spans(gold_tree, 0, gold_spans)

        else:
            gold_spans = None
        for start, end in enumerate_spans(tokens):
            spans.append(SpanField(start, end, text_field))

            if gold_spans is not None:
                gold_labels.append(gold_spans.get((start, end), "NO-LABEL"))

        metadata = {"tokens": tokens}
        if gold_tree:
            metadata["gold_tree"] = gold_tree
        if self._use_pos_tags:
            metadata["pos_tags"] = pos_tags

        fields["metadata"] = MetadataField(metadata)

        span_list_field: ListField = ListField(spans)
        fields["spans"] = span_list_field
        if gold_tree is not None:
            fields["span_labels"] = SequenceLabelField(
                gold_labels,
                span_list_field,
                label_namespace=self._label_namespace_prefix + "labels",
            )
        return Instance(fields)
Ejemplo n.º 7
0
def make_reading_comprehension_instance_quac(
        question_list_tokens: List[List[Token]],
        passage_tokens: List[Token],
        token_indexers: Dict[str, TokenIndexer],
        passage_text: str,
        token_span_lists: List[List[Tuple[int, int]]] = None,
        yesno_list: List[int] = None,
        followup_list: List[int] = None,
        additional_metadata: Dict[str, Any] = None,
        num_context_answers: int = 0) -> Instance:
    """
    Converts a question, a passage, and an optional answer (or answers) to an ``Instance`` for use
    in a reading comprehension model.

    Creates an ``Instance`` with at least these fields: ``question`` and ``passage``, both
    ``TextFields``; and ``metadata``, a ``MetadataField``.  Additionally, if both ``answer_texts``
    and ``char_span_starts`` are given, the ``Instance`` has ``span_start`` and ``span_end``
    fields, which are both ``IndexFields``.

    Parameters
    ----------
    question_list_tokens : ``List[List[Token]]``
        An already-tokenized list of questions. Each dialog have multiple questions.
    passage_tokens : ``List[Token]``
        An already-tokenized passage that contains the answer to the given question.
    token_indexers : ``Dict[str, TokenIndexer]``
        Determines how the question and passage ``TextFields`` will be converted into tensors that
        get input to a model.  See :class:`TokenIndexer`.
    passage_text : ``str``
        The original passage text.  We need this so that we can recover the actual span from the
        original passage that the model predicts as the answer to the question.  This is used in
        official evaluation scripts.
    token_spans_lists : ``List[List[Tuple[int, int]]]``, optional
        Indices into ``passage_tokens`` to use as the answer to the question for training.  This is
        a list of list, first because there is multiple questions per dialog, and
        because there might be several possible correct answer spans in the passage.
        Currently, we just select the last span in this list (i.e., QuAC has multiple
        annotations on the dev set; this will select the last span, which was given by the original annotator).
    yesno_list : ``List[int]``
        List of the affirmation bit for each question answer pairs.
    followup_list : ``List[int]``
        List of the continuation bit for each question answer pairs.
    num_context_answers : ``int``, optional
        How many answers to encode into the passage.
    additional_metadata : ``Dict[str, Any]``, optional
        The constructed ``metadata`` field will by default contain ``original_passage``,
        ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys.  If
        you want any other metadata to be associated with each instance, you can pass that in here.
        This dictionary will get added to the ``metadata`` dictionary we already construct.
    """
    additional_metadata = additional_metadata or {}
    fields: Dict[str, Field] = {}
    passage_offsets = [(token.idx, token.idx + len(token.text))
                       for token in passage_tokens]
    # This is separate so we can reference it later with a known type.
    passage_field = TextField(passage_tokens, token_indexers)
    fields['passage'] = passage_field
    fields['question'] = ListField([
        TextField(q_tokens, token_indexers)
        for q_tokens in question_list_tokens
    ])
    metadata = {'original_passage': passage_text,
                'token_offsets': passage_offsets,
                'question_tokens': [[token.text for token in question_tokens] \
                                    for question_tokens in question_list_tokens],
                'passage_tokens': [token.text for token in passage_tokens], }
    p1_answer_marker_list: List[Field] = []
    p2_answer_marker_list: List[Field] = []
    p3_answer_marker_list: List[Field] = []

    def get_tag(i, i_name):
        # Generate a tag to mark previous answer span in the passage.
        return "<{0:d}_{1:s}>".format(i, i_name)

    def mark_tag(span_start, span_end, passage_tags, prev_answer_distance):
        try:
            assert span_start >= 0
            assert span_end >= 0
        except:
            raise ValueError(
                "Previous {0:d}th answer span should have been updated!".
                format(prev_answer_distance))
        # Modify "tags" to mark previous answer span.
        if span_start == span_end:
            passage_tags[prev_answer_distance][span_start] = get_tag(
                prev_answer_distance, "")
        else:
            passage_tags[prev_answer_distance][span_start] = get_tag(
                prev_answer_distance, "start")
            passage_tags[prev_answer_distance][span_end] = get_tag(
                prev_answer_distance, "end")
            for passage_index in range(span_start + 1, span_end):
                passage_tags[prev_answer_distance][passage_index] = get_tag(
                    prev_answer_distance, "in")

    if token_span_lists:
        span_start_list: List[Field] = []
        span_end_list: List[Field] = []
        p1_span_start, p1_span_end, p2_span_start = -1, -1, -1
        p2_span_end, p3_span_start, p3_span_end = -1, -1, -1
        # Looping each <<answers>>.
        answer_list_tokens = list()
        for question_index, answer_span_lists in enumerate(token_span_lists):
            span_start, span_end = answer_span_lists[
                -1]  # Last one is the original answer
            span_start_list.append(IndexField(span_start, passage_field))
            span_end_list.append(IndexField(span_end, passage_field))
            answer_tokens = passage_tokens[span_start:span_end]
            answer_list_tokens.append(answer_tokens)
            prev_answer_marker_lists = [["O"] * len(passage_tokens),
                                        ["O"] * len(passage_tokens),
                                        ["O"] * len(passage_tokens),
                                        ["O"] * len(passage_tokens)]
            if question_index > 0 and num_context_answers > 0:
                mark_tag(p1_span_start, p1_span_end, prev_answer_marker_lists,
                         1)
                if question_index > 1 and num_context_answers > 1:
                    mark_tag(p2_span_start, p2_span_end,
                             prev_answer_marker_lists, 2)
                    if question_index > 2 and num_context_answers > 2:
                        mark_tag(p3_span_start, p3_span_end,
                                 prev_answer_marker_lists, 3)
                    p3_span_start = p2_span_start
                    p3_span_end = p2_span_end
                p2_span_start = p1_span_start
                p2_span_end = p1_span_end
            p1_span_start = span_start
            p1_span_end = span_end
            if num_context_answers > 2:
                p3_answer_marker_list.append(
                    SequenceLabelField(prev_answer_marker_lists[3],
                                       passage_field,
                                       label_namespace="answer_tags"))
            if num_context_answers > 1:
                p2_answer_marker_list.append(
                    SequenceLabelField(prev_answer_marker_lists[2],
                                       passage_field,
                                       label_namespace="answer_tags"))
            if num_context_answers > 0:
                p1_answer_marker_list.append(
                    SequenceLabelField(prev_answer_marker_lists[1],
                                       passage_field,
                                       label_namespace="answer_tags"))
        fields['span_start'] = ListField(span_start_list)
        fields['span_end'] = ListField(span_end_list)

        dialog_list_tokens = list()
        #dialog_list_tokens.append([]) #### For firwst question dialog is null
        dialog_tokens_string_as_list = []
        for diag_idx in range(len(question_list_tokens)):
            t = []
            if diag_idx > num_context_answers:
                t += [passage_tokens[-2]] + question_list_tokens[0] + [
                    passage_tokens[-1]
                ] + answer_list_tokens[0]
            for prev_idx in range(max(diag_idx - num_context_answers, 0),
                                  diag_idx):
                t += [passage_tokens[-2]] + question_list_tokens[prev_idx] + [
                    passage_tokens[-1]
                ] + answer_list_tokens[prev_idx]
            dialog_list_tokens.append(t)
        #fields['dialog'] = ListField([TextField(d_tokens, token_indexers) for d_tokens in dialog_list_tokens])

        dialog_list_tokens = list()
        #dialog_list_tokens.append([]) #### For firwst question dialog is null
        dialog_tokens_string_as_list = []
        for diag_idx in range(len(question_list_tokens)):
            t = []
            for prev_idx in range(max(diag_idx - num_context_answers, 0),
                                  diag_idx):
                t += [passage_tokens[-2]] + question_list_tokens[prev_idx] + [
                    passage_tokens[-1]
                ] + answer_list_tokens[prev_idx]
            dialog_list_tokens.append(t)
        #fields['dialog'] = ListField([TextField(d_tokens, token_indexers) for d_tokens in dialog_list_tokens])

    ### Creating entire dialog field

        dialog_list_tokens = list()
        dialog_list_tokens.append([])  #### For firwst question dialog is null
        dialog_tokens_string_as_list = []
        for ques_tokens, ans_tokens in list(
                zip(question_list_tokens, answer_list_tokens))[:-1]:
            dialog_tokens_string_as_list = dialog_tokens_string_as_list + [
                passage_tokens[-2]
            ] + ques_tokens + [passage_tokens[-1]] + ans_tokens
            dialog_list_tokens.append(dialog_tokens_string_as_list)
        fields['dialog'] = ListField([
            TextField(d_tokens, token_indexers)
            for d_tokens in dialog_list_tokens
        ])
        fields['answer'] = ListField([
            TextField(a_tokens, token_indexers)
            for a_tokens in answer_list_tokens
        ])

        questions_answer_appended_list = list()
        for q_tok, a_tok in zip(question_list_tokens, answer_list_tokens):
            if q_tok == question_list_tokens[0]:
                ques_ans_tokens = q_tok
            else:
                ques_ans_tokens = q_tok + [passage_tokens[-1]
                                           ] + prev_ans_tokens
            questions_answer_appended_list.append(ques_ans_tokens)
            prev_ans_tokens = a_tok

        fields['previous_answer_appended'] = ListField([
            TextField(tok, token_indexers)
            for tok in questions_answer_appended_list
        ])

        #print ("question_list_tokens", question_list_tokens)
        #print ("answer_list_tokens", answer_list_tokens)
        #print ("prev answer list is ", fields['previous_answer_appended'])
        if num_context_answers > 0:
            fields['p1_answer_marker'] = ListField(p1_answer_marker_list)
            if num_context_answers > 1:
                fields['p2_answer_marker'] = ListField(p2_answer_marker_list)
                if num_context_answers > 2:
                    fields['p3_answer_marker'] = ListField(
                        p3_answer_marker_list)
        fields['yesno_list'] = ListField( \
            [LabelField(yesno, label_namespace="yesno_labels") for yesno in yesno_list])
        fields['followup_list'] = ListField([LabelField(followup, label_namespace="followup_labels") \
                                             for followup in followup_list])
    metadata.update(additional_metadata)
    fields['metadata'] = MetadataField(metadata)
    #print ("question_list_tokens", question_list_tokens)
    #print ("answer_list_tokens", answer_list_tokens)
    #print ("\nfields question ", fields['question'])
    #print ("\nfields answer ", fields['answer'])
    #print ("\nfields dialog ", fields['dialog'])
    return Instance(fields)
Ejemplo n.º 8
0
    def text_to_instance(
        self,  # type: ignore
        sentence: List[Token],
        gold_clusters: Optional[List[List[Tuple[int,
                                                int]]]] = None) -> Instance:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        sentences : ``List[Token]``, required.
            The already tokenised sentence to analyse.
        gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None)
            A list of all clusters in the sentence, represented as word spans. Each cluster
            contains some number of spans, which can be nested and overlap, but will never
            exactly match between clusters.

        Returns
        -------
        An ``Instance`` containing the following ``Fields``:
            text : ``TextField``
                The text of the full sentence.
            spans : ``ListField[SpanField]``
                A ListField containing the spans represented as ``SpanFields``
                with respect to the sentence text.
            span_labels : ``SequenceLabelField``, optional
                The id of the cluster which each possible span belongs to, or -1 if it does
                 not belong to a cluster. As these labels have variable length (it depends on
                 how many spans we are considering), we represent this a as a ``SequenceLabelField``
                 with respect to the ``spans ``ListField``.
        """
        metadata: Dict[str, Any] = {"original_text": sentence}
        if gold_clusters is not None:
            metadata["clusters"] = gold_clusters

        text_field = TextField(sentence, self._token_indexers)

        cluster_dict = {}
        if gold_clusters is not None:
            for cluster_id, cluster in enumerate(gold_clusters):
                for mention in cluster:
                    cluster_dict[tuple(mention)] = cluster_id

        spans: List[Field] = []
        span_labels: Optional[
            List[int]] = [] if gold_clusters is not None else None

        for start, end in enumerate_spans(sentence,
                                          max_span_width=self._max_span_width):
            if span_labels is not None:
                if (start, end) in cluster_dict:
                    span_labels.append(cluster_dict[(start, end)])
                else:
                    span_labels.append(-1)

            spans.append(SpanField(start, end, text_field))

        span_field = ListField(spans)
        metadata_field = MetadataField(metadata)

        fields: Dict[str, Field] = {
            "text": text_field,
            "spans": span_field,
            "metadata": metadata_field
        }
        if span_labels is not None:
            fields["span_labels"] = SequenceLabelField(span_labels, span_field)

        return Instance(fields)
 def test_printing_doesnt_crash(self):
     tags = ["B", "I", "O", "O", "O"]
     sequence_label_field = SequenceLabelField(tags,
                                               self.text,
                                               label_namespace="labels")
     print(sequence_label_field)
Ejemplo n.º 10
0
    def text_to_instance(self, # type: ignore
                         tokens: List[Token],
                         ner_tags: List[str] = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {'tokens': sequence}

        def _remove_BI(_one_tag):
            if _one_tag == 'O':
                return _one_tag
            else:
                return _one_tag[2:]
        
        if self.coding_scheme == "BIOUL":
            coded_ner = to_bioul(ner_tags,
                                 encoding=self._original_coding_scheme) if ner_tags is not None else None
        else:
            # the default IOB1
            coded_ner = ner_tags

        # TODO:
        # ner_tags -> spans of NE
        # return something like spans, span_labels ("O" if span not in golden_spans, "PER", "LOC"... otherwise)
        spans: List[Field] = []
        span_labels: List[str] = []
            
        gold_spans: List[Field] = []
        gold_span_labels: List[str] = []

        assert len(ner_tags) == len(tokens), "sentence:%s but ner_tags:%s"%(str(tokens), str(ner_tags))
        ner_gold_spans = _extract_spans(ner_tags) # ner_gold_spans: Dict[tuple(startid, endid), str(entity_type)]
        for start, end in enumerate_spans(ner_tags, offset=0, max_span_width=self._max_span_width):
            span_labels.append(ner_gold_spans.get((start, end), 'O'))
            spans.append(SpanField(start, end, sequence))
            pass
        
        _dict_gold_spans = {}
        for ky, val in ner_gold_spans.items():
            gold_span_labels.append(val)
            gold_spans.append(SpanField(ky[0], ky[1], sequence))
            if val != 'O':
                _dict_gold_spans[ky] = val
            pass
        
        instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens] ,
                                                    "gold_spans": _dict_gold_spans})
        
        assert len(spans) == len(span_labels), "span length not equal to span label length..."
        span_field = ListField(spans) # a list of (start, end) tuples...
        
        # contains all possible spans and their tags
        instance_fields['spans'] = span_field
        instance_fields['span_labels'] = SequenceLabelField(span_labels, span_field, "span_tags")
        
        # only contain gold_spans and their tags
        # e.g. (0,0,O), (1,1,O), (2,3,PER), (4,4,O) for 'I am Donald Trump .'
        gold_span_field = ListField(gold_spans)
        instance_fields['gold_spans'] = gold_span_field
        instance_fields['gold_span_labels'] = SequenceLabelField(gold_span_labels, 
                                                                 gold_span_field, "span_tags")


        # Add "tag label" to instance
        if self.tag_label == 'ner' and coded_ner is not None:
            instance_fields['tags'] = SequenceLabelField(coded_ner, sequence,
                                                         'token_tags')
        return Instance(instance_fields)
Ejemplo n.º 11
0
    def make_marginal_drop_instance(
        question_tokens: List[Token],
        passage_tokens: List[Token],
        number_tokens: List[Token],
        number_indices: List[int],
        token_indexers: Dict[str, TokenIndexer],
        passage_text: str,
        answer_info: Dict[str, Any] = None,
        additional_metadata: Dict[str, Any] = None,
    ) -> Instance:
        additional_metadata = additional_metadata or {}
        fields: Dict[str, Field] = {}
        passage_offsets = [(token.idx, token.idx + len(token.text))
                           for token in passage_tokens]
        question_offsets = [(token.idx, token.idx + len(token.text))
                            for token in question_tokens]

        # This is separate so we can reference it later with a known type.
        passage_field = TextField(passage_tokens, token_indexers)
        question_field = TextField(question_tokens, token_indexers)
        fields["passage"] = passage_field
        fields["question"] = question_field
        number_index_fields: List[Field] = [
            IndexField(index, passage_field) for index in number_indices
        ]
        fields["number_indices"] = ListField(number_index_fields)
        # This field is actually not required in the model,
        # it is used to create the `answer_as_plus_minus_combinations` field, which is a `SequenceLabelField`.
        # We cannot use `number_indices` field for creating that, because the `ListField` will not be empty
        # when we want to create a new empty field. That will lead to error.
        numbers_in_passage_field = TextField(number_tokens, token_indexers)
        metadata = {
            "original_passage": passage_text,
            "passage_token_offsets": passage_offsets,
            "question_token_offsets": question_offsets,
            "question_tokens": [token.text for token in question_tokens],
            "passage_tokens": [token.text for token in passage_tokens],
            "number_tokens": [token.text for token in number_tokens],
            "number_indices": number_indices,
        }
        if answer_info:
            metadata["answer_texts"] = answer_info["answer_texts"]

            passage_span_fields: List[Field] = [
                SpanField(span[0], span[1], passage_field)
                for span in answer_info["answer_passage_spans"]
            ]
            if not passage_span_fields:
                passage_span_fields.append(SpanField(-1, -1, passage_field))
            fields["answer_as_passage_spans"] = ListField(passage_span_fields)

            question_span_fields: List[Field] = [
                SpanField(span[0], span[1], question_field)
                for span in answer_info["answer_question_spans"]
            ]
            if not question_span_fields:
                question_span_fields.append(SpanField(-1, -1, question_field))
            fields["answer_as_question_spans"] = ListField(
                question_span_fields)

            add_sub_signs_field: List[Field] = []
            for signs_for_one_add_sub_expression in answer_info[
                    "signs_for_add_sub_expressions"]:
                add_sub_signs_field.append(
                    SequenceLabelField(signs_for_one_add_sub_expression,
                                       numbers_in_passage_field))
            if not add_sub_signs_field:
                add_sub_signs_field.append(
                    SequenceLabelField([0] * len(number_tokens),
                                       numbers_in_passage_field))
            fields["answer_as_add_sub_expressions"] = ListField(
                add_sub_signs_field)

            count_fields: List[Field] = [
                LabelField(count_label, skip_indexing=True)
                for count_label in answer_info["counts"]
            ]
            if not count_fields:
                count_fields.append(LabelField(-1, skip_indexing=True))
            fields["answer_as_counts"] = ListField(count_fields)

        metadata.update(additional_metadata)
        fields["metadata"] = MetadataField(metadata)
        return Instance(fields)
Ejemplo n.º 12
0
def make_reading_comprehension_instance_quac(
    question_list_tokens: List[List[Token]],
    passage_tokens: List[Token],
    token_indexers: Dict[str, TokenIndexer],
    passage_text: str,
    token_span_lists: List[List[Tuple[int, int]]] = None,
    yesno_list: List[int] = None,
    followup_list: List[int] = None,
    additional_metadata: Dict[str, Any] = None,
    num_context_answers: int = 0,
) -> Instance:
    """
    Converts a question, a passage, and an optional answer (or answers) to an ``Instance`` for use
    in a reading comprehension model.

    Creates an ``Instance`` with at least these fields: ``question`` and ``passage``, both
    ``TextFields``; and ``metadata``, a ``MetadataField``.  Additionally, if both ``answer_texts``
    and ``char_span_starts`` are given, the ``Instance`` has ``span_start`` and ``span_end``
    fields, which are both ``IndexFields``.

    # Parameters

    question_list_tokens : `List[List[Token]]`
        An already-tokenized list of questions. Each dialog have multiple questions.
    passage_tokens : `List[Token]`
        An already-tokenized passage that contains the answer to the given question.
    token_indexers : `Dict[str, TokenIndexer]`
        Determines how the question and passage `TextFields` will be converted into tensors that
        get input to a model.  See :class:`TokenIndexer`.
    passage_text : `str`
        The original passage text.  We need this so that we can recover the actual span from the
        original passage that the model predicts as the answer to the question.  This is used in
        official evaluation scripts.
    token_span_lists : `List[List[Tuple[int, int]]]`, optional
        Indices into `passage_tokens` to use as the answer to the question for training.  This is
        a list of list, first because there is multiple questions per dialog, and
        because there might be several possible correct answer spans in the passage.
        Currently, we just select the last span in this list (i.e., QuAC has multiple
        annotations on the dev set; this will select the last span, which was given by the original annotator).
    yesno_list : `List[int]`
        List of the affirmation bit for each question answer pairs.
    followup_list : `List[int]`
        List of the continuation bit for each question answer pairs.
    num_context_answers : `int`, optional
        How many answers to encode into the passage.
    additional_metadata : `Dict[str, Any]`, optional
        The constructed `metadata` field will by default contain `original_passage`,
        ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys.  If
        you want any other metadata to be associated with each instance, you can pass that in here.
        This dictionary will get added to the ``metadata`` dictionary we already construct.
    """
    additional_metadata = additional_metadata or {}
    fields: Dict[str, Field] = {}
    passage_offsets = [(token.idx, token.idx + len(token.text))
                       for token in passage_tokens]
    # This is separate so we can reference it later with a known type.
    passage_field = TextField(passage_tokens, token_indexers)
    fields["passage"] = passage_field
    fields["question"] = ListField([
        TextField(q_tokens, token_indexers)
        for q_tokens in question_list_tokens
    ])
    metadata = {
        "original_passage":
        passage_text,
        "token_offsets":
        passage_offsets,
        "question_tokens": [[token.text for token in question_tokens]
                            for question_tokens in question_list_tokens],
        "passage_tokens": [token.text for token in passage_tokens],
    }
    p1_answer_marker_list: List[Field] = []
    p2_answer_marker_list: List[Field] = []
    p3_answer_marker_list: List[Field] = []

    def get_tag(i, i_name):
        # Generate a tag to mark previous answer span in the passage.
        return "<{0:d}_{1:s}>".format(i, i_name)

    def mark_tag(span_start, span_end, passage_tags, prev_answer_distance):
        try:
            assert span_start >= 0
            assert span_end >= 0
        except:  # noqa
            raise ValueError(
                "Previous {0:d}th answer span should have been updated!".
                format(prev_answer_distance))
        # Modify "tags" to mark previous answer span.
        if span_start == span_end:
            passage_tags[prev_answer_distance][span_start] = get_tag(
                prev_answer_distance, "")
        else:
            passage_tags[prev_answer_distance][span_start] = get_tag(
                prev_answer_distance, "start")
            passage_tags[prev_answer_distance][span_end] = get_tag(
                prev_answer_distance, "end")
            for passage_index in range(span_start + 1, span_end):
                passage_tags[prev_answer_distance][passage_index] = get_tag(
                    prev_answer_distance, "in")

    if token_span_lists:
        span_start_list: List[Field] = []
        span_end_list: List[Field] = []
        p1_span_start, p1_span_end, p2_span_start = -1, -1, -1
        p2_span_end, p3_span_start, p3_span_end = -1, -1, -1
        # Looping each <<answers>>.
        for question_index, answer_span_lists in enumerate(token_span_lists):
            span_start, span_end = answer_span_lists[
                -1]  # Last one is the original answer
            span_start_list.append(IndexField(span_start, passage_field))
            span_end_list.append(IndexField(span_end, passage_field))
            prev_answer_marker_lists = [
                ["O"] * len(passage_tokens),
                ["O"] * len(passage_tokens),
                ["O"] * len(passage_tokens),
                ["O"] * len(passage_tokens),
            ]
            if question_index > 0 and num_context_answers > 0:
                mark_tag(p1_span_start, p1_span_end, prev_answer_marker_lists,
                         1)
                if question_index > 1 and num_context_answers > 1:
                    mark_tag(p2_span_start, p2_span_end,
                             prev_answer_marker_lists, 2)
                    if question_index > 2 and num_context_answers > 2:
                        mark_tag(p3_span_start, p3_span_end,
                                 prev_answer_marker_lists, 3)
                    p3_span_start = p2_span_start
                    p3_span_end = p2_span_end
                p2_span_start = p1_span_start
                p2_span_end = p1_span_end
            p1_span_start = span_start
            p1_span_end = span_end
            if num_context_answers > 2:
                p3_answer_marker_list.append(
                    SequenceLabelField(prev_answer_marker_lists[3],
                                       passage_field,
                                       label_namespace="answer_tags"))
            if num_context_answers > 1:
                p2_answer_marker_list.append(
                    SequenceLabelField(prev_answer_marker_lists[2],
                                       passage_field,
                                       label_namespace="answer_tags"))
            if num_context_answers > 0:
                p1_answer_marker_list.append(
                    SequenceLabelField(prev_answer_marker_lists[1],
                                       passage_field,
                                       label_namespace="answer_tags"))
        fields["span_start"] = ListField(span_start_list)
        fields["span_end"] = ListField(span_end_list)
        if num_context_answers > 0:
            fields["p1_answer_marker"] = ListField(p1_answer_marker_list)
            if num_context_answers > 1:
                fields["p2_answer_marker"] = ListField(p2_answer_marker_list)
                if num_context_answers > 2:
                    fields["p3_answer_marker"] = ListField(
                        p3_answer_marker_list)
        fields["yesno_list"] = ListField([
            LabelField(yesno, label_namespace="yesno_labels")
            for yesno in yesno_list
        ])
        fields["followup_list"] = ListField([
            LabelField(followup, label_namespace="followup_labels")
            for followup in followup_list
        ])
    metadata.update(additional_metadata)
    fields["metadata"] = MetadataField(metadata)
    return Instance(fields)
Ejemplo n.º 13
0
    def text_to_instance(
        self,  # type: ignore
        sentences: List[List[str]],
        gold_clusters: Optional[List[List[Tuple[int,
                                                int]]]] = None) -> Instance:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        sentences : ``List[List[str]]``, required.
            A list of lists representing the tokenised words and sentences in the document.
        gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None)
            A list of all clusters in the document, represented as word spans. Each cluster
            contains some number of spans, which can be nested and overlap, but will never
            exactly match between clusters.
        Returns
        -------
        An ``Instance`` containing the following ``Fields``:
            text : ``TextField``
                The text of the full document.
            spans : ``ListField[SpanField]``
                A ListField containing the spans represented as ``SpanFields``
                with respect to the document text.
            span_labels : ``SequenceLabelField``, optional
                The id of the cluster which each possible span belongs to, or -1 if it does
                 not belong to a cluster. As these labels have variable length (it depends on
                 how many spans we are considering), we represent this a as a ``SequenceLabelField``
                 with respect to the ``spans ``ListField``.
        """
        flattened_sentences = [
            self._normalize_word(word) for sentence in sentences
            for word in sentence
        ]
        # align clusters
        gold_clusters = self.align_clusters_to_tokens(flattened_sentences,
                                                      gold_clusters)

        def tokenizer(s: str):
            return self.token_indexer.wordpiece_tokenizer(s)

        flattened_sentences = tokenizer(" ".join(flattened_sentences))
        metadata: Dict[str, Any] = {"original_text": flattened_sentences}
        if gold_clusters is not None:
            metadata["clusters"] = gold_clusters
        if len(flattened_sentences) > 512:
            #import pdb
            #pdb.set_trace()
            text_field = TextField(
                [Token(word) for word in flattened_sentences[:512]],
                self._token_indexers)
            total_list = [text_field]
            import math
            for i in range(
                    math.ceil(float(len(flattened_sentences[512:])) / 100.0)):
                # slide by 100
                text_field = TextField([
                    Token(word)
                    for word in flattened_sentences[512 + (i * 100):512 +
                                                    ((i + 1) * 100)]
                ], self._token_indexers)
                total_list.append(text_field)
            text_field = ListField(total_list)
            # doing the Listfield

        else:
            text_field = TextField(
                [Token(word) for word in flattened_sentences],
                self._token_indexers)
        cluster_dict = {}
        if gold_clusters is not None:
            for cluster_id, cluster in enumerate(gold_clusters):
                for mention in cluster:
                    cluster_dict[tuple(mention)] = cluster_id

        spans: List[Field] = []
        span_labels: Optional[
            List[int]] = [] if gold_clusters is not None else None
        sentence_offset = 0
        normal = []
        for sentence in sentences:
            # enumerate the spans.
            for start, end in enumerate_spans(
                    sentence,
                    offset=sentence_offset,
                    max_span_width=self._max_span_width):
                if span_labels is not None:
                    if (start, end) in cluster_dict:
                        span_labels.append(cluster_dict[(start, end)])
                    else:
                        span_labels.append(-1)
                # align the spans to the BERT tokeniation
                normal.append((start, end))
                # span field for Span, which needs to be a flattened esnetnece.
                span_field = text_field
                """
                if len(flattened_sentences) > 512:
                    span_field = TextField([Token(["[CLS]"])] + [Token(word) for word in flattened_sentences]+ [Token(["[SEP]"])] , self._token_indexers) 
                else:
                    span_field = text_field
                """
                spans.append(SpanField(start, end, span_field))
            sentence_offset += len(sentence)

        span_field = ListField(spans)
        metadata_field = MetadataField(metadata)

        fields: Dict[str, Field] = {
            "text": text_field,
            "spans": span_field,
            "metadata": metadata_field
        }
        if span_labels is not None:
            fields["span_labels"] = SequenceLabelField(span_labels, span_field)
        return Instance(fields)
Ejemplo n.º 14
0
    def text_to_instance(
            self,  # type: ignore
            tokens: List[str],
            verb_label: List[int],
            tags: List[str] = None,
            pos_tags: List[str] = None,
            gold_tree: Tree = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.

        Parameters
        ----------
        tokens : ``List[str]``, required.
            The tokens in a given sentence.
        verb_label: ``List[int]``, required
            The verb label should be a one-hot binary vector,
            the same length as the tokens, indicating the position of the verb to find arguments for.
        tags: ``List[str]``, , optional (default = None).
            SRL tags
        pos_tags ``List[str]``, optional (default = None).
            The pos tags for the words in the sentence.
        gold_tree : ``Tree``, optional (default = None).
            The gold parse tree to create span labels from.

        Returns
        -------
        An ``Instance`` containing the following fields:
            tokens : ``TextField``
                The tokens in the sentence.
            pos_tags : ``SequenceLabelField``
                The pos tags of the words in the sentence.
            spans : ``ListField[SpanField]``
                A ListField containing all possible subspans of the
                sentence.
            span_labels : ``SequenceLabelField``, optional.
                The constituency tags for each of the possible spans, with
                respect to a gold parse tree. If a span is not contained
                within the tree, a span will have a ``NO-LABEL`` label.
        """
        # pylint: disable=arguments-differ

        fields: Dict[str, Field] = {}
        text_field = TextField(tokens, token_indexers=self._token_indexers)
        fields['tokens'] = text_field
        fields['verb_indicator'] = SequenceLabelField(verb_label, text_field)

        metadata: Dict[str, Any] = {}

        if tags:
            fields['tags'] = SequenceLabelField(tags, text_field)

        if pos_tags:
            pos_tag_field = SequenceLabelField(pos_tags, text_field,
                                               "pos_tags")
            fields['pos_tags'] = pos_tag_field
            metadata['pos_tags'] = True
        else:
            pos_tags = ['X' for _ in tokens]
            fields['pos_tags'] = SequenceLabelField(pos_tags, text_field,
                                                    "pos_tags")
            metadata['pos_tags'] = False

        spans: List[Field] = []
        gold_labels = []

        if gold_tree is not None:
            gold_spans_with_pos_tags: Dict[Tuple[int, int], str] = {}
            self._get_gold_spans(gold_tree, 0, gold_spans_with_pos_tags)
            gold_spans = {
                span: label
                for (span, label) in gold_spans_with_pos_tags.items()
                if "-POS" not in label
            }
        else:
            gold_spans = None

        for start, end in enumerate_spans(tokens):
            spans.append(SpanField(start, end, text_field))

            if gold_spans is not None:
                if (start, end) in gold_spans.keys():
                    gold_labels.append(gold_spans[(start, end)])
                else:
                    gold_labels.append("NO-LABEL")
            else:
                gold_labels.append("NO-LABEL")

        span_list_field: ListField = ListField(spans)
        fields['spans'] = span_list_field

        if gold_tree is not None:
            fields['span_labels'] = SequenceLabelField(gold_labels,
                                                       span_list_field,
                                                       "constituent_labels")
            metadata['span_labels'] = True
        else:
            fields['span_labels'] = SequenceLabelField(gold_labels,
                                                       span_list_field,
                                                       "constituent_labels")
            metadata['span_labels'] = False

        metadata_field = MetadataField(metadata)
        fields['metadata'] = metadata_field

        return Instance(fields)
    def text_to_instance(
        self,  # type: ignore
        tokens: List[str],
        lemmas: List[str] = None,
        upos_tags: List[str] = None,
        xpos_tags: List[str] = None,
        feats: List[str] = None,
        dependencies: List[Tuple[str, int]] = None,
        deps: List[List[Tuple[str, int]]] = None,
        ids: List[str] = None,
        misc: List[str] = None,
        multiword_ids: List[str] = None,
        multiword_forms: List[str] = None,
        conllu_metadata: List[str] = None,
        contains_elided_token: bool = False,
    ) -> Instance:
        """
        # Parameters
        tokens : ``List[str]``, required.
            The tokens in the sentence to be encoded.
        upos_tags : ``List[str]``, required.
            The universal dependencies POS tags for each word.
        dependencies : ``List[Tuple[str, int]]``, optional (default = None)
            A list of  (head tag, head index) tuples. Indices are 1 indexed,
            meaning an index of 0 corresponds to that word being the root of
            the dependency tree.
        deps : ``List[List[Tuple[str, int]]]``, optional (default = None)
            A list of lists of (head tag, head index) tuples. Indices are 1 indexed,
            meaning an index of 0 corresponds to that word being the root of
            the dependency tree.
        # Returns
        An instance containing tokens, pos tags, basic and enhanced dependency head tags and head
        indices as fields.
        """

        fields: Dict[str, Field] = {}

        token_field = TextField([Token(t) for t in tokens],
                                self._token_indexers)
        fields["tokens"] = token_field
        names = ["upos", "xpos", "lemmas"]
        all_tags = [upos_tags, xpos_tags, lemmas]
        for name, field in zip(names, all_tags):
            if field:
                fields[name] = SequenceLabelField(field,
                                                  token_field,
                                                  label_namespace=name)

        sublist_fields = []
        for atomic_feat in feats:
            feat_fields = ListField([
                LabelField(feat, label_namespace="feats")
                for feat in atomic_feat.split("|")
            ])
            sublist_fields.append(feat_fields)
        fields["feats"] = ListField(sublist_fields)

        # basic dependency tree
        if dependencies is not None:
            head_tags = [x[0] for x in dependencies]
            head_indices = [x[1] for x in dependencies]
            # we're not using the basic tree in the parse at the moment
            # so we are excluding these fields.
            #fields["head_tags"] = SequenceLabelField(
            #    [x[0] for x in dependencies], token_field, label_namespace="head_tags"
            #)
            #fields["head_indices"] = SequenceLabelField(
            #    [x[1] for x in dependencies], token_field, label_namespace="head_index_tags"
            #)

        # enhanced dependencies
        if deps is not None:
            enhanced_arc_tags, enhanced_arc_indices = self._convert_deps_to_nested_sequences(
                deps)
            # extra processing is needed if a sentence contains an elided token
            if self.contains_elided_token == True:
                original_to_new_indices, augmented_heads = self._process_elided_tokens(
                    ids, enhanced_arc_indices)
                enhanced_arc_indices = augmented_heads
            else:
                original_to_new_indices = None

            assert len(enhanced_arc_tags) == len(
                enhanced_arc_indices), "each arc should have a label"

            arc_indices = []
            arc_tags = []
            arc_indices_and_tags = []

            for modifier, head_list in enumerate(enhanced_arc_indices,
                                                 start=1):
                for head in head_list:
                    arc_indices.append((head, modifier))

            for relation_list in enhanced_arc_tags:
                for relation in relation_list:
                    arc_tags.append(relation)

            assert len(arc_indices) == len(
                arc_tags), "each arc should have a label"

            for arc_index, arc_tag in zip(arc_indices, arc_tags):
                arc_indices_and_tags.append((arc_index, arc_tag))

            if arc_indices is not None and arc_tags is not None:
                token_field_with_root = ['root'] + tokens
                fields["enhanced_tags"] = RootedAdjacencyField(
                    arc_indices,
                    token_field_with_root,
                    arc_tags,
                    label_namespace="deps")

        fields["metadata"] = MetadataField({
            "tokens": tokens,
            "upos_tags": upos_tags,
            "xpos_tags": xpos_tags,
            "feats": feats,
            "lemmas": lemmas,
            "ids": ids,
            "misc": misc,
            "original_to_new_indices": original_to_new_indices,
            "head_tags": head_tags,
            "head_indices": head_indices,
            "arc_indices": arc_indices,
            "arc_tags": arc_tags,
            "labeled_arcs": arc_indices_and_tags,
            "multiword_ids": multiword_ids,
            "multiword_forms": multiword_forms,
            "conllu_metadata": conllu_metadata
        })

        return Instance(fields)
 def test_human_readable_repr(self):
     tags = ["B", "I", "O", "O", "O"]
     sequence_label_field = SequenceLabelField(tags,
                                               self.text,
                                               label_namespace="labels")
     assert sequence_label_field.human_readable_repr() == tags
Ejemplo n.º 17
0
    def text_to_instance(self, source_key: str, target_key: str = None, line_obj: Dict = {}) -> Instance:
        """
        Turn json object into an ``Instance``.
        Parameters
        ----------
        source_key : ``str``, required, json object key name of the source sequence
        target_key : ``str``, optional (default = None), json object key name of the target sequence
        line_obj : ``Dict``, required, json object containing the raw instance info
        Returns
        -------
        Instance
            See the above for a description of the fields that the instance will contain.
        """

        # Read source and target
        target_sequence = line_obj.get(target_key, None)
        lang_src_token = line_obj["src_lang"].upper()
        lang_tgt_token = line_obj["tgt_lang"].upper()

        # Read Predicate Indicator and make Array
        verb_label = [0, 0] + [1 if label[-2:] == "-V" else 0 for label in line_obj["BIO"]] + [0]

        # Read Language Indicator and make Array
        lang_src_ix = self._available_languages[lang_src_token]
        lang_tgt_ix = self._available_languages[lang_tgt_token]
        # This array goes to the encoder as a whole
        lang_src_ix_arr = [0, 0] + [lang_src_ix for tok in line_obj[source_key]] + [0]
        # This array goes to each one of the decoder_steps
        lang_tgt_ix_arr = lang_tgt_ix # is just int for step decoder dimensionality

        # Tokenize Source
        tokenized_source = list(map(Token, line_obj[source_key])) # Data comes already tokenized!
        tokenized_source.insert(0, Token(lang_tgt_token))
        tokenized_source.insert(0, Token(START_SYMBOL))
        tokenized_source.append(Token(END_SYMBOL))
        source_field = TextField(tokenized_source, self._source_token_indexers)

        # For each token in the source sentence, we keep track of the matching token
        # in the target sentence (which will be the OOV symbol if there is no match).
        source_to_target_field = NamespaceSwappingField(tokenized_source[1:-1], self._target_namespace)

        meta_fields = {"source_tokens": [x.text for x in tokenized_source[1:-1]]}
        fields_dict = {
                "source_tokens": source_field,
                "source_to_target": source_to_target_field,
        }

        # Process Target info during training...
        if target_sequence is not None:
            tokenized_target = list(map(Token, line_obj[target_key]))
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target, self._target_token_indexers)

            fields_dict["target_tokens"] = target_field
            meta_fields["target_tokens"] = [y.text for y in tokenized_target[1:-1]]
            source_and_target_token_ids = self._tokens_to_ids(tokenized_source[1:-1] +
                                                              tokenized_target)
            source_token_ids = source_and_target_token_ids[:len(tokenized_source)-2]
            fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids))
            target_token_ids = source_and_target_token_ids[len(tokenized_source)-2:]
            fields_dict["target_token_ids"] = ArrayField(np.array(target_token_ids))
        else:
            source_token_ids = self._tokens_to_ids(tokenized_source[1:-1])
            fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids))

        # Add Verb Indicator to the Fields
        fields_dict['verb_indicator'] = SequenceLabelField(verb_label, source_field)
        if all([x == 0 for x in verb_label]):
            verb = None
        else:
            verb = tokenized_source[verb_label.index(1)].text
        meta_fields["verb"] = verb

        # Add Language Indicator to the Fields
        meta_fields["src_lang"] = lang_src_token
        meta_fields["tgt_lang"] = lang_tgt_token
        meta_fields["original_BIO"] = line_obj.get("BIO", [])
        meta_fields["original_predicate_senses"] = line_obj.get("pred_sense_origin", [])
        meta_fields["predicate_senses"] = line_obj.get("pred_sense", [])
        meta_fields["original_target"] = line_obj.get("seq_tag_tokens", [])
        fields_dict['language_enc_indicator'] = ArrayField(np.array(lang_src_ix_arr))
        fields_dict['language_dec_indicator'] = ArrayField(np.array(lang_tgt_ix_arr))

        fields_dict["metadata"] = MetadataField(meta_fields)
        return Instance(fields_dict)
 def test_tag_length_mismatch_raises(self):
     with pytest.raises(ConfigurationError):
         wrong_tags = ["B", "O", "O"]
         _ = SequenceLabelField(wrong_tags, self.text)
    def text_to_instance(
        self,  # type: ignore
        tokens: List[str],
        lemmas: List[str] = None,
        upos_tags: List[str] = None,
        xpos_tags: List[str] = None,
        feats: List[str] = None,
        dependencies: List[Tuple[str, int]] = None,
        deps: List[List[Tuple[str, int]]] = None,
        ids: List[str] = None,
        misc: List[str] = None,
        multiword_ids: List[str] = None,
        multiword_forms: List[str] = None,
        conllu_metadata: List[str] = None,
        contains_elided_token: bool = False,
    ) -> Instance:
        """
        # Parameters
        tokens : ``List[str]``, required.
            The tokens in the sentence to be encoded.
        upos_tags : ``List[str]``, required.
            The universal dependencies POS tags for each word.
        dependencies : ``List[Tuple[str, int]]``, optional (default = None)
            A list of  (head tag, head index) tuples. Indices are 1 indexed,
            meaning an index of 0 corresponds to that word being the root of
            the dependency tree.
        deps : ``List[List[Tuple[str, int]]]``, optional (default = None)
            A list of lists of (head tag, head index) tuples. Indices are 1 indexed,
            meaning an index of 0 corresponds to that word being the root of
            the dependency tree.
        # Returns
        An instance containing tokens, pos tags, basic and enhanced dependency head tags and head
        indices as fields.
        """

        fields: Dict[str, Field] = {}

        token_field = TextField([Token(t) for t in tokens],
                                self._token_indexers)
        fields["tokens"] = token_field
        names = ["upos", "xpos", "lemmas"]
        all_tags = [upos_tags, xpos_tags, lemmas]
        for name, field in zip(names, all_tags):
            if field:
                fields[name] = SequenceLabelField(field,
                                                  token_field,
                                                  label_namespace=name)

        sublist_fields = []
        for atomic_feat in feats:
            feat_fields = ListField([
                LabelField(feat, label_namespace="feats")
                for feat in atomic_feat.split("|")
            ])
            sublist_fields.append(feat_fields)
        fields["feats"] = ListField(sublist_fields)

        # basic dependency tree
        if dependencies is not None:
            head_tags = [x[0] for x in dependencies]
            head_indices = [x[1] for x in dependencies]

            fields["deprels"] = SequenceLabelField(
                [x[0] for x in dependencies],
                token_field,
                label_namespace="deprels")
            # head indices will be encoded as direction and distance features instead
            #fields["head_indices"] = SequenceLabelField(
            #    [x[1] for x in dependencies], token_field, label_namespace="head_index_tags"
            #)

        # enhanced dependencies
        # NOTE: we always assume there is something in the edeps column at the moment.
        if deps is not None:
            enhanced_arc_tags, enhanced_arc_indices = self._convert_deps_to_nested_sequences(
                deps)
            # extra processing is needed if a sentence contains an elided token
            if self.contains_elided_token == True:
                original_to_new_indices, augmented_heads = self._process_elided_tokens(
                    ids, enhanced_arc_indices)
                enhanced_arc_indices = augmented_heads
            else:
                original_to_new_indices = None

            assert len(enhanced_arc_tags) == len(
                enhanced_arc_indices), "each arc should have a label"

            arc_indices = []
            arc_tags = []
            arc_indices_and_tags = []

            for modifier, head_list in enumerate(enhanced_arc_indices,
                                                 start=1):
                for head in head_list:
                    arc_indices.append((head, modifier))

            for relation_list in enhanced_arc_tags:
                for relation in relation_list:
                    arc_tags.append(relation)

            assert len(arc_indices) == len(
                arc_tags), "each arc should have a label"

            for arc_index, arc_tag in zip(arc_indices, arc_tags):
                arc_indices_and_tags.append((arc_index, arc_tag))

            if arc_indices is not None and arc_tags is not None:
                token_field_with_root = ['root'] + tokens
                fields["enhanced_tags"] = RootedAdjacencyField(
                    arc_indices,
                    token_field_with_root,
                    arc_tags,
                    label_namespace="deps")

        if original_to_new_indices:
            # 1-indexed conllu ids as they appear in the sentence, e.g. 13.1 -> 14.
            offsets = list(original_to_new_indices.values())
            # we start from index 1 as there is a placeholder for root (0) in the above dictionary
            conllu_ids = offsets[1:]

            # change the indices of the heads to reflect the new order
            augmented_heads = []
            for head in head_indices:
                # the "_" head won't be in here
                if head in original_to_new_indices.keys():
                    # take the 1-indexed head based on the order of words in the sentence
                    augmented_head = original_to_new_indices[head]
                    augmented_heads.append(augmented_head)
                else:
                    augmented_heads.append("_")

            basic_heads = augmented_heads

        else:
            conllu_ids = ids
            basic_heads = head_indices

        assert len(conllu_ids) == len(
            basic_heads), "each token should have a head"

        head_information = []
        for dep, head in zip(conllu_ids, basic_heads):
            if head != "_":
                distance = head - dep
                # get a qualitative distance category
                distance_category = self.get_distance_categories(distance)
                # get a qualitative category of whether the head is to the left or right
                if distance < 0:
                    # left-headed
                    direction_label = "<L>"
                elif distance > 0:
                    # right-headed
                    direction_label = "<R>"
            else:
                # there is no information from the basic tree for elided tokens
                direction_label = "<NULL_DIR>"
                distance_category = "<NULL_DIST>"

            # join direction and category
            head_direction_and_distance = direction_label + "|" + distance_category
            head_information.append(head_direction_and_distance)

        # embed the head information like a morphological feature, e.g. it is a combination of direction and distance features
        sublist_heads = []
        for full_head_information in head_information:
            head_feats = ListField([
                LabelField(head_metadata, label_namespace="heads")
                for head_metadata in full_head_information.split("|")
            ])
            sublist_heads.append(head_feats)
        fields["heads"] = ListField(sublist_heads)

        fields["metadata"] = MetadataField({
            "tokens": tokens,
            "upos_tags": upos_tags,
            "xpos_tags": xpos_tags,
            "feats": feats,
            "lemmas": lemmas,
            "ids": ids,
            "misc": misc,
            "original_to_new_indices": original_to_new_indices,
            "head_tags": head_tags,
            "head_indices": head_indices,
            "arc_indices": arc_indices,
            "arc_tags": arc_tags,
            "labeled_arcs": arc_indices_and_tags,
            "multiword_ids": multiword_ids,
            "multiword_forms": multiword_forms,
            "conllu_metadata": conllu_metadata
        })

        return Instance(fields)
    def test_sequence_label_field_raises_on_incorrect_type(self):

        with pytest.raises(ConfigurationError):
            _ = SequenceLabelField([[], [], [], [], []], self.text)
Ejemplo n.º 21
0
    def text_to_instance(
        self,
        paragraph_num: int,
        paragraph: List[str],
        ner_dict: Dict[Span, str],
        start_ix: int,
        end_ix: int,
        sentence_indices: List[Span],
        document_metadata: Dict[str, Any],
    ):

        if self.to_scierc_converter:
            return dict(
                paragraph_num=paragraph_num,
                paragraph=paragraph,
                ner_dict=ner_dict,
                start_ix=start_ix,
                end_ix=end_ix,
                sentence_indices=sentence_indices,
                document_metadata=document_metadata,
            )

        text_field = TextField([Token(word) for word in paragraph],
                               self._token_indexers)

        metadata_field = MetadataField(
            dict(
                doc_id=document_metadata["doc_id"],
                paragraph_num=paragraph_num,
                paragraph=paragraph,
                start_pos_in_doc=start_ix,
                end_pos_in_doc=end_ix,
                ner_dict=ner_dict,
                sentence_indices=sentence_indices,
                document_metadata=document_metadata,
                num_spans=len(ner_dict),
            ))

        ner_type_labels = spans_to_bio_tags(
            [(k[0] - start_ix, k[1] - start_ix, v[0])
             for k, v in ner_dict.items()], len(paragraph))

        ner_entity_field = SequenceLabelField(
            ner_type_labels, text_field, label_namespace="ner_type_labels")

        # Pull it  all together.
        fields = dict(text=text_field,
                      ner_type_labels=ner_entity_field,
                      metadata=metadata_field)

        spans = []
        span_cluster_labels = []
        span_saliency_labels = []
        span_type_labels = []
        span_features = []

        entities_to_features_map = document_metadata[
            "entities_to_features_map"]
        cluster_name_to_id = document_metadata["cluster_name_to_id"]
        relation_to_cluster_ids = document_metadata["relation_to_cluster_ids"]
        span_to_cluster_ids = document_metadata["span_to_cluster_ids"]

        for (s, e), label in ner_dict.items():
            spans.append(
                SpanField(int(s - start_ix), int(e - start_ix - 1),
                          text_field))
            span_cluster_labels.append(
                MultiLabelField(
                    span_to_cluster_ids.get((s, e), []),
                    label_namespace="cluster_labels",
                    skip_indexing=True,
                    num_labels=len(cluster_name_to_id),
                ))
            span_saliency_labels.append(1 if label[-1] == "True" else 0)
            span_type_labels.append(label[0])
            span_features.append(
                MultiLabelField(entities_to_features_map[(s, e)],
                                label_namespace="section_feature_labels",
                                num_labels=5))

        if len(spans) > 0:
            fields["spans"] = ListField(spans)
            fields["span_cluster_labels"] = ListField(span_cluster_labels)
            fields["span_saliency_labels"] = SequenceLabelField(
                span_saliency_labels,
                fields["spans"],
                label_namespace="span_saliency_labels")
            fields["span_type_labels"] = SequenceLabelField(
                span_type_labels,
                fields["spans"],
                label_namespace="span_type_labels")
            fields["span_features"] = ListField(span_features)
        else:  # Some paragraphs may not have anything !
            fields["spans"] = ListField(
                [SpanField(-1, -1, text_field).empty_field()]).empty_field()
            fields["span_cluster_labels"] = ListField([
                MultiLabelField(
                    [],
                    label_namespace="cluster_labels",
                    skip_indexing=True,
                    num_labels=len(cluster_name_to_id),
                )
            ])  #.empty_field()
            fields["span_saliency_labels"] = SequenceLabelField(
                [0], fields["spans"], label_namespace="span_saliency_labels")
            fields["span_type_labels"] = SequenceLabelField(
                ["Method"],
                fields["spans"],
                label_namespace="span_type_labels")
            fields["span_features"] = ListField([
                MultiLabelField([],
                                label_namespace="section_feature_labels",
                                num_labels=5)
            ])

        if len(relation_to_cluster_ids) > 0:
            fields["relation_to_cluster_ids"] = ListField([
                MultiLabelField(
                    v,
                    label_namespace="cluster_labels",
                    skip_indexing=True,
                    num_labels=len(cluster_name_to_id),
                ) for k, v in relation_to_cluster_ids.items()
            ])

        return Instance(fields)
Ejemplo n.º 22
0
    def text_to_instance(self, sentence: List[str],
                         ner_dict: Dict[Tuple[int, int],
                                        str], relation_dict, doc_key: str,
                         dataset: str, sentence_num: int, groups: List[str],
                         start_ix: int, end_ix: int, tree: Dict[str, Any],
                         children_dict: Dict[Tuple[int, int],
                                             List[Tuple[int, int]]],
                         dep_children_dict: Dict[Tuple[int, int],
                                                 List[Tuple[int, int]]],
                         tf_dict: Dict[Tuple[int, int], Any]):

        sentence = [self._normalize_word(word) for word in sentence]

        text_field = TextField([Token(word) for word in sentence],
                               self._token_indexers)
        text_field_with_context = TextField([Token(word) for word in groups],
                                            self._token_indexers)

        # Put together the metadata.
        metadata = dict(sentence=sentence,
                        ner_dict=ner_dict,
                        relation_dict=relation_dict,
                        doc_key=doc_key,
                        dataset=dataset,
                        groups=groups,
                        start_ix=start_ix,
                        end_ix=end_ix,
                        sentence_num=sentence_num,
                        tree=tree,
                        children_dict=children_dict,
                        dep_children_dict=dep_children_dict)
        metadata_field = MetadataField(metadata)

        # Generate fields for text spans, ner labels
        spans = []
        span_ner_labels = []
        span_children_labels = []
        raw_spans = []

        for start, end in enumerate_spans(sentence,
                                          max_span_width=self._max_span_width):
            span_ix = (start, end)
            span_ner_labels.append(ner_dict[span_ix])
            spans.append(SpanField(start, end, text_field))
            raw_spans.append(span_ix)

        span_field = ListField(spans)

        for span in raw_spans:

            if len(children_dict[span]) == 0:
                children_field = ListField([IndexField(-1, span_field)])
            else:
                children_field = []
                for children_span in children_dict[span]:
                    if children_span in raw_spans:
                        children_field.append(
                            IndexField(raw_spans.index(children_span),
                                       span_field))
                    else:
                        children_field.append(IndexField(-1, span_field))
                children_field = ListField(children_field)

            span_children_labels.append(children_field)

        n_tokens = len(sentence)
        candidate_indices = [(i, j) for i in range(n_tokens)
                             for j in range(n_tokens)]
        dep_adjs = []
        dep_adjs_indices = []
        tf_indices = []
        tf_features = []
        for token_pair in candidate_indices:
            dep_adj_label = dep_children_dict[token_pair]
            if dep_adj_label:
                dep_adjs_indices.append(token_pair)
                dep_adjs.append(dep_adj_label)

            feature = tf_dict[token_pair]
            if feature:
                tf_indices.append(token_pair)
                tf_features.append(feature)

        ner_label_field = SequenceLabelField(span_ner_labels,
                                             span_field,
                                             label_namespace="ner_labels")

        n_spans = len(spans)
        span_tuples = [(span.span_start, span.span_end) for span in spans]
        candidate_indices = [(i, j) for i in range(n_spans)
                             for j in range(n_spans)]

        relations = []
        relation_indices = []
        for i, j in candidate_indices:
            span_pair = (span_tuples[i], span_tuples[j])
            relation_label = relation_dict[span_pair]
            if relation_label:
                relation_indices.append((i, j))
                relations.append(relation_label)

        relation_label_field = AdjacencyField(
            indices=relation_indices,
            sequence_field=span_field,
            labels=relations,
            label_namespace="relation_labels")

        # Syntax
        span_children_field = ListField(span_children_labels)
        dep_span_children_field = AdjacencyField(
            indices=dep_adjs_indices,
            sequence_field=text_field,
            labels=dep_adjs,
            label_namespace="dep_adj_labels")

        tf_field = AdjacencyField(indices=tf_indices,
                                  sequence_field=text_field,
                                  labels=tf_features,
                                  label_namespace="tf_labels")

        fields = dict(text=text_field_with_context,
                      spans=span_field,
                      ner_labels=ner_label_field,
                      relation_labels=relation_label_field,
                      metadata=metadata_field,
                      span_children=span_children_field,
                      dep_span_children=dep_span_children_field,
                      tf=tf_field)

        return Instance(fields)
Ejemplo n.º 23
0
    def text_to_instance(self, sentence: List[str],
                         ner_dict: Dict[Tuple[int, int],
                                        str], relation_dict, cluster_dict,
                         trigger_dict, argument_dict, doc_key: str,
                         dataset: str, sentence_num: int, groups: List[str],
                         start_ix: int, end_ix: int, tree: Dict[str, Any],
                         syntax_dict: Dict[Tuple[int, int], str],
                         children_dict: Dict[Tuple[int, int],
                                             List[Tuple[int, int]]],
                         dep_children_dict: Dict[Tuple[int, int],
                                                 List[Tuple[int, int]]],
                         tf_dict: Dict[Tuple[int, int], Any]):
        """
        TODO(dwadden) document me.
        """

        sentence = [self._normalize_word(word) for word in sentence]

        text_field = TextField([Token(word) for word in sentence],
                               self._token_indexers)
        text_field_with_context = TextField([Token(word) for word in groups],
                                            self._token_indexers)

        # feili, NER labels. One label per token
        ner_sequence_labels = self._generate_ner_label(sentence, ner_dict)
        ner_sequence_label_field = SequenceLabelField(
            ner_sequence_labels,
            text_field,
            label_namespace="ner_sequence_labels")

        # Put together the metadata.
        metadata = dict(sentence=sentence,
                        ner_dict=ner_dict,
                        relation_dict=relation_dict,
                        cluster_dict=cluster_dict,
                        trigger_dict=trigger_dict,
                        argument_dict=argument_dict,
                        doc_key=doc_key,
                        dataset=dataset,
                        groups=groups,
                        start_ix=start_ix,
                        end_ix=end_ix,
                        sentence_num=sentence_num,
                        seq_dict=ner_sequence_labels,
                        tree=tree,
                        syntax_dict=syntax_dict,
                        children_dict=children_dict,
                        dep_children_dict=dep_children_dict)
        metadata_field = MetadataField(metadata)

        # Trigger labels. One label per token in the input.
        token_trigger_labels = []
        for i in range(len(text_field)):
            token_trigger_labels.append(trigger_dict[i])

        trigger_label_field = SequenceLabelField(
            token_trigger_labels, text_field, label_namespace="trigger_labels")

        # Generate fields for text spans, ner labels, coref labels.
        spans = []
        span_ner_labels = []
        # feili
        span_labels = []
        span_coref_labels = []
        span_syntax_labels = []
        span_children_labels = []
        dep_span_children_labels = []
        # span_children_syntax_labels = []
        span_tree_labels = []
        raw_spans = []
        assert len(syntax_dict) == len(children_dict)
        for start, end in enumerate_spans(sentence,
                                          max_span_width=self._max_span_width):
            span_ix = (start, end)
            # here we need to consider how to use tree info
            # for example, use_tree, span is in tree, match is true or false
            # if self._tree_span_filter and not self._is_span_in_tree(span_ix, syntax_dict, children_dict):
            #     if len(raw_spans) == 0: # in case that there is no span for this instance
            #         pass
            #     else:
            #         continue
            span_tree_labels.append('1' if self._is_span_in_tree(
                span_ix, syntax_dict, children_dict) else '')

            span_ner_labels.append(ner_dict[span_ix])
            span_labels.append('' if ner_dict[span_ix] == '' else '1')
            span_coref_labels.append(cluster_dict[span_ix])
            spans.append(SpanField(start, end, text_field))
            span_syntax_labels.append(syntax_dict[span_ix])
            raw_spans.append(span_ix)

            # if len(children_dict[span_ix]) == 0:
            #     children_field = ListField([SpanField(-1, -1, text_field)])
            #     children_syntax_field = SequenceLabelField([''], children_field,
            #                                            label_namespace="span_syntax_labels")
            # else:
            #     children_field = ListField([SpanField(children_span[0], children_span[1], text_field)
            #                for children_span in children_dict[span_ix]])
            #     children_syntax_field = SequenceLabelField([syntax_dict[children_span] for children_span in children_dict[span_ix]],
            #                                                children_field, label_namespace="span_syntax_labels")
            # span_children_labels.append(children_field)
            # span_children_syntax_labels.append(children_syntax_field)

        span_field = ListField(spans)

        for span in raw_spans:

            if len(children_dict[span]) == 0:
                children_field = ListField([IndexField(-1, span_field)])
            else:
                children_field = []
                for children_span in children_dict[span]:
                    if children_span in raw_spans:
                        children_field.append(
                            IndexField(raw_spans.index(children_span),
                                       span_field))
                    else:
                        children_field.append(IndexField(-1, span_field))
                children_field = ListField(children_field)

            span_children_labels.append(children_field)

        # for span in raw_spans:
        #     if len(dep_children_dict[span]) == 0:
        #         children_field = ListField([IndexField(-1, span_field)])
        #     else:
        #         children_field = []
        #         for children_span in dep_children_dict[span]:
        #             if children_span in raw_spans:
        #                 children_field.append(IndexField(raw_spans.index(children_span), span_field))
        #             else:
        #                 children_field.append(IndexField(-1, span_field))
        #         children_field = ListField(children_field)
        #     dep_span_children_labels.append(children_field)

        n_tokens = len(sentence)
        candidate_indices = [(i, j) for i in range(n_tokens)
                             for j in range(n_tokens)]
        dep_adjs = []
        dep_adjs_indices = []
        # tf_indices = {}
        # tf_features = {}
        # for k, v in tf_dict.items():
        #     tf_indices[k] = []
        #     tf_features[k] = []
        tf_indices = []
        tf_features = []
        for token_pair in candidate_indices:
            dep_adj_label = dep_children_dict[token_pair]
            if dep_adj_label:
                dep_adjs_indices.append(token_pair)
                dep_adjs.append(dep_adj_label)

            # for k,v in tf_dict.items():
            #     feature = tf_dict[k][token_pair]
            #     if feature:
            #         tf_indices[k].append(token_pair)
            #         tf_features[k].append(feature)

            feature = tf_dict[token_pair]
            if feature:
                tf_indices.append(token_pair)
                tf_features.append(feature)

        ner_label_field = SequenceLabelField(span_ner_labels,
                                             span_field,
                                             label_namespace="ner_labels")
        coref_label_field = SequenceLabelField(span_coref_labels,
                                               span_field,
                                               label_namespace="coref_labels")
        # feili
        span_label_field = SequenceLabelField(span_labels,
                                              span_field,
                                              label_namespace="span_labels")

        # Generate labels for relations and arguments. Only store non-null values.
        # For the arguments, by convention the first span specifies the trigger, and the second
        # specifies the argument. Ideally we'd have an adjacency field between (token, span) pairs
        # for the event arguments field, but AllenNLP doesn't make it possible to express
        # adjacencies between two different sequences.
        n_spans = len(spans)
        span_tuples = [(span.span_start, span.span_end) for span in spans]
        candidate_indices = [(i, j) for i in range(n_spans)
                             for j in range(n_spans)]

        relations = []
        relation_indices = []
        for i, j in candidate_indices:
            span_pair = (span_tuples[i], span_tuples[j])
            relation_label = relation_dict[span_pair]
            if relation_label:
                relation_indices.append((i, j))
                relations.append(relation_label)

        relation_label_field = AdjacencyField(
            indices=relation_indices,
            sequence_field=span_field,
            labels=relations,
            label_namespace="relation_labels")

        arguments = []
        argument_indices = []
        n_tokens = len(sentence)
        candidate_indices = [(i, j) for i in range(n_tokens)
                             for j in range(n_spans)]
        for i, j in candidate_indices:
            token_span_pair = (i, span_tuples[j])
            argument_label = argument_dict[token_span_pair]
            if argument_label:
                argument_indices.append((i, j))
                arguments.append(argument_label)

        argument_label_field = AdjacencyFieldAssym(
            indices=argument_indices,
            row_field=text_field,
            col_field=span_field,
            labels=arguments,
            label_namespace="argument_labels")

        # Syntax
        span_syntax_field = SequenceLabelField(
            span_syntax_labels,
            span_field,
            label_namespace="span_syntax_labels")
        span_children_field = ListField(span_children_labels)
        span_tree_field = SequenceLabelField(
            span_tree_labels, span_field, label_namespace="span_tree_labels")
        # span_children_syntax_field = ListField(span_children_syntax_labels)
        # dep_span_children_field = ListField(dep_span_children_labels)
        dep_span_children_field = AdjacencyField(
            indices=dep_adjs_indices,
            sequence_field=text_field,
            labels=dep_adjs,
            label_namespace="dep_adj_labels")

        # tf_f1_field = AdjacencyField(indices=tf_indices['F1'], sequence_field=text_field, labels=tf_features['F1'],
        #     label_namespace="tf_f1_labels")
        # tf_f2_field = AdjacencyField(indices=tf_indices['F2'], sequence_field=text_field, labels=tf_features['F2'],
        #                              label_namespace="tf_f2_labels")
        # tf_f3_field = AdjacencyField(indices=tf_indices['F3'], sequence_field=text_field, labels=tf_features['F3'],
        #                              label_namespace="tf_f3_labels")
        # tf_f4_field = AdjacencyField(indices=tf_indices['F4'], sequence_field=text_field, labels=tf_features['F4'],
        #                              label_namespace="tf_f4_labels")
        # tf_f5_field = AdjacencyField(indices=tf_indices['F5'], sequence_field=text_field, labels=tf_features['F5'],
        #                              label_namespace="tf_f5_labels")

        tf_field = AdjacencyField(indices=tf_indices,
                                  sequence_field=text_field,
                                  labels=tf_features,
                                  label_namespace="tf_labels")

        # Pull it  all together.
        fields = dict(
            text=text_field_with_context,
            spans=span_field,
            ner_labels=ner_label_field,
            coref_labels=coref_label_field,
            trigger_labels=trigger_label_field,
            argument_labels=argument_label_field,
            relation_labels=relation_label_field,
            metadata=metadata_field,
            span_labels=span_label_field,
            ner_sequence_labels=ner_sequence_label_field,
            syntax_labels=span_syntax_field,
            span_children=span_children_field,
            span_tree_labels=span_tree_field,
            dep_span_children=dep_span_children_field,
            # tf_f1 = tf_f1_field,
            # tf_f2 = tf_f2_field,
            # tf_f3 = tf_f3_field,
            # tf_f4 = tf_f4_field,
            # tf_f5 = tf_f5_field)
            tf=tf_field)
        # span_children_syntax=span_children_syntax_field)

        return Instance(fields)
    def text_to_instance(self,
                         question_text: str,
                         passage_text: str,
                         passage_tokens: List[Token],
                         numbers_in_passage: List[Any],
                         number_words: List[str],
                         number_indices: List[int],
                         number_len: List[int],
                         question_id: str = None,
                         passage_id: str = None,
                         answer_annotations: List[Dict] = None,
                         specific_answer_type: str = None) -> Optional[Instance]:
        # Tokenize question and passage
        '''
            ### all_number_in_qp_tokens = [qp_tokens[idx] for idx in number_indices]

            unit_tokens = self.tokenizer.tokenize(answer_annotations[0]['unit'])

            valid_unit_spans = DropReader.find_valid_spans(question_tokens, [answer_annotations[0]['unit']])
            assert len(valid_unit_spans) == 1
            ### index + 1 since there is an CLS token at the front
            valid_unit_spans = [(valid_unit_spans[0][0]+1, valid_unit_spans[0][1]+1)]
        '''

        question_tokens = self.tokenizer.tokenize(question_text)
        question_tokens = fill_token_indices(question_tokens, question_text, self._uncased, self.basic_tokenizer)

        qlen = len(question_tokens)

        qp_tokens = [Token('[CLS]')] + question_tokens + [Token('[SEP]')] + passage_tokens

        # if qp has more than max_pieces tokens (including CLS and SEP), clip the passage
        max_passage_length = -1
        if len(qp_tokens) > self.max_pieces - 1:
            qp_tokens = qp_tokens[:self.max_pieces - 1]
            passage_tokens = passage_tokens[:self.max_pieces - qlen - 3]
            plen = len(passage_tokens)
            number_indices, number_len, numbers_in_passage = \
                clipped_passage_num(number_indices, number_len, numbers_in_passage, plen)
            max_passage_length = token_to_span(passage_tokens[-1])[1] if plen > 0 else 0
        
        qp_tokens += [Token('[SEP]')]
        # update the indices of the numbers with respect to the question.
        # Not done in-place so they won't change the numbers saved for the passage
        number_indices = [index + qlen + 2 for index in number_indices] + [-1]
        number_len = number_len + [1]
        numbers_in_passage = numbers_in_passage + [0]
        number_tokens = [Token(str(number)) for number in numbers_in_passage]
        extra_number_tokens = [Token(str(num)) for num in self.extra_numbers]
        
        mask_indices = [0, qlen + 1, len(qp_tokens) - 1]
        
        fields: Dict[str, Field] = {}
            
        # Add feature fields
        qp_field = TextField(qp_tokens, self.token_indexers)
        fields["question_passage"] = qp_field
       
        number_token_indices = \
            [ArrayField(np.arange(start_ind, start_ind + number_len[i]), padding_value=-1) 
             for i, start_ind in enumerate(number_indices)]
        fields["number_indices"] = ListField(number_token_indices)
        numbers_in_passage_field = TextField(number_tokens, self.token_indexers)
        extra_numbers_field = TextField(extra_number_tokens, self.token_indexers)
        mask_index_fields: List[Field] = [IndexField(index, qp_field) for index in mask_indices]
        fields["mask_indices"] = ListField(mask_index_fields)

        # Compile question, passage, answer metadata
        metadata = {"original_passage": passage_text,
                    "original_question": question_text,
                    "original_numbers": numbers_in_passage,
                    "original_number_words": number_words,
                    "extra_numbers": self.extra_numbers,
                    "passage_tokens": passage_tokens,
                    "question_tokens": question_tokens,
                    "question_passage_tokens": qp_tokens,
                    "passage_id": passage_id,
                    "question_id": question_id,
                    "max_passage_length": max_passage_length}

        # in a word broken up into pieces, every piece except the first should be ignored when calculating the loss
        wordpiece_mask = [not token.text.startswith('##') for token in qp_tokens]
        wordpiece_mask = np.array(wordpiece_mask)
        fields['bio_wordpiece_mask'] = ArrayField(wordpiece_mask, dtype=np.int64)

        if answer_annotations:            
            # Get answer type, answer text, tokenize
            # For multi-span, remove repeating answers. Although possible, in the dataset it is mostly mistakes.
            if answer_annotations[0]['yesno']:
                answer_type = YESNO_ANSER_TYPE
                answer_texts = 'true' if answer_annotations[0]['yesno'] == '1' else 'false'
            else:
                answer_type, answer_texts = DropReader.extract_answer_info_from_annotation(answer_annotations[0])

            if answer_type == SPAN_ANSWER_TYPE:
                answer_texts = list(OrderedDict.fromkeys(answer_texts))
            tokenized_answer_texts = []
            for answer_text in answer_texts:
                answer_tokens = self.tokenizer.tokenize(answer_text)
                tokenized_answer_text = ' '.join(token.text for token in answer_tokens)
                if tokenized_answer_text not in tokenized_answer_texts and tokenized_answer_text != '':
                    tokenized_answer_texts.append(tokenized_answer_text)

            metadata["answer_annotations"] = answer_annotations
            metadata["answer_texts"] = answer_texts
            metadata["answer_tokens"] = tokenized_answer_texts
        
            # Find unit text in question
            # import pdb; pdb.set_trace()
            if answer_annotations[0]['unit'] != '': 
                # print('answer_annotations[0][unit] = '+str(answer_annotations[0]['unit']))
                valid_unit_spans = DropReader.find_valid_spans(question_tokens, [answer_annotations[0]['unit']])
                ## assert len(valid_unit_spans) <= 1
                ### index + 1 since there is an CLS token at the front
                valid_unit_spans = [(unit_span[0]+1, unit_span[1]+1) for unit_span in valid_unit_spans]
            else:
                valid_unit_spans = []
            # Find answer text in question and passage
            # if len(tokenized_answer_texts)==1 and tokenized_answer_texts[0] == '':
            #     import pdb; pdb.set_trace()
            valid_question_spans = DropReader.find_valid_spans(question_tokens, tokenized_answer_texts)
            for span_ind, span in enumerate(valid_question_spans):
                valid_question_spans[span_ind] = (span[0] + 1, span[1] + 1)
            valid_passage_spans = DropReader.find_valid_spans(passage_tokens, tokenized_answer_texts)
            for span_ind, span in enumerate(valid_passage_spans):
                valid_passage_spans[span_ind] = (span[0] + qlen + 2, span[1] + qlen + 2)

            # throw away an instance in training if a span appearing in the answer is missing from the question and passage
            if self._is_training:
                if specific_answer_type in SPAN_ANSWER_TYPES:
                    for tokenized_answer_text in tokenized_answer_texts:
                        temp_spans = DropReader.find_valid_spans(qp_field, [tokenized_answer_text])
                        if len(temp_spans) == 0:
                            return None

            # Get target numbers
            target_numbers = []
            if specific_answer_type != MULTIPLE_SPAN or self.multispan_allow_all_heads_to_answer:
                for answer_text in answer_texts:
                    number = self.word_to_num(answer_text, self.improve_number_extraction)
                    if number is not None:
                        target_numbers.append(number)
            
            # Get possible ways to arrive at target numbers with add/sub
            valid_expressions: List[List[int]] = []
            exp_strings = None
            if answer_type in ["number", "date"]:
                if self.target_number_rounding:
                    valid_expressions = \
                        find_valid_add_sub_expressions_with_rounding(
                            self.extra_numbers + numbers_in_passage,
                            target_numbers,
                            self.max_numbers_expression)
                else:
                    valid_expressions = \
                        DropReader.find_valid_add_sub_expressions(self.extra_numbers + numbers_in_passage,
                                                                  target_numbers,
                                                                  self.max_numbers_expression)
                if len(target_numbers) == 0:
                    import pdb; pdb.set_trace()
                if self.discard_impossible_number_questions:
                    # The train set was verified to have all of its target_numbers lists of length 1.
                    if (answer_type == "number" and
                            len(valid_expressions) == 0 and
                            self._is_training and
                            self.max_count < target_numbers[0]):
                        # The number to predict can't be derived from any head, so we shouldn't train on it.
                        # arithmetic - no expressions that yield the number to predict.
                        # counting - the maximal count is smaller than the number to predict.

                        # However, although the answer is marked in the dataset as a number type answer,
                        # maybe it cannot be found due to a bug in DROP's text parsing.
                        # So in addition, we try to find the answer as a span in the text.
                        # If the answer is indeed a span in the text, we don't discard that question.
                        if len(valid_question_spans) == 0 and len(valid_passage_spans) == 0:
                            return None
                        if not self.keep_impossible_number_questions_which_exist_as_spans:
                            return None

            # Get possible ways to arrive at target numbers with counting
            valid_counts: List[int] = []
            if answer_type in ["number"]:
                numbers_for_count = list(range(self.max_count + 1))
                valid_counts = DropReader.find_valid_counts(numbers_for_count, target_numbers)

            valid_yesno: int = -1
            if answer_type in ["yesno"]:
                valid_yesno = 1 if answer_texts == 'true' else 0

            # Update metadata with answer info
            answer_info = {"answer_passage_spans": valid_passage_spans,
                           "answer_question_spans": valid_question_spans,
                           "expressions": valid_expressions,
                           "counts": valid_counts,
                           "unit": valid_unit_spans,
                           "yesno": valid_yesno}

            metadata["answer_info"] = answer_info
        
            # Add answer fields
            passage_span_fields: List[Field] = []
            if specific_answer_type != MULTIPLE_SPAN or self.multispan_allow_all_heads_to_answer:
                passage_span_fields: List[Field] = [SpanField(span[0], span[1], qp_field) for span in valid_passage_spans]
            if not passage_span_fields:
                passage_span_fields.append(SpanField(-1, -1, qp_field))
            fields["answer_as_passage_spans"] = ListField(passage_span_fields)

            question_span_fields: List[Field] = []
            if specific_answer_type != MULTIPLE_SPAN or self.multispan_allow_all_heads_to_answer:
                question_span_fields: List[Field] = [SpanField(span[0], span[1], qp_field) for span in valid_question_spans]
            if not question_span_fields:
                question_span_fields.append(SpanField(-1, -1, qp_field))
            fields["answer_as_question_spans"] = ListField(question_span_fields)
            
            add_sub_signs_field: List[Field] = []
            extra_signs_field: List[Field] = []
            for signs_for_one_add_sub_expressions in valid_expressions:
                extra_signs = signs_for_one_add_sub_expressions[:len(self.extra_numbers)]
                normal_signs = signs_for_one_add_sub_expressions[len(self.extra_numbers):]
                add_sub_signs_field.append(SequenceLabelField(normal_signs, numbers_in_passage_field))
                extra_signs_field.append(SequenceLabelField(extra_signs, extra_numbers_field))
            if not add_sub_signs_field:
                add_sub_signs_field.append(SequenceLabelField([0] * len(number_tokens), numbers_in_passage_field))
            if not extra_signs_field:
                extra_signs_field.append(SequenceLabelField([0] * len(self.extra_numbers), extra_numbers_field))
            fields["answer_as_expressions"] = ListField(add_sub_signs_field)
            if self.extra_numbers:
                fields["answer_as_expressions_extra"] = ListField(extra_signs_field)


            '''
                Add unit_field
            '''
            unit_span_fields: List[Field] = []
            unit_span_fields: List[Field] = [SpanField(span[0], span[1], qp_field) for span in valid_unit_spans]
            if not unit_span_fields:
                unit_span_fields.append(SpanField(-1, -1, qp_field))
                
            fields["answer_as_unit_spans"] = ListField(unit_span_fields)

            count_fields: List[Field] = [LabelField(count_label, skip_indexing=True) for count_label in valid_counts]
            if not count_fields:
                count_fields.append(LabelField(-1, skip_indexing=True))
            fields["answer_as_counts"] = ListField(count_fields)
            
            yesno_field: List[Field] = [LabelField(valid_yesno, skip_indexing=True)]
            fields["answer_as_yesno"] = ListField(yesno_field)
            

            no_answer_bios = SequenceLabelField([0] * len(qp_tokens), sequence_field=qp_field)
            if (specific_answer_type in self.bio_types) and (len(valid_passage_spans) > 0 or len(valid_question_spans) > 0):
                
                # Used for flexible BIO loss
                # START
                
                spans_dict = {}
                text_to_disjoint_bios: List[ListField] = []
                flexibility_count = 1
                for tokenized_answer_text in tokenized_answer_texts:
                    spans = DropReader.find_valid_spans(qp_tokens, [tokenized_answer_text])
                    if len(spans) == 0:
                        # possible if the passage was clipped, but not for all of the answers
                        continue
                    spans_dict[tokenized_answer_text] = spans

                    disjoint_bios: List[SequenceLabelField] = []
                    for span_ind, span in enumerate(spans):
                        bios = create_bio_labels([span], len(qp_field))
                        disjoint_bios.append(SequenceLabelField(bios, sequence_field=qp_field))

                    text_to_disjoint_bios.append(ListField(disjoint_bios))
                    flexibility_count *= ((2**len(spans)) - 1)

                fields["answer_as_text_to_disjoint_bios"] = ListField(text_to_disjoint_bios)

                if (flexibility_count < self.flexibility_threshold):
                    # generate all non-empty span combinations per each text
                    spans_combinations_dict = {}
                    for key, spans in spans_dict.items():
                        spans_combinations_dict[key] = all_combinations = []
                        for i in range(1, len(spans) + 1):
                            all_combinations += list(itertools.combinations(spans, i))

                    # calculate product between all the combinations per each text
                    packed_gold_spans_list = itertools.product(*list(spans_combinations_dict.values()))
                    bios_list: List[SequenceLabelField] = []
                    for packed_gold_spans in packed_gold_spans_list:
                        gold_spans = [s for sublist in packed_gold_spans for s in sublist]
                        bios = create_bio_labels(gold_spans, len(qp_field))
                        bios_list.append(SequenceLabelField(bios, sequence_field=qp_field))
                    
                    fields["answer_as_list_of_bios"] = ListField(bios_list)
                    fields["answer_as_text_to_disjoint_bios"] = ListField([ListField([no_answer_bios])])
                else:
                    fields["answer_as_list_of_bios"] = ListField([no_answer_bios])

                # END

                # Used for both "require-all" BIO loss and flexible loss
                bio_labels = create_bio_labels(valid_question_spans + valid_passage_spans, len(qp_field))
                fields['span_bio_labels'] = SequenceLabelField(bio_labels, sequence_field=qp_field)

                fields["is_bio_mask"] = LabelField(1, skip_indexing=True)
            else:
                fields["answer_as_text_to_disjoint_bios"] = ListField([ListField([no_answer_bios])])
                fields["answer_as_list_of_bios"] = ListField([no_answer_bios])

                # create all 'O' BIO labels for non-span questions
                fields['span_bio_labels'] = no_answer_bios
                fields["is_bio_mask"] = LabelField(0, skip_indexing=True)

        fields["metadata"] = MetadataField(metadata)
        
        return Instance(fields)
Ejemplo n.º 25
0
    def predictions_to_labeled_instances(
            self, instance: Instance,
            outputs: Dict[str, numpy.ndarray]) -> List[Instance]:
        """
        This function currently only handles BIOUL tags.

        Imagine an NER model predicts three named entities (each one with potentially
        multiple tokens). For each individual entity, we create a new Instance that has
        the label set to only that entity and the rest of the tokens are labeled as outside.
        We then return a list of those Instances.

        For example:

        ```text
        Mary  went to Seattle to visit Microsoft Research
        U-Per  O    O   U-Loc  O   O     B-Org     L-Org
        ```

        We create three instances.

        ```text
        Mary  went to Seattle to visit Microsoft Research
        U-Per  O    O    O     O   O       O         O

        Mary  went to Seattle to visit Microsoft Research
        O      O    O   U-LOC  O   O       O         O

        Mary  went to Seattle to visit Microsoft Research
        O      O    O    O     O   O     B-Org     L-Org
        ```

        We additionally add a flag to these instances to tell the model to only compute loss on
        non-O tags, so that we get gradients that are specific to the particular span prediction
        that each instance represents.
        """
        predicted_tags = outputs["tags"]
        predicted_spans = []

        i = 0
        while i < len(predicted_tags):
            tag = predicted_tags[i]
            # if its a U, add it to the list
            if tag[0] == "U":
                current_tags = [
                    t if idx == i else "O"
                    for idx, t in enumerate(predicted_tags)
                ]
                predicted_spans.append(current_tags)
            # if its a B, keep going until you hit an L.
            elif tag[0] == "B":
                begin_idx = i
                while tag[0] != "L":
                    i += 1
                    tag = predicted_tags[i]
                end_idx = i
                current_tags = [
                    t if begin_idx <= idx <= end_idx else "O"
                    for idx, t in enumerate(predicted_tags)
                ]
                predicted_spans.append(current_tags)
            i += 1

        # Creates a new instance for each contiguous tag
        instances = []
        for labels in predicted_spans:
            new_instance = deepcopy(instance)
            text_field: TextField = instance["tokens"]  # type: ignore
            new_instance.add_field("tags",
                                   SequenceLabelField(labels, text_field),
                                   self._model.vocab)
            new_instance.add_field("ignore_loss_on_o_tags", FlagField(True))
            instances.append(new_instance)

        return instances
Ejemplo n.º 26
0
    def text_to_instance(self, source_string: str,
                         gold_spans: Dict[Tuple[int, int],
                                          str], scene_string: str, answer: str,
                         program: str) -> Instance:  # type: ignore
        """Turns raw source string and target string into an ``Instance``."""
        tokens = self.tokenizer.tokenize(source_string)
        word_pieces = self._get_wordpieces(source_string)
        word_pieces_tokens = [Token('[CLS]')
                              ] + [Token(wp)
                                   for wp in word_pieces] + [Token('[SEP]')]

        text_field = TextField(tokens, self._token_indexers)
        wp_field = TextField(word_pieces_tokens, self._token_indexers)
        fields: Dict[str, Field] = {"tokens": text_field}

        if gold_spans is None:
            constants = self._domain_utils.get_constants(program)

        spans: List[Field] = []
        gold_labels = []

        for start, end in enumerate_spans(word_pieces):
            # Shift by 1 due to CLS token
            spans.append(SpanField(start + 1, end + 1, wp_field))

            if gold_spans is not None:
                # Shift by 1 due to CLS token
                gold_labels.append(
                    gold_spans.get((start + 1, end + 1), "NO-LABEL"))
            else:
                # Create random labels for each span so that labels would be collected. When no
                # more true labels are left, draw between NO-LABEL and span. These randomly assigned
                # labels would be ignored during training
                if constants[0]:
                    gold_labels.append(constants[0].pop())
                else:
                    rand_label = np.random.choice(a=["NO-LABEL", "span"],
                                                  size=1,
                                                  p=[0.7, 0.3])
                    gold_labels.append(rand_label[0])

        span_list_field: ListField = ListField(spans)
        fields["spans"] = span_list_field

        fields["span_labels"] = SequenceLabelField(
            gold_labels,
            span_list_field,
            label_namespace="labels",
        )

        metadata = {
            "tokens": word_pieces,
            "scene_str": scene_string,
            "answer": answer
        }
        if program:
            metadata["program"] = program
        if gold_spans:
            metadata["gold_spans"] = gold_spans

        fields["metadata"] = MetadataField(metadata)

        return Instance(fields)
Ejemplo n.º 27
0
    def text_to_instance(
        self,  # type: ignore
        sentences: List[List[str]],
        gold_clusters: Optional[List[List[Tuple[int, int]]]] = None,
        *,
        mention_token_spans: Optional[Sequence[Tuple[int, int]]] = None
    ) -> Instance:  # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        sentences : ``List[List[str]]``, required.
            A list of lists representing the tokenized words and sentences in the document.
        gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None)
            A list of all clusters in the document, represented as word spans. Each cluster
            contains some number of spans, which can be nested and overlap, but will never
            exactly match between clusters.
        mention_token_spans: optional
            A Sequence of spans which should be consider for coref. This will override
            the usual behavior of including all spans up to the maximum width.  The spans should
            be specified in terms of token indices with inclusive end token indices.

        Returns
        -------
        An ``Instance`` containing the following ``Fields``:
            text : ``TextField``
                The text of the full document.
            spans : ``ListField[SpanField]``
                A ListField containing the spans represented as ``SpanFields``
                with respect to the document text.
            span_labels : ``SequenceLabelField``, optional
                The id of the cluster which each possible span belongs to, or -1 if it does
                 not belong to a cluster. As these labels have variable length (it depends on
                 how many spans we are considering), we represent this a as a ``SequenceLabelField``
                 with respect to the ``spans ``ListField``.
        """
        flattened_sentences = [
            self._normalize_word(word) for sentence in sentences
            for word in sentence
        ]

        metadata: Dict[str, Any] = {"original_text": flattened_sentences}
        if gold_clusters is not None:
            metadata["clusters"] = gold_clusters

        text_field = TextField([Token(word) for word in flattened_sentences],
                               self._token_indexers)

        cluster_dict = {}
        if gold_clusters is not None:
            for cluster_id, cluster in enumerate(gold_clusters):
                for mention in cluster:
                    cluster_dict[tuple(mention)] = cluster_id

        span_fields: List[Field] = []
        span_labels: Optional[
            List[int]] = [] if gold_clusters is not None else None

        if mention_token_spans is None:
            # every possible span in the document up to a certain maximum size is a
            # mention candidate
            sentence_offset = 0
            for sentence in sentences:
                for start, end in enumerate_spans(
                        sentence,
                        offset=sentence_offset,
                        max_span_width=self._max_span_width):
                    if span_labels is not None:
                        if (start, end) in cluster_dict:
                            span_labels.append(cluster_dict[(start, end)])
                        else:
                            span_labels.append(-1)

                    span_fields.append(SpanField(start, end, text_field))
                sentence_offset += len(sentence)
        else:
            if span_labels is not None:
                raise NotImplementedError(
                    "We currently don't handle known mentions plus "
                    "gold labels")
            # the mentions spans are already known; we just need to make SpanFields for them
            span_fields = [
                SpanField(start, end, text_field)
                for (start, end) in mention_token_spans
            ]

        span_field = ListField(span_fields)
        metadata_field = MetadataField(metadata)

        fields: Dict[str, Field] = {
            "text": text_field,
            "spans": span_field,
            "metadata": metadata_field
        }
        if span_labels is not None:
            fields["span_labels"] = SequenceLabelField(span_labels, span_field)

        return Instance(fields)
Ejemplo n.º 28
0
    def text_to_instance(
            self,  # type: ignore
            tokens: List[Token],
            pos_tags: List[str] = None,
            chunk_tags: List[str] = None,
            ner_tags: List[str] = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {'tokens': sequence}
        instance_fields["metadata"] = MetadataField(
            {"words": [x.text for x in tokens]})

        # Recode the labels if necessary.
        if self.coding_scheme == "BIOUL":
            coded_chunks = to_bioul(chunk_tags,
                                    encoding=self._original_coding_scheme
                                    ) if chunk_tags is not None else None
            coded_ner = to_bioul(ner_tags,
                                 encoding=self._original_coding_scheme
                                 ) if ner_tags is not None else None
        else:
            # the default IOB1
            coded_chunks = chunk_tags
            coded_ner = ner_tags

        # Add "feature labels" to instance
        if 'pos' in self.feature_labels:
            if pos_tags is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use pos_tags as "
                    "features. Pass them to text_to_instance.")
            instance_fields['pos_tags'] = SequenceLabelField(
                pos_tags, sequence, "pos_tags")
        if 'chunk' in self.feature_labels:
            if coded_chunks is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use chunk tags as "
                    "features. Pass them to text_to_instance.")
            instance_fields['chunk_tags'] = SequenceLabelField(
                coded_chunks, sequence, "chunk_tags")
        if 'ner' in self.feature_labels:
            if coded_ner is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use NER tags as "
                    " features. Pass them to text_to_instance.")
            instance_fields['ner_tags'] = SequenceLabelField(
                coded_ner, sequence, "ner_tags")

        # Add "tag label" to instance
        if self.tag_label == 'ner' and coded_ner is not None:
            instance_fields['tags'] = SequenceLabelField(
                coded_ner, sequence, self.label_namespace)
        elif self.tag_label == 'pos' and pos_tags is not None:
            instance_fields['tags'] = SequenceLabelField(
                pos_tags, sequence, self.label_namespace)
        elif self.tag_label == 'chunk' and coded_chunks is not None:
            instance_fields['tags'] = SequenceLabelField(
                coded_chunks, sequence, self.label_namespace)

        return Instance(instance_fields)
Ejemplo n.º 29
0
    def text_to_instance(
            self,
            question_text: str,
            passage_text: str,
            passage_tokens: List[Token],
            passage_spans: List[Tuple[int, int]],
            numbers_in_passage: List[Any],
            number_words: List[str],
            number_indices: List[int],
            number_len: List[int],
            question_id: str = None,
            passage_id: str = None,
            answer_annotations: List[Dict] = None) -> Union[Instance, None]:
        # Tokenize question and passage
        question_tokens = self.tokenizer.tokenize(question_text)
        qlen = len(question_tokens)
        plen = len(passage_tokens)

        question_passage_tokens = [Token('[CLS]')] + question_tokens + [
            Token('[SEP]')
        ] + passage_tokens
        if len(question_passage_tokens) > self.max_pieces - 1:
            question_passage_tokens = question_passage_tokens[:self.
                                                              max_pieces - 1]
            passage_tokens = passage_tokens[:self.max_pieces - qlen - 3]
            plen = len(passage_tokens)
            number_indices, number_len, numbers_in_passage = \
                clipped_passage_num(number_indices, number_len, numbers_in_passage, plen)

        question_passage_tokens += [Token('[SEP]')]
        number_indices = [index + qlen + 2 for index in number_indices] + [-1]
        # Not done in-place so they won't change the numbers saved for the passage
        number_len = number_len + [1]
        numbers_in_passage = numbers_in_passage + [0]
        number_tokens = [Token(str(number)) for number in numbers_in_passage]
        extra_number_tokens = [Token(str(num)) for num in self.extra_numbers]

        mask_indices = [0, qlen + 1, len(question_passage_tokens) - 1]

        if self.extract_spans:
            # adapt indexes to question_passage_tokens sequence
            passage_spans = [(span[0] + qlen + 2, span[1] + qlen + 2)
                             for span in passage_spans]
            # remove spans of truncated part of passage
            passage_spans = [
                span for span in passage_spans
                if span[1] <= len(question_passage_tokens)
            ]
            # make span indexes inclusive
            passage_spans = [(span[0], span[1] - 1) for span in passage_spans]

        fields: Dict[str, Field] = {}

        # Add feature fields
        question_passage_field = TextField(question_passage_tokens,
                                           self.token_indexers)
        fields["question_passage"] = question_passage_field

        if self.extract_spans:
            passage_span_fields = [
                SpanField(span[0], span[1], question_passage_field)
                for span in passage_spans
            ]
            fields["passage_spans"] = ListField(passage_span_fields)

        number_token_indices = \
            [ArrayField(np.arange(start_ind, start_ind + number_len[i]), padding_value=-1)
             for i, start_ind in enumerate(number_indices)]
        fields["number_indices"] = ListField(number_token_indices)
        numbers_in_passage_field = TextField(number_tokens,
                                             self.token_indexers)
        extra_numbers_field = TextField(extra_number_tokens,
                                        self.token_indexers)
        all_numbers_field = TextField(extra_number_tokens + number_tokens,
                                      self.token_indexers)
        mask_index_fields: List[Field] = [
            IndexField(index, question_passage_field) for index in mask_indices
        ]
        fields["mask_indices"] = ListField(mask_index_fields)

        # Compile question, passage, answer metadata
        metadata = {
            "original_passage": passage_text,
            "original_question": question_text,
            "original_numbers": numbers_in_passage,
            "original_number_words": number_words,
            "extra_numbers": self.extra_numbers,
            "passage_tokens": passage_tokens,
            "question_tokens": question_tokens,
            "question_passage_tokens": question_passage_tokens,
            "passage_id": passage_id,
            "question_id": question_id
        }

        if answer_annotations:
            for annotation in answer_annotations:
                tokenized_spans = [[
                    token.text for token in self.tokenizer.tokenize(answer)
                ] for answer in annotation['spans']]
                annotation['spans'] = [
                    tokenlist_to_passage(token_list)
                    for token_list in tokenized_spans
                ]

            # Get answer type, answer text, tokenize
            answer_type, answer_texts = DropReader.extract_answer_info_from_annotation(
                answer_annotations[0])
            tokenized_answer_texts = []
            num_spans = min(len(answer_texts), self.max_spans)
            for answer_text in answer_texts:
                answer_tokens = self.tokenizer.tokenize(answer_text)
                tokenized_answer_texts.append(' '.join(
                    token.text for token in answer_tokens))

            metadata["answer_annotations"] = answer_annotations
            metadata["answer_texts"] = answer_texts
            metadata["answer_tokens"] = tokenized_answer_texts

            # Find answer text in question and passage
            valid_question_spans = DropReader.find_valid_spans(
                question_tokens, tokenized_answer_texts)
            for span_ind, span in enumerate(valid_question_spans):
                valid_question_spans[span_ind] = (span[0] + 1, span[1] + 1)
            valid_passage_spans = DropReader.find_valid_spans(
                passage_tokens, tokenized_answer_texts)
            for span_ind, span in enumerate(valid_passage_spans):
                valid_passage_spans[span_ind] = (span[0] + qlen + 2,
                                                 span[1] + qlen + 2)

            # Get target numbers
            target_numbers = []
            for answer_text in answer_texts:
                number = self.word_to_num(answer_text)
                if number is not None:
                    target_numbers.append(number)

            # Get possible ways to arrive at target numbers with add/sub

            valid_expressions: List[List[int]] = []
            exp_strings = None
            if answer_type in ["number", "date"]:
                if self.exp_search == 'full':
                    expressions = get_full_exp(
                        list(enumerate(self.extra_numbers +
                                       numbers_in_passage)), target_numbers,
                        self.operations, self.op_dict, self.max_depth)
                    zipped = list(zip(*expressions))
                    if zipped:
                        valid_expressions = list(zipped[0])
                        exp_strings = list(zipped[1])
                elif self.exp_search == 'add_sub':
                    valid_expressions = \
                        DropReader.find_valid_add_sub_expressions(self.extra_numbers + numbers_in_passage,
                                                                  target_numbers,
                                                                  self.max_numbers_expression)
                elif self.exp_search == 'template':
                    valid_expressions, exp_strings = \
                        get_template_exp(self.extra_numbers + numbers_in_passage,
                                         target_numbers,
                                         self.templates,
                                         self.template_strings)
                    exp_strings = sum(exp_strings, [])

            # Get possible ways to arrive at target numbers with counting
            valid_counts: List[int] = []
            if answer_type in ["number"]:
                numbers_for_count = list(range(self.max_count + 1))
                valid_counts = DropReader.find_valid_counts(
                    numbers_for_count, target_numbers)

            # Update metadata with answer info
            answer_info = {
                "answer_passage_spans": valid_passage_spans,
                "answer_question_spans": valid_question_spans,
                "num_spans": num_spans,
                "expressions": valid_expressions,
                "counts": valid_counts
            }
            if self.exp_search in ['template', 'full']:
                answer_info['expr_text'] = exp_strings
            metadata["answer_info"] = answer_info

            # Add answer fields
            passage_span_fields: List[Field] = [
                SpanField(span[0], span[1], question_passage_field)
                for span in valid_passage_spans
            ]
            if not passage_span_fields:
                passage_span_fields.append(
                    SpanField(-1, -1, question_passage_field))
            fields["answer_as_passage_spans"] = ListField(passage_span_fields)

            question_span_fields: List[Field] = [
                SpanField(span[0], span[1], question_passage_field)
                for span in valid_question_spans
            ]
            if not question_span_fields:
                question_span_fields.append(
                    SpanField(-1, -1, question_passage_field))
            fields["answer_as_question_spans"] = ListField(
                question_span_fields)

            if self.exp_search == 'add_sub':
                add_sub_signs_field: List[Field] = []
                extra_signs_field: List[Field] = []
                for signs_for_one_add_sub_expressions in valid_expressions:
                    extra_signs = signs_for_one_add_sub_expressions[:len(
                        self.extra_numbers)]
                    normal_signs = signs_for_one_add_sub_expressions[
                        len(self.extra_numbers):]
                    add_sub_signs_field.append(
                        SequenceLabelField(normal_signs,
                                           numbers_in_passage_field))
                    extra_signs_field.append(
                        SequenceLabelField(extra_signs, extra_numbers_field))
                if not add_sub_signs_field:
                    add_sub_signs_field.append(
                        SequenceLabelField([0] * len(number_tokens),
                                           numbers_in_passage_field))
                if not extra_signs_field:
                    extra_signs_field.append(
                        SequenceLabelField([0] * len(self.extra_numbers),
                                           extra_numbers_field))
                fields["answer_as_expressions"] = ListField(
                    add_sub_signs_field)
                if self.extra_numbers:
                    fields["answer_as_expressions_extra"] = ListField(
                        extra_signs_field)
            elif self.exp_search in ['template', 'full']:
                expression_indices = []
                for expression in valid_expressions:
                    if not expression:
                        expression.append(3 * [-1])
                    expression_indices.append(
                        ArrayField(np.array(expression), padding_value=-1))
                if not expression_indices:
                    expression_indices = \
                        [ArrayField(np.array([3 * [-1]]), padding_value=-1) for _ in range(len(self.templates))]
                fields["answer_as_expressions"] = ListField(expression_indices)

            count_fields: List[Field] = [
                LabelField(count_label, skip_indexing=True)
                for count_label in valid_counts
            ]
            if not count_fields:
                count_fields.append(LabelField(-1, skip_indexing=True))
            fields["answer_as_counts"] = ListField(count_fields)

            fields["num_spans"] = LabelField(num_spans, skip_indexing=True)

        fields["metadata"] = MetadataField(metadata)

        return Instance(fields)
Ejemplo n.º 30
0
    def text_to_instance(self, # type: ignore
                         tokens: List[str],
                         pos_tags: List[str] = None,
                         gold_tree: Tree = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.

        Parameters
        ----------
        tokens : ``List[str]``, required.
            The tokens in a given sentence.
        pos_tags ``List[str]``, optional, (default = None).
            The POS tags for the words in the sentence.
        gold_tree : ``Tree``, optional (default = None).
            The gold parse tree to create span labels from.

        Returns
        -------
        An ``Instance`` containing the following fields:
            tokens : ``TextField``
                The tokens in the sentence.
            pos_tags : ``SequenceLabelField``
                The POS tags of the words in the sentence.
                Only returned if ``use_pos_tags`` is ``True``
            spans : ``ListField[SpanField]``
                A ListField containing all possible subspans of the
                sentence.
            span_labels : ``SequenceLabelField``, optional.
                The constiutency tags for each of the possible spans, with
                respect to a gold parse tree. If a span is not contained
                within the tree, a span will have a ``NO-LABEL`` label.
        """
        # pylint: disable=arguments-differ
        text_field = TextField([Token(x) for x in tokens], token_indexers=self._token_indexers)
        fields: Dict[str, Field] = {"tokens": text_field}

        if self._use_pos_tags and pos_tags is not None:
            pos_tag_field = SequenceLabelField(pos_tags, text_field, "pos_tags")
            fields["pos_tags"] = pos_tag_field
        elif self._use_pos_tags:
            raise ConfigurationError("use_pos_tags was set to True but no gold pos"
                                     " tags were passed to the dataset reader.")
        spans: List[Field] = []
        gold_labels = []

        if gold_tree is not None:
            gold_spans_with_pos_tags: Dict[Tuple[int, int], str] = {}
            self._get_gold_spans(gold_tree, 0, gold_spans_with_pos_tags)
            gold_spans = {span: label for (span, label)
                          in gold_spans_with_pos_tags.items() if "-POS" not in label}
        else:
            gold_spans = None
        for start, end in enumerate_spans(tokens):
            spans.append(SpanField(start, end, text_field))

            if gold_spans is not None:
                if (start, end) in gold_spans.keys():
                    gold_labels.append(gold_spans[(start, end)])
                else:
                    gold_labels.append("NO-LABEL")

        span_list_field: ListField = ListField(spans)
        fields["spans"] = span_list_field
        if gold_tree is not None:
            fields["span_labels"] = SequenceLabelField(gold_labels, span_list_field)

        return Instance(fields)