Example #1
0
 def test_nested_list_fields_are_padded_correctly(self):
     nested_field1 = ListField(
         [LabelField(c) for c in ['a', 'b', 'c', 'd', 'e']])
     nested_field2 = ListField(
         [LabelField(c) for c in ['f', 'g', 'h', 'i', 'j', 'k']])
     list_field = ListField(
         [nested_field1.empty_field(), nested_field1, nested_field2])
     list_field.index(self.vocab)
     padding_lengths = list_field.get_padding_lengths()
     assert padding_lengths == {'num_fields': 3, 'list_num_fields': 6}
     tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy()
     numpy.testing.assert_almost_equal(
         tensor, [[-1, -1, -1, -1, -1, -1], [0, 1, 2, 3, 4, -1],
                  [5, 6, 7, 8, 9, 10]])
Example #2
0
    def test_padding_handles_list_fields_with_padding_values(self):
        array1 = ArrayField(numpy.ones([2, 3]), padding_value=-1)
        array2 = ArrayField(numpy.ones([1, 5]), padding_value=-1)
        empty_array = array1.empty_field()
        list_field = ListField([array1, array2, empty_array])

        returned_tensor = list_field.as_tensor(
            list_field.get_padding_lengths()).detach().cpu().numpy()
        correct_tensor = numpy.array([[[1., 1., 1., -1., -1.],
                                       [1., 1., 1., -1., -1.]],
                                      [[1., 1., 1., 1., 1.],
                                       [-1., -1., -1., -1., -1.]],
                                      [[-1., -1., -1., -1., -1.],
                                       [-1., -1., -1., -1., -1.]]])
        numpy.testing.assert_array_equal(returned_tensor, correct_tensor)
Example #3
0
 def test_list_field_can_handle_empty_index_fields(self):
     list_field = ListField(
         [self.index_field, self.index_field, self.empty_index_field])
     list_field.index(self.vocab)
     tensor = list_field.as_tensor(list_field.get_padding_lengths())
     numpy.testing.assert_array_equal(tensor.detach().cpu().numpy(),
                                      numpy.array([[1], [1], [-1]]))
Example #4
0
 def test_list_field_can_handle_empty_text_fields(self):
     list_field = ListField(
         [self.field1, self.field2, self.empty_text_field])
     list_field.index(self.vocab)
     tensor_dict = list_field.as_tensor(list_field.get_padding_lengths())
     numpy.testing.assert_array_equal(
         tensor_dict["words"].detach().cpu().numpy(),
         numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [0, 0, 0, 0, 0]]))
Example #5
0
    def test_doubly_nested_field_works(self):
        field1 = ProductionRuleField('S -> [NP, VP]', is_global_rule=True)
        field2 = ProductionRuleField('NP -> test', is_global_rule=True)
        field3 = ProductionRuleField('VP -> eat', is_global_rule=False)
        list_field = ListField([ListField([field1, field2, field3]),
                                ListField([field1, field2])])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        tensors = list_field.as_tensor(padding_lengths)
        assert isinstance(tensors, list)
        assert len(tensors) == 2
        assert isinstance(tensors[0], list)
        assert len(tensors[0]) == 3
        assert isinstance(tensors[1], list)
        assert len(tensors[1]) == 3

        tensor_tuple = tensors[0][0]
        assert tensor_tuple[0] == 'S -> [NP, VP]'
        assert tensor_tuple[1] is True
        assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.s_rule_index])

        tensor_tuple = tensors[0][1]
        assert tensor_tuple[0] == 'NP -> test'
        assert tensor_tuple[1] is True
        assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.np_index])

        tensor_tuple = tensors[0][2]
        assert tensor_tuple[0] == 'VP -> eat'
        assert tensor_tuple[1] is False
        assert tensor_tuple[2] is None

        tensor_tuple = tensors[1][0]
        assert tensor_tuple[0] == 'S -> [NP, VP]'
        assert tensor_tuple[1] is True
        assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.s_rule_index])

        tensor_tuple = tensors[1][1]
        assert tensor_tuple[0] == 'NP -> test'
        assert tensor_tuple[1] is True
        assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.np_index])

        # This item was just padding.
        tensor_tuple = tensors[1][2]
        assert tensor_tuple[0] == ''
        assert tensor_tuple[1] is False
        assert tensor_tuple[2] is None
Example #6
0
 def test_all_fields_padded_to_max_length(self):
     list_field = ListField([self.field1, self.field2, self.field3])
     list_field.index(self.vocab)
     tensor_dict = list_field.as_tensor(list_field.get_padding_lengths())
     numpy.testing.assert_array_almost_equal(
         tensor_dict["words"][0].detach().cpu().numpy(),
         numpy.array([2, 3, 4, 5, 0]))
     numpy.testing.assert_array_almost_equal(
         tensor_dict["words"][1].detach().cpu().numpy(),
         numpy.array([2, 3, 4, 1, 5]))
     numpy.testing.assert_array_almost_equal(
         tensor_dict["words"][2].detach().cpu().numpy(),
         numpy.array([2, 3, 1, 5, 0]))
Example #7
0
 def test_fields_can_pad_to_greater_than_max_length(self):
     list_field = ListField([self.field1, self.field2, self.field3])
     list_field.index(self.vocab)
     padding_lengths = list_field.get_padding_lengths()
     padding_lengths["list_num_tokens"] = 7
     padding_lengths["num_fields"] = 5
     tensor_dict = list_field.as_tensor(padding_lengths)
     numpy.testing.assert_array_almost_equal(
         tensor_dict["words"][0].detach().cpu().numpy(),
         numpy.array([2, 3, 4, 5, 0, 0, 0]))
     numpy.testing.assert_array_almost_equal(
         tensor_dict["words"][1].detach().cpu().numpy(),
         numpy.array([2, 3, 4, 1, 5, 0, 0]))
     numpy.testing.assert_array_almost_equal(
         tensor_dict["words"][2].detach().cpu().numpy(),
         numpy.array([2, 3, 1, 5, 0, 0, 0]))
     numpy.testing.assert_array_almost_equal(
         tensor_dict["words"][3].detach().cpu().numpy(),
         numpy.array([0, 0, 0, 0, 0, 0, 0]))
     numpy.testing.assert_array_almost_equal(
         tensor_dict["words"][4].detach().cpu().numpy(),
         numpy.array([0, 0, 0, 0, 0, 0, 0]))
Example #8
0
    def test_as_tensor_can_handle_multiple_token_indexers(self):
        # pylint: disable=protected-access
        self.field1._token_indexers = self.words_and_characters_indexers
        self.field2._token_indexers = self.words_and_characters_indexers
        self.field3._token_indexers = self.words_and_characters_indexers

        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        tensor_dict = list_field.as_tensor(padding_lengths)
        words = tensor_dict["words"].detach().cpu().numpy()
        characters = tensor_dict["characters"].detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(
            words,
            numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [2, 3, 1, 5, 0]]))

        numpy.testing.assert_array_almost_equal(
            characters[0],
            numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                         [1, 2, 0, 0, 0, 0, 0, 0, 0],
                         [1, 0, 0, 0, 0, 0, 0, 0, 0],
                         [2, 3, 4, 5, 3, 4, 6, 3, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0]]))

        numpy.testing.assert_array_almost_equal(
            characters[1],
            numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                         [1, 2, 0, 0, 0, 0, 0, 0, 0],
                         [1, 0, 0, 0, 0, 0, 0, 0, 0],
                         [1, 1, 1, 1, 3, 1, 3, 4, 5],
                         [2, 3, 4, 5, 3, 4, 6, 3, 0]]))

        numpy.testing.assert_array_almost_equal(
            characters[2],
            numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                         [1, 2, 0, 0, 0, 0, 0, 0, 0],
                         [1, 4, 1, 5, 1, 3, 1, 0, 0],
                         [2, 3, 4, 5, 3, 4, 6, 3, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0]]))
Example #9
0
    def text_to_instance(
            self,  # type: ignore
            tokens: List[str],
            pos_tags: List[str] = None,
            gold_tree: Tree = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.

        Parameters
        ----------
        tokens : ``List[str]``, required.
            The tokens in a given sentence.
        pos_tags ``List[str]``, optional, (default = None).
            The POS tags for the words in the sentence.
        gold_tree : ``Tree``, optional (default = None).
            The gold parse tree to create span labels from.

        Returns
        -------
        An ``Instance`` containing the following fields:
            tokens : ``TextField``
                The tokens in the sentence.
            pos_tags : ``SequenceLabelField``
                The POS tags of the words in the sentence.
                Only returned if ``use_pos_tags`` is ``True``
            spans : ``ListField[SpanField]``
                A ListField containing all possible subspans of the
                sentence.
            span_labels : ``SequenceLabelField``, optional.
                The constiutency tags for each of the possible spans, with
                respect to a gold parse tree. If a span is not contained
                within the tree, a span will have a ``NO-LABEL`` label.
            gold_tree : ``MetadataField(Tree)``
                The gold NLTK parse tree for use in evaluation.
        """
        # pylint: disable=arguments-differ
        text_field = TextField([Token(x) for x in tokens],
                               token_indexers=self._token_indexers)
        fields: Dict[str, Field] = {"tokens": text_field}

        pos_namespace = self._label_namespace_prefix + self._pos_label_namespace
        if self._use_pos_tags and pos_tags is not None:
            pos_tag_field = SequenceLabelField(pos_tags,
                                               text_field,
                                               label_namespace=pos_namespace)
            fields["pos_tags"] = pos_tag_field
        elif self._use_pos_tags:
            raise ConfigurationError(
                "use_pos_tags was set to True but no gold pos"
                " tags were passed to the dataset reader.")
        spans: List[Field] = []
        gold_labels = []

        if gold_tree is not None:
            gold_spans: Dict[Tuple[int, int], str] = {}
            self._get_gold_spans(gold_tree, 0, gold_spans)

        else:
            gold_spans = None
        for start, end in enumerate_spans(tokens):
            spans.append(SpanField(start, end, text_field))

            if gold_spans is not None:
                if (start, end) in gold_spans.keys():
                    gold_labels.append(gold_spans[(start, end)])
                else:
                    gold_labels.append("NO-LABEL")

        metadata = {"tokens": tokens}
        if gold_tree:
            metadata["gold_tree"] = gold_tree
        if self._use_pos_tags:
            metadata["pos_tags"] = pos_tags

        fields["metadata"] = MetadataField(metadata)

        span_list_field: ListField = ListField(spans)
        fields["spans"] = span_list_field
        if gold_tree is not None:
            fields["span_labels"] = SequenceLabelField(
                gold_labels,
                span_list_field,
                label_namespace=self._label_namespace_prefix + "labels")
        return Instance(fields)
Example #10
0
    def text_to_instance(self,  # type: ignore
                         sentences: List[List[str]],
                         gold_clusters: Optional[List[List[Tuple[int, int]]]] = None) -> Instance:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        sentences : ``List[List[str]]``, required.
            A list of lists representing the tokenised words and sentences in the document.
        gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None)
            A list of all clusters in the document, represented as word spans. Each cluster
            contains some number of spans, which can be nested and overlap, but will never
            exactly match between clusters.

        Returns
        -------
        An ``Instance`` containing the following ``Fields``:
            text : ``TextField``
                The text of the full document.
            spans : ``ListField[SpanField]``
                A ListField containing the spans represented as ``SpanFields``
                with respect to the document text.
            span_labels : ``SequenceLabelField``, optional
                The id of the cluster which each possible span belongs to, or -1 if it does
                 not belong to a cluster. As these labels have variable length (it depends on
                 how many spans we are considering), we represent this a as a ``SequenceLabelField``
                 with respect to the ``spans ``ListField``.
        """
        flattened_sentences = [self._normalize_word(word)
                               for sentence in sentences
                               for word in sentence]

        metadata: Dict[str, Any] = {"original_text": flattened_sentences}
        if gold_clusters is not None:
            metadata["clusters"] = gold_clusters

        text_field = TextField([Token(word) for word in flattened_sentences], self._token_indexers)

        cluster_dict = {}
        if gold_clusters is not None:
            for cluster_id, cluster in enumerate(gold_clusters):
                for mention in cluster:
                    cluster_dict[tuple(mention)] = cluster_id

        spans: List[Field] = []
        span_labels: Optional[List[int]] = [] if gold_clusters is not None else None

        sentence_offset = 0
        for sentence in sentences:
            for start, end in enumerate_spans(sentence,
                                              offset=sentence_offset,
                                              max_span_width=self._max_span_width):
                if span_labels is not None:
                    if (start, end) in cluster_dict:
                        span_labels.append(cluster_dict[(start, end)])
                    else:
                        span_labels.append(-1)

                spans.append(SpanField(start, end, text_field))
            sentence_offset += len(sentence)

        span_field = ListField(spans)
        metadata_field = MetadataField(metadata)

        fields: Dict[str, Field] = {"text": text_field,
                                    "spans": span_field,
                                    "metadata": metadata_field}
        if span_labels is not None:
            fields["span_labels"] = SequenceLabelField(span_labels, span_field)

        return Instance(fields)
Example #11
0
    def _json_blob_to_instance(self, json_obj: JsonDict) -> Instance:
        question_tokens = self._read_tokens_from_json_list(
            json_obj['question_tokens'])
        question_field = TextField(question_tokens,
                                   self._question_token_indexers)
        question_metadata = MetadataField(
            {"question_tokens": [x.text for x in question_tokens]})
        table_knowledge_graph = TableQuestionKnowledgeGraph.read_from_lines(
            json_obj['table_lines'], question_tokens)
        entity_tokens = [
            self._read_tokens_from_json_list(token_list)
            for token_list in json_obj['entity_texts']
        ]
        table_field = KnowledgeGraphField(
            table_knowledge_graph,
            question_tokens,
            tokenizer=None,
            token_indexers=self._table_token_indexers,
            entity_tokens=entity_tokens,
            linking_features=json_obj['linking_features'],
            include_in_vocab=self._use_table_for_vocab,
            max_table_tokens=self._max_table_tokens)
        world = WikiTablesWorld(table_knowledge_graph)
        world_field = MetadataField(world)

        production_rule_fields: List[Field] = []
        for production_rule in world.all_possible_actions():
            _, rule_right_side = production_rule.split(' -> ')
            is_global_rule = not world.is_table_entity(rule_right_side)
            field = ProductionRuleField(production_rule, is_global_rule)
            production_rule_fields.append(field)
        action_field = ListField(production_rule_fields)

        example_string_field = MetadataField(json_obj['example_lisp_string'])

        fields = {
            'question': question_field,
            'metadata': question_metadata,
            'table': table_field,
            'world': world_field,
            'actions': action_field,
            'example_lisp_string': example_string_field
        }

        if 'target_action_sequences' in json_obj or 'agenda' in json_obj:
            action_map = {
                action.rule: i
                for i, action in enumerate(action_field.field_list)
            }  # type: ignore
        if 'target_action_sequences' in json_obj:
            action_sequence_fields: List[Field] = []
            for sequence in json_obj['target_action_sequences']:
                index_fields: List[Field] = []
                for production_rule in sequence:
                    index_fields.append(
                        IndexField(action_map[production_rule], action_field))
                action_sequence_fields.append(ListField(index_fields))
            fields['target_action_sequences'] = ListField(
                action_sequence_fields)
        if 'agenda' in json_obj:
            agenda_index_fields: List[Field] = []
            for agenda_action in json_obj['agenda']:
                agenda_index_fields.append(
                    IndexField(action_map[agenda_action], action_field))
            fields['agenda'] = ListField(agenda_index_fields)
        return Instance(fields)
Example #12
0
    def text_to_instance(
            self,  # type: ignore
            question: str,
            table_lines: List[str],
            example_lisp_string: str = None,
            dpd_output: List[str] = None,
            tokenized_question: List[Token] = None) -> Instance:
        """
        Reads text inputs and makes an instance. WikitableQuestions dataset provides tables as TSV
        files, which we use for training.

        Parameters
        ----------
        question : ``str``
            Input question
        table_lines : ``List[str]``
            The table content itself, as a list of rows. See
            ``TableQuestionKnowledgeGraph.read_from_lines`` for the expected format.
        example_lisp_string : ``str``, optional
            The original (lisp-formatted) example string in the WikiTableQuestions dataset.  This
            comes directly from the ``.examples`` file provided with the dataset.  We pass this to
            SEMPRE for evaluating logical forms during training.  It isn't otherwise used for
            anything.
        dpd_output : List[str], optional
            List of logical forms, produced by dynamic programming on denotations. Not required
            during test.
        tokenized_question : ``List[Token]``, optional
            If you have already tokenized the question, you can pass that in here, so we don't
            duplicate that work.  You might, for example, do batch processing on the questions in
            the whole dataset, then pass the result in here.
        """
        # pylint: disable=arguments-differ
        tokenized_question = tokenized_question or self._tokenizer.tokenize(
            question.lower())
        question_field = TextField(tokenized_question,
                                   self._question_token_indexers)
        metadata: Dict[str, Any] = {
            "question_tokens": [x.text for x in tokenized_question]
        }
        metadata["original_table"] = "".join(table_lines)
        table_knowledge_graph = TableQuestionKnowledgeGraph.read_from_lines(
            table_lines, tokenized_question)
        table_metadata = MetadataField(table_lines)
        table_field = KnowledgeGraphField(
            table_knowledge_graph,
            tokenized_question,
            self._table_token_indexers,
            tokenizer=self._tokenizer,
            feature_extractors=self._linking_feature_extractors,
            include_in_vocab=self._use_table_for_vocab,
            max_table_tokens=self._max_table_tokens)
        world = WikiTablesWorld(table_knowledge_graph)
        world_field = MetadataField(world)

        production_rule_fields: List[Field] = []
        for production_rule in world.all_possible_actions():
            _, rule_right_side = production_rule.split(' -> ')
            is_global_rule = not world.is_table_entity(rule_right_side)
            field = ProductionRuleField(production_rule, is_global_rule)
            production_rule_fields.append(field)
        action_field = ListField(production_rule_fields)

        fields = {
            'question': question_field,
            'metadata': MetadataField(metadata),
            'table': table_field,
            'world': world_field,
            'actions': action_field
        }
        if self._include_table_metadata:
            fields['table_metadata'] = table_metadata
        if example_lisp_string:
            fields['example_lisp_string'] = MetadataField(example_lisp_string)

        # We'll make each target action sequence a List[IndexField], where the index is into
        # the action list we made above.  We need to ignore the type here because mypy doesn't
        # like `action.rule` - it's hard to tell mypy that the ListField is made up of
        # ProductionRuleFields.
        action_map = {
            action.rule: i
            for i, action in enumerate(action_field.field_list)
        }  # type: ignore
        if dpd_output:
            action_sequence_fields: List[Field] = []
            for logical_form in dpd_output:
                if not self._should_keep_logical_form(logical_form):
                    logger.debug(f'Question was: {question}')
                    logger.debug(f'Table info was: {table_lines}')
                    continue
                try:
                    expression = world.parse_logical_form(logical_form)
                except ParsingError as error:
                    logger.debug(
                        f'Parsing error: {error.message}, skipping logical form'
                    )
                    logger.debug(f'Question was: {question}')
                    logger.debug(f'Logical form was: {logical_form}')
                    logger.debug(f'Table info was: {table_lines}')
                    continue
                except:
                    logger.error(logical_form)
                    raise
                action_sequence = world.get_action_sequence(expression)
                try:
                    index_fields: List[Field] = []
                    for production_rule in action_sequence:
                        index_fields.append(
                            IndexField(action_map[production_rule],
                                       action_field))
                    action_sequence_fields.append(ListField(index_fields))
                except KeyError as error:
                    logger.debug(
                        f'Missing production rule: {error.args}, skipping logical form'
                    )
                    logger.debug(f'Question was: {question}')
                    logger.debug(f'Table info was: {table_lines}')
                    logger.debug(f'Logical form was: {logical_form}')
                    continue
                if len(action_sequence_fields) >= self._max_dpd_logical_forms:
                    break

            if not action_sequence_fields:
                # This is not great, but we're only doing it when we're passed logical form
                # supervision, so we're expecting labeled logical forms, but we can't actually
                # produce the logical forms.  We should skip this instance.  Note that this affects
                # _dev_ and _test_ instances, too, so your metrics could be over-estimates on the
                # full test data.
                return None
            fields['target_action_sequences'] = ListField(
                action_sequence_fields)
        if self._output_agendas:
            agenda_index_fields: List[Field] = []
            for agenda_string in world.get_agenda():
                agenda_index_fields.append(
                    IndexField(action_map[agenda_string], action_field))
            if not agenda_index_fields:
                agenda_index_fields = [IndexField(-1, action_field)]
            fields['agenda'] = ListField(agenda_index_fields)
        return Instance(fields)
Example #13
0
def make_reading_comprehension_instance_quac(
        question_list_tokens: List[List[Token]],
        passage_tokens: List[Token],
        token_indexers: Dict[str, TokenIndexer],
        passage_text: str,
        token_span_lists: List[List[Tuple[int, int]]] = None,
        yesno_list: List[int] = None,
        followup_list: List[int] = None,
        additional_metadata: Dict[str, Any] = None,
        num_context_answers: int = 0) -> Instance:
    """
    Converts a question, a passage, and an optional answer (or answers) to an ``Instance`` for use
    in a reading comprehension model.

    Creates an ``Instance`` with at least these fields: ``question`` and ``passage``, both
    ``TextFields``; and ``metadata``, a ``MetadataField``.  Additionally, if both ``answer_texts``
    and ``char_span_starts`` are given, the ``Instance`` has ``span_start`` and ``span_end``
    fields, which are both ``IndexFields``.

    Parameters
    ----------
    question_list_tokens : ``List[List[Token]]``
        An already-tokenized list of questions. Each dialog have multiple questions.
    passage_tokens : ``List[Token]``
        An already-tokenized passage that contains the answer to the given question.
    token_indexers : ``Dict[str, TokenIndexer]``
        Determines how the question and passage ``TextFields`` will be converted into tensors that
        get input to a model.  See :class:`TokenIndexer`.
    passage_text : ``str``
        The original passage text.  We need this so that we can recover the actual span from the
        original passage that the model predicts as the answer to the question.  This is used in
        official evaluation scripts.
    token_spans_lists : ``List[List[Tuple[int, int]]]``, optional
        Indices into ``passage_tokens`` to use as the answer to the question for training.  This is
        a list of list, first because there is multiple questions per dialog, and
        because there might be several possible correct answer spans in the passage.
        Currently, we just select the last span in this list (i.e., QuAC has multiple
        annotations on the dev set; this will select the last span, which was given by the original annotator).
    yesno_list : ``List[int]``
        List of the affirmation bit for each question answer pairs.
    followup_list : ``List[int]``
        List of the continuation bit for each question answer pairs.
    num_context_answers : ``int``, optional
        How many answers to encode into the passage.
    additional_metadata : ``Dict[str, Any]``, optional
        The constructed ``metadata`` field will by default contain ``original_passage``,
        ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys.  If
        you want any other metadata to be associated with each instance, you can pass that in here.
        This dictionary will get added to the ``metadata`` dictionary we already construct.
    """
    additional_metadata = additional_metadata or {}
    fields: Dict[str, Field] = {}
    passage_offsets = [(token.idx, token.idx + len(token.text))
                       for token in passage_tokens]
    # This is separate so we can reference it later with a known type.
    passage_field = TextField(passage_tokens, token_indexers)
    fields['passage'] = passage_field
    fields['question'] = ListField([
        TextField(q_tokens, token_indexers)
        for q_tokens in question_list_tokens
    ])
    metadata = {'original_passage': passage_text,
                'token_offsets': passage_offsets,
                'question_tokens': [[token.text for token in question_tokens] \
                                    for question_tokens in question_list_tokens],
                'passage_tokens': [token.text for token in passage_tokens], }
    p1_answer_marker_list: List[Field] = []
    p2_answer_marker_list: List[Field] = []
    p3_answer_marker_list: List[Field] = []

    def get_tag(i, i_name):
        # Generate a tag to mark previous answer span in the passage.
        return "<{0:d}_{1:s}>".format(i, i_name)

    def mark_tag(span_start, span_end, passage_tags, prev_answer_distance):
        try:
            assert span_start > 0
            assert span_end > 0
        except:
            raise ValueError(
                "Previous {0:d}th answer span should have been updated!".
                format(prev_answer_distance))
        # Modify "tags" to mark previous answer span.
        if span_start == span_end:
            passage_tags[prev_answer_distance][span_start] = get_tag(
                prev_answer_distance, "")
        else:
            passage_tags[prev_answer_distance][span_start] = get_tag(
                prev_answer_distance, "start")
            passage_tags[prev_answer_distance][span_end] = get_tag(
                prev_answer_distance, "end")
            for passage_index in range(span_start + 1, span_end):
                passage_tags[prev_answer_distance][passage_index] = get_tag(
                    prev_answer_distance, "in")

    if token_span_lists:
        span_start_list: List[Field] = []
        span_end_list: List[Field] = []
        p1_span_start, p1_span_end, p2_span_start = -1, -1, -1
        p2_span_end, p3_span_start, p3_span_end = -1, -1, -1
        # Looping each <<answers>>.
        for question_index, answer_span_lists in enumerate(token_span_lists):
            span_start, span_end = answer_span_lists[
                -1]  # Last one is the original answer
            span_start_list.append(IndexField(span_start, passage_field))
            span_end_list.append(IndexField(span_end, passage_field))
            prev_answer_marker_lists = [["O"] * len(passage_tokens),
                                        ["O"] * len(passage_tokens),
                                        ["O"] * len(passage_tokens),
                                        ["O"] * len(passage_tokens)]
            if question_index > 0 and num_context_answers > 0:
                mark_tag(p1_span_start, p1_span_end, prev_answer_marker_lists,
                         1)
                if question_index > 1 and num_context_answers > 1:
                    mark_tag(p2_span_start, p2_span_end,
                             prev_answer_marker_lists, 2)
                    if question_index > 2 and num_context_answers > 2:
                        mark_tag(p3_span_start, p3_span_end,
                                 prev_answer_marker_lists, 3)
                    p3_span_start = p2_span_start
                    p3_span_end = p2_span_end
                p2_span_start = p1_span_start
                p2_span_end = p1_span_end
            p1_span_start = span_start
            p1_span_end = span_end
            if num_context_answers > 2:
                p3_answer_marker_list.append(
                    SequenceLabelField(prev_answer_marker_lists[3],
                                       passage_field,
                                       label_namespace="answer_tags"))
            if num_context_answers > 1:
                p2_answer_marker_list.append(
                    SequenceLabelField(prev_answer_marker_lists[2],
                                       passage_field,
                                       label_namespace="answer_tags"))
            if num_context_answers > 0:
                p1_answer_marker_list.append(
                    SequenceLabelField(prev_answer_marker_lists[1],
                                       passage_field,
                                       label_namespace="answer_tags"))
        fields['span_start'] = ListField(span_start_list)
        fields['span_end'] = ListField(span_end_list)
        if num_context_answers > 0:
            fields['p1_answer_marker'] = ListField(p1_answer_marker_list)
            if num_context_answers > 1:
                fields['p2_answer_marker'] = ListField(p2_answer_marker_list)
                if num_context_answers > 2:
                    fields['p3_answer_marker'] = ListField(
                        p3_answer_marker_list)
        fields['yesno_list'] = ListField( \
            [LabelField(yesno, label_namespace="yesno_labels") for yesno in yesno_list])
        fields['followup_list'] = ListField([LabelField(followup, label_namespace="followup_labels") \
                                             for followup in followup_list])
    metadata.update(additional_metadata)
    fields['metadata'] = MetadataField(metadata)
    return Instance(fields)
Example #14
0
    def text_to_instance(
            self,  # type: ignore
            sentence: str,
            structured_representations: List[List[List[JsonDict]]],
            labels: List[str] = None,
            target_sequences: List[List[str]] = None,
            identifier: str = None) -> Instance:
        """
        Parameters
        ----------
        sentence : ``str``
            The query sentence.
        structured_representations : ``List[List[List[JsonDict]]]``
            A list of Json representations of all the worlds. See expected format in this class' docstring.
        labels : ``List[str]`` (optional)
            List of string representations of the labels (true or false) corresponding to the
            ``structured_representations``. Not required while testing.
        target_sequences : ``List[List[str]]`` (optional)
            List of target action sequences for each element which lead to the correct denotation in
            worlds corresponding to the structured representations.
        identifier : ``str`` (optional)
            The identifier from the dataset if available.
        """
        # pylint: disable=arguments-differ
        worlds = [NlvrWorld(data) for data in structured_representations]
        tokenized_sentence = self._tokenizer.tokenize(sentence)
        sentence_field = TextField(tokenized_sentence,
                                   self._sentence_token_indexers)
        production_rule_fields: List[Field] = []
        instance_action_ids: Dict[str, int] = {}
        # TODO(pradeep): Assuming that possible actions are the same in all worlds. This may change
        # later.
        for production_rule in worlds[0].all_possible_actions():
            instance_action_ids[production_rule] = len(instance_action_ids)
            field = ProductionRuleField(production_rule, is_global_rule=True)
            production_rule_fields.append(field)
        action_field = ListField(production_rule_fields)
        worlds_field = ListField([MetadataField(world) for world in worlds])
        fields: Dict[str, Field] = {
            "sentence": sentence_field,
            "worlds": worlds_field,
            "actions": action_field
        }
        if identifier is not None:
            fields["identifier"] = MetadataField(identifier)
        # Depending on the type of supervision used for training the parser, we may want either
        # target action sequences or an agenda in our instance. We check if target sequences are
        # provided, and include them if they are. If not, we'll get an agenda for the sentence, and
        # include that in the instance.
        if target_sequences:
            action_sequence_fields: List[Field] = []
            for target_sequence in target_sequences:
                index_fields = ListField([
                    IndexField(instance_action_ids[action], action_field)
                    for action in target_sequence
                ])
                action_sequence_fields.append(index_fields)
                # TODO(pradeep): Define a max length for this field.
            fields["target_action_sequences"] = ListField(
                action_sequence_fields)
        elif self._output_agendas:
            # TODO(pradeep): Assuming every world gives the same agenda for a sentence. This is true
            # now, but may change later too.
            agenda = worlds[0].get_agenda_for_sentence(
                sentence, add_paths_to_agenda=False)
            assert agenda, "No agenda found for sentence: %s" % sentence
            # agenda_field contains indices into actions.
            agenda_field = ListField([
                IndexField(instance_action_ids[action], action_field)
                for action in agenda
            ])
            fields["agenda"] = agenda_field
        if labels:
            labels_field = ListField([
                LabelField(label, label_namespace='denotations')
                for label in labels
            ])
            fields["labels"] = labels_field

        return Instance(fields)
Example #15
0
    def text_to_instance(self,  # type: ignore
                         utterances: List[str],
                         sql_query: str = None) -> Instance:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        utterances: ``List[str]``, required.
            List of utterances in the interaction, the last element is the current utterance.
        sql_query: ``str``, optional
            The SQL query, given as label during training or validation.
        """
        utterance = utterances[-1]
        action_sequence: List[str] = []

        if not utterance:
            return None

        world = AtisWorld(utterances)

        if sql_query:
            try:
                action_sequence = world.get_action_sequence(sql_query)
            except ParseError:
                logger.debug(f'Parsing error')

        tokenized_utterance = self._tokenizer.tokenize(utterance.lower())
        utterance_field = TextField(tokenized_utterance, self._token_indexers)

        production_rule_fields: List[Field] = []

        for production_rule in world.all_possible_actions():
            lhs, _ = production_rule.split(' ->')
            is_global_rule = not lhs in ['number', 'string']
            # The whitespaces are not semantically meaningful, so we filter them out.
            production_rule = ' '.join([token for token in production_rule.split(' ') if token != 'ws'])
            field = ProductionRuleField(production_rule, is_global_rule)
            production_rule_fields.append(field)

        action_field = ListField(production_rule_fields)
        action_map = {action.rule: i # type: ignore
                      for i, action in enumerate(action_field.field_list)}
        index_fields: List[Field] = []
        world_field = MetadataField(world)
        fields = {'utterance' : utterance_field,
                  'actions' : action_field,
                  'world' : world_field,
                  'linking_scores' : ArrayField(world.linking_scores)}

        if sql_query:
            if action_sequence:
                for production_rule in action_sequence:
                    index_fields.append(IndexField(action_map[production_rule], action_field))

                action_sequence_field: List[Field] = []
                action_sequence_field.append(ListField(index_fields))
                fields['target_action_sequence'] = ListField(action_sequence_field)
            else:
                # If we are given a SQL query, but we are unable to parse it, then we will skip it.
                return None

        return Instance(fields)
Example #16
0
 def test_get_padding_lengths(self):
     list_field = ListField([self.field1, self.field2, self.field3])
     list_field.index(self.vocab)
     lengths = list_field.get_padding_lengths()
     assert lengths == {"num_fields": 3, "list_num_tokens": 5}
Example #17
0
 def test_printing_doesnt_crash(self):
     list_field = ListField([self.field1, self.field2])
     print(list_field)