def test_empty_list_can_be_tensorized(self):
     tokenizer = SpacyTokenizer()
     tokens = tokenizer.tokenize("Foo")
     text_field = TextField(tokens, self.word_indexer)
     list_field = ListField([text_field.empty_field()])
     fields = {
         "list": list_field,
         "bar": TextField(tokenizer.tokenize("BAR"), self.word_indexer),
     }
     instance = Instance(fields)
     instance.index_fields(self.vocab)
     instance.as_tensor_dict()
Beispiel #2
0
    def test_all_fields_padded_to_max_length(self):
        field1 = TextField(["this", "is", "a", "sentence"], self.word_indexer)
        field2 = TextField(["this", "is", "a", "different", "sentence"], self.word_indexer)
        field3 = TextField(["this", "is", "another", "sentence"], self.word_indexer)

        list_field = ListField([field1, field2, field3])
        list_field.index(self.vocab)

        array_dict = list_field.as_array(list_field.get_padding_lengths())
        numpy.testing.assert_array_almost_equal(array_dict["words"][0], numpy.array([2, 3, 4, 5, 0]))
        numpy.testing.assert_array_almost_equal(array_dict["words"][1], numpy.array([2, 3, 4, 1, 5]))
        numpy.testing.assert_array_almost_equal(array_dict["words"][2], numpy.array([2, 3, 1, 5, 0]))
Beispiel #3
0
    def text_to_instance(
            self,  # type: ignore
            qid: str,
            question: str,
            choices: List[str],
            choice_evidences: List[Union[str, List[str]]] = None,
            answer: str = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        question_tokens = self._tokenizer.tokenize(question)
        choice_tokens = self._tokenizer.batch_tokenize(choices)

        qa_pair_tokens = []
        for i, c_tokens in enumerate(choice_tokens):
            qa_pair = question_tokens + [Token("[SEP]")] + c_tokens
            evidence_tokens = []
            if choice_evidences and choice_evidences[i]:
                choice_evidence_sents = [
                    evi for evi, _ in choice_evidences[i][:self.num_evidences]
                ]
                evidence_tokens = self._tokenizer.batch_tokenize(
                    choice_evidence_sents)
                evidence_tokens_flat = [
                    t for evi in evidence_tokens for t in evi
                ]
            else:
                evidence_tokens_flat = []
            if evidence_tokens_flat:
                qa_pair += [Token("[SEP]")] + evidence_tokens_flat
            qa_pair_tokens.append(qa_pair)

        qa_pairs_field = ListField([
            TextField(tokens, self._token_indexers)
            for tokens in qa_pair_tokens
        ])
        if answer:
            fields['answer_index'] = IndexField(self.LABELS.index(answer),
                                                qa_pairs_field)
        fields['qa_pairs'] = qa_pairs_field

        metadata = {
            "qid":
            qid,
            "question":
            question,
            "choices":
            choices,
            "question_tokens": [x.text for x in question_tokens],
            "choices_tokens":
            [[x.text for x in tokens] for tokens in choice_tokens]
        }
        fields["metadata"] = MetadataField(metadata)
        return Instance(fields)
    def text_to_instance(
            self,
            context_parse: Dict[str, Any],
            question_parse: Dict[str, Any],
            span_start: int,
            span_end: int,
            metadata: Dict[str, Any] = None) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        fields = {}

        # Create the instance fields
        if metadata is not None:
            fields["metadata"] = MetadataField(metadata)

        # context
        tokenized_context = self._context_tokenizer.tokenize(context_parse)
        if self._source_add_start_token:
            tokenized_context.insert(0, Token(START_SYMBOL))
        tokenized_context.append(Token(END_SYMBOL))

        if self._return_context_tokens_pointers:
            lowercase_tokens = False
            if "tokens" in self._context_token_indexers:
                lowercase_tokens = self._context_token_indexers[
                    "tokens"].lowercase_tokens

            context_tokens_text = [x.text for x in tokenized_context]
            _, unique_tokens_pointers, unique_tokens_list_lens = get_token_lookup_pointers(
                context_tokens_text, lowercase_tokens)

            context_tokens_pointers = ListField([
                ArrayField(np.asarray(x, dtype=np.int32), padding_value=-1)
                for x in unique_tokens_pointers
            ])
            fields["context_tokens_pointers"] = context_tokens_pointers

        context_field = TextField(tokenized_context,
                                  self._context_token_indexers)
        fields["passage"] = context_field

        # question
        tokenized_question = self._question_tokenizer.tokenize(question_parse)
        if self._source_add_start_token:
            tokenized_question.insert(0, Token(START_SYMBOL))
        tokenized_question.append(Token(END_SYMBOL))
        question_field = TextField(tokenized_question,
                                   self._question_token_indexers)
        fields["question"] = question_field

        fields['span_start'] = IndexField(span_start, context_field)
        fields['span_end'] = IndexField(span_end, context_field)

        return Instance(fields)
Beispiel #5
0
    def test_get_padding_lengths(self):
        field1 = TextField(["this", "is", "a", "sentence"], self.word_indexer)
        field2 = TextField(["this", "is", "a", "different", "sentence"],
                           self.word_indexer)
        field3 = TextField(["this", "is", "another", "sentence"],
                           self.word_indexer)

        list_field = ListField([field1, field2, field3])
        list_field.index(self.vocab)
        lengths = list_field.get_padding_lengths()

        assert lengths == {"num_fields": 3, "num_tokens": 5}
    def text_to_instance(self,
                         sents: List[str],
                         labels: List[str] = None) -> Instance:
        fields: Dict[str, Field] = {}
        tokenized_sents = [self._tokenizer.tokenize(sent) for sent in sents]
        sentence_sequence = ListField(
            [TextField(tk, self._token_indexers) for tk in tokenized_sents])
        fields['sentences'] = sentence_sequence

        if labels is not None:
            fields['labels'] = SequenceLabelField(labels, sentence_sequence)
        return Instance(fields)
    def _split(self, instance: Instance) -> Tuple[List[Instance], int]:
        # Determine the size of the sequence inside the instance.
        true_length = len(instance['source'])
        padded_length = self._split_size * (true_length // self._split_size)

        # Determine the split indices.
        split_indices = list(range(0, true_length, self._split_size))
        if true_length > split_indices[-1]:
            split_indices.append(true_length)

        # Determine which fields are not going to be split
        constant_fields = [
            x for x in instance.fields if x not in self._splitting_keys
        ]

        # Create the list of chunks
        chunks: List[Instance] = []

        for i, (start,
                end) in enumerate(zip(split_indices[:-1], split_indices[1:])):

            # Copy all of the constant fields from the instance to the chunk.
            chunk_fields = {key: instance[key] for key in constant_fields}

            # Determine whether or not to signal model to reset.
            if i == 0:
                reset = SequentialArrayField(np.array(1), dtype=np.uint8)
            else:
                reset = SequentialArrayField(np.array(0), dtype=np.uint8)
            chunk_fields['reset'] = reset

            # Obtain splits derived from sequence fields.
            for key in self._splitting_keys:
                source_field = instance[key]
                # pylint: disable=protected-access
                if isinstance(source_field, TextField):
                    split_field = TextField(source_field.tokens[start:end],
                                            source_field._token_indexers)
                elif isinstance(source_field, SequentialArrayField):
                    # TODO: Figure out how to use sequence dim here...
                    split_field = SequentialArrayField(
                        source_field.array[start:end],
                        dtype=source_field._dtype)
                elif isinstance(source_field, ListField):
                    split_field = ListField(source_field.field_list[start:end])
                else:
                    raise NotImplementedError(
                        'FancyIterator currently only supports splitting '
                        '`TextField`s or `SequentialArrayField`s.')
                chunk_fields[key] = split_field
            chunks.append(Instance(chunk_fields))

        return chunks, padded_length
Beispiel #8
0
    def text_to_instance(self, doc, doc_id, rating) -> Instance:
        fields = {}

        fields['rating'] = LabelField(rating)
        fields['doc'] = ListField([TextField(sent, self._indexer) for sent in doc])

        nsents = len(doc)
        ntokens = sum(len(i) for i in doc)
        fields['meta'] = MetadataField({'doc_id': doc_id,
                                        'sentences': nsents,
                                        'tokens': ntokens})
        return Instance(fields)
Beispiel #9
0
    def text_to_instance(self, data: dict, relation_type: int = None) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        N_relations = []
        all_tokens_sentences = []
        for i, K_examples in enumerate(data[self.TRAIN_DATA]):
            toknized_sentences = []
            clean_text_for_debug = []
            for relation in K_examples:
                head_tail = self.create_head_tail_sentence(relation)
                tokenized_tokens = self._tokenizer.tokenize(head_tail)

                field_of_tokens = TextField(tokenized_tokens, self._token_indexers)
                clean_text_for_debug.append(MetadataField(tokenized_tokens))

                toknized_sentences.append(field_of_tokens)
            assert len(toknized_sentences) == len(clean_text_for_debug)

            clean_text_for_debug = ListField(clean_text_for_debug)
            toknized_sentences = ListField(toknized_sentences)

            all_tokens_sentences.append(clean_text_for_debug)
            N_relations.append(toknized_sentences)

        assert len(N_relations) == len(all_tokens_sentences)
        N_relations = ListField(N_relations)
        all_tokens_sentences = ListField(all_tokens_sentences)
        fields = {'sentences': N_relations, "clean_tokens": all_tokens_sentences}

        test_dict = data[self.TEST_DATA]
        head_tail = self.create_head_tail_sentence(test_dict)
        tokenized_tokens = self._tokenizer.tokenize(head_tail)
        test_clean_text_for_debug = MetadataField(tokenized_tokens)
        field_of_tokens = TextField(tokenized_tokens, self._token_indexers)

        fields['test'] = field_of_tokens
        fields['test_clean_text'] = test_clean_text_for_debug

        if relation_type is not None:
            fields['label'] = IndexField(relation_type, N_relations)
        return Instance(fields)
    def get_answer_fields(
            self, **kwargs: Dict[str, Any]) -> Tuple[Dict[str, Field], bool]:
        number_occurrences_in_passage: List[Dict[
            str, Any]] = kwargs['number_occurrences_in_passage']
        answer_texts: List[str] = kwargs['answer_texts']

        fields: Dict[str, Field] = {}

        target_numbers = get_target_numbers(answer_texts)

        # Get possible ways to arrive at target numbers with add/sub
        valid_expressions: List[List[int]] = \
            self._find_valid_add_sub_expressions_with_rounding(
                self._special_numbers + [number_occurrence['value'] for number_occurrence in number_occurrences_in_passage],
                target_numbers,
                self._max_numbers_expression)

        if len(valid_expressions) > 0:
            has_answer = True

            add_sub_signs_field: List[Field] = []
            special_signs_field: List[Field] = []

            for signs_for_one_add_sub_expressions in valid_expressions:
                special_signs = signs_for_one_add_sub_expressions[:len(
                    self._special_numbers)]
                normal_signs = signs_for_one_add_sub_expressions[
                    len(self._special_numbers):]
                add_sub_signs_field.append(LabelsField(normal_signs))
                special_signs_field.append(LabelsField(special_signs))

            fields['answer_as_expressions'] = ListField(add_sub_signs_field)
            if self._special_numbers:
                fields['answer_as_expressions_extra'] = ListField(
                    special_signs_field)
        else:
            has_answer = False
            fields.update(self.get_empty_answer_fields(**kwargs))

        return fields, has_answer
Beispiel #11
0
    def text_to_instance(self,
                         tokens: List[Token],
                         entities: List = None,
                         relations: List = None) -> Instance:
        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {"tokens": sequence}
        words = [x.text for x in tokens]
        spans = []
        for start, end in enumerate_spans(words,
                                          max_span_width=self._max_span_width):
            assert start >= 0
            assert end >= 0
            spans.append(SpanField(start, end, sequence))

        span_field = ListField(spans)
        span_tuples = [(span.span_start, span.span_end) for span in spans]
        instance_fields["spans"] = span_field

        ner_labels = [[] for i in span_tuples]

        ner_list = [((e.start, e.end), e.role) for e in entities]

        for span, label in ner_list:
            if self._too_long(span):
                continue
            ix = span_tuples.index(span)
            # if "" in ner_labels[ix]:
            #     ner_labels[ix].remove("")

            ner_labels[ix] += [label]

        instance_fields["ner_labels"] = ListField([
            MultiLabelField(entry, label_namespace=self.label_namespace)
            for entry in ner_labels
        ])

        metadata = {"words": words, "relations": relations}
        instance_fields["metadata"] = MetadataField(metadata)

        return Instance(instance_fields)
Beispiel #12
0
    def text_to_instance(
            self,  # type: ignore
            question: List[Token],
            entity: List[str],
            entity_surface: List[List[Token]],
            e_type: List[List[Token]] = None,
            e_descr: List[List[Token]] = None,
            e_detail: List[List[Token]] = None,
            logical_form: List[List[str]] = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        sequence = TextField(question, self._token_indexers)
        entity_sequence = ListField(
            [TextField(x, self._token_indexers) for x in entity_surface])
        description = ListField(
            [TextField(x, self._token_indexers) for x in e_descr])
        detail = ListField(
            [TextField(x, self._token_indexers) for x in e_detail])

        instance_fields: Dict[str, Field] = {
            'question': sequence,
            'entity_surface': entity_sequence,
            "entity_description": description,
            "entity_detail": detail
        }

        instance_fields["metadata"] = MetadataField({
            "question_words": [x.text for x in question],
            "entity_surface": [x.text for y in entity_sequence for x in y],
            "entity_description": [x.text for y in description for x in y],
            "entity_detail": [x.text for y in detail for x in y]
        })

        instance_fields['entity_type'] = ListField(
            [TextField(x, self._token_indexers) for x in e_type])
        instance_fields['entity'] = ListField(
            [LabelField(x, "entity") for x in entity])
        if len(logical_form) > 0:
            instance_fields['logical_form_1'] = ListField(
                [LabelField(x, "logical_form") for x in logical_form[0]])
            instance_fields['logical_form_2'] = ListField(
                [LabelField(x, "logical_form") for x in logical_form[1]])
            instance_fields['logical_form_both'] = ListField(
                [LabelField(x, "logical_form") for x in logical_form[2]])
        else:
            pass

        return Instance(instance_fields)
Beispiel #13
0
    def _get_author_field(self,
                          authors: List[str]) -> Tuple[ListField, ListField]:
        """
        Get a Label field associated with authors along with their position
        Args:
            authors: list of authors

        Returns:
            authors and their positions
        """
        if authors == []:
            authors = ['##']
        authors = [self._tokenizer.tokenize(author) for author in authors]
        if len(authors) > self.max_num_authors:
            authors = authors[:self.max_num_authors - 1] + [authors[-1]]
        author_field = ListField([
            TextField(author, token_indexers=self._token_indexer_author_id)
            for author in authors
        ])

        author_positions = []
        for i, _ in enumerate(authors):
            if i == 0:
                author_positions.append(
                    TextField(
                        self._tokenizer.tokenize('00'),
                        token_indexers=self._token_indexer_author_position))
            elif i < len(authors) - 1:
                author_positions.append(
                    TextField(
                        self._tokenizer.tokenize('01'),
                        token_indexers=self._token_indexer_author_position))
            else:
                author_positions.append(
                    TextField(
                        self._tokenizer.tokenize('02'),
                        token_indexers=self._token_indexer_author_position))
        position_field = ListField(author_positions)
        return author_field, position_field
Beispiel #14
0
    def text_to_instance(
            self,  # type: ignore
            premises: Union[List[str], List[List[str]]],
            choices: List[str],
            label: int = None) -> Instance:

        number_of_choices = len(choices)
        if isinstance(premises[0], str):
            premises = [premises] * number_of_choices

        # create an empty dictionary to store the input
        fields: Dict[str, Field] = {}
        all_premises = []
        all_choices = []
        for premise, hypothesis in zip(premises, choices):

            # hypothesis is a sentence, tokentize it to get List[Token]
            tokenized_hypothesis = self._tokenizer.tokenize(hypothesis)

            # create a ListField for premise since  it is a list of sentences
            tokenized_premises_field = []
            for premise_sentence in premise:
                tokenized_premises_field.append(
                    TextField(self._tokenizer.tokenize(premise_sentence),
                              self._token_indexers))

            all_premises.append(ListField(tokenized_premises_field))

            #create a simple textfield for hypothesis
            all_choices.append(
                TextField(tokenized_hypothesis, self._token_indexers))

        if label is not None:
            fields['label'] = LabelField(label, skip_indexing=True)

        fields['premises'] = ListField(all_premises)
        fields['choices'] = ListField(all_choices)

        return Instance(fields)
Beispiel #15
0
 def test_all_fields_padded_to_max_length(self):
     list_field = ListField([self.field1, self.field2, self.field3])
     list_field.index(self.vocab)
     tensor_dict = list_field.as_tensor(list_field.get_padding_lengths())
     numpy.testing.assert_array_almost_equal(
         tensor_dict["words"]["tokens"][0].detach().cpu().numpy(),
         numpy.array([2, 3, 4, 5, 0]))
     numpy.testing.assert_array_almost_equal(
         tensor_dict["words"]["tokens"][1].detach().cpu().numpy(),
         numpy.array([2, 3, 4, 1, 5]))
     numpy.testing.assert_array_almost_equal(
         tensor_dict["words"]["tokens"][2].detach().cpu().numpy(),
         numpy.array([2, 3, 1, 5, 0]))
Beispiel #16
0
    def text_to_instance(self,
                         label,
                         response=None,
                         original_post=None,
                         weakpoints=None,
                         op_features=None,
                         response_features=None,
                         op_doc_features=None,
                         response_doc_features=None,
                         goodpoints=None) -> Instance:

        fields: Dict[str, Field] = {}

        if original_post is not None:
            fields['original_post'] = ListField([
                TextField(
                    self._tokenizer.tokenize(s)[:self.max_sentence_len],
                    self._token_indexers)
                for s in original_post[:self.max_post_len]
            ])
            if weakpoints is not None:
                fields['weakpoints'] = ListField([
                    IndexField(wp, fields['original_post'])
                    for wp in weakpoints
                ])

        if response is not None:
            fields['response'] = ListField([
                TextField(
                    self._tokenizer.tokenize(s)[:self.max_sentence_len],
                    self._token_indexers) for s in response[:self.max_post_len]
            ])

            if goodpoints is not None:
                fields['goodpoints'] = ListField(
                    [IndexField(gp, fields['response']) for gp in goodpoints])

        if op_features is not None:
            fields['op_features'] = ListField([
                ArrayField(np.array(f))
                for f in op_features[:self.max_post_len]
            ])

        if response_features is not None:
            fields['response_features'] = ListField([
                ArrayField(np.array(f))
                for f in response_features[:self.max_post_len]
            ])

        if op_doc_features is not None:
            fields['op_doc_features'] = ArrayField(np.array(op_doc_features))

        if response_doc_features is not None:
            fields['response_doc_features'] = ArrayField(
                np.array(response_doc_features))

        fields['label'] = LabelField(label, skip_indexing=True)

        return Instance(fields)
Beispiel #17
0
    def text_to_instance(
        self,  # type: ignore
        candidates: List[str],
        query: str,
        supports: List[str],
        _id: str = None,
        answer: str = None,
        annotations: List[List[str]] = None,
    ) -> Instance:

        fields: Dict[str, Field] = {}

        candidates_field = ListField([
            TextField(candidate, self._token_indexers)
            for candidate in self._tokenizer.batch_tokenize(candidates)
        ])

        fields["query"] = TextField(self._tokenizer.tokenize(query),
                                    self._token_indexers)

        fields["supports"] = ListField([
            TextField(support, self._token_indexers)
            for support in self._tokenizer.batch_tokenize(supports)
        ])

        fields["answer"] = TextField(self._tokenizer.tokenize(answer),
                                     self._token_indexers)

        fields["answer_index"] = IndexField(candidates.index(answer),
                                            candidates_field)

        fields["candidates"] = candidates_field

        fields["metadata"] = MetadataField({
            "annotations": annotations,
            "id": _id
        })

        return Instance(fields)
    def text_to_instance(
            self,  # type: ignore
            candidates: List[str],
            query: str,
            supports: List[str],
            _id: str = None,
            answer: str = None,
            annotations: List[List[str]] = None) -> Instance:

        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}

        candidates_field = ListField([
            TextField(candidate, self._token_indexers)
            for candidate in self._tokenizer.batch_tokenize(candidates)
        ])

        fields['query'] = TextField(self._tokenizer.tokenize(query),
                                    self._token_indexers)

        fields['supports'] = ListField([
            TextField(support, self._token_indexers)
            for support in self._tokenizer.batch_tokenize(supports)
        ])

        fields['answer'] = TextField(self._tokenizer.tokenize(answer),
                                     self._token_indexers)

        fields['answer_index'] = IndexField(candidates.index(answer),
                                            candidates_field)

        fields['candidates'] = candidates_field

        fields['metadata'] = MetadataField({
            'annotations': annotations,
            'id': _id
        })

        return Instance(fields)
Beispiel #19
0
    def tokens_to_user_field(self, tokens) -> Optional[ListField]:
        doc_list = []
        if self.overflow_doc_strategy == 'latest':
            docs = tokens[-self.max_doc:]
        elif self.overflow_doc_strategy == 'earliest':
            docs = tokens[:self.max_doc]
        elif self.overflow_doc_strategy == 'all':
            docs = tokens
        elif self.overflow_doc_strategy == 'random':
            if len(tokens) > self.max_doc:
                doc_indexes = range(len(tokens))
                selected_doc_indexes = np.sort(
                    np.random.choice(doc_indexes, self.max_doc, replace=False))
                docs = [tokens[i] for i in selected_doc_indexes]
            else:
                docs = tokens
        else:
            raise ValueError('{} as docs overflow strategy is not valid, \
choose from latest, earliest, or random'.format(self.overflow_doc_strategy))

        for doc in docs:
            sent_list = []
            for sentence in doc[:self.max_sent]:
                word_list = []
                for word in sentence[:self.max_word]:
                    if len(word) < self.max_word_len:
                        word_list.append(Token(word))
                    else:
                        word_list.append(Token(word[:self.max_word_len]))
                if len(word_list) > 0:
                    sent_list.append(TextField(word_list, self.token_indexers))

            if len(sent_list) > 0:
                doc_list.append(ListField(sent_list))

        if len(doc_list) > 0:
            return ListField(doc_list)
        else:
            return None
 def text_to_instance(self, article_paragraphs: List[List[str]], label: str,
                      evidence_spans: List[int], outcome: List[str],
                      intervention: List[str], comparator: List[str]):
     article = ListField([
         TextField([Token(x) for x in para[:100]], self.token_indexers)
         for para in article_paragraphs
     ])
     fields = {
         'article':
         article,
         'outcome':
         TextField([Token(x) for x in outcome], self.token_indexers),
         'intervention':
         TextField([Token(x) for x in intervention], self.token_indexers),
         'comparator':
         TextField([Token(x) for x in comparator], self.token_indexers),
         'labels':
         LabelField(label),
         'evidence':
         ListField([IndexField(item, article) for item in evidence_spans])
     }
     return Instance(fields)
Beispiel #21
0
    def text_to_instance(self,
                         triple,
                         predicate,
                         draft,
                         revised=None,
                         action=None) -> Instance:
        triple_field = ListField([TextField(t, self.token_indexers) for t in triple])
        predicate_field = ListField([TextField(p, self.token_indexers) for p in predicate])
        draft.insert(0, Token(START_SYMBOL))
        draft.append(Token(END_SYMBOL))
        draft_field = TextField(draft, self.token_indexers)
        fields = {
            "triple_tokens": triple_field,
            "predicate_tokens": predicate_field,
            "draft_tokens": draft_field
        }
        meta_fields = {"draft": [w.text for w in draft[1:-1]], "triple": [t[-1].text for t in triple]}

        if revised is not None:
            meta_fields["revised"] = [w.text for w in revised]
            revised.insert(0, Token(START_SYMBOL))
            revised.append(Token(END_SYMBOL))

            action.insert(0, Token(START_SYMBOL))
            action.append(Token(END_SYMBOL))

            triple_revised_ids = self._tokens_to_ids([t[-1] for t in triple] + action)
            fields["triple_token_ids"] = ArrayField(np.array(triple_revised_ids[:len(triple)]))
            fields["action_token_ids"] = ArrayField(np.array(triple_revised_ids[len(triple):]))

            fields.update({"revised_tokens": TextField(revised, self.token_indexers),
                           "action_tokens": TextField(action, self.token_indexers)})
        else:
            fields["triple_token_ids"] = ArrayField(np.array(self._tokens_to_ids([t[-1] for t in triple])))

        fields["metadata"] = MetadataField(meta_fields)

        return Instance(fields)
Beispiel #22
0
def make_multiqa_instance(question_tokens: List[Token],
                                             tokenized_paragraph: List[List[Token]],
                                             token_indexers: Dict[str, TokenIndexer],
                                             paragraph: List[str],
                                             answers_list: List[Tuple[int, int]] = None,
                                             additional_metadata: Dict[str, Any] = None) -> AllenInstance:

    additional_metadata = additional_metadata or {}
    fields: Dict[str, Field] = {}

    passage_offsets = [(token.idx, token.idx + len(token.text)) for token in tokenized_paragraph]
    # This is separate so we can reference it later with a known type.
    passage_field = TextField(tokenized_paragraph, token_indexers)
    fields['passage'] = passage_field
    fields['question'] = TextField(question_tokens, token_indexers)
    metadata = {'original_passage': paragraph,
                'answers_list': answers_list,
                'token_offsets': passage_offsets,
                'question_tokens': [token.text for token in question_tokens],
                'passage_tokens': [token.text for token in tokenized_paragraph]}

    if answers_list is not None:
        span_start_list: List[Field] = []
        span_end_list: List[Field] = []
        if answers_list == []:
            span_start, span_end = -1, -1
        else:
            span_start, span_end, text = answers_list[0]

        span_start_list.append(IndexField(span_start, passage_field))
        span_end_list.append(IndexField(span_end, passage_field))

        fields['span_start'] = ListField(span_start_list)
        fields['span_end'] = ListField(span_end_list)

    metadata.update(additional_metadata)
    fields['metadata'] = MetadataField(metadata)
    return AllenInstance(fields)
    def text_to_instance(self,  # type: ignore
                         item_id: Any,
                         question_text: str,
                         choice_text_list: List[str],
                         facts_text_list: List[str],
                         answer_id: int,
                         meta_fields: Dict = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}

        question_tokens = self.tokenize(question_text, "question")
        choices_tokens_list = [self.tokenize(x, "choice") for x in choice_text_list]
        facts_tokens_list = [self.tokenize(x, "fact") for x in facts_text_list]

        fields['question'] = TextField(question_tokens, self._token_indexers)
        fields['choices_list'] = ListField([TextField(x, self._token_indexers) for x in choices_tokens_list])
        fields['facts_list'] = ListField([TextField(x, self._token_indexers) for x in facts_tokens_list])

        fields['label'] = LabelField(answer_id, skip_indexing=True)

        metadata = {
            "id": item_id,
            "question_text": question_text,
            "choice_text_list": choice_text_list,
            "facts_text_list": facts_text_list,
            "question_tokens": [x.text for x in question_tokens],
            "choice_tokens_list": [[x.text for x in ct] for ct in choices_tokens_list],
            "facts_tokens_list": [[x.text for x in ct] for ct in facts_tokens_list],
            "label_gold": answer_id,
        }

        if meta_fields is not None:
            for k, v in meta_fields.items():
                metadata[k] = v

        fields["metadata"] = MetadataField(metadata)

        return Instance(fields)
Beispiel #24
0
 def text_to_instance(
         self,  # type: ignore
         premise: str,
         hypotheses: List[str],
         labels: List[str] = None) -> Instance:
     # pylint: disable=arguments-differ
     fields: Dict[str, Field] = {}
     premise_tokens = self._tokenizer.tokenize(premise)
     fields['premise'] = TextField(premise_tokens, self._token_indexers)
     all_hypotheses_fields = list()
     for hypothesis in hypotheses:
         hypothesis_tokens = self._tokenizer.tokenize(hypothesis)
         all_hypotheses_fields.append(
             TextField(hypothesis_tokens, self._token_indexers))
     fields['hypotheses'] = ListField(all_hypotheses_fields)
     if labels:
         all_labels_fields = list()
         for label in labels:
             all_labels_fields.append(LabelField(label))
         fields['labels'] = ListField(all_labels_fields)
         metadata = {"labels": all_labels_fields}
         fields["metadata"] = MetadataField(metadata)
     return Instance(fields)
    def line_to_instance(self, query: List[Token], docs: List[List[List[Token]]],
                         relevant_ix: int = None, scores: List[float] = None,
                         dataset: Optional[str] = None) -> Instance:
        query_field = TextField(query, self.q_token_indexers)
        doc_fields = [ListField([TextField(sentence, self.d_token_indexers) for sentence in doc]) for doc in docs]

        fields = {
            'query': query_field,
            'docs': ListField(doc_fields)
        }

        if scores is not None:
            scores_field = ArrayField(scores)
            fields['scores'] = scores_field

        if relevant_ix is not None:
            label_field = LabelField(int(relevant_ix), skip_indexing=True)
            fields['labels'] = label_field

        if dataset is not None:
            fields[self.dataset_name_field] = MetadataField(dataset)

        return Instance(fields)
Beispiel #26
0
    def test_padding_handles_list_fields_with_padding_values(self):
        array1 = ArrayField(numpy.ones([2, 3]), padding_value=-1)
        array2 = ArrayField(numpy.ones([1, 5]), padding_value=-1)
        empty_array = array1.empty_field()
        list_field = ListField([array1, array2, empty_array])

        returned_tensor = (list_field.as_tensor(
            list_field.get_padding_lengths()).detach().cpu().numpy())
        correct_tensor = numpy.array([
            [[1.0, 1.0, 1.0, -1.0, -1.0], [1.0, 1.0, 1.0, -1.0, -1.0]],
            [[1.0, 1.0, 1.0, 1.0, 1.0], [-1.0, -1.0, -1.0, -1.0, -1.0]],
            [[-1.0, -1.0, -1.0, -1.0, -1.0], [-1.0, -1.0, -1.0, -1.0, -1.0]],
        ])
        numpy.testing.assert_array_equal(returned_tensor, correct_tensor)
def text_to_instance(sents: List[str], labels: List[str] = None):
    """
    Make list of sentences (and labels) to ``Instance``
    """
    fields = {}
    tokenized_sents = [WordTokenizer().tokenize(sent) for sent in sents]
    sentence_sequence = ListField([
        TextField(tk, {'tokens': SingleIdTokenIndexer()})
        for tk in tokenized_sents
    ])
    fields['sentences'] = sentence_sequence
    if labels is not None:
        fields['labels'] = SequenceLabelField(labels, sentence_sequence)
    return Instance(fields)
Beispiel #28
0
def test_extract_tokens_listfield(task_head):
    tokenizer = Tokenizer(Vocab())
    input_tokens = list(tokenizer("test this sentence."))

    tf = TextField(input_tokens, None)
    instance = Instance({"test": ListField([tf, tf])})

    tokens = task_head._extract_tokens(instance)

    assert len(tokens) == 2 and len(tokens[0]) == 3 and len(tokens[1]) == 3
    assert all([
        all([isinstance(tok, Token) for tok in tf_tokens]
            for tf_tokens in tokens)
    ])
Beispiel #29
0
    def text_to_instance(self, document: str, label: str = None) -> Instance:
        sentences: List[str] = self._sentence_splitter.split_sentences(
            document)
        tokenized_sents: List[List[str]] = (self._tokenizer.tokenize(sent)
                                            for sent in sentences)

        fields = {
            'tokens':
            ListField(
                [TextField(s, self._token_indexers) for s in tokenized_sents])
        }
        if label:
            fields['label'] = LabelField(int(label), skip_indexing=True)
        return Instance(fields)
    def test_padding_handles_list_fields(self):
        array1 = ArrayField(numpy.ones([2, 3]))
        array2 = ArrayField(numpy.ones([1, 5]))
        empty_array = array1.empty_field()
        list_field = ListField([array1, array2, empty_array])

        returned_tensor = list_field.as_tensor(list_field.get_padding_lengths()).detach().cpu().numpy()
        correct_tensor = numpy.array([[[1., 1., 1., 0., 0.],
                                       [1., 1., 1., 0., 0.]],
                                      [[1., 1., 1., 1., 1.],
                                       [0., 0., 0., 0., 0.]],
                                      [[0., 0., 0., 0., 0.],
                                       [0., 0., 0., 0., 0.]]])
        numpy.testing.assert_array_equal(returned_tensor, correct_tensor)