def test_empty_list_can_be_tensorized(self): tokenizer = SpacyTokenizer() tokens = tokenizer.tokenize("Foo") text_field = TextField(tokens, self.word_indexer) list_field = ListField([text_field.empty_field()]) fields = { "list": list_field, "bar": TextField(tokenizer.tokenize("BAR"), self.word_indexer), } instance = Instance(fields) instance.index_fields(self.vocab) instance.as_tensor_dict()
def test_all_fields_padded_to_max_length(self): field1 = TextField(["this", "is", "a", "sentence"], self.word_indexer) field2 = TextField(["this", "is", "a", "different", "sentence"], self.word_indexer) field3 = TextField(["this", "is", "another", "sentence"], self.word_indexer) list_field = ListField([field1, field2, field3]) list_field.index(self.vocab) array_dict = list_field.as_array(list_field.get_padding_lengths()) numpy.testing.assert_array_almost_equal(array_dict["words"][0], numpy.array([2, 3, 4, 5, 0])) numpy.testing.assert_array_almost_equal(array_dict["words"][1], numpy.array([2, 3, 4, 1, 5])) numpy.testing.assert_array_almost_equal(array_dict["words"][2], numpy.array([2, 3, 1, 5, 0]))
def text_to_instance( self, # type: ignore qid: str, question: str, choices: List[str], choice_evidences: List[Union[str, List[str]]] = None, answer: str = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} question_tokens = self._tokenizer.tokenize(question) choice_tokens = self._tokenizer.batch_tokenize(choices) qa_pair_tokens = [] for i, c_tokens in enumerate(choice_tokens): qa_pair = question_tokens + [Token("[SEP]")] + c_tokens evidence_tokens = [] if choice_evidences and choice_evidences[i]: choice_evidence_sents = [ evi for evi, _ in choice_evidences[i][:self.num_evidences] ] evidence_tokens = self._tokenizer.batch_tokenize( choice_evidence_sents) evidence_tokens_flat = [ t for evi in evidence_tokens for t in evi ] else: evidence_tokens_flat = [] if evidence_tokens_flat: qa_pair += [Token("[SEP]")] + evidence_tokens_flat qa_pair_tokens.append(qa_pair) qa_pairs_field = ListField([ TextField(tokens, self._token_indexers) for tokens in qa_pair_tokens ]) if answer: fields['answer_index'] = IndexField(self.LABELS.index(answer), qa_pairs_field) fields['qa_pairs'] = qa_pairs_field metadata = { "qid": qid, "question": question, "choices": choices, "question_tokens": [x.text for x in question_tokens], "choices_tokens": [[x.text for x in tokens] for tokens in choice_tokens] } fields["metadata"] = MetadataField(metadata) return Instance(fields)
def text_to_instance( self, context_parse: Dict[str, Any], question_parse: Dict[str, Any], span_start: int, span_end: int, metadata: Dict[str, Any] = None) -> Instance: # type: ignore # pylint: disable=arguments-differ fields = {} # Create the instance fields if metadata is not None: fields["metadata"] = MetadataField(metadata) # context tokenized_context = self._context_tokenizer.tokenize(context_parse) if self._source_add_start_token: tokenized_context.insert(0, Token(START_SYMBOL)) tokenized_context.append(Token(END_SYMBOL)) if self._return_context_tokens_pointers: lowercase_tokens = False if "tokens" in self._context_token_indexers: lowercase_tokens = self._context_token_indexers[ "tokens"].lowercase_tokens context_tokens_text = [x.text for x in tokenized_context] _, unique_tokens_pointers, unique_tokens_list_lens = get_token_lookup_pointers( context_tokens_text, lowercase_tokens) context_tokens_pointers = ListField([ ArrayField(np.asarray(x, dtype=np.int32), padding_value=-1) for x in unique_tokens_pointers ]) fields["context_tokens_pointers"] = context_tokens_pointers context_field = TextField(tokenized_context, self._context_token_indexers) fields["passage"] = context_field # question tokenized_question = self._question_tokenizer.tokenize(question_parse) if self._source_add_start_token: tokenized_question.insert(0, Token(START_SYMBOL)) tokenized_question.append(Token(END_SYMBOL)) question_field = TextField(tokenized_question, self._question_token_indexers) fields["question"] = question_field fields['span_start'] = IndexField(span_start, context_field) fields['span_end'] = IndexField(span_end, context_field) return Instance(fields)
def test_get_padding_lengths(self): field1 = TextField(["this", "is", "a", "sentence"], self.word_indexer) field2 = TextField(["this", "is", "a", "different", "sentence"], self.word_indexer) field3 = TextField(["this", "is", "another", "sentence"], self.word_indexer) list_field = ListField([field1, field2, field3]) list_field.index(self.vocab) lengths = list_field.get_padding_lengths() assert lengths == {"num_fields": 3, "num_tokens": 5}
def text_to_instance(self, sents: List[str], labels: List[str] = None) -> Instance: fields: Dict[str, Field] = {} tokenized_sents = [self._tokenizer.tokenize(sent) for sent in sents] sentence_sequence = ListField( [TextField(tk, self._token_indexers) for tk in tokenized_sents]) fields['sentences'] = sentence_sequence if labels is not None: fields['labels'] = SequenceLabelField(labels, sentence_sequence) return Instance(fields)
def _split(self, instance: Instance) -> Tuple[List[Instance], int]: # Determine the size of the sequence inside the instance. true_length = len(instance['source']) padded_length = self._split_size * (true_length // self._split_size) # Determine the split indices. split_indices = list(range(0, true_length, self._split_size)) if true_length > split_indices[-1]: split_indices.append(true_length) # Determine which fields are not going to be split constant_fields = [ x for x in instance.fields if x not in self._splitting_keys ] # Create the list of chunks chunks: List[Instance] = [] for i, (start, end) in enumerate(zip(split_indices[:-1], split_indices[1:])): # Copy all of the constant fields from the instance to the chunk. chunk_fields = {key: instance[key] for key in constant_fields} # Determine whether or not to signal model to reset. if i == 0: reset = SequentialArrayField(np.array(1), dtype=np.uint8) else: reset = SequentialArrayField(np.array(0), dtype=np.uint8) chunk_fields['reset'] = reset # Obtain splits derived from sequence fields. for key in self._splitting_keys: source_field = instance[key] # pylint: disable=protected-access if isinstance(source_field, TextField): split_field = TextField(source_field.tokens[start:end], source_field._token_indexers) elif isinstance(source_field, SequentialArrayField): # TODO: Figure out how to use sequence dim here... split_field = SequentialArrayField( source_field.array[start:end], dtype=source_field._dtype) elif isinstance(source_field, ListField): split_field = ListField(source_field.field_list[start:end]) else: raise NotImplementedError( 'FancyIterator currently only supports splitting ' '`TextField`s or `SequentialArrayField`s.') chunk_fields[key] = split_field chunks.append(Instance(chunk_fields)) return chunks, padded_length
def text_to_instance(self, doc, doc_id, rating) -> Instance: fields = {} fields['rating'] = LabelField(rating) fields['doc'] = ListField([TextField(sent, self._indexer) for sent in doc]) nsents = len(doc) ntokens = sum(len(i) for i in doc) fields['meta'] = MetadataField({'doc_id': doc_id, 'sentences': nsents, 'tokens': ntokens}) return Instance(fields)
def text_to_instance(self, data: dict, relation_type: int = None) -> Instance: # type: ignore # pylint: disable=arguments-differ N_relations = [] all_tokens_sentences = [] for i, K_examples in enumerate(data[self.TRAIN_DATA]): toknized_sentences = [] clean_text_for_debug = [] for relation in K_examples: head_tail = self.create_head_tail_sentence(relation) tokenized_tokens = self._tokenizer.tokenize(head_tail) field_of_tokens = TextField(tokenized_tokens, self._token_indexers) clean_text_for_debug.append(MetadataField(tokenized_tokens)) toknized_sentences.append(field_of_tokens) assert len(toknized_sentences) == len(clean_text_for_debug) clean_text_for_debug = ListField(clean_text_for_debug) toknized_sentences = ListField(toknized_sentences) all_tokens_sentences.append(clean_text_for_debug) N_relations.append(toknized_sentences) assert len(N_relations) == len(all_tokens_sentences) N_relations = ListField(N_relations) all_tokens_sentences = ListField(all_tokens_sentences) fields = {'sentences': N_relations, "clean_tokens": all_tokens_sentences} test_dict = data[self.TEST_DATA] head_tail = self.create_head_tail_sentence(test_dict) tokenized_tokens = self._tokenizer.tokenize(head_tail) test_clean_text_for_debug = MetadataField(tokenized_tokens) field_of_tokens = TextField(tokenized_tokens, self._token_indexers) fields['test'] = field_of_tokens fields['test_clean_text'] = test_clean_text_for_debug if relation_type is not None: fields['label'] = IndexField(relation_type, N_relations) return Instance(fields)
def get_answer_fields( self, **kwargs: Dict[str, Any]) -> Tuple[Dict[str, Field], bool]: number_occurrences_in_passage: List[Dict[ str, Any]] = kwargs['number_occurrences_in_passage'] answer_texts: List[str] = kwargs['answer_texts'] fields: Dict[str, Field] = {} target_numbers = get_target_numbers(answer_texts) # Get possible ways to arrive at target numbers with add/sub valid_expressions: List[List[int]] = \ self._find_valid_add_sub_expressions_with_rounding( self._special_numbers + [number_occurrence['value'] for number_occurrence in number_occurrences_in_passage], target_numbers, self._max_numbers_expression) if len(valid_expressions) > 0: has_answer = True add_sub_signs_field: List[Field] = [] special_signs_field: List[Field] = [] for signs_for_one_add_sub_expressions in valid_expressions: special_signs = signs_for_one_add_sub_expressions[:len( self._special_numbers)] normal_signs = signs_for_one_add_sub_expressions[ len(self._special_numbers):] add_sub_signs_field.append(LabelsField(normal_signs)) special_signs_field.append(LabelsField(special_signs)) fields['answer_as_expressions'] = ListField(add_sub_signs_field) if self._special_numbers: fields['answer_as_expressions_extra'] = ListField( special_signs_field) else: has_answer = False fields.update(self.get_empty_answer_fields(**kwargs)) return fields, has_answer
def text_to_instance(self, tokens: List[Token], entities: List = None, relations: List = None) -> Instance: sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {"tokens": sequence} words = [x.text for x in tokens] spans = [] for start, end in enumerate_spans(words, max_span_width=self._max_span_width): assert start >= 0 assert end >= 0 spans.append(SpanField(start, end, sequence)) span_field = ListField(spans) span_tuples = [(span.span_start, span.span_end) for span in spans] instance_fields["spans"] = span_field ner_labels = [[] for i in span_tuples] ner_list = [((e.start, e.end), e.role) for e in entities] for span, label in ner_list: if self._too_long(span): continue ix = span_tuples.index(span) # if "" in ner_labels[ix]: # ner_labels[ix].remove("") ner_labels[ix] += [label] instance_fields["ner_labels"] = ListField([ MultiLabelField(entry, label_namespace=self.label_namespace) for entry in ner_labels ]) metadata = {"words": words, "relations": relations} instance_fields["metadata"] = MetadataField(metadata) return Instance(instance_fields)
def text_to_instance( self, # type: ignore question: List[Token], entity: List[str], entity_surface: List[List[Token]], e_type: List[List[Token]] = None, e_descr: List[List[Token]] = None, e_detail: List[List[Token]] = None, logical_form: List[List[str]] = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ sequence = TextField(question, self._token_indexers) entity_sequence = ListField( [TextField(x, self._token_indexers) for x in entity_surface]) description = ListField( [TextField(x, self._token_indexers) for x in e_descr]) detail = ListField( [TextField(x, self._token_indexers) for x in e_detail]) instance_fields: Dict[str, Field] = { 'question': sequence, 'entity_surface': entity_sequence, "entity_description": description, "entity_detail": detail } instance_fields["metadata"] = MetadataField({ "question_words": [x.text for x in question], "entity_surface": [x.text for y in entity_sequence for x in y], "entity_description": [x.text for y in description for x in y], "entity_detail": [x.text for y in detail for x in y] }) instance_fields['entity_type'] = ListField( [TextField(x, self._token_indexers) for x in e_type]) instance_fields['entity'] = ListField( [LabelField(x, "entity") for x in entity]) if len(logical_form) > 0: instance_fields['logical_form_1'] = ListField( [LabelField(x, "logical_form") for x in logical_form[0]]) instance_fields['logical_form_2'] = ListField( [LabelField(x, "logical_form") for x in logical_form[1]]) instance_fields['logical_form_both'] = ListField( [LabelField(x, "logical_form") for x in logical_form[2]]) else: pass return Instance(instance_fields)
def _get_author_field(self, authors: List[str]) -> Tuple[ListField, ListField]: """ Get a Label field associated with authors along with their position Args: authors: list of authors Returns: authors and their positions """ if authors == []: authors = ['##'] authors = [self._tokenizer.tokenize(author) for author in authors] if len(authors) > self.max_num_authors: authors = authors[:self.max_num_authors - 1] + [authors[-1]] author_field = ListField([ TextField(author, token_indexers=self._token_indexer_author_id) for author in authors ]) author_positions = [] for i, _ in enumerate(authors): if i == 0: author_positions.append( TextField( self._tokenizer.tokenize('00'), token_indexers=self._token_indexer_author_position)) elif i < len(authors) - 1: author_positions.append( TextField( self._tokenizer.tokenize('01'), token_indexers=self._token_indexer_author_position)) else: author_positions.append( TextField( self._tokenizer.tokenize('02'), token_indexers=self._token_indexer_author_position)) position_field = ListField(author_positions) return author_field, position_field
def text_to_instance( self, # type: ignore premises: Union[List[str], List[List[str]]], choices: List[str], label: int = None) -> Instance: number_of_choices = len(choices) if isinstance(premises[0], str): premises = [premises] * number_of_choices # create an empty dictionary to store the input fields: Dict[str, Field] = {} all_premises = [] all_choices = [] for premise, hypothesis in zip(premises, choices): # hypothesis is a sentence, tokentize it to get List[Token] tokenized_hypothesis = self._tokenizer.tokenize(hypothesis) # create a ListField for premise since it is a list of sentences tokenized_premises_field = [] for premise_sentence in premise: tokenized_premises_field.append( TextField(self._tokenizer.tokenize(premise_sentence), self._token_indexers)) all_premises.append(ListField(tokenized_premises_field)) #create a simple textfield for hypothesis all_choices.append( TextField(tokenized_hypothesis, self._token_indexers)) if label is not None: fields['label'] = LabelField(label, skip_indexing=True) fields['premises'] = ListField(all_premises) fields['choices'] = ListField(all_choices) return Instance(fields)
def test_all_fields_padded_to_max_length(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) tensor_dict = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_almost_equal( tensor_dict["words"]["tokens"][0].detach().cpu().numpy(), numpy.array([2, 3, 4, 5, 0])) numpy.testing.assert_array_almost_equal( tensor_dict["words"]["tokens"][1].detach().cpu().numpy(), numpy.array([2, 3, 4, 1, 5])) numpy.testing.assert_array_almost_equal( tensor_dict["words"]["tokens"][2].detach().cpu().numpy(), numpy.array([2, 3, 1, 5, 0]))
def text_to_instance(self, label, response=None, original_post=None, weakpoints=None, op_features=None, response_features=None, op_doc_features=None, response_doc_features=None, goodpoints=None) -> Instance: fields: Dict[str, Field] = {} if original_post is not None: fields['original_post'] = ListField([ TextField( self._tokenizer.tokenize(s)[:self.max_sentence_len], self._token_indexers) for s in original_post[:self.max_post_len] ]) if weakpoints is not None: fields['weakpoints'] = ListField([ IndexField(wp, fields['original_post']) for wp in weakpoints ]) if response is not None: fields['response'] = ListField([ TextField( self._tokenizer.tokenize(s)[:self.max_sentence_len], self._token_indexers) for s in response[:self.max_post_len] ]) if goodpoints is not None: fields['goodpoints'] = ListField( [IndexField(gp, fields['response']) for gp in goodpoints]) if op_features is not None: fields['op_features'] = ListField([ ArrayField(np.array(f)) for f in op_features[:self.max_post_len] ]) if response_features is not None: fields['response_features'] = ListField([ ArrayField(np.array(f)) for f in response_features[:self.max_post_len] ]) if op_doc_features is not None: fields['op_doc_features'] = ArrayField(np.array(op_doc_features)) if response_doc_features is not None: fields['response_doc_features'] = ArrayField( np.array(response_doc_features)) fields['label'] = LabelField(label, skip_indexing=True) return Instance(fields)
def text_to_instance( self, # type: ignore candidates: List[str], query: str, supports: List[str], _id: str = None, answer: str = None, annotations: List[List[str]] = None, ) -> Instance: fields: Dict[str, Field] = {} candidates_field = ListField([ TextField(candidate, self._token_indexers) for candidate in self._tokenizer.batch_tokenize(candidates) ]) fields["query"] = TextField(self._tokenizer.tokenize(query), self._token_indexers) fields["supports"] = ListField([ TextField(support, self._token_indexers) for support in self._tokenizer.batch_tokenize(supports) ]) fields["answer"] = TextField(self._tokenizer.tokenize(answer), self._token_indexers) fields["answer_index"] = IndexField(candidates.index(answer), candidates_field) fields["candidates"] = candidates_field fields["metadata"] = MetadataField({ "annotations": annotations, "id": _id }) return Instance(fields)
def text_to_instance( self, # type: ignore candidates: List[str], query: str, supports: List[str], _id: str = None, answer: str = None, annotations: List[List[str]] = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} candidates_field = ListField([ TextField(candidate, self._token_indexers) for candidate in self._tokenizer.batch_tokenize(candidates) ]) fields['query'] = TextField(self._tokenizer.tokenize(query), self._token_indexers) fields['supports'] = ListField([ TextField(support, self._token_indexers) for support in self._tokenizer.batch_tokenize(supports) ]) fields['answer'] = TextField(self._tokenizer.tokenize(answer), self._token_indexers) fields['answer_index'] = IndexField(candidates.index(answer), candidates_field) fields['candidates'] = candidates_field fields['metadata'] = MetadataField({ 'annotations': annotations, 'id': _id }) return Instance(fields)
def tokens_to_user_field(self, tokens) -> Optional[ListField]: doc_list = [] if self.overflow_doc_strategy == 'latest': docs = tokens[-self.max_doc:] elif self.overflow_doc_strategy == 'earliest': docs = tokens[:self.max_doc] elif self.overflow_doc_strategy == 'all': docs = tokens elif self.overflow_doc_strategy == 'random': if len(tokens) > self.max_doc: doc_indexes = range(len(tokens)) selected_doc_indexes = np.sort( np.random.choice(doc_indexes, self.max_doc, replace=False)) docs = [tokens[i] for i in selected_doc_indexes] else: docs = tokens else: raise ValueError('{} as docs overflow strategy is not valid, \ choose from latest, earliest, or random'.format(self.overflow_doc_strategy)) for doc in docs: sent_list = [] for sentence in doc[:self.max_sent]: word_list = [] for word in sentence[:self.max_word]: if len(word) < self.max_word_len: word_list.append(Token(word)) else: word_list.append(Token(word[:self.max_word_len])) if len(word_list) > 0: sent_list.append(TextField(word_list, self.token_indexers)) if len(sent_list) > 0: doc_list.append(ListField(sent_list)) if len(doc_list) > 0: return ListField(doc_list) else: return None
def text_to_instance(self, article_paragraphs: List[List[str]], label: str, evidence_spans: List[int], outcome: List[str], intervention: List[str], comparator: List[str]): article = ListField([ TextField([Token(x) for x in para[:100]], self.token_indexers) for para in article_paragraphs ]) fields = { 'article': article, 'outcome': TextField([Token(x) for x in outcome], self.token_indexers), 'intervention': TextField([Token(x) for x in intervention], self.token_indexers), 'comparator': TextField([Token(x) for x in comparator], self.token_indexers), 'labels': LabelField(label), 'evidence': ListField([IndexField(item, article) for item in evidence_spans]) } return Instance(fields)
def text_to_instance(self, triple, predicate, draft, revised=None, action=None) -> Instance: triple_field = ListField([TextField(t, self.token_indexers) for t in triple]) predicate_field = ListField([TextField(p, self.token_indexers) for p in predicate]) draft.insert(0, Token(START_SYMBOL)) draft.append(Token(END_SYMBOL)) draft_field = TextField(draft, self.token_indexers) fields = { "triple_tokens": triple_field, "predicate_tokens": predicate_field, "draft_tokens": draft_field } meta_fields = {"draft": [w.text for w in draft[1:-1]], "triple": [t[-1].text for t in triple]} if revised is not None: meta_fields["revised"] = [w.text for w in revised] revised.insert(0, Token(START_SYMBOL)) revised.append(Token(END_SYMBOL)) action.insert(0, Token(START_SYMBOL)) action.append(Token(END_SYMBOL)) triple_revised_ids = self._tokens_to_ids([t[-1] for t in triple] + action) fields["triple_token_ids"] = ArrayField(np.array(triple_revised_ids[:len(triple)])) fields["action_token_ids"] = ArrayField(np.array(triple_revised_ids[len(triple):])) fields.update({"revised_tokens": TextField(revised, self.token_indexers), "action_tokens": TextField(action, self.token_indexers)}) else: fields["triple_token_ids"] = ArrayField(np.array(self._tokens_to_ids([t[-1] for t in triple]))) fields["metadata"] = MetadataField(meta_fields) return Instance(fields)
def make_multiqa_instance(question_tokens: List[Token], tokenized_paragraph: List[List[Token]], token_indexers: Dict[str, TokenIndexer], paragraph: List[str], answers_list: List[Tuple[int, int]] = None, additional_metadata: Dict[str, Any] = None) -> AllenInstance: additional_metadata = additional_metadata or {} fields: Dict[str, Field] = {} passage_offsets = [(token.idx, token.idx + len(token.text)) for token in tokenized_paragraph] # This is separate so we can reference it later with a known type. passage_field = TextField(tokenized_paragraph, token_indexers) fields['passage'] = passage_field fields['question'] = TextField(question_tokens, token_indexers) metadata = {'original_passage': paragraph, 'answers_list': answers_list, 'token_offsets': passage_offsets, 'question_tokens': [token.text for token in question_tokens], 'passage_tokens': [token.text for token in tokenized_paragraph]} if answers_list is not None: span_start_list: List[Field] = [] span_end_list: List[Field] = [] if answers_list == []: span_start, span_end = -1, -1 else: span_start, span_end, text = answers_list[0] span_start_list.append(IndexField(span_start, passage_field)) span_end_list.append(IndexField(span_end, passage_field)) fields['span_start'] = ListField(span_start_list) fields['span_end'] = ListField(span_end_list) metadata.update(additional_metadata) fields['metadata'] = MetadataField(metadata) return AllenInstance(fields)
def text_to_instance(self, # type: ignore item_id: Any, question_text: str, choice_text_list: List[str], facts_text_list: List[str], answer_id: int, meta_fields: Dict = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} question_tokens = self.tokenize(question_text, "question") choices_tokens_list = [self.tokenize(x, "choice") for x in choice_text_list] facts_tokens_list = [self.tokenize(x, "fact") for x in facts_text_list] fields['question'] = TextField(question_tokens, self._token_indexers) fields['choices_list'] = ListField([TextField(x, self._token_indexers) for x in choices_tokens_list]) fields['facts_list'] = ListField([TextField(x, self._token_indexers) for x in facts_tokens_list]) fields['label'] = LabelField(answer_id, skip_indexing=True) metadata = { "id": item_id, "question_text": question_text, "choice_text_list": choice_text_list, "facts_text_list": facts_text_list, "question_tokens": [x.text for x in question_tokens], "choice_tokens_list": [[x.text for x in ct] for ct in choices_tokens_list], "facts_tokens_list": [[x.text for x in ct] for ct in facts_tokens_list], "label_gold": answer_id, } if meta_fields is not None: for k, v in meta_fields.items(): metadata[k] = v fields["metadata"] = MetadataField(metadata) return Instance(fields)
def text_to_instance( self, # type: ignore premise: str, hypotheses: List[str], labels: List[str] = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} premise_tokens = self._tokenizer.tokenize(premise) fields['premise'] = TextField(premise_tokens, self._token_indexers) all_hypotheses_fields = list() for hypothesis in hypotheses: hypothesis_tokens = self._tokenizer.tokenize(hypothesis) all_hypotheses_fields.append( TextField(hypothesis_tokens, self._token_indexers)) fields['hypotheses'] = ListField(all_hypotheses_fields) if labels: all_labels_fields = list() for label in labels: all_labels_fields.append(LabelField(label)) fields['labels'] = ListField(all_labels_fields) metadata = {"labels": all_labels_fields} fields["metadata"] = MetadataField(metadata) return Instance(fields)
def line_to_instance(self, query: List[Token], docs: List[List[List[Token]]], relevant_ix: int = None, scores: List[float] = None, dataset: Optional[str] = None) -> Instance: query_field = TextField(query, self.q_token_indexers) doc_fields = [ListField([TextField(sentence, self.d_token_indexers) for sentence in doc]) for doc in docs] fields = { 'query': query_field, 'docs': ListField(doc_fields) } if scores is not None: scores_field = ArrayField(scores) fields['scores'] = scores_field if relevant_ix is not None: label_field = LabelField(int(relevant_ix), skip_indexing=True) fields['labels'] = label_field if dataset is not None: fields[self.dataset_name_field] = MetadataField(dataset) return Instance(fields)
def test_padding_handles_list_fields_with_padding_values(self): array1 = ArrayField(numpy.ones([2, 3]), padding_value=-1) array2 = ArrayField(numpy.ones([1, 5]), padding_value=-1) empty_array = array1.empty_field() list_field = ListField([array1, array2, empty_array]) returned_tensor = (list_field.as_tensor( list_field.get_padding_lengths()).detach().cpu().numpy()) correct_tensor = numpy.array([ [[1.0, 1.0, 1.0, -1.0, -1.0], [1.0, 1.0, 1.0, -1.0, -1.0]], [[1.0, 1.0, 1.0, 1.0, 1.0], [-1.0, -1.0, -1.0, -1.0, -1.0]], [[-1.0, -1.0, -1.0, -1.0, -1.0], [-1.0, -1.0, -1.0, -1.0, -1.0]], ]) numpy.testing.assert_array_equal(returned_tensor, correct_tensor)
def text_to_instance(sents: List[str], labels: List[str] = None): """ Make list of sentences (and labels) to ``Instance`` """ fields = {} tokenized_sents = [WordTokenizer().tokenize(sent) for sent in sents] sentence_sequence = ListField([ TextField(tk, {'tokens': SingleIdTokenIndexer()}) for tk in tokenized_sents ]) fields['sentences'] = sentence_sequence if labels is not None: fields['labels'] = SequenceLabelField(labels, sentence_sequence) return Instance(fields)
def test_extract_tokens_listfield(task_head): tokenizer = Tokenizer(Vocab()) input_tokens = list(tokenizer("test this sentence.")) tf = TextField(input_tokens, None) instance = Instance({"test": ListField([tf, tf])}) tokens = task_head._extract_tokens(instance) assert len(tokens) == 2 and len(tokens[0]) == 3 and len(tokens[1]) == 3 assert all([ all([isinstance(tok, Token) for tok in tf_tokens] for tf_tokens in tokens) ])
def text_to_instance(self, document: str, label: str = None) -> Instance: sentences: List[str] = self._sentence_splitter.split_sentences( document) tokenized_sents: List[List[str]] = (self._tokenizer.tokenize(sent) for sent in sentences) fields = { 'tokens': ListField( [TextField(s, self._token_indexers) for s in tokenized_sents]) } if label: fields['label'] = LabelField(int(label), skip_indexing=True) return Instance(fields)
def test_padding_handles_list_fields(self): array1 = ArrayField(numpy.ones([2, 3])) array2 = ArrayField(numpy.ones([1, 5])) empty_array = array1.empty_field() list_field = ListField([array1, array2, empty_array]) returned_tensor = list_field.as_tensor(list_field.get_padding_lengths()).detach().cpu().numpy() correct_tensor = numpy.array([[[1., 1., 1., 0., 0.], [1., 1., 1., 0., 0.]], [[1., 1., 1., 1., 1.], [0., 0., 0., 0., 0.]], [[0., 0., 0., 0., 0.], [0., 0., 0., 0., 0.]]]) numpy.testing.assert_array_equal(returned_tensor, correct_tensor)