def test_as_tensor_can_handle_multiple_token_indexers_and_empty_fields(self): # pylint: disable=protected-access self.field1._token_indexers = self.words_and_characters_indexers self.field2._token_indexers = self.words_and_characters_indexers self.field3._token_indexers = self.words_and_characters_indexers list_field = ListField([self.field1.empty_field(), self.field1, self.field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() tensor_dict = list_field.as_tensor(padding_lengths) words = tensor_dict["words"].detach().cpu().numpy() characters = tensor_dict["characters"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal(words, numpy.array([[0, 0, 0, 0, 0], [2, 3, 4, 5, 0], [2, 3, 4, 1, 5]])) numpy.testing.assert_array_almost_equal(characters[0], numpy.zeros([5, 9])) numpy.testing.assert_array_almost_equal(characters[1], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]])) numpy.testing.assert_array_almost_equal(characters[2], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 3, 1, 3, 4, 5], [2, 3, 4, 5, 3, 4, 6, 3, 0]]))
def test_list_field_can_handle_empty_text_fields(self): list_field = ListField([self.field1, self.field2, self.empty_text_field]) list_field.index(self.vocab) tensor_dict = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal(tensor_dict["words"].detach().cpu().numpy(), numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [0, 0, 0, 0, 0]]))
def test_list_field_can_handle_empty_sequence_label_fields(self): list_field = ListField([self.sequence_label_field, self.sequence_label_field, self.empty_sequence_label_field]) list_field.index(self.vocab) tensor = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal(tensor.detach().cpu().numpy(), numpy.array([[1, 1, 0, 1], [1, 1, 0, 1], [0, 0, 0, 0]]))
def test_all_fields_padded_to_max_length(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) tensor_dict = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_almost_equal(tensor_dict["words"][0].detach().cpu().numpy(), numpy.array([2, 3, 4, 5, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][1].detach().cpu().numpy(), numpy.array([2, 3, 4, 1, 5])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][2].detach().cpu().numpy(), numpy.array([2, 3, 1, 5, 0]))
def test_padding_handles_list_fields_with_padding_values(self): array1 = ArrayField(numpy.ones([2, 3]), padding_value=-1) array2 = ArrayField(numpy.ones([1, 5]), padding_value=-1) empty_array = array1.empty_field() list_field = ListField([array1, array2, empty_array]) returned_tensor = list_field.as_tensor(list_field.get_padding_lengths()).detach().cpu().numpy() correct_tensor = numpy.array([[[1., 1., 1., -1., -1.], [1., 1., 1., -1., -1.]], [[1., 1., 1., 1., 1.], [-1., -1., -1., -1., -1.]], [[-1., -1., -1., -1., -1.], [-1., -1., -1., -1., -1.]]]) numpy.testing.assert_array_equal(returned_tensor, correct_tensor)
def test_fields_can_pad_to_greater_than_max_length(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() padding_lengths["list_words_length"] = 7 padding_lengths["num_fields"] = 5 tensor_dict = list_field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal(tensor_dict["words"][0].detach().cpu().numpy(), numpy.array([2, 3, 4, 5, 0, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][1].detach().cpu().numpy(), numpy.array([2, 3, 4, 1, 5, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][2].detach().cpu().numpy(), numpy.array([2, 3, 1, 5, 0, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][3].detach().cpu().numpy(), numpy.array([0, 0, 0, 0, 0, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][4].detach().cpu().numpy(), numpy.array([0, 0, 0, 0, 0, 0, 0]))
def test_doubly_nested_field_works(self): field1 = ProductionRuleField('S -> [NP, VP]', is_global_rule=True) field2 = ProductionRuleField('NP -> test', is_global_rule=True) field3 = ProductionRuleField('VP -> eat', is_global_rule=False) list_field = ListField([ListField([field1, field2, field3]), ListField([field1, field2])]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() tensors = list_field.as_tensor(padding_lengths) assert isinstance(tensors, list) assert len(tensors) == 2 assert isinstance(tensors[0], list) assert len(tensors[0]) == 3 assert isinstance(tensors[1], list) assert len(tensors[1]) == 3 tensor_tuple = tensors[0][0] assert tensor_tuple[0] == 'S -> [NP, VP]' assert tensor_tuple[1] is True assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.s_rule_index]) tensor_tuple = tensors[0][1] assert tensor_tuple[0] == 'NP -> test' assert tensor_tuple[1] is True assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.np_index]) tensor_tuple = tensors[0][2] assert tensor_tuple[0] == 'VP -> eat' assert tensor_tuple[1] is False assert tensor_tuple[2] is None tensor_tuple = tensors[1][0] assert tensor_tuple[0] == 'S -> [NP, VP]' assert tensor_tuple[1] is True assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.s_rule_index]) tensor_tuple = tensors[1][1] assert tensor_tuple[0] == 'NP -> test' assert tensor_tuple[1] is True assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.np_index]) # This item was just padding. tensor_tuple = tensors[1][2] assert tensor_tuple[0] == '' assert tensor_tuple[1] is False assert tensor_tuple[2] is None
def test_nested_list_fields_are_padded_correctly(self): nested_field1 = ListField([LabelField(c) for c in ['a', 'b', 'c', 'd', 'e']]) nested_field2 = ListField([LabelField(c) for c in ['f', 'g', 'h', 'i', 'j', 'k']]) list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() assert padding_lengths == {'num_fields': 3, 'list_num_fields': 6} tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy() numpy.testing.assert_almost_equal(tensor, [[-1, -1, -1, -1, -1, -1], [0, 1, 2, 3, 4, -1], [5, 6, 7, 8, 9, 10]])
def test_nested_list_fields_are_padded_correctly(self): nested_field1 = ListField([LabelField(c) for c in ['a', 'b', 'c', 'd', 'e']]) nested_field2 = ListField([LabelField(c) for c in ['f', 'g', 'h', 'i', 'j', 'k']]) list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() assert padding_lengths == {'num_fields': 3, 'list_num_fields': 6} tensor = list_field.as_tensor(padding_lengths).data.cpu().numpy() numpy.testing.assert_almost_equal(tensor, [[[-1], [-1], [-1], [-1], [-1], [-1]], [[0], [1], [2], [3], [4], [-1]], [[5], [6], [7], [8], [9], [10]]])
def text_to_instance( self, # type: ignore sentences: List[str], passage: str, columns: str, column_start_spans, column_end_spans, value_start_spans, value_end_spans, sqls, passage_tokens: List[Token] = None, column_tokens: List[Token] = None, sentence_tokens: List[List[Token]] = None, yesno_list: List[int] = None, metadata: Dict[str, Any] = None) -> Instance: passage_field = TextField(passage_tokens, self._token_indexers) columns_field = TextField(column_tokens, self._token_indexers) sentences_field = ListField([ TextField(s_tokens, self._token_indexers) for s_tokens in sentence_tokens ]) fields = { 'passage': passage_field, 'sentence': sentences_field, 'column': columns_field } col_start_list = [] col_end_list = [] for s, e in zip(column_start_spans, column_end_spans): col_start_list.append(IndexField(s, passage_field)) col_end_list.append(IndexField(e, passage_field)) fields['col_start_idx'] = ListField(col_start_list) fields['col_end_idx'] = ListField(col_end_list) val_start_list = [] val_end_list = [] for s, e in zip(value_start_spans, value_end_spans): val_start_list.append(IndexField(s, passage_field)) val_end_list.append(IndexField(e, passage_field)) fields['val_start_idx'] = ListField(val_start_list) fields['val_end_idx'] = ListField(val_end_list) metadata['origin_passage'] = passage metadata['passage_tokens'] = passage_tokens metadata['column_tokens'] = column_tokens metadata['sentence_tokens'] = sentence_tokens metadata['sqls'] = sqls fields['yesno_list'] = ListField([ LabelField(yesno, label_namespace="yesno_labels") for yesno in yesno_list ]) fields['metadata'] = MetadataField(metadata) return Instance(fields)
def test_list_field_can_handle_empty_text_fields(self): list_field = ListField( [self.field1, self.field2, self.empty_text_field]) list_field.index(self.vocab) tensor_dict = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal( tensor_dict["words"].detach().cpu().numpy(), numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [0, 0, 0, 0, 0]]), )
def test_list_field_can_handle_empty_sequence_label_fields(self): list_field = ListField( [self.sequence_label_field, self.sequence_label_field, self.empty_sequence_label_field] ) list_field.index(self.vocab) tensor = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal( tensor.detach().cpu().numpy(), numpy.array([[1, 1, 0, 1], [1, 1, 0, 1], [0, 0, 0, 0]]) )
def _json_to_instance(self, json_dict: JsonDict) -> Instance: if "text" in json_dict: text = json_dict["text"] words = [ text[t["span"]["start"]:t["span"]["end"]] for t in json_dict["tokens"] ] else: words = json_dict["tokens"] tokens = [Token(w) for w in words] # Attribut (_dataset_reader._token_indexers ) wird durch unseren DataReader hinzugefügt! # Nicht allgemein gültig... token_indexers = self._dataset_reader._token_indexers sequence = TextField(tokens, token_indexers=token_indexers) context_size = len(words) + 1 spans = [] span_masks = [] for start, end in enumerate_spans( tokens, max_span_width=self._dataset_reader._max_span_width): spans.append(SpanField(start, end, sequence)) span_masks.append(create_mask(start, end, context_size)) span_field = ListField(spans) # span_tuples = [(span.span_start, span.span_end) for span in spans] span_mask_field = ListField([ ArrayField(np.array(si, dtype=np.int), dtype=np.int) for si in span_masks ]) instance_fields: Dict[str, Field] = { "tokens": sequence, "metadata": MetadataField({"words": [x.text for x in tokens]}), "spans": span_field, "span_masks": span_mask_field } return Instance(instance_fields)
def _make_instance_from_text(self, sent_tokens, pred_index, annotations = None, sent_id = None): instance_dict = {} if isinstance(sent_tokens, str): sent_tokens = sent_tokens.split() sent_tokens = cleanse_sentence_text(sent_tokens) text_field = TextField([Token(t) for t in sent_tokens], self._token_indexers) instance_dict['text'] = text_field instance_dict['predicate_indicator'] = SequenceLabelField([1 if i == pred_index else 0 for i in range(len(sent_tokens))], text_field) if annotations is not None: for i, slot_name in enumerate(self._slot_labels): span_slot = ListField([LabelField(ann.slots[i], label_namespace="slot_%s"%slot_name) for ann in annotations for span in ann.all_spans]) instance_dict['span_slot_%s'%slot_name] = span_slot labeled_span_field = ListField([SpanField(span.start(), span.end(), text_field) for ann in annotations for span in ann.all_spans]) instance_dict['labeled_spans'] = labeled_span_field if self._bio_labels: bio_labels = ["O"] * len(sent_tokens) bio_labels[pred_index] = "B-V" for span in self._resolve_spans(annotations, pred_index): bio_labels[span.start()] = "B-ARG" for i in range(span.start()+1, span.end()+1): bio_labels[i] = "I-ARG" instance_dict["bio_label"] = SequenceLabelField(bio_labels, text_field, label_namespace="bio_labels") instance_dict['annotations'] = MetadataField({'annotations':annotations}) metadata = {'pred_index' : pred_index, 'sent_text': " ".join(sent_tokens)} if sent_id is not None: metadata['sent_id'] = sent_id instance_dict['metadata'] = MetadataField(metadata) return Instance(instance_dict)
def text_to_instance(self, # type: ignore query: List[str], prelinked_entities: Dict[str, Dict[str, str]] = None, sql: List[str] = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} tokens = TextField([Token(t) for t in query], self._token_indexers) fields["tokens"] = tokens if sql is not None: try: action_sequence, all_actions = self._world.get_action_sequence_and_all_actions(sql, prelinked_entities) except ParseError: return None index_fields: List[Field] = [] production_rule_fields: List[Field] = [] for production_rule in all_actions: nonterminal, _ = production_rule.split(' ->') production_rule = ' '.join(production_rule.split(' ')) field = ProductionRuleField(production_rule, self._world.is_global_rule(nonterminal)) production_rule_fields.append(field) valid_actions_field = ListField(production_rule_fields) fields["valid_actions"] = valid_actions_field action_map = {action.rule: i # type: ignore for i, action in enumerate(valid_actions_field.field_list)} for production_rule in action_sequence: index_fields.append(IndexField(action_map[production_rule], valid_actions_field)) action_sequence_field = ListField(index_fields) fields["action_sequence"] = action_sequence_field return Instance(fields)
def text_to_instance(self, tokens: List[Token], relations=None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} text_field = TextField(tokens, token_indexers=self._token_indexers) fields["text"] = text_field if relations is not None: field_list = [] for relation in relations: field_list.append( SequenceLabelField( labels=relation, sequence_field=text_field, label_namespace=self._label_namespace ) ) fields["relations"] = ListField(field_list=field_list) return Instance(fields)
def text_to_instance( self, # type: ignore premise: str, hypotheses: List[str], labels: List[str] = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} premise_tokens = self._tokenizer.tokenize(premise) fields['premise'] = TextField(premise_tokens, self._token_indexers) all_hypotheses_fields = list() for hypothesis in hypotheses: hypothesis_tokens = self._tokenizer.tokenize(hypothesis) all_hypotheses_fields.append( TextField(hypothesis_tokens, self._token_indexers)) fields['hypotheses'] = ListField(all_hypotheses_fields) if labels: all_labels_fields = list() for label in labels: all_labels_fields.append(LabelField(label)) fields['labels'] = ListField(all_labels_fields) metadata = {"labels": all_labels_fields} fields["metadata"] = MetadataField(metadata) return Instance(fields)
def text_to_instance( self, image: np.ndarray, label_box: List[List[float]] = list(), label_class: List[str] = list(), keypoints: List[List[Tuple[float, float, float]]] = list() ) -> Instance: if self._keypoint_name in self._include_fields: # protect against some augmentations not supporting keypoints img, _, label_box, label_class, keypoints = self.augment( image, boxes=[np.array(b) for b in label_box], category_id=label_class, keypoints=keypoints) else: img, _, label_box, label_class, _ = self.augment( image, boxes=[np.array(b) for b in label_box], category_id=label_class) h, w, c = img.shape fields: Dict[str, Field] = {} fields['image'] = ImageField(img.transpose(2, 0, 1), channels_first=False) fields['image_sizes'] = ArrayField(np.array([w, h])) if self._bbox_name in self._include_fields and len(label_box) > 0: box_fields = [BoundingBoxField(x) for x in label_box] fields['boxes'] = ListField(box_fields) if self._bbox_class_name in self._include_fields and len( label_class) > 0: fields['box_classes'] = ListField( [LabelField(idx) for idx in label_class]) if self._keypoint_name in self._include_fields and len(keypoints) > 0: assert all([len(kp) == len(keypoints[0]) for kp in keypoints]) fields['keypoint_positions'] = ListField( [KeypointField(kp) for kp in keypoints]) return Instance(fields)
def text_to_instance(self, saifa_text_list: List[ArrayField], raisha_text_list: List[ArrayField], labels: List[str] = None, metadata: Dict = None) -> Instance: raisha_text_list = ListField(raisha_text_list) fields = {'source': raisha_text_list} saifa_text_list = ListField(saifa_text_list) fields['target'] = saifa_text_list if labels: seq_labels_field = SequenceLabelField( labels=labels, sequence_field=saifa_text_list) fields['seq_labels'] = seq_labels_field reg_labels = [0 if label == 'hotel' else 1 for label in labels] reg_label_field = FloatLabelField( sum(reg_labels) / len(reg_labels)) fields['reg_labels'] = reg_label_field if metadata is not None: fields['metadata'] = MetadataField(metadata) return Instance(fields)
def text_to_instance(self, text: str, labels: List[str] = None, header: List[str] = None) -> Instance: # type: ignore # pylint: disable=arguments-differ tokenized_text = self._tokenizer.tokenize(text) text_field = TextField(tokenized_text, self._token_indexers) fields = {'text': text_field} if not labels: labels = [0 for i in range(237)] fields['labels'] = ListField( [LabelField(int(l), skip_indexing=True) for l in labels]) fields['metadata'] = MetadataField(header) return Instance(fields)
def text_to_instance(self, source_string: str, target_string: str = None, alignment: str = None) -> Instance: # type: ignore # pylint: disable=arguments-differ tokenized_source = self._source_tokenizer.tokenize(source_string) if self._source_add_start_token: tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, self._source_token_indexers) if target_string is not None: tokenized_target = self._target_tokenizer.tokenize(target_string) if self._remove_unneeded_aliases: new_target = tu.clean_unneeded_aliases( [token.text for token in tokenized_target]) tokenized_target = [Token(t) for t in new_target] tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._target_token_indexers) alignment_index_fields: List[IndexField] = [] tokenized_alignment = self._source_tokenizer.tokenize(alignment) tmp_source_tokenized_strings = [t.text for t in tokenized_source] for aligned_token in tokenized_alignment: try: aligned_token_index = int( tmp_source_tokenized_strings.index(aligned_token.text)) except ValueError as e: # Since START_TOKEN is added, no step should be aligned to it so it can be used as # a special "no alignment" index aligned_token_index = 0 alignment_index_fields.append( IndexField(aligned_token_index, source_field)) if not alignment_index_fields: # if there was no alignemnet (it was None or ""), add dummy alignments for _ in range(len(tokenized_target) - 2): alignment_index_fields.append(IndexField(0, source_field)) alignment_field = ListField(alignment_index_fields) return Instance({ "source_tokens": source_field, "target_tokens": target_field, "alignment_sequence": alignment_field }) else: return Instance({'source_tokens': source_field})
def test_all_fields_padded_to_max_length(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) tensor_dict = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_almost_equal(tensor_dict[u"words"][0].detach().cpu().numpy(), numpy.array([2, 3, 4, 5, 0])) numpy.testing.assert_array_almost_equal(tensor_dict[u"words"][1].detach().cpu().numpy(), numpy.array([2, 3, 4, 1, 5])) numpy.testing.assert_array_almost_equal(tensor_dict[u"words"][2].detach().cpu().numpy(), numpy.array([2, 3, 1, 5, 0]))
def _read(self, file_path): # if `file_path` is a URL, redirect to the cache # file_path = cached_path(file_path) for filename in os.listdir(file_path): filename_splitted = filename.split('_') task_name = filename_splitted[-3] domain_name = filename_splitted[-2] if task_name not in self._tasks or domain_name not in self._domains: continue with open(os.path.join(file_path, filename), "r") as data_file: logger.info("Reading instances from lines in file at: %s", filename) for line in Tqdm.tqdm(data_file): line = line.strip("\n") # skip blank lines if not line: continue tokens_and_tags = [ pair.rsplit(self._word_tag_delimiter, 1) for pair in line.split(self._token_delimiter) ] tokens = [Token(token) for token, tag in tokens_and_tags] tags = [tag for token, tag in tokens_and_tags] sequence = TextField(tokens, self._token_indexers) sequence_tags = SequenceLabelField( tags, sequence, label_namespace=task_name + '_labels') task_field = LabelField(task_name, label_namespace="task_labels") domain_field = LabelField(domain_name, label_namespace="domain_labels") input_dict = { 'task_token': task_field, 'domain_token': domain_field, 'tokens': sequence } all_tags = [] empty_tags = ['O'] * len(tags) for tsk in self._tasks: if tsk != task_name: empty_sequence_tags = SequenceLabelField( empty_tags, sequence, label_namespace=tsk + '_labels') all_tags.append(empty_sequence_tags) else: all_tags.append(sequence_tags) input_dict['all_tags'] = ListField(all_tags) yield Instance(input_dict)
def text_to_instance(self, data: Dict[str, Any]) -> Instance: # Tokenize input sentence input = data['input'] if self._prev: input = ' '.join((data['previous_sentence'], input)) tokenized_input = self._tokenizer.tokenize(input) input_field = TextField(tokenized_input, self._token_indexers) # Combine and tokenize claims properties = data['properties'] values = data['values'] qualifiers = data['qualifiers'] claims_list = [] for prop, val, quals in zip(properties, values, qualifiers): substrings = [] substrings.extend(['<prop>', prop, '</prop>']) substrings.extend(['<val>', val, '</val>']) if len(quals) > 0: for qp, qv in quals: substrings.extend(['<qual_prop>', qp, '</qual_prop>']) substrings.extend(['<qual_val>', qv, '</qual_val>']) claim_string = ' '.join(substrings) tokenized_claim = self._tokenizer.tokenize(claim_string) claim_field = TextField(tokenized_claim, self._token_indexers) claims_list.append(claim_field) claims_field = ListField(claims_list) # Stuff everything in a dict fields = { 'inputs': input_field, 'claims': claims_field, } # If target labels are provided add as SequenceLabelField if 'used' in data: labels = ['used' if x else 'not used' for x in data['used']] label_field = SequenceLabelField(labels=labels, sequence_field=claims_field) fields['labels'] = label_field # If target output sequence is provided add as TextField if 'target' in data: target = data['target'] tokenized_target = self._tokenizer.tokenize(target) fields['targets'] = TextField(tokenized_target, self._token_indexers) return Instance(fields)
def text_to_instance( self, # type: ignore qid: str, start: str, alternatives: List[str], label: Optional[int] = None, ) -> Instance: # tokenize start = self._tokenizer.tokenize(start) sequences = [] for alternative in alternatives: alternative = self._tokenizer.tokenize(alternative) length_for_start = (self.length_limit - len(alternative) - self._tokenizer.num_special_tokens_for_pair()) if length_for_start < 0: # If the alternative is too long by itself, we take the beginning and add no tokens from the start. alternative = alternative[:length_for_start] length_for_start = 0 sequences.append( self._tokenizer.add_special_tokens(start[:length_for_start], alternative)) # make fields from allennlp.data.fields import TextField sequences = [ TextField(sequence, self._token_indexers) for sequence in sequences ] from allennlp.data.fields import ListField sequences = ListField(sequences) from allennlp.data.fields import MetadataField fields = { "alternatives": sequences, "qid": MetadataField(qid), } if label is not None: if label < 0 or label >= len(sequences): raise ValueError("Alternative %d does not exist", label) from allennlp.data.fields import IndexField fields["correct_alternative"] = IndexField(label, sequences) return Instance(fields)
def test_all_fields_padded_to_max_length(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) array_dict = list_field.as_array(list_field.get_padding_lengths()) numpy.testing.assert_array_almost_equal(array_dict["words"][0], numpy.array([2, 3, 4, 5, 0])) numpy.testing.assert_array_almost_equal(array_dict["words"][1], numpy.array([2, 3, 4, 1, 5])) numpy.testing.assert_array_almost_equal(array_dict["words"][2], numpy.array([2, 3, 1, 5, 0]))
def text_to_instance(self, tokenized_sentence: List[str], spans: List[List[int]]) -> Instance: allennlp_sentence_tokens = [Token(text=t) for t in tokenized_sentence] sentence_token_indexes = TextField(allennlp_sentence_tokens, self._token_indexers) span_fields = [] for span_start, span_end_exclusive in spans: span_field = SpanField(span_start, span_end_exclusive - 1, sentence_token_indexes) span_fields.append(span_field) fields: Dict[str, Field] = {} fields["tokens"] = sentence_token_indexes fields["spans"] = ListField(span_fields) return Instance(fields)
def tokens_to_empath(self, tokens: List[List[List[str]]]) -> ListField: def doc_to_empath(doc_str) -> ArrayField: results = self.empath_lexicon.analyze(doc_str) return ArrayField( np.array([ results[category] for category in self.lexicon_categories ])) doc_list = [ doc_to_empath(" ".join([ word for sentence in doc[:self.max_sent] for word in sentence[:self.max_word] ])) for doc in tokens[-self.max_doc:] ] return ListField(doc_list)
def text_to_instance( self, # type: ignore question: str, image: Union[str, Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]], answer_counts: Optional[MutableMapping[str, int]] = None, *, use_cache: bool = True, ) -> Optional[Instance]: tokenized_question = self._tokenizer.tokenize(question) question_field = TextField(tokenized_question, None) fields: Dict[str, Field] = { "question": question_field, } if image is not None: if isinstance(image, str): features, coords, _, _ = next( self._process_image_paths([image], use_cache=use_cache) ) else: features, coords, _, _ = image fields["box_features"] = ArrayField(features) fields["box_coordinates"] = ArrayField(coords) fields["box_mask"] = ArrayField( features.new_ones((features.shape[0],), dtype=torch.bool), padding_value=False, dtype=torch.bool, ) if answer_counts is not None: answer_fields = [] weights = [] for answer, count in answer_counts.items(): if self.answer_vocab is None or answer in self.answer_vocab: answer_fields.append(LabelField(answer, label_namespace="answers")) weights.append(get_score(count)) if len(answer_fields) <= 0: return None fields["labels"] = ListField(answer_fields) fields["label_weights"] = ArrayField(torch.tensor(weights)) return Instance(fields)
def text_to_instance(self, passage_id: str, question_id: str, question_type: str, passage: str, question: str, answer0: str, answer1: str, label0: Optional[str] = None) -> Instance: metadata = { 'passage_id': passage_id, 'question_id': question_id, 'question_type': question_type, } passage_sentences = util.split_sentences(self.sentenciser, passage) passage_sentences_tokens = [ self.tokeniser.tokenize(sentence) for sentence in passage_sentences ] passage_fields = [ TextField(tokens, self.word_indexers) for tokens in passage_sentences_tokens ] question_tokens = self.tokeniser.tokenize(question) answer0_tokens = self.tokeniser.tokenize(answer0) answer1_tokens = self.tokeniser.tokenize(answer1) fields = { "metadata": MetadataField(metadata), "sentences": ListField(passage_fields), "question": TextField(question_tokens, self.word_indexers), "answer0": TextField(answer0_tokens, self.word_indexers), "answer1": TextField(answer1_tokens, self.word_indexers), } if label0 is not None: if label0 == "True": label = 0 elif label0 == 'False': label = 1 else: raise ValueError('Wrong value for Answer::correct') fields["label"] = LabelField(label=label, skip_indexing=True) return Instance(fields)
def text_to_instance( self, # type: ignore question_text: str, passage_text: str, options_text: List[str], qa_id: str, passage_tokens: List[Token] = None, answer_index: int = None, debate_mode: List[str] = None) -> Instance: # pylint: disable=arguments-differ additional_metadata = {'id': qa_id, 'debate_mode': debate_mode} fields: Dict[str, Field] = {} # Tokenize and calculate token offsets (i.e., for wordpiece) question_tokens = self._tokenizer.tokenize(question_text) if passage_tokens is None: passage_tokens = self._tokenizer.tokenize(passage_text) options_tokens = self._tokenizer.batch_tokenize(options_text) passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] # This is separate so we can reference it later with a known type. options_field = ListField([ TextField(option_tokens, self._token_indexers) for option_tokens in options_tokens ]) fields['passage'] = TextField(passage_tokens, self._token_indexers) fields['question'] = TextField(question_tokens, self._token_indexers) fields['options'] = options_field metadata = { 'original_passage': passage_text, 'token_offsets': passage_offsets, 'question_tokens': [token.text for token in question_tokens], 'passage_tokens': [token.text for token in passage_tokens], 'options_tokens': [[token.text for token in option_tokens] for option_tokens in options_tokens] } if answer_index is not None: metadata['answer_texts'] = options_text[answer_index] fields['answer_index'] = IndexField(answer_index, options_field) metadata.update(additional_metadata) fields['metadata'] = MetadataField(metadata) return Instance(fields)
def test_as_tensor_can_handle_multiple_token_indexers(self): self.field1._token_indexers = self.words_and_characters_indexers self.field2._token_indexers = self.words_and_characters_indexers self.field3._token_indexers = self.words_and_characters_indexers list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() tensor_dict = list_field.as_tensor(padding_lengths) words = tensor_dict["words"].detach().cpu().numpy() characters = tensor_dict["characters"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal( words, numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [2, 3, 1, 5, 0]]) ) numpy.testing.assert_array_almost_equal( characters[0], numpy.array( [ [5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], ] ), ) numpy.testing.assert_array_almost_equal( characters[1], numpy.array( [ [5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 3, 1, 3, 4, 5], [2, 3, 4, 5, 3, 4, 6, 3, 0], ] ), ) numpy.testing.assert_array_almost_equal( characters[2], numpy.array( [ [5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 4, 1, 5, 1, 3, 1, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], ] ), )
def text_to_instance( self, # type: ignore qid: str, question: str, choices: List[str], evidence_top: List[str] = None, answer: str = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} question_tokens = self._tokenizer.tokenize(question) choice_tokens = self._tokenizer.batch_tokenize(choices) if evidence_top: evidence_tokens = self._tokenizer.batch_tokenize(evidence_top) evidence_tokens_flat = [t for evi in evidence_tokens for t in evi] else: evidence_tokens_flat = [] qa_pair_tokens = [] for c_tokens in choice_tokens: qa_pair = question_tokens + evidence_tokens_flat + \ [Token("[SEP]")] + c_tokens qa_pair_tokens.append(qa_pair) qa_pairs_field = ListField([ TextField(tokens, self._token_indexers) for tokens in qa_pair_tokens ]) if answer: fields['answer_index'] = IndexField(self.LABELS.index(answer), qa_pairs_field) fields['qa_pairs'] = qa_pairs_field metadata = { "qid": qid, "question": question, "choices": choices, "question_tokens": [x.text for x in question_tokens], "choices_tokens": [[x.text for x in tokens] for tokens in choice_tokens] } fields["metadata"] = MetadataField(metadata) return Instance(fields)
def target_short_sequence_instance( self, # type: ignore inst_tokens: Iterable, inst_arcs: Iterable, context_length: int = 15, metadata={"tokens": []}) -> Instance: """generate instances as short sequences (of given length) before a target turn with labels being dependency between target and its head; if head is out of context ? -> force previous turn ? other heuristics ? could also be used to refactor full-chat instances, by providing start="first" line, end=end of chat """ all = [] for i, turn in enumerate(inst_tokens): start = max(0, i - context_length) context_turns = inst_tokens[start:i + 1] data = self.extract_data(inst_arcs, metadata, start, i) fields: Dict[str, Field] = {} seq_field = ListField([ TextField(tokenized_line, self._token_indexers) for tokenized_line in context_turns ]) data["tokens"] = context_turns data["file_source"] = metadata["file_source"] fields["lines"] = seq_field try: fields["arcs"] = AdjacencyField(data["arcs"], seq_field) except: print("error at index", i, "interval should be (%d,%d)" % (start, i)) breakpoint() # example additional features, should be a separate function features = data["features"] # does src address target ? fields["rel_features"] = ArrayField( target_address_src_matrix(features["speaker"], features["addressee"])) # distance between src and target fields["offsets"] = ArrayField(turn_distance_matrix(context_turns), dtype=int) # is the turn the server ? fields["is_server"] = ArrayField(np.array(features["is_server"])) # should be listfield too ? one for each line ? fields["metadata"] = MetadataField(data) all.append(Instance(fields)) return all
def _get_wordpiece_indices_field(wordpieces: List[List[int]]): wordpiece_token_indices = [] ingested_indices = [] i = 0 while i < len(wordpieces): current_wordpieces = wordpieces[i] if len(current_wordpieces) > 1: wordpiece_token_indices.append(LabelsField(current_wordpieces, padding_value=-1)) i = current_wordpieces[-1] + 1 else: i += 1 # hack to guarantee minimal length of padded number # according to dataset_readers.reading_comprehension.drop from allennlp) wordpiece_token_indices.append(LabelsField([-1], padding_value=-1)) return ListField(wordpiece_token_indices)
def text_to_instance( self, # type: ignore qa_id: int, question: str, answer: Optional[str], image: Union[str, Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]], use_cache: bool = True, keep_impossible_questions: bool = True, ) -> Optional[Instance]: question_field = TextField(self._tokenizer.tokenize(question), None) fields: Dict[str, Field] = { "question": question_field, } if isinstance(image, str): features, coords, _, _ = next( self._process_image_paths([image], use_cache=use_cache)) else: features, coords, _, _ = image fields["box_features"] = ArrayField(features) fields["box_coordinates"] = ArrayField(coords) fields["box_mask"] = ArrayField( features.new_ones((features.shape[0], ), dtype=torch.bool), padding_value=False, dtype=torch.bool, ) if answer is not None: labels_fields = [] weights = [] if (not self.answer_vocab or answer in self.answer_vocab) or keep_impossible_questions: labels_fields.append( LabelField(answer, label_namespace="answers")) weights.append(1.0) if len(labels_fields) <= 0: return None fields["label_weights"] = ArrayField(torch.tensor(weights)) fields["labels"] = ListField(labels_fields) return Instance(fields)
def process_evidence_chains(evd_possible_chains, sent_labels_, fields): evd_possible_chains_ = [] if evd_possible_chains is not None: for chain in evd_possible_chains: if len(chain) == 0 or any([s_idx >= len(sent_labels_) for s_idx in chain]): # if there is no possible chain or any selected sentence in the chain exceeds para_limit, # ignore the instance. # the chain start with 0 will be filtered out in RLBidirectionalAttentionFlow Module. chain = [0] else: # Since indice 0 is for eos, shifts by one # Also add eos at the end chain = [s_idx + 1 for s_idx in chain] + [0] evd_possible_chains_.append(chain) fields['evd_chain_labels'] = ListField([ArrayField(np.array(ch), padding_value=0) for ch in evd_possible_chains_]) return evd_possible_chains_
def text_to_instance(self, image_feature: np.ndarray, captions: List[str] = None, image_name: str = None): fields = { "image_feature": ArrayField(image_feature), } if captions is not None: text_field_list = [] for caption in captions: tokens = [Token(t) for t in caption.split()] text_field_list.append(TextField(tokens, self._token_indexers)) fields["target_tokens"] = ListField(text_field_list) if image_name: fields["image_name"] = MetadataField(image_name) return Instance(fields)
def test_list_field_can_handle_empty_index_fields(self): list_field = ListField([self.index_field, self.index_field, self.empty_index_field]) list_field.index(self.vocab) tensor = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal(tensor.data.cpu().numpy(), numpy.array([[1], [1], [-1]]))
def test_get_padding_lengths(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) lengths = list_field.get_padding_lengths() assert lengths == {"num_fields": 3, "list_words_length": 5, "list_num_tokens": 5}