Beispiel #1
0
    def test_as_tensor_can_handle_multiple_token_indexers_and_empty_fields(self):
        # pylint: disable=protected-access
        self.field1._token_indexers = self.words_and_characters_indexers
        self.field2._token_indexers = self.words_and_characters_indexers
        self.field3._token_indexers = self.words_and_characters_indexers

        list_field = ListField([self.field1.empty_field(), self.field1, self.field2])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        tensor_dict = list_field.as_tensor(padding_lengths)
        words = tensor_dict["words"].detach().cpu().numpy()
        characters = tensor_dict["characters"].detach().cpu().numpy()

        numpy.testing.assert_array_almost_equal(words, numpy.array([[0, 0, 0, 0, 0],
                                                                    [2, 3, 4, 5, 0],
                                                                    [2, 3, 4, 1, 5]]))

        numpy.testing.assert_array_almost_equal(characters[0], numpy.zeros([5, 9]))

        numpy.testing.assert_array_almost_equal(characters[1], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                                                                            [1, 2, 0, 0, 0, 0, 0, 0, 0],
                                                                            [1, 0, 0, 0, 0, 0, 0, 0, 0],
                                                                            [2, 3, 4, 5, 3, 4, 6, 3, 0],
                                                                            [0, 0, 0, 0, 0, 0, 0, 0, 0]]))

        numpy.testing.assert_array_almost_equal(characters[2], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                                                                            [1, 2, 0, 0, 0, 0, 0, 0, 0],
                                                                            [1, 0, 0, 0, 0, 0, 0, 0, 0],
                                                                            [1, 1, 1, 1, 3, 1, 3, 4, 5],
                                                                            [2, 3, 4, 5, 3, 4, 6, 3, 0]]))
Beispiel #2
0
 def test_list_field_can_handle_empty_text_fields(self):
     list_field = ListField([self.field1, self.field2, self.empty_text_field])
     list_field.index(self.vocab)
     tensor_dict = list_field.as_tensor(list_field.get_padding_lengths())
     numpy.testing.assert_array_equal(tensor_dict["words"].detach().cpu().numpy(),
                                      numpy.array([[2, 3, 4, 5, 0],
                                                   [2, 3, 4, 1, 5],
                                                   [0, 0, 0, 0, 0]]))
Beispiel #3
0
 def test_list_field_can_handle_empty_sequence_label_fields(self):
     list_field = ListField([self.sequence_label_field,
                             self.sequence_label_field,
                             self.empty_sequence_label_field])
     list_field.index(self.vocab)
     tensor = list_field.as_tensor(list_field.get_padding_lengths())
     numpy.testing.assert_array_equal(tensor.detach().cpu().numpy(),
                                      numpy.array([[1, 1, 0, 1],
                                                   [1, 1, 0, 1],
                                                   [0, 0, 0, 0]]))
Beispiel #4
0
 def test_all_fields_padded_to_max_length(self):
     list_field = ListField([self.field1, self.field2, self.field3])
     list_field.index(self.vocab)
     tensor_dict = list_field.as_tensor(list_field.get_padding_lengths())
     numpy.testing.assert_array_almost_equal(tensor_dict["words"][0].detach().cpu().numpy(),
                                             numpy.array([2, 3, 4, 5, 0]))
     numpy.testing.assert_array_almost_equal(tensor_dict["words"][1].detach().cpu().numpy(),
                                             numpy.array([2, 3, 4, 1, 5]))
     numpy.testing.assert_array_almost_equal(tensor_dict["words"][2].detach().cpu().numpy(),
                                             numpy.array([2, 3, 1, 5, 0]))
Beispiel #5
0
    def test_padding_handles_list_fields_with_padding_values(self):
        array1 = ArrayField(numpy.ones([2, 3]), padding_value=-1)
        array2 = ArrayField(numpy.ones([1, 5]), padding_value=-1)
        empty_array = array1.empty_field()
        list_field = ListField([array1, array2, empty_array])

        returned_tensor = list_field.as_tensor(list_field.get_padding_lengths()).detach().cpu().numpy()
        correct_tensor = numpy.array([[[1., 1., 1., -1., -1.],
                                       [1., 1., 1., -1., -1.]],
                                      [[1., 1., 1., 1., 1.],
                                       [-1., -1., -1., -1., -1.]],
                                      [[-1., -1., -1., -1., -1.],
                                       [-1., -1., -1., -1., -1.]]])
        numpy.testing.assert_array_equal(returned_tensor, correct_tensor)
Beispiel #6
0
 def test_fields_can_pad_to_greater_than_max_length(self):
     list_field = ListField([self.field1, self.field2, self.field3])
     list_field.index(self.vocab)
     padding_lengths = list_field.get_padding_lengths()
     padding_lengths["list_words_length"] = 7
     padding_lengths["num_fields"] = 5
     tensor_dict = list_field.as_tensor(padding_lengths)
     numpy.testing.assert_array_almost_equal(tensor_dict["words"][0].detach().cpu().numpy(),
                                             numpy.array([2, 3, 4, 5, 0, 0, 0]))
     numpy.testing.assert_array_almost_equal(tensor_dict["words"][1].detach().cpu().numpy(),
                                             numpy.array([2, 3, 4, 1, 5, 0, 0]))
     numpy.testing.assert_array_almost_equal(tensor_dict["words"][2].detach().cpu().numpy(),
                                             numpy.array([2, 3, 1, 5, 0, 0, 0]))
     numpy.testing.assert_array_almost_equal(tensor_dict["words"][3].detach().cpu().numpy(),
                                             numpy.array([0, 0, 0, 0, 0, 0, 0]))
     numpy.testing.assert_array_almost_equal(tensor_dict["words"][4].detach().cpu().numpy(),
                                             numpy.array([0, 0, 0, 0, 0, 0, 0]))
    def test_doubly_nested_field_works(self):
        field1 = ProductionRuleField('S -> [NP, VP]', is_global_rule=True)
        field2 = ProductionRuleField('NP -> test', is_global_rule=True)
        field3 = ProductionRuleField('VP -> eat', is_global_rule=False)
        list_field = ListField([ListField([field1, field2, field3]),
                                ListField([field1, field2])])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        tensors = list_field.as_tensor(padding_lengths)
        assert isinstance(tensors, list)
        assert len(tensors) == 2
        assert isinstance(tensors[0], list)
        assert len(tensors[0]) == 3
        assert isinstance(tensors[1], list)
        assert len(tensors[1]) == 3

        tensor_tuple = tensors[0][0]
        assert tensor_tuple[0] == 'S -> [NP, VP]'
        assert tensor_tuple[1] is True
        assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.s_rule_index])

        tensor_tuple = tensors[0][1]
        assert tensor_tuple[0] == 'NP -> test'
        assert tensor_tuple[1] is True
        assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.np_index])

        tensor_tuple = tensors[0][2]
        assert tensor_tuple[0] == 'VP -> eat'
        assert tensor_tuple[1] is False
        assert tensor_tuple[2] is None

        tensor_tuple = tensors[1][0]
        assert tensor_tuple[0] == 'S -> [NP, VP]'
        assert tensor_tuple[1] is True
        assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.s_rule_index])

        tensor_tuple = tensors[1][1]
        assert tensor_tuple[0] == 'NP -> test'
        assert tensor_tuple[1] is True
        assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.np_index])

        # This item was just padding.
        tensor_tuple = tensors[1][2]
        assert tensor_tuple[0] == ''
        assert tensor_tuple[1] is False
        assert tensor_tuple[2] is None
Beispiel #8
0
 def test_nested_list_fields_are_padded_correctly(self):
     nested_field1 = ListField([LabelField(c) for c in ['a', 'b', 'c', 'd', 'e']])
     nested_field2 = ListField([LabelField(c) for c in ['f', 'g', 'h', 'i', 'j', 'k']])
     list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2])
     list_field.index(self.vocab)
     padding_lengths = list_field.get_padding_lengths()
     assert padding_lengths == {'num_fields': 3, 'list_num_fields': 6}
     tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy()
     numpy.testing.assert_almost_equal(tensor, [[-1, -1, -1, -1, -1, -1],
                                                [0, 1, 2, 3, 4, -1],
                                                [5, 6, 7, 8, 9, 10]])
 def test_nested_list_fields_are_padded_correctly(self):
     nested_field1 = ListField([LabelField(c) for c in ['a', 'b', 'c', 'd', 'e']])
     nested_field2 = ListField([LabelField(c) for c in ['f', 'g', 'h', 'i', 'j', 'k']])
     list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2])
     list_field.index(self.vocab)
     padding_lengths = list_field.get_padding_lengths()
     assert padding_lengths == {'num_fields': 3, 'list_num_fields': 6}
     tensor = list_field.as_tensor(padding_lengths).data.cpu().numpy()
     numpy.testing.assert_almost_equal(tensor, [[[-1], [-1], [-1], [-1], [-1], [-1]],
                                                [[0], [1], [2], [3], [4], [-1]],
                                                [[5], [6], [7], [8], [9], [10]]])
Beispiel #10
0
    def text_to_instance(
            self,  # type: ignore
            sentences: List[str],
            passage: str,
            columns: str,
            column_start_spans,
            column_end_spans,
            value_start_spans,
            value_end_spans,
            sqls,
            passage_tokens: List[Token] = None,
            column_tokens: List[Token] = None,
            sentence_tokens: List[List[Token]] = None,
            yesno_list: List[int] = None,
            metadata: Dict[str, Any] = None) -> Instance:

        passage_field = TextField(passage_tokens, self._token_indexers)
        columns_field = TextField(column_tokens, self._token_indexers)
        sentences_field = ListField([
            TextField(s_tokens, self._token_indexers)
            for s_tokens in sentence_tokens
        ])

        fields = {
            'passage': passage_field,
            'sentence': sentences_field,
            'column': columns_field
        }
        col_start_list = []
        col_end_list = []
        for s, e in zip(column_start_spans, column_end_spans):
            col_start_list.append(IndexField(s, passage_field))
            col_end_list.append(IndexField(e, passage_field))
        fields['col_start_idx'] = ListField(col_start_list)
        fields['col_end_idx'] = ListField(col_end_list)

        val_start_list = []
        val_end_list = []
        for s, e in zip(value_start_spans, value_end_spans):
            val_start_list.append(IndexField(s, passage_field))
            val_end_list.append(IndexField(e, passage_field))

        fields['val_start_idx'] = ListField(val_start_list)
        fields['val_end_idx'] = ListField(val_end_list)

        metadata['origin_passage'] = passage
        metadata['passage_tokens'] = passage_tokens
        metadata['column_tokens'] = column_tokens
        metadata['sentence_tokens'] = sentence_tokens
        metadata['sqls'] = sqls
        fields['yesno_list'] = ListField([
            LabelField(yesno, label_namespace="yesno_labels")
            for yesno in yesno_list
        ])
        fields['metadata'] = MetadataField(metadata)

        return Instance(fields)
 def test_list_field_can_handle_empty_text_fields(self):
     list_field = ListField(
         [self.field1, self.field2, self.empty_text_field])
     list_field.index(self.vocab)
     tensor_dict = list_field.as_tensor(list_field.get_padding_lengths())
     numpy.testing.assert_array_equal(
         tensor_dict["words"].detach().cpu().numpy(),
         numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [0, 0, 0, 0, 0]]),
     )
Beispiel #12
0
 def test_list_field_can_handle_empty_sequence_label_fields(self):
     list_field = ListField(
         [self.sequence_label_field, self.sequence_label_field, self.empty_sequence_label_field]
     )
     list_field.index(self.vocab)
     tensor = list_field.as_tensor(list_field.get_padding_lengths())
     numpy.testing.assert_array_equal(
         tensor.detach().cpu().numpy(), numpy.array([[1, 1, 0, 1], [1, 1, 0, 1], [0, 0, 0, 0]])
     )
Beispiel #13
0
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        if "text" in json_dict:
            text = json_dict["text"]
            words = [
                text[t["span"]["start"]:t["span"]["end"]]
                for t in json_dict["tokens"]
            ]
        else:
            words = json_dict["tokens"]

        tokens = [Token(w) for w in words]
        # Attribut (_dataset_reader._token_indexers ) wird durch unseren DataReader hinzugefügt!
        # Nicht allgemein gültig...
        token_indexers = self._dataset_reader._token_indexers
        sequence = TextField(tokens, token_indexers=token_indexers)

        context_size = len(words) + 1
        spans = []
        span_masks = []
        for start, end in enumerate_spans(
                tokens, max_span_width=self._dataset_reader._max_span_width):
            spans.append(SpanField(start, end, sequence))
            span_masks.append(create_mask(start, end, context_size))

        span_field = ListField(spans)
        # span_tuples = [(span.span_start, span.span_end) for span in spans]
        span_mask_field = ListField([
            ArrayField(np.array(si, dtype=np.int), dtype=np.int)
            for si in span_masks
        ])
        instance_fields: Dict[str, Field] = {
            "tokens": sequence,
            "metadata": MetadataField({"words": [x.text for x in tokens]}),
            "spans": span_field,
            "span_masks": span_mask_field
        }
        return Instance(instance_fields)
Beispiel #14
0
    def _make_instance_from_text(self, sent_tokens, pred_index, annotations = None, sent_id = None):
        instance_dict = {}

        if isinstance(sent_tokens, str):
            sent_tokens = sent_tokens.split()
        sent_tokens = cleanse_sentence_text(sent_tokens)
        text_field = TextField([Token(t) for t in sent_tokens], self._token_indexers)
        instance_dict['text'] = text_field
        instance_dict['predicate_indicator'] = SequenceLabelField([1 if i == pred_index else 0 for i in range(len(sent_tokens))], text_field)

        if annotations is not None:
            for i, slot_name in enumerate(self._slot_labels):
                span_slot = ListField([LabelField(ann.slots[i], label_namespace="slot_%s"%slot_name) for ann in annotations for span in ann.all_spans])
                instance_dict['span_slot_%s'%slot_name] = span_slot

            labeled_span_field = ListField([SpanField(span.start(), span.end(), text_field) for ann in annotations for span in ann.all_spans])
            instance_dict['labeled_spans'] = labeled_span_field

            if self._bio_labels:
                bio_labels = ["O"] * len(sent_tokens)

                bio_labels[pred_index] = "B-V"

                for span in self._resolve_spans(annotations, pred_index):
                    bio_labels[span.start()] = "B-ARG"
                    for i in range(span.start()+1, span.end()+1):
                        bio_labels[i] = "I-ARG"
                instance_dict["bio_label"] = SequenceLabelField(bio_labels, text_field, label_namespace="bio_labels")

            instance_dict['annotations'] = MetadataField({'annotations':annotations})

        metadata = {'pred_index' : pred_index, 'sent_text': " ".join(sent_tokens)}
        if sent_id is not None:
            metadata['sent_id'] = sent_id
        instance_dict['metadata'] = MetadataField(metadata)

        return Instance(instance_dict)
    def text_to_instance(self,  # type: ignore
                         query: List[str],
                         prelinked_entities: Dict[str, Dict[str, str]] = None,
                         sql: List[str] = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        tokens = TextField([Token(t) for t in query], self._token_indexers)
        fields["tokens"] = tokens

        if sql is not None:
            try:
                action_sequence, all_actions = self._world.get_action_sequence_and_all_actions(sql,
                                                                                               prelinked_entities)
            except ParseError:
                return None

        index_fields: List[Field] = []
        production_rule_fields: List[Field] = []

        for production_rule in all_actions:
            nonterminal, _ = production_rule.split(' ->')
            production_rule = ' '.join(production_rule.split(' '))
            field = ProductionRuleField(production_rule, self._world.is_global_rule(nonterminal))
            production_rule_fields.append(field)

        valid_actions_field = ListField(production_rule_fields)
        fields["valid_actions"] = valid_actions_field

        action_map = {action.rule: i # type: ignore
                      for i, action in enumerate(valid_actions_field.field_list)}

        for production_rule in action_sequence:
            index_fields.append(IndexField(action_map[production_rule], valid_actions_field))

        action_sequence_field = ListField(index_fields)
        fields["action_sequence"] = action_sequence_field
        return Instance(fields)
Beispiel #16
0
 def text_to_instance(self, tokens: List[Token], relations=None) -> Instance:
     # pylint: disable=arguments-differ
     fields: Dict[str, Field] = {}
     text_field = TextField(tokens, token_indexers=self._token_indexers)
     fields["text"] = text_field
     if relations is not None:
         field_list = []
         for relation in relations:
             field_list.append(
                 SequenceLabelField(
                     labels=relation, sequence_field=text_field, label_namespace=self._label_namespace
                 )
             )
         fields["relations"] = ListField(field_list=field_list)
     return Instance(fields)
    def text_to_instance(
            self,  # type: ignore
            premise: str,
            hypotheses: List[str],
            labels: List[str] = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        premise_tokens = self._tokenizer.tokenize(premise)
        fields['premise'] = TextField(premise_tokens, self._token_indexers)
        all_hypotheses_fields = list()
        for hypothesis in hypotheses:
            hypothesis_tokens = self._tokenizer.tokenize(hypothesis)
            all_hypotheses_fields.append(
                TextField(hypothesis_tokens, self._token_indexers))
        fields['hypotheses'] = ListField(all_hypotheses_fields)
        if labels:
            all_labels_fields = list()
            for label in labels:
                all_labels_fields.append(LabelField(label))
            fields['labels'] = ListField(all_labels_fields)

        metadata = {"labels": all_labels_fields}
        fields["metadata"] = MetadataField(metadata)
        return Instance(fields)
Beispiel #18
0
 def text_to_instance(
     self,
     image: np.ndarray,
     label_box: List[List[float]] = list(),
     label_class: List[str] = list(),
     keypoints: List[List[Tuple[float, float, float]]] = list()
 ) -> Instance:
     if self._keypoint_name in self._include_fields:
         # protect against some augmentations not supporting keypoints
         img, _, label_box, label_class, keypoints = self.augment(
             image,
             boxes=[np.array(b) for b in label_box],
             category_id=label_class,
             keypoints=keypoints)
     else:
         img, _, label_box, label_class, _ = self.augment(
             image,
             boxes=[np.array(b) for b in label_box],
             category_id=label_class)
     h, w, c = img.shape
     fields: Dict[str, Field] = {}
     fields['image'] = ImageField(img.transpose(2, 0, 1),
                                  channels_first=False)
     fields['image_sizes'] = ArrayField(np.array([w, h]))
     if self._bbox_name in self._include_fields and len(label_box) > 0:
         box_fields = [BoundingBoxField(x) for x in label_box]
         fields['boxes'] = ListField(box_fields)
     if self._bbox_class_name in self._include_fields and len(
             label_class) > 0:
         fields['box_classes'] = ListField(
             [LabelField(idx) for idx in label_class])
     if self._keypoint_name in self._include_fields and len(keypoints) > 0:
         assert all([len(kp) == len(keypoints[0]) for kp in keypoints])
         fields['keypoint_positions'] = ListField(
             [KeypointField(kp) for kp in keypoints])
     return Instance(fields)
    def text_to_instance(self,
                         saifa_text_list: List[ArrayField],
                         raisha_text_list: List[ArrayField],
                         labels: List[str] = None,
                         metadata: Dict = None) -> Instance:
        raisha_text_list = ListField(raisha_text_list)
        fields = {'source': raisha_text_list}

        saifa_text_list = ListField(saifa_text_list)
        fields['target'] = saifa_text_list

        if labels:
            seq_labels_field = SequenceLabelField(
                labels=labels, sequence_field=saifa_text_list)
            fields['seq_labels'] = seq_labels_field
            reg_labels = [0 if label == 'hotel' else 1 for label in labels]
            reg_label_field = FloatLabelField(
                sum(reg_labels) / len(reg_labels))
            fields['reg_labels'] = reg_label_field

        if metadata is not None:
            fields['metadata'] = MetadataField(metadata)

        return Instance(fields)
Beispiel #20
0
    def test_doubly_nested_field_works(self):
        field1 = ProductionRuleField('S -> [NP, VP]', is_global_rule=True)
        field2 = ProductionRuleField('NP -> test', is_global_rule=True)
        field3 = ProductionRuleField('VP -> eat', is_global_rule=False)
        list_field = ListField([ListField([field1, field2, field3]),
                                ListField([field1, field2])])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        tensors = list_field.as_tensor(padding_lengths)
        assert isinstance(tensors, list)
        assert len(tensors) == 2
        assert isinstance(tensors[0], list)
        assert len(tensors[0]) == 3
        assert isinstance(tensors[1], list)
        assert len(tensors[1]) == 3

        tensor_tuple = tensors[0][0]
        assert tensor_tuple[0] == 'S -> [NP, VP]'
        assert tensor_tuple[1] is True
        assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.s_rule_index])

        tensor_tuple = tensors[0][1]
        assert tensor_tuple[0] == 'NP -> test'
        assert tensor_tuple[1] is True
        assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.np_index])

        tensor_tuple = tensors[0][2]
        assert tensor_tuple[0] == 'VP -> eat'
        assert tensor_tuple[1] is False
        assert tensor_tuple[2] is None

        tensor_tuple = tensors[1][0]
        assert tensor_tuple[0] == 'S -> [NP, VP]'
        assert tensor_tuple[1] is True
        assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.s_rule_index])

        tensor_tuple = tensors[1][1]
        assert tensor_tuple[0] == 'NP -> test'
        assert tensor_tuple[1] is True
        assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.np_index])

        # This item was just padding.
        tensor_tuple = tensors[1][2]
        assert tensor_tuple[0] == ''
        assert tensor_tuple[1] is False
        assert tensor_tuple[2] is None
    def text_to_instance(self,
                         text: str,
                         labels: List[str] = None,
                         header: List[str] = None) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        tokenized_text = self._tokenizer.tokenize(text)
        text_field = TextField(tokenized_text, self._token_indexers)
        fields = {'text': text_field}
        if not labels:
            labels = [0 for i in range(237)]

        fields['labels'] = ListField(
            [LabelField(int(l), skip_indexing=True) for l in labels])
        fields['metadata'] = MetadataField(header)
        return Instance(fields)
    def text_to_instance(self,
                         source_string: str,
                         target_string: str = None,
                         alignment: str = None) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        tokenized_source = self._source_tokenizer.tokenize(source_string)
        if self._source_add_start_token:
            tokenized_source.insert(0, Token(START_SYMBOL))
        tokenized_source.append(Token(END_SYMBOL))
        source_field = TextField(tokenized_source, self._source_token_indexers)

        if target_string is not None:
            tokenized_target = self._target_tokenizer.tokenize(target_string)
            if self._remove_unneeded_aliases:
                new_target = tu.clean_unneeded_aliases(
                    [token.text for token in tokenized_target])
                tokenized_target = [Token(t) for t in new_target]
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target,
                                     self._target_token_indexers)

            alignment_index_fields: List[IndexField] = []
            tokenized_alignment = self._source_tokenizer.tokenize(alignment)
            tmp_source_tokenized_strings = [t.text for t in tokenized_source]
            for aligned_token in tokenized_alignment:
                try:
                    aligned_token_index = int(
                        tmp_source_tokenized_strings.index(aligned_token.text))
                except ValueError as e:
                    # Since START_TOKEN is added, no step should be aligned to it so it can be used as
                    # a special "no alignment" index
                    aligned_token_index = 0
                alignment_index_fields.append(
                    IndexField(aligned_token_index, source_field))
            if not alignment_index_fields:
                # if there was no alignemnet (it was None or ""), add dummy alignments
                for _ in range(len(tokenized_target) - 2):
                    alignment_index_fields.append(IndexField(0, source_field))
            alignment_field = ListField(alignment_index_fields)

            return Instance({
                "source_tokens": source_field,
                "target_tokens": target_field,
                "alignment_sequence": alignment_field
            })
        else:
            return Instance({'source_tokens': source_field})
 def test_all_fields_padded_to_max_length(self):
     list_field = ListField([self.field1, self.field2, self.field3])
     list_field.index(self.vocab)
     tensor_dict = list_field.as_tensor(list_field.get_padding_lengths())
     numpy.testing.assert_array_almost_equal(tensor_dict[u"words"][0].detach().cpu().numpy(),
                                             numpy.array([2, 3, 4, 5, 0]))
     numpy.testing.assert_array_almost_equal(tensor_dict[u"words"][1].detach().cpu().numpy(),
                                             numpy.array([2, 3, 4, 1, 5]))
     numpy.testing.assert_array_almost_equal(tensor_dict[u"words"][2].detach().cpu().numpy(),
                                             numpy.array([2, 3, 1, 5, 0]))
 def _read(self, file_path):
     # if `file_path` is a URL, redirect to the cache
     # file_path = cached_path(file_path)
     for filename in os.listdir(file_path):
         filename_splitted = filename.split('_')
         task_name = filename_splitted[-3]
         domain_name = filename_splitted[-2]
         if task_name not in self._tasks or domain_name not in self._domains:
             continue
         with open(os.path.join(file_path, filename), "r") as data_file:
             logger.info("Reading instances from lines in file at: %s",
                         filename)
             for line in Tqdm.tqdm(data_file):
                 line = line.strip("\n")
                 # skip blank lines
                 if not line:
                     continue
                 tokens_and_tags = [
                     pair.rsplit(self._word_tag_delimiter, 1)
                     for pair in line.split(self._token_delimiter)
                 ]
                 tokens = [Token(token) for token, tag in tokens_and_tags]
                 tags = [tag for token, tag in tokens_and_tags]
                 sequence = TextField(tokens, self._token_indexers)
                 sequence_tags = SequenceLabelField(
                     tags, sequence, label_namespace=task_name + '_labels')
                 task_field = LabelField(task_name,
                                         label_namespace="task_labels")
                 domain_field = LabelField(domain_name,
                                           label_namespace="domain_labels")
                 input_dict = {
                     'task_token': task_field,
                     'domain_token': domain_field,
                     'tokens': sequence
                 }
                 all_tags = []
                 empty_tags = ['O'] * len(tags)
                 for tsk in self._tasks:
                     if tsk != task_name:
                         empty_sequence_tags = SequenceLabelField(
                             empty_tags,
                             sequence,
                             label_namespace=tsk + '_labels')
                         all_tags.append(empty_sequence_tags)
                     else:
                         all_tags.append(sequence_tags)
                 input_dict['all_tags'] = ListField(all_tags)
                 yield Instance(input_dict)
    def text_to_instance(self, data: Dict[str, Any]) -> Instance:
        # Tokenize input sentence
        input = data['input']
        if self._prev:
            input = ' '.join((data['previous_sentence'], input))
        tokenized_input = self._tokenizer.tokenize(input)
        input_field = TextField(tokenized_input, self._token_indexers)

        # Combine and tokenize claims
        properties = data['properties']
        values = data['values']
        qualifiers = data['qualifiers']
        claims_list = []
        for prop, val, quals in zip(properties, values, qualifiers):
            substrings = []
            substrings.extend(['<prop>', prop, '</prop>'])
            substrings.extend(['<val>', val, '</val>'])
            if len(quals) > 0:
                for qp, qv in quals:
                    substrings.extend(['<qual_prop>', qp, '</qual_prop>'])
                    substrings.extend(['<qual_val>', qv, '</qual_val>'])
            claim_string = ' '.join(substrings)
            tokenized_claim = self._tokenizer.tokenize(claim_string)
            claim_field = TextField(tokenized_claim, self._token_indexers)
            claims_list.append(claim_field)
        claims_field = ListField(claims_list)

        # Stuff everything in a dict
        fields = {
            'inputs': input_field,
            'claims': claims_field,
        }

        # If target labels are provided add as SequenceLabelField
        if 'used' in data:
            labels = ['used' if x else 'not used' for x in data['used']]
            label_field = SequenceLabelField(labels=labels,
                                             sequence_field=claims_field)
            fields['labels'] = label_field

        # If target output sequence is provided add as TextField
        if 'target' in data:
            target = data['target']
            tokenized_target = self._tokenizer.tokenize(target)
            fields['targets'] = TextField(tokenized_target,
                                          self._token_indexers)

        return Instance(fields)
    def text_to_instance(
        self,  # type: ignore
        qid: str,
        start: str,
        alternatives: List[str],
        label: Optional[int] = None,
    ) -> Instance:
        # tokenize
        start = self._tokenizer.tokenize(start)

        sequences = []
        for alternative in alternatives:
            alternative = self._tokenizer.tokenize(alternative)
            length_for_start = (self.length_limit - len(alternative) -
                                self._tokenizer.num_special_tokens_for_pair())
            if length_for_start < 0:
                # If the alternative is too long by itself, we take the beginning and add no tokens from the start.
                alternative = alternative[:length_for_start]
                length_for_start = 0
            sequences.append(
                self._tokenizer.add_special_tokens(start[:length_for_start],
                                                   alternative))

        # make fields
        from allennlp.data.fields import TextField

        sequences = [
            TextField(sequence, self._token_indexers) for sequence in sequences
        ]
        from allennlp.data.fields import ListField

        sequences = ListField(sequences)

        from allennlp.data.fields import MetadataField

        fields = {
            "alternatives": sequences,
            "qid": MetadataField(qid),
        }

        if label is not None:
            if label < 0 or label >= len(sequences):
                raise ValueError("Alternative %d does not exist", label)
            from allennlp.data.fields import IndexField

            fields["correct_alternative"] = IndexField(label, sequences)

        return Instance(fields)
Beispiel #27
0
 def test_all_fields_padded_to_max_length(self):
     list_field = ListField([self.field1, self.field2, self.field3])
     list_field.index(self.vocab)
     array_dict = list_field.as_array(list_field.get_padding_lengths())
     numpy.testing.assert_array_almost_equal(array_dict["words"][0],
                                             numpy.array([2, 3, 4, 5, 0]))
     numpy.testing.assert_array_almost_equal(array_dict["words"][1],
                                             numpy.array([2, 3, 4, 1, 5]))
     numpy.testing.assert_array_almost_equal(array_dict["words"][2],
                                             numpy.array([2, 3, 1, 5, 0]))
    def text_to_instance(self, tokenized_sentence: List[str],
                         spans: List[List[int]]) -> Instance:
        allennlp_sentence_tokens = [Token(text=t) for t in tokenized_sentence]
        sentence_token_indexes = TextField(allennlp_sentence_tokens,
                                           self._token_indexers)

        span_fields = []
        for span_start, span_end_exclusive in spans:
            span_field = SpanField(span_start, span_end_exclusive - 1,
                                   sentence_token_indexes)
            span_fields.append(span_field)

        fields: Dict[str, Field] = {}
        fields["tokens"] = sentence_token_indexes
        fields["spans"] = ListField(span_fields)
        return Instance(fields)
Beispiel #29
0
    def tokens_to_empath(self, tokens: List[List[List[str]]]) -> ListField:
        def doc_to_empath(doc_str) -> ArrayField:
            results = self.empath_lexicon.analyze(doc_str)
            return ArrayField(
                np.array([
                    results[category] for category in self.lexicon_categories
                ]))

        doc_list = [
            doc_to_empath(" ".join([
                word for sentence in doc[:self.max_sent]
                for word in sentence[:self.max_word]
            ])) for doc in tokens[-self.max_doc:]
        ]

        return ListField(doc_list)
Beispiel #30
0
    def text_to_instance(
        self,  # type: ignore
        question: str,
        image: Union[str, Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]],
        answer_counts: Optional[MutableMapping[str, int]] = None,
        *,
        use_cache: bool = True,
    ) -> Optional[Instance]:
        tokenized_question = self._tokenizer.tokenize(question)
        question_field = TextField(tokenized_question, None)

        fields: Dict[str, Field] = {
            "question": question_field,
        }

        if image is not None:
            if isinstance(image, str):
                features, coords, _, _ = next(
                    self._process_image_paths([image], use_cache=use_cache)
                )
            else:
                features, coords, _, _ = image

            fields["box_features"] = ArrayField(features)
            fields["box_coordinates"] = ArrayField(coords)
            fields["box_mask"] = ArrayField(
                features.new_ones((features.shape[0],), dtype=torch.bool),
                padding_value=False,
                dtype=torch.bool,
            )

        if answer_counts is not None:
            answer_fields = []
            weights = []

            for answer, count in answer_counts.items():
                if self.answer_vocab is None or answer in self.answer_vocab:
                    answer_fields.append(LabelField(answer, label_namespace="answers"))
                    weights.append(get_score(count))

            if len(answer_fields) <= 0:
                return None

            fields["labels"] = ListField(answer_fields)
            fields["label_weights"] = ArrayField(torch.tensor(weights))

        return Instance(fields)
Beispiel #31
0
    def text_to_instance(self,
                         passage_id: str,
                         question_id: str,
                         question_type: str,
                         passage: str,
                         question: str,
                         answer0: str,
                         answer1: str,
                         label0: Optional[str] = None) -> Instance:
        metadata = {
            'passage_id': passage_id,
            'question_id': question_id,
            'question_type': question_type,
        }

        passage_sentences = util.split_sentences(self.sentenciser, passage)
        passage_sentences_tokens = [
            self.tokeniser.tokenize(sentence) for sentence in passage_sentences
        ]
        passage_fields = [
            TextField(tokens, self.word_indexers)
            for tokens in passage_sentences_tokens
        ]

        question_tokens = self.tokeniser.tokenize(question)
        answer0_tokens = self.tokeniser.tokenize(answer0)
        answer1_tokens = self.tokeniser.tokenize(answer1)

        fields = {
            "metadata": MetadataField(metadata),
            "sentences": ListField(passage_fields),
            "question": TextField(question_tokens, self.word_indexers),
            "answer0": TextField(answer0_tokens, self.word_indexers),
            "answer1": TextField(answer1_tokens, self.word_indexers),
        }

        if label0 is not None:
            if label0 == "True":
                label = 0
            elif label0 == 'False':
                label = 1
            else:
                raise ValueError('Wrong value for Answer::correct')

            fields["label"] = LabelField(label=label, skip_indexing=True)

        return Instance(fields)
Beispiel #32
0
    def text_to_instance(
            self,  # type: ignore
            question_text: str,
            passage_text: str,
            options_text: List[str],
            qa_id: str,
            passage_tokens: List[Token] = None,
            answer_index: int = None,
            debate_mode: List[str] = None) -> Instance:
        # pylint: disable=arguments-differ
        additional_metadata = {'id': qa_id, 'debate_mode': debate_mode}
        fields: Dict[str, Field] = {}

        # Tokenize and calculate token offsets (i.e., for wordpiece)
        question_tokens = self._tokenizer.tokenize(question_text)
        if passage_tokens is None:
            passage_tokens = self._tokenizer.tokenize(passage_text)
        options_tokens = self._tokenizer.batch_tokenize(options_text)
        passage_offsets = [(token.idx, token.idx + len(token.text))
                           for token in passage_tokens]

        # This is separate so we can reference it later with a known type.
        options_field = ListField([
            TextField(option_tokens, self._token_indexers)
            for option_tokens in options_tokens
        ])
        fields['passage'] = TextField(passage_tokens, self._token_indexers)
        fields['question'] = TextField(question_tokens, self._token_indexers)
        fields['options'] = options_field
        metadata = {
            'original_passage':
            passage_text,
            'token_offsets':
            passage_offsets,
            'question_tokens': [token.text for token in question_tokens],
            'passage_tokens': [token.text for token in passage_tokens],
            'options_tokens': [[token.text for token in option_tokens]
                               for option_tokens in options_tokens]
        }
        if answer_index is not None:
            metadata['answer_texts'] = options_text[answer_index]

        fields['answer_index'] = IndexField(answer_index, options_field)

        metadata.update(additional_metadata)
        fields['metadata'] = MetadataField(metadata)
        return Instance(fields)
Beispiel #33
0
    def test_as_tensor_can_handle_multiple_token_indexers(self):

        self.field1._token_indexers = self.words_and_characters_indexers
        self.field2._token_indexers = self.words_and_characters_indexers
        self.field3._token_indexers = self.words_and_characters_indexers

        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        tensor_dict = list_field.as_tensor(padding_lengths)
        words = tensor_dict["words"].detach().cpu().numpy()
        characters = tensor_dict["characters"].detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(
            words, numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [2, 3, 1, 5, 0]])
        )

        numpy.testing.assert_array_almost_equal(
            characters[0],
            numpy.array(
                [
                    [5, 1, 1, 2, 0, 0, 0, 0, 0],
                    [1, 2, 0, 0, 0, 0, 0, 0, 0],
                    [1, 0, 0, 0, 0, 0, 0, 0, 0],
                    [2, 3, 4, 5, 3, 4, 6, 3, 0],
                    [0, 0, 0, 0, 0, 0, 0, 0, 0],
                ]
            ),
        )

        numpy.testing.assert_array_almost_equal(
            characters[1],
            numpy.array(
                [
                    [5, 1, 1, 2, 0, 0, 0, 0, 0],
                    [1, 2, 0, 0, 0, 0, 0, 0, 0],
                    [1, 0, 0, 0, 0, 0, 0, 0, 0],
                    [1, 1, 1, 1, 3, 1, 3, 4, 5],
                    [2, 3, 4, 5, 3, 4, 6, 3, 0],
                ]
            ),
        )

        numpy.testing.assert_array_almost_equal(
            characters[2],
            numpy.array(
                [
                    [5, 1, 1, 2, 0, 0, 0, 0, 0],
                    [1, 2, 0, 0, 0, 0, 0, 0, 0],
                    [1, 4, 1, 5, 1, 3, 1, 0, 0],
                    [2, 3, 4, 5, 3, 4, 6, 3, 0],
                    [0, 0, 0, 0, 0, 0, 0, 0, 0],
                ]
            ),
        )
Beispiel #34
0
    def text_to_instance(
            self,  # type: ignore
            qid: str,
            question: str,
            choices: List[str],
            evidence_top: List[str] = None,
            answer: str = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        question_tokens = self._tokenizer.tokenize(question)
        choice_tokens = self._tokenizer.batch_tokenize(choices)

        if evidence_top:
            evidence_tokens = self._tokenizer.batch_tokenize(evidence_top)
            evidence_tokens_flat = [t for evi in evidence_tokens for t in evi]
        else:
            evidence_tokens_flat = []

        qa_pair_tokens = []
        for c_tokens in choice_tokens:
            qa_pair = question_tokens + evidence_tokens_flat + \
                [Token("[SEP]")] + c_tokens
            qa_pair_tokens.append(qa_pair)

        qa_pairs_field = ListField([
            TextField(tokens, self._token_indexers)
            for tokens in qa_pair_tokens
        ])
        if answer:
            fields['answer_index'] = IndexField(self.LABELS.index(answer),
                                                qa_pairs_field)
        fields['qa_pairs'] = qa_pairs_field

        metadata = {
            "qid":
            qid,
            "question":
            question,
            "choices":
            choices,
            "question_tokens": [x.text for x in question_tokens],
            "choices_tokens":
            [[x.text for x in tokens] for tokens in choice_tokens]
        }
        fields["metadata"] = MetadataField(metadata)
        return Instance(fields)
 def target_short_sequence_instance(
         self,  # type: ignore
         inst_tokens: Iterable,
         inst_arcs: Iterable,
         context_length: int = 15,
         metadata={"tokens": []}) -> Instance:
     """generate instances as short sequences (of given length) before a target turn
     with labels being dependency between target and its head;
     if head is out of context ? -> force previous turn ? other heuristics ?
     
     could also be used to refactor full-chat instances, by providing start="first" line, end=end of chat
     """
     all = []
     for i, turn in enumerate(inst_tokens):
         start = max(0, i - context_length)
         context_turns = inst_tokens[start:i + 1]
         data = self.extract_data(inst_arcs, metadata, start, i)
         fields: Dict[str, Field] = {}
         seq_field = ListField([
             TextField(tokenized_line, self._token_indexers)
             for tokenized_line in context_turns
         ])
         data["tokens"] = context_turns
         data["file_source"] = metadata["file_source"]
         fields["lines"] = seq_field
         try:
             fields["arcs"] = AdjacencyField(data["arcs"], seq_field)
         except:
             print("error at index", i,
                   "interval should be (%d,%d)" % (start, i))
             breakpoint()
         # example additional features, should be a separate function
         features = data["features"]
         # does src address target ?
         fields["rel_features"] = ArrayField(
             target_address_src_matrix(features["speaker"],
                                       features["addressee"]))
         # distance between src and target
         fields["offsets"] = ArrayField(turn_distance_matrix(context_turns),
                                        dtype=int)
         # is the turn the server ?
         fields["is_server"] = ArrayField(np.array(features["is_server"]))
         # should be listfield too ? one for each line ?
         fields["metadata"] = MetadataField(data)
         all.append(Instance(fields))
     return all
Beispiel #36
0
    def _get_wordpiece_indices_field(wordpieces: List[List[int]]):
        wordpiece_token_indices = []
        ingested_indices = []
        i = 0
        while i < len(wordpieces):
            current_wordpieces = wordpieces[i]
            if len(current_wordpieces) > 1:
                wordpiece_token_indices.append(LabelsField(current_wordpieces, padding_value=-1))
                i = current_wordpieces[-1] + 1
            else:
                i += 1

        # hack to guarantee minimal length of padded number
        # according to dataset_readers.reading_comprehension.drop from allennlp)
        wordpiece_token_indices.append(LabelsField([-1], padding_value=-1))

        return ListField(wordpiece_token_indices)
Beispiel #37
0
    def text_to_instance(
        self,  # type: ignore
        qa_id: int,
        question: str,
        answer: Optional[str],
        image: Union[str, Tuple[Tensor, Tensor, Optional[Tensor],
                                Optional[Tensor]]],
        use_cache: bool = True,
        keep_impossible_questions: bool = True,
    ) -> Optional[Instance]:
        question_field = TextField(self._tokenizer.tokenize(question), None)

        fields: Dict[str, Field] = {
            "question": question_field,
        }

        if isinstance(image, str):
            features, coords, _, _ = next(
                self._process_image_paths([image], use_cache=use_cache))
        else:
            features, coords, _, _ = image

        fields["box_features"] = ArrayField(features)
        fields["box_coordinates"] = ArrayField(coords)
        fields["box_mask"] = ArrayField(
            features.new_ones((features.shape[0], ), dtype=torch.bool),
            padding_value=False,
            dtype=torch.bool,
        )

        if answer is not None:
            labels_fields = []
            weights = []
            if (not self.answer_vocab or answer
                    in self.answer_vocab) or keep_impossible_questions:
                labels_fields.append(
                    LabelField(answer, label_namespace="answers"))
                weights.append(1.0)

            if len(labels_fields) <= 0:
                return None

            fields["label_weights"] = ArrayField(torch.tensor(weights))
            fields["labels"] = ListField(labels_fields)

        return Instance(fields)
def process_evidence_chains(evd_possible_chains, sent_labels_, fields):
    evd_possible_chains_ = []
    if evd_possible_chains is not None:
        for chain in evd_possible_chains:
            if len(chain) == 0 or any([s_idx >= len(sent_labels_) for s_idx in chain]):
                # if there is no possible chain or any selected sentence in the chain exceeds para_limit,
                # ignore the instance.
                # the chain start with 0 will be filtered out in RLBidirectionalAttentionFlow Module.
                chain = [0]
            else:
                # Since indice 0 is for eos, shifts by one
                # Also add eos at the end
                chain = [s_idx + 1 for s_idx in chain] + [0]
            evd_possible_chains_.append(chain)
        fields['evd_chain_labels'] = ListField([ArrayField(np.array(ch), padding_value=0)
                                                for ch in evd_possible_chains_])
    return evd_possible_chains_
    def text_to_instance(self, image_feature: np.ndarray, captions: List[str] = None, image_name: str = None):

        fields = {
            "image_feature": ArrayField(image_feature),
        }

        if captions is not None:
            text_field_list = []
            for caption in captions:
                tokens = [Token(t) for t in caption.split()]
                text_field_list.append(TextField(tokens, self._token_indexers))
            fields["target_tokens"] = ListField(text_field_list)

        if image_name:
            fields["image_name"] = MetadataField(image_name)

        return Instance(fields)
 def test_list_field_can_handle_empty_index_fields(self):
     list_field = ListField([self.index_field, self.index_field, self.empty_index_field])
     list_field.index(self.vocab)
     tensor = list_field.as_tensor(list_field.get_padding_lengths())
     numpy.testing.assert_array_equal(tensor.data.cpu().numpy(), numpy.array([[1], [1], [-1]]))
Beispiel #41
0
 def test_get_padding_lengths(self):
     list_field = ListField([self.field1, self.field2, self.field3])
     list_field.index(self.vocab)
     lengths = list_field.get_padding_lengths()
     assert lengths == {"num_fields": 3, "list_words_length": 5, "list_num_tokens": 5}