Exemple #1
0
    def text_to_instance(self, sample) -> Instance:
        fields: Dict[str, Field] = {}
        # print(sample)
        tailored_history = sample['history']
        tailored_tags = sample['tags'][-10:]
        context = '。'.join(tailored_history)

        # history = ' '.join(list(''.join(context)))
        # context = '[CLS] ' + context[-512:]
        text_tokens = self._tokenizer.tokenize(context[-510:])
        fields['text'] = TextField(text_tokens, self._token_indexers)

        fileds_list = []
        for sen in tailored_history:
            sen = ' '.join(sen)
            txt_token = self._tokenizer.tokenize(sen)
            ff = TextField(txt_token, self._token_indexers)
            fileds_list.append(ff)
        fields["label"] = MultiLabelField(list(sample['next_symp']),
                                          skip_indexing=True,
                                          num_labels=sym_size)
        # fields['symptoms'] = MultiLabelField(list(sample['his_symp']), skip_indexing=True, num_labels=sym_size)
        # fields['tags'] = MetadataField(tailored_tags)
        # fields['history'] = ListField(fileds_list)
        fields["future"] = MultiLabelField(list(sample['future_symp']),
                                           skip_indexing=True,
                                           num_labels=sym_size)

        return Instance(fields)
 def test_as_tensor_returns_integer_tensor(self):
     f = MultiLabelField([2, 3],
                         skip_indexing=True,
                         label_namespace="test1",
                         num_labels=5)
     tensor = f.as_tensor(f.get_padding_lengths()).data.cpu().numpy()
     numpy.testing.assert_array_almost_equal(tensor,
                                             numpy.array([0, 0, 1, 1, 0]))
Exemple #3
0
 def test_as_tensor_returns_integer_tensor(self):
     f = MultiLabelField([2, 3],
                         skip_indexing=True,
                         label_namespace="test1",
                         num_labels=5)
     tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().tolist()
     assert tensor == [0, 0, 1, 1, 0]
     assert {type(item) for item in tensor} == {int}
    def test_multilabel_field_empty_field_works(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("label1", namespace="test_empty_labels")
        vocab.add_token_to_namespace("label2", namespace="test_empty_labels")

        f = MultiLabelField([], label_namespace="test_empty_labels")
        f.index(vocab)
        tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))
    def test_multilabel_field_can_index_with_vocab(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("rel0", namespace="rel_labels")
        vocab.add_token_to_namespace("rel1", namespace="rel_labels")
        vocab.add_token_to_namespace("rel2", namespace="rel_labels")

        f = MultiLabelField(["rel1", "rel0"], label_namespace="rel_labels")
        f.index(vocab)
        tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([1, 1, 0]))
    def text_to_instance(self, tokens: List[Token],
                         text: str, ID: Optional[str] = None, 
                         labels: Optional[List[str]] = None) -> Instance:
        '''
        The tokens are expected to be pre-tokenised.

        :param tokens: The text that has been tokenised
        :param text: The text from the sample
        :param ID: The ID of the sample
        :param labels: A list of labels (can be an empty list which is 
                       associated implictly to the neutral class)
        :returns: An Instance object with all of the above enocded for a
                  PyTorch model.
        '''
        token_sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {'tokens': token_sequence}

        meta_fields = {}
        meta_fields["words"] = [x.text for x in tokens]
        meta_fields["text"] = text
        if ID is not None:
            meta_fields["ID"] = ID
        instance_fields["metadata"] = MetadataField(meta_fields)

        if labels is not None:
            instance_fields['labels'] = MultiLabelField(labels, 
                                                        label_namespace="labels")

        return Instance(instance_fields)
Exemple #7
0
    def text_to_instance(self, index: int, field_type: str):  # type: ignore
        field = TextField(
            [Token(t) for t in ["The", "number", "is",
                                str(index), "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")},
        )

        return Instance({
            "text":
            field,
            "label":
            LabelField(index, skip_indexing=True),
            "flag":
            FlagField(23),
            "index":
            IndexField(index % self.batch_size, field),
            "metadata":
            MetadataField(
                {"some_key": "This will not be logged as a histogram."}),
            "adjacency":
            AdjacencyField([(0, 1), (1, 2)], field),
            "multilabel":
            MultiLabelField(["l1", "l2"]),
            "span":
            SpanField(2, 3, field),
            "tensor":
            TensorField(torch.randn(2, 3)),
        })
 def text_to_instance(
         self,
         context_tokens: List[Token],
         tokens: List[Token],
         tags: List[str] = None,
         intents: List[str] = None,
         dialog_act: Dict[str, Any] = None) -> Instance:  # type: ignore
     """
     We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
     """
     # pylint: disable=arguments-differ
     fields: Dict[str, Field] = {}
     # print([t.text for t in context_tokens])
     fields["context_tokens"] = TextField(context_tokens,
                                          self._token_indexers)
     fields["tokens"] = TextField(tokens, self._token_indexers)
     fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
     if tags is not None:
         fields["tags"] = SequenceLabelField(tags, fields["tokens"])
     if intents is not None:
         fields["intents"] = MultiLabelField(
             intents, label_namespace="intent_labels")
     if dialog_act is not None:
         fields["metadata"] = MetadataField({
             "words": [x.text for x in tokens],
             'dialog_act':
             dialog_act
         })
     else:
         fields["metadata"] = MetadataField({
             "words": [x.text for x in tokens],
             'dialog_act': {}
         })
     return Instance(fields)
    def test_class_variables_for_namespace_warnings_work_correctly(self):

        assert "text" not in MultiLabelField._already_warned_namespaces
        with self.assertLogs(logger="allennlp.data.fields.multilabel_field", level="WARNING"):
            _ = MultiLabelField(["test"], label_namespace="text")

        # We've warned once, so we should have set the class variable to False.
        assert "text" in MultiLabelField._already_warned_namespaces
        with pytest.raises(AssertionError):
            with self.assertLogs(logger="allennlp.data.fields.multilabel_field", level="WARNING"):
                _ = MultiLabelField(["test2"], label_namespace="text")

        # ... but a new namespace should still log a warning.
        assert "text2" not in MultiLabelField._already_warned_namespaces
        with self.assertLogs(logger="allennlp.data.fields.multilabel_field", level="WARNING"):
            _ = MultiLabelField(["test"], label_namespace="text2")
Exemple #10
0
    def add_label(
        self,
        instance: Instance,
        label: Union[List[str], List[int], str, int],
        to_field: str = "label",
    ) -> Optional[Instance]:
        """Includes the label field for classification into the instance data"""
        # "if not label:" fails for ndarrays this is why we explicitly check None
        if label is None:
            return instance

        field = None
        # check if multilabel and if adequate type
        if self._multilabel and isinstance(label, (list, numpy.ndarray)):
            label = label.tolist() if isinstance(label, numpy.ndarray) else label
            field = MultiLabelField(label, label_namespace=vocabulary.LABELS_NAMESPACE)
        # check if not multilabel and adequate type + check for empty strings
        if not self._multilabel and isinstance(label, (str, int)) and label:
            field = LabelField(label, label_namespace=vocabulary.LABELS_NAMESPACE)
        if not field:
            # We have label info but we cannot build the label field --> discard the instance
            return None

        instance.add_field(to_field, field)
        return instance
    def index(self, ner_tags: List[str],
              as_label_field: bool) -> Union[List[int], MultiLabelField]:
        """
        Takes in a list of tags ([B-PER, I-PER, O, O, B-LOC, I-LOC]),
        performs a regex match against the ner tags (.*-TAG), and
        generates the label accordingly

        Arguments:
            ner_tags (List[str]): The list of NER Tags
            as_label_field (bool): If True, returns a MultiLabelField,
                otherwise returns a list of tag indices

        Returns:
            indices (Union[List[int], MultiLabelField]): Returns
                either a list of indexed labels, or a MultiLabelField
                instance

        """
        indices = set()
        for gold_tag in ner_tags:
            for tag in self.tags2ix:
                if re.match(f".*-{tag}", gold_tag) is not None:
                    indices.add(self.tags2ix[tag])
        if len(indices) > 0:
            indices = list(indices)
        else:
            indices = [len(self.tags2ix)]
        if as_label_field:
            indices = MultiLabelField(labels=indices,
                                      label_namespace=self.label_namespace,
                                      skip_indexing=True,
                                      num_labels=self.get_num_tags())
        return indices
Exemple #12
0
 def text_to_instance(self,
                      tokens: List[str],
                      features: List[List[str]],
                      tags: Optional[List[str]] = None,
                      tag_label_namespace: Optional[str] = None):
     # pylint: disable=arguments-differ
     tokens: List[Token] = [Token(x) for x in tokens]
     sequence = TextField(tokens, self._token_indexers)
     instance_fields: Dict[str, Field] = {"tokens": sequence}
     metadata = {"words": [x.text for x in tokens]}
     if self._use_sentence_markers:
         sentence_markers = get_sentence_markers_from_tokens(tokens)
         metadata["sentence_markers"] = sentence_markers
     instance_fields["metadata"] = MetadataField(metadata)
     # now encode the features
     feature_list: List[MultiLabelField] = []
     for feature in features:
         indexed_feature: List[int] = [
             self._features_index_map[x] for x in feature
             if x in self._features_index_map
         ]
         feature_list.append(
             MultiLabelField(indexed_feature,
                             label_namespace=self.feature_label_namespace,
                             skip_indexing=True,
                             num_labels=len(self._features_index_map)))
     instance_fields["features"] = ListField(feature_list)
     if tags:
         tag_label_namespace = tag_label_namespace or self.label_namespace
         converted_tags: List[str] = self.convert_tags(tags)
         instance_fields["tags"] = SequenceLabelField(
             converted_tags, sequence, tag_label_namespace)
     return Instance(instance_fields)
Exemple #13
0
    def _build_instance(self,
                        tokens: List[Token],
                        trigger_labels: List[str] = None,
                        trigger_token_seqs: Dict[Tuple[int, int], str] = None,
                        **metadata) -> Instance:
        if len(tokens) < 3 and self._filter is True:
            return None

        # Translate sentence if translation_service is provided
        if self._translation_service is not None:
            source_snt = self._detokenize(tokens)
            target_snt = self._translation_service(source_snt)
            tokens = self._tokenize(target_snt)

        fields: Dict[str, Field] = dict()

        # First, populate fields with provided metadata
        for key, value in metadata.items():
            fields[key] = MetadataField(value)

        if trigger_token_seqs is not None:
            fields['trigger_token_seqs'] = MetadataField(trigger_token_seqs)

        # Building different discrete representations for text embedders.
        text_field = TextField(tokens, self._token_indexers)
        fields['text'] = text_field
        # Additionally, raw tokens are also stored for reverse mapping
        fields['tokens'] = MetadataField(tokens)

        # Build an Instance without annotations to use in inference phase.
        if trigger_labels is None:
            return Instance(fields)

        if self._translation_service is None:
            # If the sentence is translated we have no alignments
            # for token-level labels, so we skip them

            if len(trigger_labels) > len(tokens):
                truncate_len = len(tokens)
                trigger_labels = trigger_labels[:truncate_len]
                logger.warning('Truncated tokens detected. Truncating labels as well.')

            # Token-level trigger labels
            trigger_labels_field = SequenceLabelField(trigger_labels,
                                                      text_field,
                                                      self._trigger_label_namespace)
            if not self._sentence_level_only:
                fields['trigger_labels'] = trigger_labels_field

        # Sentence-level trigger label(s)
        # if not self._multi_label:
        #     raise NotImplementedError

        token_tags = set(trigger_labels)
        sentence_trigger_labels = [tag for tag in token_tags if tag != 'O']
        if not sentence_trigger_labels and self._null_label:
            sentence_trigger_labels = ['O']
        fields['sentence_trigger_labels'] = MultiLabelField(sentence_trigger_labels,
                                                            self._trigger_label_namespace)
        return Instance(fields)
Exemple #14
0
    def test_class_variables_for_namespace_warnings_work_correctly(self, caplog):
        with caplog.at_level(logging.WARNING, logger="allennlp.data.fields.multilabel_field"):
            assert "text" not in MultiLabelField._already_warned_namespaces
            _ = MultiLabelField(["test"], label_namespace="text")
            assert caplog.records

            # We've warned once, so we should have set the class variable to False.
            assert "text" in MultiLabelField._already_warned_namespaces
            caplog.clear()
            _ = MultiLabelField(["test2"], label_namespace="text")
            assert not caplog.records

            # ... but a new namespace should still log a warning.
            assert "text2" not in MultiLabelField._already_warned_namespaces
            caplog.clear()
            _ = MultiLabelField(["test"], label_namespace="text2")
            assert caplog
Exemple #15
0
    def text_to_instance(self, sentence: str, category_tag: str) -> Instance:
        categories = self.category_mapping.get(category_tag)

        tokenized_sentence = self.tokenizer.tokenize(sentence)
        sent_field = TextField(tokenized_sentence, self.token_indexers)

        return Instance({
            'sentences': sent_field,
            'categories': MultiLabelField(categories)
        })
Exemple #16
0
    def test_multilabel_field_empty_field_works(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("label1", namespace="test_empty_labels")
        vocab.add_token_to_namespace("label2", namespace="test_empty_labels")

        f = MultiLabelField([], label_namespace="test_empty_labels")
        f.index(vocab)
        tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))
        g = f.empty_field()
        g.index(vocab)
        tensor = g.as_tensor(g.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))

        h = MultiLabelField(
            [0, 0, 1], label_namespace="test_empty_labels", num_labels=3, skip_indexing=True
        )
        tensor = h.empty_field().as_tensor(None).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0, 0]))
 def text_to_instance(self,
                      text: str,
                      labels: List[str] = None) -> Instance:
     tokenized_text = self._tokenizer.tokenize(text)
     tokenized_text = TextField(tokenized_text, self._token_indexers)
     fields = {'text': tokenized_text}
     if labels:
         label_field = MultiLabelField(labels=labels)
         fields["labels"] = label_field
     return Instance(fields)
Exemple #18
0
 def modify_batch_instances(self, batch_instances):
     batch_instances = list(batch_instances)
     batch_size = len(batch_instances)
     negative_token_contexts = self.get_negative_contexts(batch_size)
     token_namespace = 'token_context'
     for instance, negs in zip(batch_instances, negative_token_contexts):
         instance.add_field(
             'negative_context',
             MultiLabelField(negs, label_namespace=token_namespace))
     return batch_instances
Exemple #19
0
    def text_to_instance(self, sample) -> Instance:
        fields: Dict[str, Field] = {}
        sen_num = self.pre_sen
        context = ' '.join(sample['history'][-sen_num:])
        all_sentence = sample['history'][-sen_num:]
        # history = ' '.join(list(''.join(context)))
        history = ' '.join(self.seg.cut(context))

        text_tokens = self._source_tokenizer.tokenize(history)
        text_tokens = text_tokens[-self._source_max_tokens:]
        text_tokens.insert(0, Token(START_SYMBOL))
        text_tokens.append(Token(END_SYMBOL))

        # response = ' '.join(sample['response'])
        response = ' '.join(self.seg.cut(sample['response']))
        response_tokens = self._target_tokenizer.tokenize(response)
        response_tokens = response_tokens[:self._target_max_tokens]
        response_tokens.insert(0, Token(START_SYMBOL))
        response_tokens.append(Token(END_SYMBOL))

        fileds_list = []
        for sen in all_sentence:
            sen = ' '.join(self.seg.cut(sen))
            # sen = ' '.join(sen)
            txt_token = self._source_tokenizer.tokenize(sen)
            ff = TextField(txt_token, self._source_token_indexers)
            fileds_list.append(ff)
        fields['source_tokens'] = TextField(text_tokens,
                                            self._source_token_indexers)
        fields["next_sym"] = MultiLabelField(list(sample['next_symp']),
                                             skip_indexing=True,
                                             num_labels=total_entity + sen_num)
        fields['target_tokens'] = TextField(response_tokens,
                                            self._target_token_indexers)
        fields['his_symptoms'] = MultiLabelField(list(sample['his_symp']),
                                                 skip_indexing=True,
                                                 num_labels=total_entity +
                                                 sen_num)
        fields['tags'] = MetadataField(sample['tags'][-sen_num:])
        fields['history'] = ListField(fileds_list)
        # fields['dialog_index'] = MetadataField(sample['dialog_index'])

        return Instance(fields)
Exemple #20
0
    def text_to_instance(
        self,
        sentences: List[str],
        labels: List[str] = None,
        confidences: List[float] = None,
        additional_features: List[float] = None,
    ) -> Instance:
        if not self.predict:
            assert len(sentences) == len(labels)
        if confidences is not None:
            assert len(sentences) == len(confidences)
        if additional_features is not None:
            assert len(sentences) == len(additional_features)

        if self.use_sep:
            tokenized_sentences = [
                self._tokenizer.tokenize(s)[:self.sent_max_len] +
                [Token("[SEP]")] for s in sentences
            ]
            sentences = [
                list(itertools.chain.from_iterable(tokenized_sentences))[:-1]
            ]
        else:
            # Tokenize the sentences
            sentences = [
                self._tokenizer.tokenize(sentence_text)[:self.sent_max_len]
                for sentence_text in sentences
            ]

        fields: Dict[str, Field] = {}
        fields["sentences"] = ListField([
            TextField(sentence, self._token_indexers) for sentence in sentences
        ])

        if labels is not None:
            if isinstance(labels[0], list):
                fields["labels"] = ListField(
                    [MultiLabelField(label) for label in labels])
            else:
                # make the labels strings for easier identification of the neutral label
                # probably not strictly necessary
                if self.sci_sum:
                    fields["labels"] = ArrayField(np.array(labels))
                else:
                    fields["labels"] = ListField([
                        LabelField(str(label) + "_label") for label in labels
                    ])

        if confidences is not None:
            fields['confidences'] = ArrayField(np.array(confidences))
        if additional_features is not None:
            fields["additional_features"] = ArrayField(
                np.array(additional_features))

        return Instance(fields)
Exemple #21
0
    def _add_label(
        self,
        instance: Instance,
        label: Union[List[str], List[int], str, int],
        to_field: str = "label",
    ) -> Instance:
        """Adds the label field for classification into the instance data

        Helper function for the child's `self.featurize` method.

        Parameters
        ----------
        instance
            Add a label field to this instance
        label
            The label data
        to_field
            Name space of the field

        Returns
        -------
        instance
            If `label` is not None, return `instance` with the a label field added.
            Otherwise return just the given `instance`.

        Raises
        ------
        FeaturizeError
            If the label is an empty string or does not match the type:
            - (str, int) for single label
            - (list, np.array) for multi label
        """
        # "if not label:" fails for ndarrays, this is why we explicitly check for None
        if label is None:
            return instance

        field = None
        # check if multilabel and if adequate type
        if self._multilabel and isinstance(label, (list, numpy.ndarray)):
            label = label.tolist() if isinstance(label,
                                                 numpy.ndarray) else label
            field = MultiLabelField(
                label, label_namespace=vocabulary.LABELS_NAMESPACE)
        # check if not multilabel and adequate type + check for empty strings
        if not self._multilabel and isinstance(label, (str, int)) and label:
            field = LabelField(label,
                               label_namespace=vocabulary.LABELS_NAMESPACE)
        if not field:
            # We have label info but we cannot build the label field --> discard the instance
            raise FeaturizeError(
                f"Cannot create label field for `label={label}`!")

        instance.add_field(to_field, field)

        return instance
    def text_to_instance(self, text: str, label: str = None) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        tokenized = self._tokenizer.tokenize(text)
        if not (text and tokenized):
            return None

        fields = {'tokens': TextField(tokenized, self._token_indexers)}
        if label is not None:
            label = label.split(', ')
            fields['label'] = MultiLabelField(label)
        return Instance(fields)
    def text_to_instance(self,
                         tokens: List[Token],
                         labels: Sequence[int] = None,
                         labels_aspect: Sequence[int] = None,
                         domain: str = None) -> Instance:
        sentence_field = TextField(tokens,
                                   self.token_indexers)  # sentence and indexer
        fields = {'sentence': sentence_field}

        if domain != None:
            fields['domain'] = LabelField(label=DOMAIN.index(domain),
                                          label_namespace='domain-labels',
                                          skip_indexing=True)
            in_domain = domain == self.target_domain
            fields['sample_weight'] = ArrayField(
                np.array([1.0 if in_domain else self.out_domain_weight]))

        if labels:
            label_field = MultiLabelField(labels=labels,
                                          label_namespace='motive-labels',
                                          skip_indexing=True,
                                          num_labels=NUM_MOTIVES + 1)
            fields['labels'] = label_field

        if self.multitask and labels_aspect:
            num_aspects = max(
                [c.shape[0] for c in self.label_counts_aspect.values()])
            label_field = MultiLabelField(labels=labels_aspect,
                                          label_namespace='aspect-labels',
                                          skip_indexing=True,
                                          num_labels=num_aspects)
            fields['labels_aspect'] = label_field

        fields['metadata'] = MetadataField({
            'label_prior':
            self.label_prior,
            'label_prior_aspect':
            self.label_prior_aspect[domain]
        })

        return Instance(fields)
Exemple #24
0
    def text_to_instance(self, sample) -> Instance:  #箭头是注释表明返回值是什么类型
        fields: Dict[str, Field] = {}
        tailored_history = sample['history']
        context = '。'.join(tailored_history)

        text_tokens = self._tokenizer.tokenize(context[-510:])
        fields['text'] = TextField(text_tokens, self._token_indexers)

        fileds_list = []
        for sen in tailored_history:
            sen = ' '.join(sen)
            txt_token = self._tokenizer.tokenize(sen)
            ff = TextField(txt_token, self._token_indexers)
            fileds_list.append(ff)
        fields["label"] = MultiLabelField(list(sample['next_sym']),
                                          skip_indexing=True,
                                          num_labels=sym_size)
        fields["future"] = MultiLabelField(list(sample['future']),
                                           skip_indexing=True,
                                           num_labels=sym_size)

        return Instance(fields)
    def text_to_instance(self,
                         graf_tokens: List[Token],
                         labels: List[str] = None) -> Instance:
        graf_field = TextField(graf_tokens, self.token_indexers)

        metadata = MetadataField(({"graf_words": graf_tokens}))

        fields = {"graf": graf_field, "metadata": metadata}

        if labels is not None:
            label_field = MultiLabelField(labels)
            fields["label"] = label_field

        return Instance(fields)
    def test_multilabel_field_empty_field_works(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("label1", namespace="test_empty_labels")
        vocab.add_token_to_namespace("label2", namespace="test_empty_labels")

        f = MultiLabelField([], label_namespace="test_empty_labels")
        f.index(vocab)
        tensor = f.as_tensor(f.get_padding_lengths()).data.cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))
Exemple #27
0
    def test_multilabel_field_can_index_with_vocab(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("rel0", namespace="rel_labels")
        vocab.add_token_to_namespace("rel1", namespace="rel_labels")
        vocab.add_token_to_namespace("rel2", namespace="rel_labels")

        f = MultiLabelField(["rel1", "rel0"], label_namespace="rel_labels")
        f.index(vocab)
        tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([1, 1, 0]))
Exemple #28
0
def get_new_instance(instance: Instance, tags: List[str], reader: DatasetReader) -> Instance:
    # first copy over the tokens
    new_instance: Dict[str, Field] = {}
    tokens = instance.fields["tokens"].tokens
    sequence = TextField(tokens, reader._token_indexers)
    new_instance["tokens"] = sequence
    # now copy the tags
    new_instance["tags"] = SequenceLabelField(tags, sequence, reader.label_namespace)
    # now copy the handcrafted features
    feature_list: List[MultiLabelField] = []
    for feature in instance.fields["features"]:
        labels: List[int] = feature.labels
        feature_list.append(MultiLabelField(
            labels, label_namespace=reader.feature_label_namespace,
            skip_indexing=True, num_labels=len(reader._features_index_map))
        )
    new_instance["features"] = ListField(feature_list)
    return Instance(new_instance)
Exemple #29
0
    def text_to_instance(self,
                         sentences: List[str],
                         category_tag: str = None) -> Instance:

        sentence_fields = []
        for sentence in sentences:
            sent_field = LazyTextFiled(text=sentence,
                                       tokenizer_name="sentences",
                                       token_indexers=self.token_indexers)
            sentence_fields.append(sent_field)

        dt = {
            'sentences': ListField(sentence_fields),
        }

        if category_tag is not None:
            categories = self.category_mapping.get(category_tag)
            dt['categories'] = MultiLabelField(categories)

        return Instance(dt)
    def text_to_instance(self,
                         text: Union[str, List[Token]],
                         label: str = None) -> Instance:
        if isinstance(text, str):
            tokenized_text = self.tokenize(text)
        else:
            tokenized_text = text

        if len(tokenized_text) > self._max_length:
            tokenized_text = tokenized_text[:self._max_length]

        text_field = TextField(tokenized_text, self._token_indexers)

        fields = {'text': text_field}
        if label is not None:
            if self._multi_label:
                fields['label'] = MultiLabelField(label)
            else:
                fields['label'] = LabelField(label)

        return Instance(fields)
    def text_to_instance(
        self, text: str, labels: List[Union[str, int]] = None
    ) -> Instance:  # type: ignore
        """
        # Parameters

        text : `str`, required.
            The text to classify
        labels : `List[Union[str, int]]`, optional, (default = `None`).
            The labels for this text.

        # Returns

        An `Instance` containing the following fields:
            - tokens (`TextField`) :
              The tokens in the sentence or phrase.
            - label (`MultiLabelField`) :
              The labels of the sentence or phrase.
        """

        fields: Dict[str, Field] = {}
        if self._segment_sentences:
            sentences: List[Field] = []
            sentence_splits = self._sentence_segmenter.split_sentences(text)
            for sentence in sentence_splits:
                word_tokens = self._tokenizer.tokenize(sentence)
                if self._max_sequence_length is not None:
                    word_tokens = self._truncate(word_tokens)
                sentences.append(TextField(word_tokens, self._token_indexers))
            fields["tokens"] = ListField(sentences)
        else:
            tokens = self._tokenizer.tokenize(text)
            if self._max_sequence_length is not None:
                tokens = self._truncate(tokens)
            fields["tokens"] = TextField(tokens, self._token_indexers)
        if labels is not None:
            fields["labels"] = MultiLabelField(
                labels, skip_indexing=self._skip_label_indexing, num_labels=self._num_labels
            )
        return Instance(fields)
Exemple #32
0
    def text_to_instance(self,
                         tokens: List[Token],
                         entities: List = None,
                         relations: List = None) -> Instance:
        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {"tokens": sequence}
        words = [x.text for x in tokens]
        spans = []
        for start, end in enumerate_spans(words,
                                          max_span_width=self._max_span_width):
            assert start >= 0
            assert end >= 0
            spans.append(SpanField(start, end, sequence))

        span_field = ListField(spans)
        span_tuples = [(span.span_start, span.span_end) for span in spans]
        instance_fields["spans"] = span_field

        ner_labels = [[] for i in span_tuples]

        ner_list = [((e.start, e.end), e.role) for e in entities]

        for span, label in ner_list:
            if self._too_long(span):
                continue
            ix = span_tuples.index(span)
            # if "" in ner_labels[ix]:
            #     ner_labels[ix].remove("")

            ner_labels[ix] += [label]

        instance_fields["ner_labels"] = ListField([
            MultiLabelField(entry, label_namespace=self.label_namespace)
            for entry in ner_labels
        ])

        metadata = {"words": words, "relations": relations}
        instance_fields["metadata"] = MetadataField(metadata)

        return Instance(instance_fields)
 def test_as_tensor_returns_integer_tensor(self):
     f = MultiLabelField([2, 3], skip_indexing=True, label_namespace="test1", num_labels=5)
     tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().tolist()
     assert tensor == [0, 0, 1, 1, 0]
     assert set([type(item) for item in tensor]) == set([int])