Ejemplo n.º 1
0
    def test_instance_implements_mutable_mapping(self):
        words_field = TextField([Token("hello")], {})
        label_field = LabelField(1, skip_indexing=True)
        instance = Instance({"words": words_field, "labels": label_field})

        assert instance["words"] == words_field
        assert instance["labels"] == label_field
        assert len(instance) == 2

        keys = {k for k, v in instance.items()}
        assert keys == {"words", "labels"}

        values = [v for k, v in instance.items()]
        assert words_field in values
        assert label_field in values
Ejemplo n.º 2
0
    def text_to_instance(self,
                         comment_text: str,
                         sentiment: int = None) -> Instance:

        tokens = self._tokenizer.tokenize(comment_text)
        if self._max_seq_len is not None:
            tokens = tokens[:self._max_seq_len]

        sentence_field = TextField(tokens, self._token_indexers)
        fields = {"tokens": sentence_field}

        label_field = LabelField(str(sentiment))
        fields["label"] = label_field

        return Instance(fields)
    def text_to_instance(self,
                         tokens: List[Token],
                         id: str = None,
                         labels: np.ndarray = None) -> Instance:
        sentence_field = TextField(tokens, self.token_indexers)
        fields = {"tokens": sentence_field}
        id_field = MetadataField(id)
        fields["id"] = id_field

        if labels is None:
            labels = np.zeros(len(label_cols))
        label_field = ArrayField(array=labels)
        fields["label"] = label_field

        return Instance(fields)
Ejemplo n.º 4
0
    def text_to_instance(self,
                         text: str,
                         hypothesis: str,
                         label: str = None) -> Instance:
        fields = {}
        # to make it look like two instances [cls] sent1 [sep] & [cls] sent2 [sep]
        tokens1 = self._tokenizer.tokenize(text)
        tokens2 = self._tokenizer.tokenize(hypothesis)
        fields["tokens1"] = TextField(tokens1, self._token_indexers)
        fields["tokens2"] = TextField(tokens2, self._token_indexers)

        if label is not None:
            fields["labels"] = LabelField(label)

        return Instance(fields)
Ejemplo n.º 5
0
    def _sentences_to_ids(self, sentences):
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for sentence in sentences:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens, {'character_ids': indexer})
            instance = Instance({'elmo': field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary()
        dataset.index_instances(vocab)
        return dataset.as_tensor_dict()['elmo']['character_ids']
Ejemplo n.º 6
0
    def text_to_instance(self, source_tokens: List[Token],
                         target_tokens: List[Token]) -> Optional[Instance]:
        source_tokens.insert(0, Token(START_SYMBOL))
        source_tokens.append(Token(END_SYMBOL))

        target_tokens.insert(0, Token(START_SYMBOL))
        target_tokens.append(Token(END_SYMBOL))

        fields = {
            'source_tokens': TextField(source_tokens, self.token_indexers),
            'target_tokens': TextField(target_tokens,
                                       self.target_token_indexers),
        }

        return Instance(fields)
Ejemplo n.º 7
0
    def test_registrability(self):
        @Vocabulary.register('my-vocabulary')
        class MyVocabulary:
            @classmethod
            def from_params(cls, params, instances=None):
                # pylint: disable=unused-argument
                return MyVocabulary()

        params = Params({'type': 'my-vocabulary'})

        instance = Instance(fields={})

        vocab = Vocabulary.from_params(params=params, instances=[instance])

        assert isinstance(vocab, MyVocabulary)
Ejemplo n.º 8
0
    def predict_batch(self, texts):
        instances = []
        for text in texts:
            tokens = self._tokenizer.tokenize(text)
            instance = Instance(
                {'tokens': TextField(tokens, self._token_indexers)})
            instances.append(instance)

        result = self.model.forward_on_instances(instances)

        results = []
        for instance_result in result:
            results.append(self._format_instance_result(instance_result))

        return results
Ejemplo n.º 9
0
    def toInstance(self,
                   names: List[str],
                   categories: List[str] = None) -> Instance:
        token_field = TextField([Token(nm) for nm in names],
                                self.token_indexers)

        fields = {"tokens": token_field}

        fields["token_characters"] = TextField([Token(nm) for nm in names],
                                               self.char_indexers)

        if categories:
            fields["labels"] = SequenceLabelField(labels=categories,
                                                  sequence_field=token_field)
        return Instance(fields)
Ejemplo n.º 10
0
    def test_registrability(self):
        @Vocabulary.register("my-vocabulary")
        class MyVocabulary:
            @classmethod
            def from_params(cls, params, instances=None):

                return MyVocabulary()

        params = Params({"type": "my-vocabulary"})

        instance = Instance(fields={})

        vocab = Vocabulary.from_params(params=params, instances=[instance])

        assert isinstance(vocab, MyVocabulary)
Ejemplo n.º 11
0
 def get_dataset(self):
     field1 = TextField(
         [Token(t) for t in ["this", "is", "a", "sentence", "."]],
         self.token_indexer)
     field2 = TextField([
         Token(t)
         for t in ["this", "is", "a", "different", "sentence", "."]
     ], self.token_indexer)
     field3 = TextField(
         [Token(t) for t in ["here", "is", "a", "sentence", "."]],
         self.token_indexer)
     field4 = TextField([Token(t) for t in ["this", "is", "short"]],
                        self.token_indexer)
     instances = [
         Instance({
             "text1": field1,
             "text2": field2
         }),
         Instance({
             "text1": field3,
             "text2": field4
         })
     ]
     return Dataset(instances)
Ejemplo n.º 12
0
 def text_to_instance(self, src_seq: Iterable[Token], tgt_seq: str, salience_seq: Iterable[float]) -> Instance:
     indexer = SingleIdTokenIndexer(lowercase_tokens=True)
     tokenized_src = src_seq[:self._source_max_tokens]
     tokenized_tgt = self._tokenizer.tokenize(tgt_seq)[:self._target_max_tokens]
     source_field = TextField(tokenized_src, {'tokens': indexer})
     target_field = TextField(tokenized_tgt, {'tokens': indexer})
     if self._interpolation:
         saliency_field = ArrayField(np.array(self.smooth_and_norm(salience_seq)[:self._source_max_tokens]))
     else:
         saliency_field = ArrayField(np.array(salience_seq[:self._source_max_tokens]))
     return Instance({
         'source_tokens': source_field,
         'target_tokens': target_field,
         'salience_values': saliency_field
     })
 def text_to_instance(self, string: str,
                      lang2_lang: str) -> Instance:  # type: ignore
     # pylint: disable=arguments-differ
     """
     Used in predicition time
     """
     lang_pair = self._undefined_lang_id + '-' + lang2_lang
     tokenized_string = self._tokenizer.tokenize(string)
     string_field = TextField(tokenized_string, self._token_indexers)
     return Instance({
         self._mingler.dataset_name_field:
         MetadataField(lang_pair),
         'lang1_tokens':
         string_field
     })
Ejemplo n.º 14
0
    def test_elmo_bilm(self):
        # get the raw data
        sentences, expected_lm_embeddings = self._load_sentences_embeddings()

        # load the test model
        elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file)

        # Deal with the data.
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for batch in zip(*sentences):
            for sentence in batch:
                tokens = [Token(token) for token in sentence.split()]
                field = TextField(tokens, {"character_ids": indexer})
                instance = Instance({"elmo": field})
                instances.append(instance)

        vocab = Vocabulary()
        dataset = AllennlpDataset(instances, vocab)
        # Now finally we can iterate through batches.
        loader = DataLoader(dataset, 3)
        for i, batch in enumerate(loader):
            lm_embeddings = elmo_bilm(
                batch["elmo"]["character_ids"]["elmo_tokens"])
            top_layer_embeddings, mask = remove_sentence_boundaries(
                lm_embeddings["activations"][2], lm_embeddings["mask"])

            # check the mask lengths
            lengths = mask.data.numpy().sum(axis=1)
            batch_sentences = [sentences[k][i] for k in range(3)]
            expected_lengths = [
                len(sentence.split()) for sentence in batch_sentences
            ]
            self.assertEqual(lengths.tolist(), expected_lengths)

            # get the expected embeddings and compare!
            expected_top_layer = [
                expected_lm_embeddings[k][i] for k in range(3)
            ]
            for k in range(3):
                self.assertTrue(
                    numpy.allclose(
                        top_layer_embeddings[k, :lengths[k], :].data.numpy(),
                        expected_top_layer[k],
                        atol=1.0e-6,
                    ))
Ejemplo n.º 15
0
    def text_to_instance(
        self,  # type: ignore
        qid: str,
        start: str,
        alternatives: List[str],
        label: Optional[int] = None,
    ) -> Instance:
        # tokenize
        start = self._tokenizer.tokenize(start)

        sequences = []
        for alternative in alternatives:
            alternative = self._tokenizer.tokenize(alternative)
            length_for_start = (self.length_limit - len(alternative) -
                                self._tokenizer.num_special_tokens_for_pair())
            if length_for_start < 0:
                # If the alternative is too long by itself, we take the beginning and add no tokens from the start.
                alternative = alternative[:length_for_start]
                length_for_start = 0
            sequences.append(
                self._tokenizer.add_special_tokens(start[:length_for_start],
                                                   alternative))

        # make fields
        from allennlp.data.fields import TextField

        sequences = [
            TextField(sequence, self._token_indexers) for sequence in sequences
        ]
        from allennlp.data.fields import ListField

        sequences = ListField(sequences)

        from allennlp.data.fields import MetadataField

        fields = {
            "alternatives": sequences,
            "qid": MetadataField(qid),
        }

        if label is not None:
            if label < 0 or label >= len(sequences):
                raise ValueError("Alternative %d does not exist", label)
            from allennlp.data.fields import IndexField

            fields["correct_alternative"] = IndexField(label, sequences)

        return Instance(fields)
Ejemplo n.º 16
0
    def text_to_instance(self,
                         sup_sents: List[str],
                         sup_labels: List[str] = None,
                         ori_unsup_sents = None,
                         aug_unsup_sents = None) -> Instance:
        fields: Dict[str, Field] = {}
        tokenized_sents = [self._tokenizer.tokenize(sent) for sent in sup_sents]
        sentence_sequence = ListField([TextField(tk, self._token_indexers) for tk in tokenized_sents])
        fields['sentences'] = sentence_sequence
        
        if sup_labels is not None:
            fields['labels'] = SequenceLabelField(sup_labels, sentence_sequence)
        
        ori_name = 'ori_unsup_sentences_'
        aug_name = 'aug_unsup_sentences_'

        if ori_unsup_sents is not None and aug_unsup_sents is not None:
            # Create TextField for each in ori_unsup_sentences
            # ori_unsup_tokenized_sents = [self._tokenizer.tokenize(sent) for sent in ori_unsup_sents]
            # ori_unsup_sentence_sequence = ListField([TextField(tk, self._token_indexers) for tk in ori_unsup_tokenized_sents])
            ori_unsup_sentence_sequences = []
            for ori_unsup_sent in ori_unsup_sents:
                ori_unsup_tokenized_sent = [self._tokenizer.tokenize(sent) for sent in ori_unsup_sent]
                ori_unsup_sentence_sequence = ListField([TextField(tk, self._token_indexers) for tk in ori_unsup_tokenized_sent])
                ori_unsup_sentence_sequences.append(ori_unsup_sentence_sequence)

            # Create TextField for aug_unsup_sentences
            # aug_unsup_tokenized_sents = [self._tokenizer.tokenize(sent) for sent in aug_unsup_sents]
            # aug_unsup_sentence_sequence = ListField([TextField(tk, self._token_indexers) for tk in aug_unsup_tokenized_sents])

            aug_unsup_sentence_sequences = []
            for aug_unsup_sent in aug_unsup_sents:
                aug_unsup_tokenized_sent = [self._tokenizer.tokenize(sent) for sent in aug_unsup_sent]
                aug_unsup_sentence_sequence = ListField([TextField(tk, self._token_indexers) for tk in aug_unsup_tokenized_sent])
                aug_unsup_sentence_sequences.append(aug_unsup_sentence_sequence)

            for i in range(len(ori_unsup_sentence_sequences)):
                fields[ori_name + str(i)] = ori_unsup_sentence_sequences[i]
                fields[aug_name + str(i)] = aug_unsup_sentence_sequences[i]

            # This wont work because iterator in AllenNLP requires fields to be ListField to use BERT vocab on
            # fields['ori_unsup_sentences'] = ori_unsup_sentence_sequences # List
            # fields['aug_unsup_sentences'] = aug_unsup_sentence_sequences # List
        # Fake data
        # fields['ori_unsup_sentences'] = sentence_sequence
        # fields['aug_unsup_sentences'] = sentence_sequence

        return Instance(fields)
Ejemplo n.º 17
0
    def test_elmo_bilm(self):
        # get the raw data
        sentences, expected_lm_embeddings = self._load_sentences_embeddings()

        # load the test model
        elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file)

        # Deal with the data.
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for batch in izip(*sentences):
            for sentence in batch:
                tokens = [Token(token) for token in sentence.split()]
                field = TextField(tokens, {u'character_ids': indexer})
                instance = Instance({u"elmo": field})
                instances.append(instance)

        vocab = Vocabulary()

        # Now finally we can iterate through batches.
        iterator = BasicIterator(3)
        iterator.index_with(vocab)
        for i, batch in enumerate(
                iterator(instances, num_epochs=1, shuffle=False)):
            lm_embeddings = elmo_bilm(batch[u'elmo'][u'character_ids'])
            top_layer_embeddings, mask = remove_sentence_boundaries(
                lm_embeddings[u'activations'][2], lm_embeddings[u'mask'])

            # check the mask lengths
            lengths = mask.data.numpy().sum(axis=1)
            batch_sentences = [sentences[k][i] for k in range(3)]
            expected_lengths = [
                len(sentence.split()) for sentence in batch_sentences
            ]
            self.assertEqual(lengths.tolist(), expected_lengths)

            # get the expected embeddings and compare!
            expected_top_layer = [
                expected_lm_embeddings[k][i] for k in range(3)
            ]
            for k in range(3):
                self.assertTrue(
                    numpy.allclose(
                        top_layer_embeddings[k, :lengths[k], :].data.numpy(),
                        expected_top_layer[k],
                        atol=1.0e-6))
    def text_to_instance(self, mention_uniq_id, data=None) -> Instance:
        l_tokenized = [Token('[CLS]')]
        l_tokenized += [
            Token(split_token)
            for split_token in self.custom_tokenizer_class.tokenize(
                txt=data['l'])
        ][:self.config.max_mention_length]
        l_tokenized.append(Token('[SEP]'))
        r_tokenized = [Token('[CLS]')]
        r_tokenized += [
            Token(split_token)
            for split_token in self.custom_tokenizer_class.tokenize(
                txt=data['r'])
        ][:self.config.max_mention_length]
        r_tokenized.append(Token('[SEP]'))

        l_plus_r = [Token('[CLS]')]
        l_plus_r += [
            Token(split_token)
            for split_token in self.custom_tokenizer_class.tokenize(
                txt=data['l'])
        ][:self.config.max_mention_length]
        l_plus_r += [Token(BOND_TOKEN)]
        l_plus_r += [
            Token(split_token)
            for split_token in self.custom_tokenizer_class.tokenize(
                txt=data['r'])
        ][:self.config.max_mention_length]
        l_plus_r += [Token('[SEP]')]

        context_field = TextField(l_tokenized, self.token_indexers)
        fields = {"l": context_field}
        fields['lev'] = ArrayField(
            np.array(Levenshtein.distance(data['l'], data['r'])))
        # /  (max(len(data['l']), len(data['r'])))))

        fields['r'] = TextField(r_tokenized, self.token_indexers)
        fields['l_plus_r'] = TextField(l_plus_r, self.token_indexers)
        fields['label'] = ArrayField(np.array(data['label']))
        fields['mention_uniq_id'] = ArrayField(np.array(mention_uniq_id))
        fields['subword_match_num'] = ArrayField(
            np.array(
                len(
                    set(self.custom_tokenizer_class.tokenize(txt=data['l']))
                    & set(self.custom_tokenizer_class.tokenize(
                        txt=data['r'])))))

        return Instance(fields)
Ejemplo n.º 19
0
    def test_invalid_vocab_extension(self):
        vocab_dir = self.TEST_DIR / "vocab_save"
        original_vocab = Vocabulary(non_padded_namespaces=["tokens1"])
        original_vocab.add_tokens_to_namespace(["a", "b"], namespace="tokens1")
        original_vocab.add_token_to_namespace("p", namespace="tokens2")
        original_vocab.save_to_files(vocab_dir)
        text_field1 = TextField(
            [Token(t) for t in ["a", "c"]], {"tokens1": SingleIdTokenIndexer("tokens1")}
        )
        text_field2 = TextField(
            [Token(t) for t in ["p", "q", "r"]], {"tokens2": SingleIdTokenIndexer("tokens2")}
        )
        instances = Batch([Instance({"text1": text_field1, "text2": text_field2})])

        # Following 2 should give error: tokens1 is non-padded in original_vocab but not in instances
        params = Params(
            {
                "type": "extend",
                "directory": vocab_dir,
                "non_padded_namespaces": [],
                "tokens_to_add": {"tokens1": ["a"], "tokens2": ["p"]},
            }
        )
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances=instances)

        # Following 2 should not give error: overlapping namespaces have same padding setting
        params = Params(
            {
                "type": "extend",
                "directory": vocab_dir,
                "non_padded_namespaces": ["tokens1"],
                "tokens_to_add": {"tokens1": ["a"], "tokens2": ["p"]},
            }
        )
        Vocabulary.from_params(params, instances=instances)

        # Following 2 should give error: tokens2 is padded in instances but not in original_vocab
        params = Params(
            {
                "type": "extend",
                "directory": vocab_dir,
                "non_padded_namespaces": ["tokens1", "tokens2"],
                "tokens_to_add": {"tokens1": ["a"], "tokens2": ["p"]},
            }
        )
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances=instances)
Ejemplo n.º 20
0
    def text_to_instance(self, prompt: List[List[str]], evidence: List[str],
                         non_evidence: List[str]):

        fields = {
            'comb_evidence':
            TextField([
                Token(x) for x in (['[CLS]'] + prompt[0] + prompt[1] +
                                   prompt[2] + ['[SEP]'] + evidence)
            ], self.token_indexers),
            'comb_non_evidence':
            TextField([
                Token(x) for x in (['[CLS]'] + prompt[0] + prompt[1] +
                                   prompt[2] + ['[SEP]'] + non_evidence)
            ], self.token_indexers)
        }
        return Instance(fields)
Ejemplo n.º 21
0
        def text_to_instance(x: Tuple[str]) -> Instance:
            source_string = x[0]
            target_string = x[1]

            tokenized_source = source_tokenizer.tokenize(source_string)[:source_length_limit]
            tokenized_source.insert(0, Token(START_SYMBOL))
            tokenized_source.append(Token(END_SYMBOL))
            source_field = TextField(tokenized_source, source_token_indexers)

            tokenized_target = target_tokenizer.tokenize(target_string)[:target_length_limit]
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target, target_token_indexers)

            return Instance({source_field_name: source_field,
                             target_field_name: target_field})
Ejemplo n.º 22
0
    def get_vocab_and_both_elmo_indexed_ids(batch: List[List[str]]):
        instances = []
        indexer = ELMoTokenCharactersIndexer()
        indexer2 = SingleIdTokenIndexer()
        for sentence in batch:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens,
                              {'character_ids': indexer,
                               'tokens': indexer2})
            instance = Instance({"elmo": field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary.from_instances(instances)
        dataset.index_instances(vocab)
        return vocab, dataset.as_tensor_dict()["elmo"]
def batch_to_ids(batch):
    """
    Given a batch (as list of tokenized sentences), return a batch
    of padded character ids.
    """
    instances = []
    for sentence in batch:
        tokens = [Token(token) for token in sentence]
        field = TextField(tokens, {'character_ids': indexer})
        instance = Instance({"elmo": field})
        instances.append(instance)

    dataset = Batch(instances)
    vocab = Vocabulary()
    dataset.index_instances(vocab)
    return dataset.as_tensor_dict()['elmo']['character_ids']
Ejemplo n.º 24
0
    def text_to_instance(self,
                         tokens: List[Token],
                         tags: List[str] = None,
                         title: str = None) -> Instance:
        tokens_field = TextField(tokens, self.token_indexers)
        fields = {"tokens": tokens_field}

        if tags:
            tags_field = SequenceLabelField(labels=tags,
                                            sequence_field=tokens_field)
            fields["tags"] = tags_field

        if title:
            fields["title"] = MetadataField(title)

        return Instance(fields)
Ejemplo n.º 25
0
 def _make_instance(sent_):
     """ Forward targs adds <s> as a target for input </s>
     and bwd targs adds </s> as a target for input <s>
     to avoid issues with needing to strip extra tokens
     in the input for each direction """
     d = {
         "input":
         sentence_to_text_field(sent_, indexers),
         "targs":
         sentence_to_text_field(sent_[1:] + [sent_[0]],
                                self.target_indexer),
         "targs_b":
         sentence_to_text_field([sent_[-1]] + sent_[:-1],
                                self.target_indexer),
     }
     return Instance(d)
Ejemplo n.º 26
0
    def predictions_to_labeled_instances(self, instance: Instance,
                                         outputs: Dict[str, numpy.ndarray]):
        new_instance = instance.duplicate()
        token_field: TextField = instance["tokens"]  # type: ignore
        mask_targets = [
            Token(target_top_k_text[0], text_id=target_top_id_id)
            for (target_top_k_text, target_top_id_id
                 ) in zip(outputs["words"], outputs["token_ids"])
        ]

        new_instance.add_field(
            "target_ids",
            TextField(mask_targets, token_field._token_indexers),
            vocab=self._model.vocab,
        )
        return [new_instance]
Ejemplo n.º 27
0
    def text_to_instance(self,
                         tokens: List[Token],
                         instance_id: int = -1,
                         labels: int = 0) -> Instance:
        sentence_field = TextField(tokens, self.token_indexers)

        label_field = LabelField(labels)
        id_field = MetadataField(instance_id)

        fields = {
            "tokens": sentence_field,
            'label': label_field,
            "instance_id": id_field
        }

        return Instance(fields)
Ejemplo n.º 28
0
    def text_to_instance(self,
                         tokens: List[Token],
                         label: int = None,
                         id: str = None) -> Instance:
        fields = {}
        sentence_field = TextField(tokens=tokens,
                                   token_indexers=self.token_indexers)
        fields['sentence'] = sentence_field

        id_field = MetadataField(id)
        fields['id'] = id_field

        label_field = LabelField(label=label, skip_indexing=True)
        fields['labels'] = label_field

        return Instance(fields=fields)
Ejemplo n.º 29
0
    def text_to_instance(self, tokens, label):
        """Build text and label field and convert tokens
        to an ``Instance``.

        Arguments:
            tokens {List[str]} -- tokens
            label {str} -- a label

        Returns:
            {Instance} -- a data instance
        """

        sentence_field = TextField(tokens, self.token_indexers)
        label_field = LabelField(label=label)
        fields = {"text": sentence_field, "labels": label_field}
        return Instance(fields)
Ejemplo n.º 30
0
    def text_to_instance(self, tokenized_sentence: List[str],
                         spans: List[List[int]]) -> Instance:
        allennlp_sentence_tokens = [Token(text=t) for t in tokenized_sentence]
        sentence_token_indexes = TextField(allennlp_sentence_tokens,
                                           self._token_indexers)

        span_fields = []
        for span_start, span_end_exclusive in spans:
            span_field = SpanField(span_start, span_end_exclusive - 1,
                                   sentence_token_indexes)
            span_fields.append(span_field)

        fields: Dict[str, Field] = {}
        fields["tokens"] = sentence_token_indexes
        fields["spans"] = ListField(span_fields)
        return Instance(fields)
Ejemplo n.º 31
0
    def text_to_instance(self, tokens: List[Token], tags: List[str]=None) -> Instance:

        if len(tokens) > self._max_token_len:
            tokens = tokens[:self._max_token_len]
            print(f'Length of tokens exceeded the limit {self._max_token_len}. Truncating...')
            if tags:
                tags = tags[:self._max_token_len]

        fields = {}

        text_field = TextField(tokens, self._token_indexers)
        fields['tokens'] = text_field
        if tags:
            fields['tags'] = SequenceLabelField(tags, text_field)

        return Instance(fields)
Ejemplo n.º 32
0
 def create_instance(self, str_tokens: List[str]):
     tokens = [Token(t) for t in str_tokens]
     instance = Instance({'text': TextField(tokens, self.token_indexers)})
     instance.index_fields(self.vocab)
     return instance