Ejemplo n.º 1
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as text_file:
            instance_strings = text_file.readlines()

        if self._tokens_per_instance is not None:
            all_text = " ".join(
                [x.replace("\n", " ").strip() for x in instance_strings])
            tokenized_text = self._tokenizer.tokenize(all_text)
            num_tokens = self._tokens_per_instance + 1
            tokenized_strings = []
            logger.info("Creating dataset from all text in file: %s",
                        file_path)
            for index in Tqdm.tqdm(
                    range(0,
                          len(tokenized_text) - num_tokens, num_tokens - 1)):
                tokenized_strings.append(tokenized_text[index:(index +
                                                               num_tokens)])
        else:
            tokenized_strings = [
                self._tokenizer.tokenize(s) for s in instance_strings
            ]

        for tokenized_string in tokenized_strings:
            input_field = TextField(tokenized_string[:-1],
                                    self._token_indexers)
            output_field = TextField(tokenized_string[1:],
                                     self._output_indexer)
            yield Instance({
                'input_tokens': input_field,
                'output_tokens': output_field
            })
    def setUp(self):
        super().setUp()
        token_indexer = {"tokens": SingleIdTokenIndexer()}

        field1 = TextField(
            [Token(t) for t in ["this", "is", "a", "sentence", "."]],
            token_indexer)
        field2 = TextField([
            Token(t)
            for t in ["this", "is", "a", "different", "sentence", "."]
        ], token_indexer)
        field3 = TextField(
            [Token(t) for t in ["here", "is", "a", "sentence", "."]],
            token_indexer)
        field4 = TextField([Token(t) for t in ["this", "is", "short"]],
                           token_indexer)
        self.instances = [
            Instance({
                "text1": field1,
                "text2": field2
            }),
            Instance({
                "text1": field3,
                "text2": field4
            })
        ]
Ejemplo n.º 3
0
 def text_to_instance(self, sentence: str) -> Instance:  # type: ignore
     # pylint: disable=arguments-differ
     tokenized_string = self._tokenizer.tokenize(sentence)
     input_field = TextField(tokenized_string[:-1], self._token_indexers)
     output_field = TextField(tokenized_string[1:], self._output_indexer)
     return Instance({
         'input_tokens': input_field,
         'output_tokens': output_field
     })
Ejemplo n.º 4
0
 def get_instances(self):
     field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence", "."]],
                        self.token_indexer)
     field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence", "."]],
                        self.token_indexer)
     field3 = TextField([Token(t) for t in ["here", "is", "a", "sentence", "."]],
                        self.token_indexer)
     field4 = TextField([Token(t) for t in ["this", "is", "short"]],
                        self.token_indexer)
     instances = [Instance({"text1": field1, "text2": field2}),
                  Instance({"text1": field3, "text2": field4})]
     return instances
Ejemplo n.º 5
0
    def test_invalid_vocab_extension(self):
        vocab_dir = self.TEST_DIR / 'vocab_save'
        original_vocab = Vocabulary(non_padded_namespaces=["tokens1"])
        original_vocab.add_token_to_namespace("a", namespace="tokens1")
        original_vocab.add_token_to_namespace("b", namespace="tokens1")
        original_vocab.add_token_to_namespace("p", namespace="tokens2")
        original_vocab.save_to_files(vocab_dir)
        text_field1 = TextField([Token(t) for t in ["a" "c"]],
                                {"tokens1": SingleIdTokenIndexer("tokens1")})
        text_field2 = TextField([Token(t) for t in ["p", "q", "r"]],
                                {"tokens2": SingleIdTokenIndexer("tokens2")})
        instances = Batch([Instance({"text1": text_field1, "text2": text_field2})])

        # Following 2 should give error: token1 is non-padded in original_vocab but not in instances
        params = Params({"directory_path": vocab_dir, "extend": True,
                         "non_padded_namespaces": []})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            params = Params({"non_padded_namespaces": []})
            extended_vocab.extend_from_instances(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            extended_vocab._extend(non_padded_namespaces=[],
                                   tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]})

        # Following 2 should not give error: overlapping namespaces have same padding setting
        params = Params({"directory_path": vocab_dir, "extend": True,
                         "non_padded_namespaces": ["tokens1"]})
        Vocabulary.from_params(params, instances)
        extended_vocab = copy.copy(original_vocab)
        params = Params({"non_padded_namespaces": ["tokens1"]})
        extended_vocab.extend_from_instances(params, instances)
        extended_vocab = copy.copy(original_vocab)
        extended_vocab._extend(non_padded_namespaces=["tokens1"],
                               tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]})

        # Following 2 should give error: token1 is padded in instances but not in original_vocab
        params = Params({"directory_path": vocab_dir, "extend": True,
                         "non_padded_namespaces": ["tokens1", "tokens2"]})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            params = Params({"non_padded_namespaces": ["tokens1", "tokens2"]})
            extended_vocab.extend_from_instances(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            extended_vocab._extend(non_padded_namespaces=["tokens1", "tokens2"],
                                   tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]})
Ejemplo n.º 6
0
    def text_to_instance(
            self,  # type: ignore
            premise: str,
            hypothesis: str,
            label: str = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        tokenized_premise = self._tokenizer.tokenize(premise)
        tokenized_hypothesis = self._tokenizer.tokenize(hypothesis)
        fields["premise"] = TextField(tokenized_premise, self._token_indexers)
        fields["hypothesis"] = TextField(tokenized_hypothesis,
                                         self._token_indexers)
        if label is not None:
            fields['label'] = LabelField(label)

        return Instance(fields)
Ejemplo n.º 7
0
    def text_to_instance(
            self,  # type: ignore
            tokens: List[Token],
            pos_tags: List[str] = None,
            chunk_tags: List[str] = None,
            ner_tags: List[str] = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {'tokens': sequence}
        instance_fields["metadata"] = MetadataField(
            {"words": [x.text for x in tokens]})

        # Recode the labels if necessary.
        if self.coding_scheme == "BIOUL":
            coded_chunks = to_bioul(
                chunk_tags) if chunk_tags is not None else None
            coded_ner = to_bioul(ner_tags) if ner_tags is not None else None
        else:
            # the default IOB1
            coded_chunks = chunk_tags
            coded_ner = ner_tags

        # Add "feature labels" to instance
        if 'pos' in self.feature_labels:
            if pos_tags is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use pos_tags as "
                    "features. Pass them to text_to_instance.")
            instance_fields['pos_tags'] = SequenceLabelField(
                pos_tags, sequence, "pos_tags")
        if 'chunk' in self.feature_labels:
            if coded_chunks is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use chunk tags as "
                    "features. Pass them to text_to_instance.")
            instance_fields['chunk_tags'] = SequenceLabelField(
                coded_chunks, sequence, "chunk_tags")
        if 'ner' in self.feature_labels:
            if coded_ner is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use NER tags as "
                    " features. Pass them to text_to_instance.")
            instance_fields['ner_tags'] = SequenceLabelField(
                coded_ner, sequence, "ner_tags")

        # Add "tag label" to instance
        if self.tag_label == 'ner' and coded_ner is not None:
            instance_fields['tags'] = SequenceLabelField(
                coded_ner, sequence, self.label_namespace)
        elif self.tag_label == 'pos' and pos_tags is not None:
            instance_fields['tags'] = SequenceLabelField(
                pos_tags, sequence, self.label_namespace)
        elif self.tag_label == 'chunk' and coded_chunks is not None:
            instance_fields['tags'] = SequenceLabelField(
                coded_chunks, sequence, self.label_namespace)

        return Instance(instance_fields)
Ejemplo n.º 8
0
def batch_to_ids(batch: List[List[str]]) -> torch.Tensor:
    """
    Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters
    (len(batch), max sentence length, max word length).

    Parameters
    ----------
    batch : ``List[List[str]]``, required
        A list of tokenized sentences.

    Returns
    -------
        A tensor of padded character ids.
    """
    instances = []
    indexer = ELMoTokenCharactersIndexer()
    for sentence in batch:
        tokens = [Token(token) for token in sentence]
        field = TextField(tokens, {'character_ids': indexer})
        instance = Instance({"elmo": field})
        instances.append(instance)

    dataset = Batch(instances)
    vocab = Vocabulary()
    dataset.index_instances(vocab)
    return dataset.as_tensor_dict()['elmo']['character_ids']
Ejemplo n.º 9
0
 def setUp(self):
     token_indexer = SingleIdTokenIndexer("tokens")
     text_field = TextField([Token(t) for t in ["a", "a", "a", "a", "b", "b", "c", "c", "c"]],
                            {"tokens": token_indexer})
     self.instance = Instance({"text": text_field})
     self.dataset = Batch([self.instance])
     super(TestVocabulary, self).setUp()
Ejemplo n.º 10
0
    def text_to_instance(
            self,  # type: ignore
            tokens: List[Token],
            verb_label: List[int],
            tags: List[str] = None) -> Instance:
        """
        We take `pre-tokenized` input here, along with a verb label.  The verb label should be a
        one-hot binary vector, the same length as the tokens, indicating the position of the verb
        to find arguments for.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        text_field = TextField(tokens, token_indexers=self._token_indexers)
        fields['tokens'] = text_field
        fields['verb_indicator'] = SequenceLabelField(verb_label, text_field)
        if tags:
            fields['tags'] = SequenceLabelField(tags, text_field)

        if all([x == 0 for x in verb_label]):
            verb = None
        else:
            verb = tokens[verb_label.index(1)].text
        fields["metadata"] = MetadataField({
            "words": [x.text for x in tokens],
            "verb": verb
        })
        return Instance(fields)
Ejemplo n.º 11
0
    def text_to_instance(
            self,  # type: ignore
            tokens: List[str],
            ccg_categories: List[str] = None,
            original_pos_tags: List[str] = None,
            modified_pos_tags: List[str] = None,
            predicate_arg_categories: List[str] = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.

        Parameters
        ----------
        tokens : ``List[str]``, required.
            The tokens in a given sentence.
        ccg_categories : ``List[str]``, optional, (default = None).
            The CCG categories for the words in the sentence. (e.g. N/N)
        original_pos_tags : ``List[str]``, optional, (default = None).
            The tag assigned to the word in the Penn Treebank.
        modified_pos_tags : ``List[str]``, optional, (default = None).
            The POS tag might have changed during the translation to CCG.
        predicate_arg_categories : ``List[str]``, optional, (default = None).
            Encodes the word-word dependencies in the underlying predicate-
            argument structure.

        Returns
        -------
        An ``Instance`` containing the following fields:
            tokens : ``TextField``
                The tokens in the sentence.
            ccg_categories : ``SequenceLabelField``
                The CCG categories (only if supplied)
            original_pos_tags : ``SequenceLabelField``
                Original POS tag (only if supplied)
            modified_pos_tags : ``SequenceLabelField``
                Modified POS tag (only if supplied)
            predicate_arg_categories : ``SequenceLabelField``
                Predicate-argument categories (only if supplied)
        """
        # pylint: disable=arguments-differ
        text_field = TextField([Token(x) for x in tokens],
                               token_indexers=self._token_indexers)
        fields: Dict[str, Field] = {"tokens": text_field}

        for field_name, labels in (('ccg_categories', ccg_categories),
                                   ('original_pos_tags', original_pos_tags),
                                   ('modified_pos_tags', modified_pos_tags),
                                   ('predicate_arg_categories',
                                    predicate_arg_categories)):
            if labels is not None:
                # end namespace in labels so Vocabulary doesn't add PAD and UNK
                namespace = self._label_namespace_prefix + field_name + '_labels'
                fields[field_name] = SequenceLabelField(
                    labels, text_field, namespace)

        return Instance(fields)
Ejemplo n.º 12
0
 def text_to_instance(self, tokens: List[Token], tags: List[str] = None) -> Instance:  # type: ignore
     """
     We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
     """
     # pylint: disable=arguments-differ
     fields: Dict[str, Field] = {}
     sequence = TextField(tokens, self._token_indexers)
     fields["tokens"] = sequence
     fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
     if tags is not None:
         fields["tags"] = SequenceLabelField(tags, sequence)
     return Instance(fields)
Ejemplo n.º 13
0
    def text_to_instance(
            self,  # type: ignore
            premise: str,
            hypothesis: str,
            label: str = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        premise_tokens = self._tokenizer.tokenize(premise)
        hypothesis_tokens = self._tokenizer.tokenize(hypothesis)
        fields['premise'] = TextField(premise_tokens, self._token_indexers)
        fields['hypothesis'] = TextField(hypothesis_tokens,
                                         self._token_indexers)
        if label:
            fields['label'] = LabelField(label)

        metadata = {
            "premise_tokens": [x.text for x in premise_tokens],
            "hypothesis_tokens": [x.text for x in hypothesis_tokens]
        }
        fields["metadata"] = MetadataField(metadata)
        return Instance(fields)
Ejemplo n.º 14
0
 def text_to_instance(
         self,
         source_string: str,
         target_string: str = None) -> Instance:  # type: ignore
     # pylint: disable=arguments-differ
     tokenized_source = self._source_tokenizer.tokenize(source_string)
     if self._source_add_start_token:
         tokenized_source.insert(0, Token(START_SYMBOL))
     tokenized_source.append(Token(END_SYMBOL))
     source_field = TextField(tokenized_source, self._source_token_indexers)
     if target_string is not None:
         tokenized_target = self._target_tokenizer.tokenize(target_string)
         tokenized_target.insert(0, Token(START_SYMBOL))
         tokenized_target.append(Token(END_SYMBOL))
         target_field = TextField(tokenized_target,
                                  self._target_token_indexers)
         return Instance({
             "source_tokens": source_field,
             "target_tokens": target_field
         })
     else:
         return Instance({'source_tokens': source_field})
Ejemplo n.º 15
0
 def test_as_tensor_handles_words(self):
     field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                       token_indexers={"words": SingleIdTokenIndexer("words")})
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     tensor_dict = field.as_tensor(padding_lengths)
     numpy.testing.assert_array_almost_equal(tensor_dict["words"].detach().cpu().numpy(),
                                             numpy.array([1, 1, 1, 2, 1]))
Ejemplo n.º 16
0
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this", "words")
        self.vocab.add_token_to_namespace("is", "words")
        self.vocab.add_token_to_namespace("a", "words")
        self.vocab.add_token_to_namespace("sentence", 'words')
        self.vocab.add_token_to_namespace("s", 'characters')
        self.vocab.add_token_to_namespace("e", 'characters')
        self.vocab.add_token_to_namespace("n", 'characters')
        self.vocab.add_token_to_namespace("t", 'characters')
        self.vocab.add_token_to_namespace("c", 'characters')
        for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']:
            self.vocab.add_token_to_namespace(label, 'labels')

        self.word_indexer = {"words": SingleIdTokenIndexer("words")}
        self.words_and_characters_indexers = {
            "words": SingleIdTokenIndexer("words"),
            "characters": TokenCharactersIndexer("characters")
        }
        self.field1 = TextField(
            [Token(t) for t in ["this", "is", "a", "sentence"]],
            self.word_indexer)
        self.field2 = TextField(
            [Token(t) for t in ["this", "is", "a", "different", "sentence"]],
            self.word_indexer)
        self.field3 = TextField(
            [Token(t) for t in ["this", "is", "another", "sentence"]],
            self.word_indexer)

        self.empty_text_field = self.field1.empty_field()
        self.index_field = IndexField(1, self.field1)
        self.empty_index_field = self.index_field.empty_field()
        self.sequence_label_field = SequenceLabelField([1, 1, 0, 1],
                                                       self.field1)
        self.empty_sequence_label_field = self.sequence_label_field.empty_field(
        )

        super(TestListField, self).setUp()
Ejemplo n.º 17
0
    def test_max_vocab_size_partial_dict(self):
        indexers = {"tokens": SingleIdTokenIndexer(), "token_characters": TokenCharactersIndexer()}
        instance = Instance({
                'text': TextField([Token(w) for w in 'Abc def ghi jkl mno pqr stu vwx yz'.split(' ')], indexers)
        })
        dataset = Batch([instance])
        params = Params({
                "max_vocab_size": {
                        "tokens": 1
                }
        })

        vocab = Vocabulary.from_params(params=params, instances=dataset)
        assert len(vocab.get_index_to_token_vocabulary("tokens").values()) == 3 # 1 + 2
        assert len(vocab.get_index_to_token_vocabulary("token_characters").values()) == 28 # 26 + 2
Ejemplo n.º 18
0
    def _sentences_to_ids(self, sentences):
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for sentence in sentences:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens, {'character_ids': indexer})
            instance = Instance({'elmo': field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary()
        dataset.index_instances(vocab)
        return dataset.as_tensor_dict()['elmo']['character_ids']
Ejemplo n.º 19
0
    def test_elmo_bilm(self):
        # get the raw data
        sentences, expected_lm_embeddings = self._load_sentences_embeddings()

        # load the test model
        elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file)

        # Deal with the data.
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for batch in zip(*sentences):
            for sentence in batch:
                tokens = [Token(token) for token in sentence.split()]
                field = TextField(tokens, {'character_ids': indexer})
                instance = Instance({"elmo": field})
                instances.append(instance)

        vocab = Vocabulary()

        # Now finally we can iterate through batches.
        iterator = BasicIterator(3)
        iterator.index_with(vocab)
        for i, batch in enumerate(
                iterator(instances, num_epochs=1, shuffle=False)):
            lm_embeddings = elmo_bilm(batch['elmo']['character_ids'])
            top_layer_embeddings, mask = remove_sentence_boundaries(
                lm_embeddings['activations'][2], lm_embeddings['mask'])

            # check the mask lengths
            lengths = mask.data.numpy().sum(axis=1)
            batch_sentences = [sentences[k][i] for k in range(3)]
            expected_lengths = [
                len(sentence.split()) for sentence in batch_sentences
            ]
            self.assertEqual(lengths.tolist(), expected_lengths)

            # get the expected embeddings and compare!
            expected_top_layer = [
                expected_lm_embeddings[k][i] for k in range(3)
            ]
            for k in range(3):
                self.assertTrue(
                    numpy.allclose(
                        top_layer_embeddings[k, :lengths[k], :].data.numpy(),
                        expected_top_layer[k],
                        atol=1.0e-6))
    def text_to_instance(self,
                         tokens: List[str],
                         sentiment: str = None) -> Instance:  # type: ignore
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.

        Parameters
        ----------
        tokens : ``List[str]``, required.
            The tokens in a given sentence.
        sentiment ``str``, optional, (default = None).
            The sentiment for this sentence.

        Returns
        -------
        An ``Instance`` containing the following fields:
            tokens : ``TextField``
                The tokens in the sentence or phrase.
            label : ``LabelField``
                The sentiment label of the sentence or phrase.
        """
        # pylint: disable=arguments-differ
        text_field = TextField([Token(x) for x in tokens],
                               token_indexers=self._token_indexers)
        fields: Dict[str, Field] = {"tokens": text_field}
        if sentiment is not None:
            # 0 and 1 are negative sentiment, 2 is neutral, and 3 and 4 are positive sentiment
            # In 5-class, we use labels as is.
            # 3-class reduces the granularity, and only asks the model to predict
            # negative, neutral, or positive.
            # 2-class further reduces the granularity by only asking the model to
            # predict whether an instance is negative or positive.
            if self._granularity == "3-class":
                if int(sentiment) < 2:
                    sentiment = "0"
                elif int(sentiment) == 2:
                    sentiment = "1"
                else:
                    sentiment = "2"
            elif self._granularity == "2-class":
                if int(sentiment) < 2:
                    sentiment = "0"
                elif int(sentiment) == 2:
                    return None
                else:
                    sentiment = "1"
            fields['label'] = LabelField(sentiment)
        return Instance(fields)
Ejemplo n.º 21
0
    def get_vocab_and_both_elmo_indexed_ids(batch: List[List[str]]):
        instances = []
        indexer = ELMoTokenCharactersIndexer()
        indexer2 = SingleIdTokenIndexer()
        for sentence in batch:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens, {
                'character_ids': indexer,
                'tokens': indexer2
            })
            instance = Instance({"elmo": field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary.from_instances(instances)
        dataset.index_instances(vocab)
        return vocab, dataset.as_tensor_dict()["elmo"]
Ejemplo n.º 22
0
 def test_as_tensor_handles_characters(self):
     field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                       token_indexers={"characters": TokenCharactersIndexer("characters")})
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     tensor_dict = field.as_tensor(padding_lengths)
     expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0],
                                             [1, 3, 0, 0, 0, 0, 0, 0],
                                             [1, 0, 0, 0, 0, 0, 0, 0],
                                             [3, 4, 5, 6, 4, 5, 7, 4],
                                             [1, 0, 0, 0, 0, 0, 0, 0]])
     numpy.testing.assert_array_almost_equal(tensor_dict["characters"].detach().cpu().numpy(),
                                             expected_character_array)
Ejemplo n.º 23
0
    def text_to_instance(
            self,  # type: ignore
            words: List[str],
            upos_tags: List[str],
            dependencies: List[Tuple[str, int]] = None) -> Instance:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        words : ``List[str]``, required.
            The words in the sentence to be encoded.
        upos_tags : ``List[str]``, required.
            The universal dependencies POS tags for each word.
        dependencies ``List[Tuple[str, int]]``, optional (default = None)
            A list of  (head tag, head index) tuples. Indices are 1 indexed,
            meaning an index of 0 corresponds to that word being the root of
            the dependency tree.

        Returns
        -------
        An instance containing words, upos tags, dependency head tags and head
        indices as fields.
        """
        fields: Dict[str, Field] = {}

        tokens = TextField([Token(w) for w in words], self._token_indexers)
        fields["words"] = tokens
        fields["pos_tags"] = SequenceLabelField(upos_tags,
                                                tokens,
                                                label_namespace="pos")
        if dependencies is not None:
            # We don't want to expand the label namespace with an additional dummy token, so we'll
            # always give the 'ROOT_HEAD' token a label of 'root'.
            fields["head_tags"] = SequenceLabelField(
                [x[0] for x in dependencies],
                tokens,
                label_namespace="head_tags")
            fields["head_indices"] = SequenceLabelField(
                [int(x[1]) for x in dependencies],
                tokens,
                label_namespace="head_index_tags")

        fields["metadata"] = MetadataField({"words": words, "pos": upos_tags})
        return Instance(fields)
Ejemplo n.º 24
0
    def test_saving_and_loading_works_with_byte_encoding(self):
        # We're going to set a vocabulary from a TextField using byte encoding, index it, save the
        # vocab, load the vocab, then index the text field again, and make sure we get the same
        # result.
        tokenizer = CharacterTokenizer(byte_encoding='utf-8')
        token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer)
        tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]]
        text_field = TextField(tokens, {"characters": token_indexer})
        dataset = Batch([Instance({"sentence": text_field})])
        vocab = Vocabulary.from_instances(dataset)
        text_field.index(vocab)
        indexed_tokens = deepcopy(text_field._indexed_tokens)  # pylint: disable=protected-access

        vocab_dir = self.TEST_DIR / 'vocab_save'
        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)
        text_field2 = TextField(tokens, {"characters": token_indexer})
        text_field2.index(vocab2)
        indexed_tokens2 = deepcopy(text_field2._indexed_tokens)  # pylint: disable=protected-access
        assert indexed_tokens == indexed_tokens2
Ejemplo n.º 25
0
    def test_from_params_extend_config(self):

        vocab_dir = self.TEST_DIR / 'vocab_save'
        original_vocab = Vocabulary(non_padded_namespaces=["tokens"])
        original_vocab.add_token_to_namespace("a", namespace="tokens")
        original_vocab.save_to_files(vocab_dir)

        text_field = TextField([Token(t) for t in ["a", "b"]],
                               {"tokens": SingleIdTokenIndexer("tokens")})
        instances = Batch([Instance({"text": text_field})])

        # If you ask to extend vocab from `directory_path`, instances must be passed
        # in Vocabulary constructor, or else there is nothing to extend to.
        params = Params({"directory_path": vocab_dir, "extend": True})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params)

        # If you ask to extend vocab, `directory_path` key must be present in params,
        # or else there is nothing to extend from.
        params = Params({"extend": True})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances)
Ejemplo n.º 26
0
    def test_as_tensor_handles_words_and_characters_with_longer_lengths(self):
        field = TextField([Token(t) for t in ["a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words"),
                                          "characters": TokenCharactersIndexer("characters")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths["num_tokens"] = 5
        padding_lengths["num_token_characters"] = 10
        tensor_dict = field.as_tensor(padding_lengths)

        numpy.testing.assert_array_almost_equal(tensor_dict["words"].detach().cpu().numpy(),
                                                numpy.array([1, 2, 1, 0, 0]))
        numpy.testing.assert_array_almost_equal(tensor_dict["characters"].detach().cpu().numpy(),
                                                numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                                             [3, 4, 5, 6, 4, 5, 7, 4, 0, 0],
                                                             [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                                             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                                             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
Ejemplo n.º 27
0
 def test_token_embedder_returns_dict(self):
     field = TextField([Token(t) for t in ["A", "sentence"]],
                       token_indexers={"field_with_dict": DictReturningTokenIndexer(),
                                       "words": SingleIdTokenIndexer("words"),
                                       "characters": TokenCharactersIndexer("characters")})
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     assert padding_lengths == {
             'token_ids': 5,
             'additional_key': 2,
             'words': 2,
             'characters': 2,
             'num_token_characters': 8
     }
     padding_lengths['token_ids'] = 7
     padding_lengths['additional_key'] = 3
     padding_lengths['words'] = 4
     padding_lengths['characters'] = 4
     tensors = field.as_tensor(padding_lengths)
     assert list(tensors['token_ids'].shape) == [7]
     assert list(tensors['additional_key'].shape) == [3]
     assert list(tensors['words'].shape) == [4]
     assert list(tensors['characters'].shape) == [4, 8]
Ejemplo n.º 28
0
    def text_to_instance(
            self,  # type: ignore
            tokens: List[str],
            pos_tags: List[str] = None,
            gold_tree: Tree = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.

        Parameters
        ----------
        tokens : ``List[str]``, required.
            The tokens in a given sentence.
        pos_tags ``List[str]``, optional, (default = None).
            The POS tags for the words in the sentence.
        gold_tree : ``Tree``, optional (default = None).
            The gold parse tree to create span labels from.

        Returns
        -------
        An ``Instance`` containing the following fields:
            tokens : ``TextField``
                The tokens in the sentence.
            pos_tags : ``SequenceLabelField``
                The POS tags of the words in the sentence.
                Only returned if ``use_pos_tags`` is ``True``
            spans : ``ListField[SpanField]``
                A ListField containing all possible subspans of the
                sentence.
            span_labels : ``SequenceLabelField``, optional.
                The constiutency tags for each of the possible spans, with
                respect to a gold parse tree. If a span is not contained
                within the tree, a span will have a ``NO-LABEL`` label.
            gold_tree : ``MetadataField(Tree)``
                The gold NLTK parse tree for use in evaluation.
        """
        # pylint: disable=arguments-differ
        text_field = TextField([Token(x) for x in tokens],
                               token_indexers=self._token_indexers)
        fields: Dict[str, Field] = {"tokens": text_field}

        pos_namespace = self._label_namespace_prefix + self._pos_label_namespace
        if self._use_pos_tags and pos_tags is not None:
            pos_tag_field = SequenceLabelField(pos_tags,
                                               text_field,
                                               label_namespace=pos_namespace)
            fields["pos_tags"] = pos_tag_field
        elif self._use_pos_tags:
            raise ConfigurationError(
                "use_pos_tags was set to True but no gold pos"
                " tags were passed to the dataset reader.")
        spans: List[Field] = []
        gold_labels = []

        if gold_tree is not None:
            gold_spans: Dict[Tuple[int, int], str] = {}
            self._get_gold_spans(gold_tree, 0, gold_spans)

        else:
            gold_spans = None
        for start, end in enumerate_spans(tokens):
            spans.append(SpanField(start, end, text_field))

            if gold_spans is not None:
                if (start, end) in gold_spans.keys():
                    gold_labels.append(gold_spans[(start, end)])
                else:
                    gold_labels.append("NO-LABEL")

        metadata = {"tokens": tokens}
        if gold_tree:
            metadata["gold_tree"] = gold_tree
        if self._use_pos_tags:
            metadata["pos_tags"] = pos_tags

        fields["metadata"] = MetadataField(metadata)

        span_list_field: ListField = ListField(spans)
        fields["spans"] = span_list_field
        if gold_tree is not None:
            fields["span_labels"] = SequenceLabelField(
                gold_labels,
                span_list_field,
                label_namespace=self._label_namespace_prefix + "labels")
        return Instance(fields)
Ejemplo n.º 29
0
    def test_field_counts_vocab_items_correctly(self):
        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["words"]

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"characters": TokenCharactersIndexer("characters")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["characters"]

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words"),
                                          "characters": TokenCharactersIndexer("characters")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)
        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert set(namespace_token_counts.keys()) == {"words", "characters"}
Ejemplo n.º 30
0
 def test_printing_doesnt_crash(self):
     field = TextField([Token(t) for t in ["A", "sentence"]],
                       {"words": SingleIdTokenIndexer(namespace="words")})
     print(field)