def test_initializes_from_legacy_word_tokenizer_params(self):
        params = Params({
            "type": "word",
            "word_splitter": {
                "type": "spacy",
                "pos_tags": True
            },
            "start_tokens": ["<s>"],
            "end_tokens": ["</s>"],
        })
        tokenizer = Tokenizer.from_params(params)
        assert isinstance(tokenizer, SpacyTokenizer)
        assert tokenizer._start_tokens == params["start_tokens"]
        assert tokenizer._end_tokens == params["end_tokens"]
        assert "tagger" in tokenizer.spacy.pipe_names

        # Remove "word_splitter_type"
        params = Params({"type": "word", "word_splitter": {"pos_tags": True}})
        tokenizer = Tokenizer.from_params(params)
        assert isinstance(tokenizer, SpacyTokenizer)

        # Splitter is a string
        params = Params({"type": "word", "word_splitter": "just_spaces"})
        tokenizer = Tokenizer.from_params(params)
        assert isinstance(tokenizer, WhitespaceTokenizer)

        # Remove legacy tokenizer type
        params = Params({"word_splitter": "spacy"})
        tokenizer = Tokenizer.from_params(params)
        assert isinstance(tokenizer, SpacyTokenizer)
 def test_raises_exception_for_invalid_legacy_params(self):
     params = Params({"type": "word", "word_stemmer": "porter"})
     with self.assertRaises(ConfigurationError):
         Tokenizer.from_params(params)
     params = Params({"type": "word", "word_filter": "regex"})
     with self.assertRaises(ConfigurationError):
         Tokenizer.from_params(params)
Beispiel #3
0
 def from_params(cls, params: Params) -> 'LanguageModelingReader':
     tokens_per_instance = params.pop_int('tokens_per_instance', None)
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = TokenIndexer.dict_from_params(
         params.pop('token_indexers', {}))
     params.assert_empty(cls.__name__)
     return LanguageModelingReader(tokens_per_instance=tokens_per_instance,
                                   tokenizer=tokenizer,
                                   token_indexers=token_indexers)
 def from_params(cls, params: Params) -> 'LanguageModelingReader':
     tokens_per_instance = params.pop_int('tokens_per_instance', None)
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {}))
     lazy = params.pop('lazy', False)
     params.assert_empty(cls.__name__)
     return LanguageModelingReader(tokens_per_instance=tokens_per_instance,
                                   tokenizer=tokenizer,
                                   token_indexers=token_indexers,
                                   lazy=lazy)
Beispiel #5
0
 def from_params(cls, params: Params) -> 'SquadSentenceSelectionReader':
     negative_sentence_selection = params.pop('negative_sentence_selection',
                                              'paragraph')
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = TokenIndexer.dict_from_params(
         params.pop('token_indexers', {}))
     params.assert_empty(cls.__name__)
     return SquadSentenceSelectionReader(
         negative_sentence_selection=negative_sentence_selection,
         tokenizer=tokenizer,
         token_indexers=token_indexers)
Beispiel #6
0
 def from_params(cls, params: Params) -> 'SnliReader':
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = {}
     token_indexer_params = params.pop('token_indexers', Params({}))
     for name, indexer_params in token_indexer_params.items():
         token_indexers[name] = TokenIndexer.from_params(indexer_params)
     # The default parameters are contained within the class,
     # so if no parameters are given we must pass None.
     if token_indexers == {}:
         token_indexers = None
     params.assert_empty(cls.__name__)
     return SnliReader(tokenizer=tokenizer, token_indexers=token_indexers)
Beispiel #7
0
 def from_params(cls, params: Params) -> 'SquadReader':
     """
     Parameters
     ----------
     tokenizer : ``Params``, optional (default=``{}``)
     token_indexers: ``Params``, optional (default=``{}``)
     """
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = {}
     token_indexer_params = params.pop('token_indexers', {})
     for name, indexer_params in token_indexer_params.items():
         token_indexers[name] = TokenIndexer.from_params(indexer_params)
     # The default parameters are contained within the class, so if no parameters are given we
     # must pass None.
     if token_indexers == {}:
         token_indexers = None
     params.assert_empty(cls.__name__)
     return cls(tokenizer=tokenizer, token_indexers=token_indexers)
Beispiel #8
0
 def from_params(cls, params: Params) -> 'LanguageModelingReader':
     """
     Parameters
     ----------
     filename : ``str``
     tokens_per_instance : ``int``, optional (default=``None``)
     tokenizer : ``Params``, optional
     token_indexers: ``List[Params]``, optional
     """
     tokens_per_instance = params.pop('tokens_per_instance', None)
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = {}
     token_indexer_params = params.pop('token_indexers', Params({}))
     for name, indexer_params in token_indexer_params.items():
         token_indexers[name] = TokenIndexer.from_params(indexer_params)
     # The default parameters are contained within the class,
     # so if no parameters are given we must pass None.
     if token_indexers == {}:
         token_indexers = None
     params.assert_empty(cls.__name__)
     return LanguageModelingReader(tokens_per_instance=tokens_per_instance,
                                   tokenizer=tokenizer,
                                   token_indexers=token_indexers)
Beispiel #9
0
 def from_params(cls, params: Params) -> 'SquadSentenceSelectionReader':
     """
     Parameters
     ----------
     negative_sentence_selection : ``str``, optional (default="paragraph")
     tokenizer : ``Params``, optional
     token_indexers: ``List[Params]``, optional
     """
     negative_sentence_selection = params.pop('negative_sentence_selection',
                                              'paragraph')
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = {}
     token_indexer_params = params.pop('token_indexers', Params({}))
     for name, indexer_params in token_indexer_params.items():
         token_indexers[name] = TokenIndexer.from_params(indexer_params)
     # The default parameters are contained within the class,
     # so if no parameters are given we must pass None.
     if token_indexers == {}:
         token_indexers = None
     params.assert_empty(cls.__name__)
     return SquadSentenceSelectionReader(
         negative_sentence_selection=negative_sentence_selection,
         tokenizer=tokenizer,
         token_indexers=token_indexers)
Beispiel #10
0
 def from_params(cls, params: Params) -> 'SnliReader':
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {}))
     params.assert_empty(cls.__name__)
     return SnliReader(tokenizer=tokenizer,
                       token_indexers=token_indexers)