def test_initializes_from_legacy_word_tokenizer_params(self): params = Params({ "type": "word", "word_splitter": { "type": "spacy", "pos_tags": True }, "start_tokens": ["<s>"], "end_tokens": ["</s>"], }) tokenizer = Tokenizer.from_params(params) assert isinstance(tokenizer, SpacyTokenizer) assert tokenizer._start_tokens == params["start_tokens"] assert tokenizer._end_tokens == params["end_tokens"] assert "tagger" in tokenizer.spacy.pipe_names # Remove "word_splitter_type" params = Params({"type": "word", "word_splitter": {"pos_tags": True}}) tokenizer = Tokenizer.from_params(params) assert isinstance(tokenizer, SpacyTokenizer) # Splitter is a string params = Params({"type": "word", "word_splitter": "just_spaces"}) tokenizer = Tokenizer.from_params(params) assert isinstance(tokenizer, WhitespaceTokenizer) # Remove legacy tokenizer type params = Params({"word_splitter": "spacy"}) tokenizer = Tokenizer.from_params(params) assert isinstance(tokenizer, SpacyTokenizer)
def test_raises_exception_for_invalid_legacy_params(self): params = Params({"type": "word", "word_stemmer": "porter"}) with self.assertRaises(ConfigurationError): Tokenizer.from_params(params) params = Params({"type": "word", "word_filter": "regex"}) with self.assertRaises(ConfigurationError): Tokenizer.from_params(params)
def from_params(cls, params: Params) -> 'LanguageModelingReader': tokens_per_instance = params.pop_int('tokens_per_instance', None) tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) params.assert_empty(cls.__name__) return LanguageModelingReader(tokens_per_instance=tokens_per_instance, tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'LanguageModelingReader': tokens_per_instance = params.pop_int('tokens_per_instance', None) tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return LanguageModelingReader(tokens_per_instance=tokens_per_instance, tokenizer=tokenizer, token_indexers=token_indexers, lazy=lazy)
def from_params(cls, params: Params) -> 'SquadSentenceSelectionReader': negative_sentence_selection = params.pop('negative_sentence_selection', 'paragraph') tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) params.assert_empty(cls.__name__) return SquadSentenceSelectionReader( negative_sentence_selection=negative_sentence_selection, tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'SnliReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = {} token_indexer_params = params.pop('token_indexers', Params({})) for name, indexer_params in token_indexer_params.items(): token_indexers[name] = TokenIndexer.from_params(indexer_params) # The default parameters are contained within the class, # so if no parameters are given we must pass None. if token_indexers == {}: token_indexers = None params.assert_empty(cls.__name__) return SnliReader(tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'SquadReader': """ Parameters ---------- tokenizer : ``Params``, optional (default=``{}``) token_indexers: ``Params``, optional (default=``{}``) """ tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = {} token_indexer_params = params.pop('token_indexers', {}) for name, indexer_params in token_indexer_params.items(): token_indexers[name] = TokenIndexer.from_params(indexer_params) # The default parameters are contained within the class, so if no parameters are given we # must pass None. if token_indexers == {}: token_indexers = None params.assert_empty(cls.__name__) return cls(tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'LanguageModelingReader': """ Parameters ---------- filename : ``str`` tokens_per_instance : ``int``, optional (default=``None``) tokenizer : ``Params``, optional token_indexers: ``List[Params]``, optional """ tokens_per_instance = params.pop('tokens_per_instance', None) tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = {} token_indexer_params = params.pop('token_indexers', Params({})) for name, indexer_params in token_indexer_params.items(): token_indexers[name] = TokenIndexer.from_params(indexer_params) # The default parameters are contained within the class, # so if no parameters are given we must pass None. if token_indexers == {}: token_indexers = None params.assert_empty(cls.__name__) return LanguageModelingReader(tokens_per_instance=tokens_per_instance, tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'SquadSentenceSelectionReader': """ Parameters ---------- negative_sentence_selection : ``str``, optional (default="paragraph") tokenizer : ``Params``, optional token_indexers: ``List[Params]``, optional """ negative_sentence_selection = params.pop('negative_sentence_selection', 'paragraph') tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = {} token_indexer_params = params.pop('token_indexers', Params({})) for name, indexer_params in token_indexer_params.items(): token_indexers[name] = TokenIndexer.from_params(indexer_params) # The default parameters are contained within the class, # so if no parameters are given we must pass None. if token_indexers == {}: token_indexers = None params.assert_empty(cls.__name__) return SquadSentenceSelectionReader( negative_sentence_selection=negative_sentence_selection, tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'SnliReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) params.assert_empty(cls.__name__) return SnliReader(tokenizer=tokenizer, token_indexers=token_indexers)