def test_initializes_from_legacy_word_tokenizer_params(self): params = Params({ "type": "word", "word_splitter": { "type": "spacy", "pos_tags": True }, "start_tokens": ["<s>"], "end_tokens": ["</s>"], }) tokenizer = Tokenizer.from_params(params) assert isinstance(tokenizer, SpacyTokenizer) assert tokenizer._start_tokens == params["start_tokens"] assert tokenizer._end_tokens == params["end_tokens"] assert "tagger" in tokenizer.spacy.pipe_names # Remove "word_splitter_type" params = Params({"type": "word", "word_splitter": {"pos_tags": True}}) tokenizer = Tokenizer.from_params(params) assert isinstance(tokenizer, SpacyTokenizer) # Splitter is a string params = Params({"type": "word", "word_splitter": "just_spaces"}) tokenizer = Tokenizer.from_params(params) assert isinstance(tokenizer, WhitespaceTokenizer) # Remove legacy tokenizer type params = Params({"word_splitter": "spacy"}) tokenizer = Tokenizer.from_params(params) assert isinstance(tokenizer, SpacyTokenizer)
def test_raises_exception_for_invalid_legacy_params(self): params = Params({"type": "word", "word_stemmer": "porter"}) with self.assertRaises(ConfigurationError): Tokenizer.from_params(params) params = Params({"type": "word", "word_filter": "regex"}) with self.assertRaises(ConfigurationError): Tokenizer.from_params(params)
def from_params(cls, params: Params) -> 'LanguageModelingReader': tokens_per_instance = params.pop_int('tokens_per_instance', None) tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) params.assert_empty(cls.__name__) return LanguageModelingReader(tokens_per_instance=tokens_per_instance, tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'LanguageModelingReader': tokens_per_instance = params.pop_int('tokens_per_instance', None) tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return LanguageModelingReader(tokens_per_instance=tokens_per_instance, tokenizer=tokenizer, token_indexers=token_indexers, lazy=lazy)
def from_params(cls, params: Params) -> 'SquadSentenceSelectionReader': negative_sentence_selection = params.pop('negative_sentence_selection', 'paragraph') tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) params.assert_empty(cls.__name__) return SquadSentenceSelectionReader( negative_sentence_selection=negative_sentence_selection, tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'SnliReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = {} token_indexer_params = params.pop('token_indexers', Params({})) for name, indexer_params in token_indexer_params.items(): token_indexers[name] = TokenIndexer.from_params(indexer_params) # The default parameters are contained within the class, # so if no parameters are given we must pass None. if token_indexers == {}: token_indexers = None params.assert_empty(cls.__name__) return SnliReader(tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'SquadReader': """ Parameters ---------- tokenizer : ``Params``, optional (default=``{}``) token_indexers: ``Params``, optional (default=``{}``) """ tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = {} token_indexer_params = params.pop('token_indexers', {}) for name, indexer_params in token_indexer_params.items(): token_indexers[name] = TokenIndexer.from_params(indexer_params) # The default parameters are contained within the class, so if no parameters are given we # must pass None. if token_indexers == {}: token_indexers = None params.assert_empty(cls.__name__) return cls(tokenizer=tokenizer, token_indexers=token_indexers)
def _char_span_to_token_span(sentence: str, tokenized_sentence: List[str], span: Tuple[int, int], tokenizer: Tokenizer, slack: int = 3) -> Tuple[int, int]: """ Converts a character span from a sentence into the corresponding token span in the tokenized version of the sentence. If you pass in a character span that does not correspond to complete tokens in the tokenized version, we'll do our best, but the behavior is officially undefined. The basic outline of this method is to find the token that starts the same number of characters into the sentence as the given character span. We try to handle a bit of error in the tokenization by checking `slack` tokens in either direction from that initial estimate. The returned ``(begin, end)`` indices are `inclusive` for ``begin``, and `exclusive` for ``end``. So, for example, ``(2, 2)`` is an empty span, ``(2, 3)`` is the one-word span beginning at token index 2, and so on. """ # First we'll tokenize the span and the sentence, so we can count tokens and check for # matches. span_chars = sentence[span[0]:span[1]] tokenized_span = tokenizer.tokenize(span_chars) # Then we'll find what we think is the first token in the span chars_seen = 0 index = 0 while index < len(tokenized_sentence) and chars_seen < span[0]: chars_seen += len(tokenized_sentence[index]) + 1 index += 1 # index is now the span start index. Is it a match? if _spans_match(tokenized_sentence, tokenized_span, index): return (index, index + len(tokenized_span)) for i in range(1, slack + 1): if _spans_match(tokenized_sentence, tokenized_span, index + i): return (index + i, index + i + len(tokenized_span)) if _spans_match(tokenized_sentence, tokenized_span, index - i): return (index - i, index - i + len(tokenized_span)) # No match; we'll just return our best guess. return (index, index + len(tokenized_span))
def from_params(cls, params: Params) -> 'LanguageModelingReader': """ Parameters ---------- filename : ``str`` tokens_per_instance : ``int``, optional (default=``None``) tokenizer : ``Params``, optional token_indexers: ``List[Params]``, optional """ tokens_per_instance = params.pop('tokens_per_instance', None) tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = {} token_indexer_params = params.pop('token_indexers', Params({})) for name, indexer_params in token_indexer_params.items(): token_indexers[name] = TokenIndexer.from_params(indexer_params) # The default parameters are contained within the class, # so if no parameters are given we must pass None. if token_indexers == {}: token_indexers = None params.assert_empty(cls.__name__) return LanguageModelingReader(tokens_per_instance=tokens_per_instance, tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'SquadSentenceSelectionReader': """ Parameters ---------- negative_sentence_selection : ``str``, optional (default="paragraph") tokenizer : ``Params``, optional token_indexers: ``List[Params]``, optional """ negative_sentence_selection = params.pop('negative_sentence_selection', 'paragraph') tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = {} token_indexer_params = params.pop('token_indexers', Params({})) for name, indexer_params in token_indexer_params.items(): token_indexers[name] = TokenIndexer.from_params(indexer_params) # The default parameters are contained within the class, # so if no parameters are given we must pass None. if token_indexers == {}: token_indexers = None params.assert_empty(cls.__name__) return SquadSentenceSelectionReader( negative_sentence_selection=negative_sentence_selection, tokenizer=tokenizer, token_indexers=token_indexers)
def test_registry_has_builtin_tokenizers(self): assert Tokenizer.by_name('word').__name__ == 'WordTokenizer' assert Tokenizer.by_name('character').__name__ == 'CharacterTokenizer'
return [self.tokenize(text) for text in texts] #overrides def tokenize(self, text): if self._lowercase_characters: text = text.lower() if self._byte_encoding is not None: # We add 1 here so that we can still use 0 for masking, no matter what bytes we get out # of this. tokens = [ Token(text_id=c + 1) for c in text.encode(self._byte_encoding) ] else: tokens = [Token(t) for t in list(text)] for start_token in self._start_tokens: if isinstance(start_token, int): token = Token(text_id=start_token, idx=0) else: token = Token(text=start_token, idx=0) tokens.insert(0, token) for end_token in self._end_tokens: if isinstance(end_token, int): token = Token(text_id=end_token, idx=0) else: token = Token(text=end_token, idx=0) tokens.append(token) return tokens CharacterTokenizer = Tokenizer.register(u"character")(CharacterTokenizer)
def __init__(self, knowledge_graph: KnowledgeGraph, utterance_tokens: List[Token], token_indexers: Dict[str, TokenIndexer], tokenizer: Tokenizer = None, feature_extractors: List[str] = None, entity_tokens: List[List[Token]] = None, linking_features: List[List[List[float]]] = None, include_in_vocab: bool = True, max_table_tokens: int = None) -> None: self.knowledge_graph = knowledge_graph if not entity_tokens: entity_texts = [ knowledge_graph.entity_text[entity].lower() for entity in knowledge_graph.entities ] # TODO(mattg): Because we do tagging on each of these entities in addition to just # tokenizations, this is quite slow, and about half of our data processing time just # goes to this (~15 minutes when there are 7k instances). The reason we do tagging is # so that we can add lemma features. If we can remove the need for lemma / other # hand-written features, like with a CNN, we can cut down our data processing time by a # factor of 2. self.entity_texts = tokenizer.batch_tokenize(entity_texts) else: self.entity_texts = entity_tokens self.utterance_tokens = utterance_tokens self._token_indexers: Dict[str, TokenIndexer] = token_indexers self._include_in_vocab = include_in_vocab self._indexed_entity_texts: Dict[str, TokenList] = None self._max_table_tokens = max_table_tokens feature_extractors = feature_extractors if feature_extractors is not None else [ 'number_token_match', 'exact_token_match', 'contains_exact_token_match', 'lemma_match', 'contains_lemma_match', 'edit_distance', 'related_column', 'related_column_lemma', 'span_overlap_fraction', 'span_lemma_overlap_fraction', ] self._feature_extractors: List[Callable[ [str, List[Token], Token, int, List[Token]], float]] = [] for feature_extractor_name in feature_extractors: extractor = getattr(self, '_' + feature_extractor_name, None) if not extractor: raise ConfigurationError( f"Invalid feature extractor name: {feature_extractor_name}" ) self._feature_extractors.append(extractor) if not linking_features: # For quicker lookups in our feature functions, we'll additionally store some # dictionaries that map entity strings to useful information about the entity. self._entity_text_map: Dict[str, List[Token]] = {} for entity, entity_text in zip(knowledge_graph.entities, self.entity_texts): self._entity_text_map[entity] = entity_text self._entity_text_exact_text: Dict[str, Set[str]] = {} for entity, entity_text in zip(knowledge_graph.entities, self.entity_texts): self._entity_text_exact_text[entity] = set( e.text for e in entity_text) self._entity_text_lemmas: Dict[str, Set[str]] = {} for entity, entity_text in zip(knowledge_graph.entities, self.entity_texts): self._entity_text_lemmas[entity] = set(e.lemma_ for e in entity_text) self.linking_features = self._compute_linking_features() else: self.linking_features = linking_features
def from_params(cls, params: Params) -> 'SnliReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) params.assert_empty(cls.__name__) return SnliReader(tokenizer=tokenizer, token_indexers=token_indexers)
def test_registry_has_builtin_tokenizers(self): assert Tokenizer.by_name("spacy").__name__ == "SpacyTokenizer" assert Tokenizer.by_name("character").__name__ == "CharacterTokenizer"
def __init__(self, knowledge_graph: KnowledgeGraph, utterance_tokens: List[Token], token_indexers: Dict[str, TokenIndexer], tokenizer: Tokenizer = None, feature_extractors: List[str] = None, entity_tokens: List[List[Token]] = None, linking_features: List[List[List[float]]] = None, include_in_vocab: bool = True, max_table_tokens: int = None) -> None: self.knowledge_graph = knowledge_graph if not entity_tokens: entity_texts = [knowledge_graph.entity_text[entity].lower() for entity in knowledge_graph.entities] # TODO(mattg): Because we do tagging on each of these entities in addition to just # tokenizations, this is quite slow, and about half of our data processing time just # goes to this (~15 minutes when there are 7k instances). The reason we do tagging is # so that we can add lemma features. If we can remove the need for lemma / other # hand-written features, like with a CNN, we can cut down our data processing time by a # factor of 2. self.entity_texts = tokenizer.batch_tokenize(entity_texts) else: self.entity_texts = entity_tokens self.utterance_tokens = utterance_tokens self._token_indexers: Dict[str, TokenIndexer] = token_indexers self._include_in_vocab = include_in_vocab self._indexed_entity_texts: Dict[str, TokenList] = None self._max_table_tokens = max_table_tokens feature_extractors = feature_extractors if feature_extractors is not None else [ 'number_token_match', 'exact_token_match', 'contains_exact_token_match', 'lemma_match', 'contains_lemma_match', 'edit_distance', 'related_column', 'related_column_lemma', 'span_overlap_fraction', 'span_lemma_overlap_fraction', ] self._feature_extractors: List[Callable[[str, List[Token], Token, int, List[Token]], float]] = [] for feature_extractor_name in feature_extractors: extractor = getattr(self, '_' + feature_extractor_name, None) if not extractor: raise ConfigurationError(f"Invalid feature extractor name: {feature_extractor_name}") self._feature_extractors.append(extractor) if not linking_features: # For quicker lookups in our feature functions, we'll additionally store some # dictionaries that map entity strings to useful information about the entity. self._entity_text_map: Dict[str, List[Token]] = {} for entity, entity_text in zip(knowledge_graph.entities, self.entity_texts): self._entity_text_map[entity] = entity_text self._entity_text_exact_text: Dict[str, Set[str]] = {} for entity, entity_text in zip(knowledge_graph.entities, self.entity_texts): self._entity_text_exact_text[entity] = set(e.text for e in entity_text) self._entity_text_lemmas: Dict[str, Set[str]] = {} for entity, entity_text in zip(knowledge_graph.entities, self.entity_texts): self._entity_text_lemmas[entity] = set(e.lemma_ for e in entity_text) self.linking_features = self._compute_linking_features() else: self.linking_features = linking_features
def test_registry_has_builtin_tokenizers(self): assert Tokenizer.by_name("word").__name__ == "WordTokenizer" assert Tokenizer.by_name("character").__name__ == "CharacterTokenizer"
#overrides def tokenize(self, text): u""" Does whatever processing is required to convert a string of text into a sequence of tokens. At a minimum, this uses a ``WordSplitter`` to split words into text. It may also do stemming or stopword removal, depending on the parameters given to the constructor. """ words = self._word_splitter.split_words(text) return self._filter_and_stem(words) #overrides def batch_tokenize(self, texts): batched_words = self._word_splitter.batch_split_words(texts) return [self._filter_and_stem(words) for words in batched_words] def _filter_and_stem(self, words): filtered_words = self._word_filter.filter_words(words) stemmed_words = [ self._word_stemmer.stem_word(word) for word in filtered_words ] for start_token in self._start_tokens: stemmed_words.insert(0, Token(start_token, 0)) for end_token in self._end_tokens: stemmed_words.append(Token(end_token, -1)) return stemmed_words WordTokenizer = Tokenizer.register(u"word")(WordTokenizer)