Example #1
0
    def test_initializes_from_legacy_word_tokenizer_params(self):
        params = Params({
            "type": "word",
            "word_splitter": {
                "type": "spacy",
                "pos_tags": True
            },
            "start_tokens": ["<s>"],
            "end_tokens": ["</s>"],
        })
        tokenizer = Tokenizer.from_params(params)
        assert isinstance(tokenizer, SpacyTokenizer)
        assert tokenizer._start_tokens == params["start_tokens"]
        assert tokenizer._end_tokens == params["end_tokens"]
        assert "tagger" in tokenizer.spacy.pipe_names

        # Remove "word_splitter_type"
        params = Params({"type": "word", "word_splitter": {"pos_tags": True}})
        tokenizer = Tokenizer.from_params(params)
        assert isinstance(tokenizer, SpacyTokenizer)

        # Splitter is a string
        params = Params({"type": "word", "word_splitter": "just_spaces"})
        tokenizer = Tokenizer.from_params(params)
        assert isinstance(tokenizer, WhitespaceTokenizer)

        # Remove legacy tokenizer type
        params = Params({"word_splitter": "spacy"})
        tokenizer = Tokenizer.from_params(params)
        assert isinstance(tokenizer, SpacyTokenizer)
Example #2
0
 def test_raises_exception_for_invalid_legacy_params(self):
     params = Params({"type": "word", "word_stemmer": "porter"})
     with self.assertRaises(ConfigurationError):
         Tokenizer.from_params(params)
     params = Params({"type": "word", "word_filter": "regex"})
     with self.assertRaises(ConfigurationError):
         Tokenizer.from_params(params)
Example #3
0
 def from_params(cls, params: Params) -> 'LanguageModelingReader':
     tokens_per_instance = params.pop_int('tokens_per_instance', None)
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = TokenIndexer.dict_from_params(
         params.pop('token_indexers', {}))
     params.assert_empty(cls.__name__)
     return LanguageModelingReader(tokens_per_instance=tokens_per_instance,
                                   tokenizer=tokenizer,
                                   token_indexers=token_indexers)
 def from_params(cls, params: Params) -> 'LanguageModelingReader':
     tokens_per_instance = params.pop_int('tokens_per_instance', None)
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {}))
     lazy = params.pop('lazy', False)
     params.assert_empty(cls.__name__)
     return LanguageModelingReader(tokens_per_instance=tokens_per_instance,
                                   tokenizer=tokenizer,
                                   token_indexers=token_indexers,
                                   lazy=lazy)
Example #5
0
 def from_params(cls, params: Params) -> 'SquadSentenceSelectionReader':
     negative_sentence_selection = params.pop('negative_sentence_selection',
                                              'paragraph')
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = TokenIndexer.dict_from_params(
         params.pop('token_indexers', {}))
     params.assert_empty(cls.__name__)
     return SquadSentenceSelectionReader(
         negative_sentence_selection=negative_sentence_selection,
         tokenizer=tokenizer,
         token_indexers=token_indexers)
Example #6
0
 def from_params(cls, params: Params) -> 'SnliReader':
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = {}
     token_indexer_params = params.pop('token_indexers', Params({}))
     for name, indexer_params in token_indexer_params.items():
         token_indexers[name] = TokenIndexer.from_params(indexer_params)
     # The default parameters are contained within the class,
     # so if no parameters are given we must pass None.
     if token_indexers == {}:
         token_indexers = None
     params.assert_empty(cls.__name__)
     return SnliReader(tokenizer=tokenizer, token_indexers=token_indexers)
Example #7
0
 def from_params(cls, params: Params) -> 'SquadReader':
     """
     Parameters
     ----------
     tokenizer : ``Params``, optional (default=``{}``)
     token_indexers: ``Params``, optional (default=``{}``)
     """
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = {}
     token_indexer_params = params.pop('token_indexers', {})
     for name, indexer_params in token_indexer_params.items():
         token_indexers[name] = TokenIndexer.from_params(indexer_params)
     # The default parameters are contained within the class, so if no parameters are given we
     # must pass None.
     if token_indexers == {}:
         token_indexers = None
     params.assert_empty(cls.__name__)
     return cls(tokenizer=tokenizer, token_indexers=token_indexers)
Example #8
0
def _char_span_to_token_span(sentence: str,
                             tokenized_sentence: List[str],
                             span: Tuple[int, int],
                             tokenizer: Tokenizer,
                             slack: int = 3) -> Tuple[int, int]:
    """
    Converts a character span from a sentence into the corresponding token span in the
    tokenized version of the sentence.  If you pass in a character span that does not
    correspond to complete tokens in the tokenized version, we'll do our best, but the behavior
    is officially undefined.

    The basic outline of this method is to find the token that starts the same number of
    characters into the sentence as the given character span.  We try to handle a bit of error
    in the tokenization by checking `slack` tokens in either direction from that initial
    estimate.

    The returned ``(begin, end)`` indices are `inclusive` for ``begin``, and `exclusive` for
    ``end``.  So, for example, ``(2, 2)`` is an empty span, ``(2, 3)`` is the one-word span
    beginning at token index 2, and so on.
    """
    # First we'll tokenize the span and the sentence, so we can count tokens and check for
    # matches.
    span_chars = sentence[span[0]:span[1]]
    tokenized_span = tokenizer.tokenize(span_chars)
    # Then we'll find what we think is the first token in the span
    chars_seen = 0
    index = 0
    while index < len(tokenized_sentence) and chars_seen < span[0]:
        chars_seen += len(tokenized_sentence[index]) + 1
        index += 1
    # index is now the span start index.  Is it a match?
    if _spans_match(tokenized_sentence, tokenized_span, index):
        return (index, index + len(tokenized_span))
    for i in range(1, slack + 1):
        if _spans_match(tokenized_sentence, tokenized_span, index + i):
            return (index + i, index + i + len(tokenized_span))
        if _spans_match(tokenized_sentence, tokenized_span, index - i):
            return (index - i, index - i + len(tokenized_span))
    # No match; we'll just return our best guess.
    return (index, index + len(tokenized_span))
Example #9
0
 def from_params(cls, params: Params) -> 'LanguageModelingReader':
     """
     Parameters
     ----------
     filename : ``str``
     tokens_per_instance : ``int``, optional (default=``None``)
     tokenizer : ``Params``, optional
     token_indexers: ``List[Params]``, optional
     """
     tokens_per_instance = params.pop('tokens_per_instance', None)
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = {}
     token_indexer_params = params.pop('token_indexers', Params({}))
     for name, indexer_params in token_indexer_params.items():
         token_indexers[name] = TokenIndexer.from_params(indexer_params)
     # The default parameters are contained within the class,
     # so if no parameters are given we must pass None.
     if token_indexers == {}:
         token_indexers = None
     params.assert_empty(cls.__name__)
     return LanguageModelingReader(tokens_per_instance=tokens_per_instance,
                                   tokenizer=tokenizer,
                                   token_indexers=token_indexers)
Example #10
0
 def from_params(cls, params: Params) -> 'SquadSentenceSelectionReader':
     """
     Parameters
     ----------
     negative_sentence_selection : ``str``, optional (default="paragraph")
     tokenizer : ``Params``, optional
     token_indexers: ``List[Params]``, optional
     """
     negative_sentence_selection = params.pop('negative_sentence_selection',
                                              'paragraph')
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = {}
     token_indexer_params = params.pop('token_indexers', Params({}))
     for name, indexer_params in token_indexer_params.items():
         token_indexers[name] = TokenIndexer.from_params(indexer_params)
     # The default parameters are contained within the class,
     # so if no parameters are given we must pass None.
     if token_indexers == {}:
         token_indexers = None
     params.assert_empty(cls.__name__)
     return SquadSentenceSelectionReader(
         negative_sentence_selection=negative_sentence_selection,
         tokenizer=tokenizer,
         token_indexers=token_indexers)
 def test_registry_has_builtin_tokenizers(self):
     assert Tokenizer.by_name('word').__name__ == 'WordTokenizer'
     assert Tokenizer.by_name('character').__name__ == 'CharacterTokenizer'
Example #12
0
        return [self.tokenize(text) for text in texts]

    #overrides
    def tokenize(self, text):
        if self._lowercase_characters:
            text = text.lower()
        if self._byte_encoding is not None:
            # We add 1 here so that we can still use 0 for masking, no matter what bytes we get out
            # of this.
            tokens = [
                Token(text_id=c + 1) for c in text.encode(self._byte_encoding)
            ]
        else:
            tokens = [Token(t) for t in list(text)]
        for start_token in self._start_tokens:
            if isinstance(start_token, int):
                token = Token(text_id=start_token, idx=0)
            else:
                token = Token(text=start_token, idx=0)
            tokens.insert(0, token)
        for end_token in self._end_tokens:
            if isinstance(end_token, int):
                token = Token(text_id=end_token, idx=0)
            else:
                token = Token(text=end_token, idx=0)
            tokens.append(token)
        return tokens


CharacterTokenizer = Tokenizer.register(u"character")(CharacterTokenizer)
Example #13
0
    def __init__(self,
                 knowledge_graph: KnowledgeGraph,
                 utterance_tokens: List[Token],
                 token_indexers: Dict[str, TokenIndexer],
                 tokenizer: Tokenizer = None,
                 feature_extractors: List[str] = None,
                 entity_tokens: List[List[Token]] = None,
                 linking_features: List[List[List[float]]] = None,
                 include_in_vocab: bool = True,
                 max_table_tokens: int = None) -> None:
        self.knowledge_graph = knowledge_graph
        if not entity_tokens:
            entity_texts = [
                knowledge_graph.entity_text[entity].lower()
                for entity in knowledge_graph.entities
            ]
            # TODO(mattg): Because we do tagging on each of these entities in addition to just
            # tokenizations, this is quite slow, and about half of our data processing time just
            # goes to this (~15 minutes when there are 7k instances).  The reason we do tagging is
            # so that we can add lemma features.  If we can remove the need for lemma / other
            # hand-written features, like with a CNN, we can cut down our data processing time by a
            # factor of 2.
            self.entity_texts = tokenizer.batch_tokenize(entity_texts)
        else:
            self.entity_texts = entity_tokens
        self.utterance_tokens = utterance_tokens
        self._token_indexers: Dict[str, TokenIndexer] = token_indexers
        self._include_in_vocab = include_in_vocab
        self._indexed_entity_texts: Dict[str, TokenList] = None
        self._max_table_tokens = max_table_tokens

        feature_extractors = feature_extractors if feature_extractors is not None else [
            'number_token_match',
            'exact_token_match',
            'contains_exact_token_match',
            'lemma_match',
            'contains_lemma_match',
            'edit_distance',
            'related_column',
            'related_column_lemma',
            'span_overlap_fraction',
            'span_lemma_overlap_fraction',
        ]
        self._feature_extractors: List[Callable[
            [str, List[Token], Token, int, List[Token]], float]] = []
        for feature_extractor_name in feature_extractors:
            extractor = getattr(self, '_' + feature_extractor_name, None)
            if not extractor:
                raise ConfigurationError(
                    f"Invalid feature extractor name: {feature_extractor_name}"
                )
            self._feature_extractors.append(extractor)

        if not linking_features:
            # For quicker lookups in our feature functions, we'll additionally store some
            # dictionaries that map entity strings to useful information about the entity.
            self._entity_text_map: Dict[str, List[Token]] = {}
            for entity, entity_text in zip(knowledge_graph.entities,
                                           self.entity_texts):
                self._entity_text_map[entity] = entity_text

            self._entity_text_exact_text: Dict[str, Set[str]] = {}
            for entity, entity_text in zip(knowledge_graph.entities,
                                           self.entity_texts):
                self._entity_text_exact_text[entity] = set(
                    e.text for e in entity_text)

            self._entity_text_lemmas: Dict[str, Set[str]] = {}
            for entity, entity_text in zip(knowledge_graph.entities,
                                           self.entity_texts):
                self._entity_text_lemmas[entity] = set(e.lemma_
                                                       for e in entity_text)
            self.linking_features = self._compute_linking_features()
        else:
            self.linking_features = linking_features
Example #14
0
 def from_params(cls, params: Params) -> 'SnliReader':
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {}))
     params.assert_empty(cls.__name__)
     return SnliReader(tokenizer=tokenizer,
                       token_indexers=token_indexers)
Example #15
0
 def test_registry_has_builtin_tokenizers(self):
     assert Tokenizer.by_name("spacy").__name__ == "SpacyTokenizer"
     assert Tokenizer.by_name("character").__name__ == "CharacterTokenizer"
 def test_registry_has_builtin_tokenizers(self):
     assert Tokenizer.by_name('word').__name__ == 'WordTokenizer'
     assert Tokenizer.by_name('character').__name__ == 'CharacterTokenizer'
    def __init__(self,
                 knowledge_graph: KnowledgeGraph,
                 utterance_tokens: List[Token],
                 token_indexers: Dict[str, TokenIndexer],
                 tokenizer: Tokenizer = None,
                 feature_extractors: List[str] = None,
                 entity_tokens: List[List[Token]] = None,
                 linking_features: List[List[List[float]]] = None,
                 include_in_vocab: bool = True,
                 max_table_tokens: int = None) -> None:
        self.knowledge_graph = knowledge_graph
        if not entity_tokens:
            entity_texts = [knowledge_graph.entity_text[entity].lower()
                            for entity in knowledge_graph.entities]
            # TODO(mattg): Because we do tagging on each of these entities in addition to just
            # tokenizations, this is quite slow, and about half of our data processing time just
            # goes to this (~15 minutes when there are 7k instances).  The reason we do tagging is
            # so that we can add lemma features.  If we can remove the need for lemma / other
            # hand-written features, like with a CNN, we can cut down our data processing time by a
            # factor of 2.
            self.entity_texts = tokenizer.batch_tokenize(entity_texts)
        else:
            self.entity_texts = entity_tokens
        self.utterance_tokens = utterance_tokens
        self._token_indexers: Dict[str, TokenIndexer] = token_indexers
        self._include_in_vocab = include_in_vocab
        self._indexed_entity_texts: Dict[str, TokenList] = None
        self._max_table_tokens = max_table_tokens

        feature_extractors = feature_extractors if feature_extractors is not None else [
                'number_token_match',
                'exact_token_match',
                'contains_exact_token_match',
                'lemma_match',
                'contains_lemma_match',
                'edit_distance',
                'related_column',
                'related_column_lemma',
                'span_overlap_fraction',
                'span_lemma_overlap_fraction',
                ]
        self._feature_extractors: List[Callable[[str, List[Token], Token, int, List[Token]], float]] = []
        for feature_extractor_name in feature_extractors:
            extractor = getattr(self, '_' + feature_extractor_name, None)
            if not extractor:
                raise ConfigurationError(f"Invalid feature extractor name: {feature_extractor_name}")
            self._feature_extractors.append(extractor)

        if not linking_features:
            # For quicker lookups in our feature functions, we'll additionally store some
            # dictionaries that map entity strings to useful information about the entity.
            self._entity_text_map: Dict[str, List[Token]] = {}
            for entity, entity_text in zip(knowledge_graph.entities, self.entity_texts):
                self._entity_text_map[entity] = entity_text

            self._entity_text_exact_text: Dict[str, Set[str]] = {}
            for entity, entity_text in zip(knowledge_graph.entities, self.entity_texts):
                self._entity_text_exact_text[entity] = set(e.text for e in entity_text)

            self._entity_text_lemmas: Dict[str, Set[str]] = {}
            for entity, entity_text in zip(knowledge_graph.entities, self.entity_texts):
                self._entity_text_lemmas[entity] = set(e.lemma_ for e in entity_text)
            self.linking_features = self._compute_linking_features()
        else:
            self.linking_features = linking_features
 def test_registry_has_builtin_tokenizers(self):
     assert Tokenizer.by_name("word").__name__ == "WordTokenizer"
     assert Tokenizer.by_name("character").__name__ == "CharacterTokenizer"
Example #19
0
    #overrides
    def tokenize(self, text):
        u"""
        Does whatever processing is required to convert a string of text into a sequence of tokens.

        At a minimum, this uses a ``WordSplitter`` to split words into text.  It may also do
        stemming or stopword removal, depending on the parameters given to the constructor.
        """
        words = self._word_splitter.split_words(text)
        return self._filter_and_stem(words)

    #overrides
    def batch_tokenize(self, texts):
        batched_words = self._word_splitter.batch_split_words(texts)
        return [self._filter_and_stem(words) for words in batched_words]

    def _filter_and_stem(self, words):
        filtered_words = self._word_filter.filter_words(words)
        stemmed_words = [
            self._word_stemmer.stem_word(word) for word in filtered_words
        ]
        for start_token in self._start_tokens:
            stemmed_words.insert(0, Token(start_token, 0))
        for end_token in self._end_tokens:
            stemmed_words.append(Token(end_token, -1))
        return stemmed_words


WordTokenizer = Tokenizer.register(u"word")(WordTokenizer)