class LanguageModel(object): """ABC for Language Models. Cannot be directly instantiated itself. """ def __init__(self, order, vocabulary=None, counter=None): """Creates new LanguageModel. :param vocabulary: If provided, this vocabulary will be used instead of creating a new one when training. :type vocabulary: `nltk.lm.Vocabulary` or None :param counter: If provided, use this object to count ngrams. :type vocabulary: `nltk.lm.NgramCounter` or None :param ngrams_fn: If given, defines how sentences in training text are turned to ngram sequences. :type ngrams_fn: function or None :param pad_fn: If given, defines how senteces in training text are padded. :type pad_fn: function or None """ self.order = order self.vocab = Vocabulary() if vocabulary is None else vocabulary self.counts = NgramCounter() if counter is None else counter def fit(self, text, vocabulary_text=None): """Trains the model on a text. :param text: Training text as a sequence of sentences. """ if not self.vocab: if vocabulary_text is None: raise ValueError( "Cannot fit without a vocabulary or text to " "create it from." ) self.vocab.update(vocabulary_text) self.counts.update(self.vocab.lookup(sent) for sent in text) def score(self, word, context=None): """Masks out of vocab (OOV) words and computes their model score. For model-specific logic of calculating scores, see the `unmasked_score` method. """ return self.unmasked_score( self.vocab.lookup(word), self.vocab.lookup(context) if context else None ) @abstractmethod def unmasked_score(self, word, context=None): """Score a word given some optional context. Concrete models are expected to provide an implementation. Note that this method does not mask its arguments with the OOV label. Use the `score` method for that. :param str word: Word for which we want the score :param tuple(str) context: Context the word is in. If `None`, compute unigram score. :param context: tuple(str) or None :rtype: float """ raise NotImplementedError() def logscore(self, word, context=None): """Evaluate the log score of this word in this context. The arguments are the same as for `score` and `unmasked_score`. """ return log_base2(self.score(word, context)) def context_counts(self, context): """Helper method for retrieving counts for a given context. Assumes context has been checked and oov words in it masked. :type context: tuple(str) or None """ return ( self.counts[len(context) + 1][context] if context else self.counts.unigrams ) def entropy(self, text_ngrams): """Calculate cross-entropy of model for given evaluation text. :param Iterable(tuple(str)) text_ngrams: A sequence of ngram tuples. :rtype: float """ return -1 * _mean( [self.logscore(ngram[-1], ngram[:-1]) for ngram in text_ngrams] ) def perplexity(self, text_ngrams): """Calculates the perplexity of the given text. This is simply 2 ** cross-entropy for the text, so the arguments are the same. """ return pow(2.0, self.entropy(text_ngrams)) def generate(self, num_words=1, text_seed=None, random_seed=None): """Generate words from the model. :param int num_words: How many words to generate. By default 1. :param text_seed: Generation can be conditioned on preceding context. :param random_seed: A random seed or an instance of `random.Random`. If provided, makes the random sampling part of generation reproducible. :return: One (str) word or a list of words generated from model. Examples: >>> from nltk.lm import MLE >>> lm = MLE(2) >>> lm.fit([[("a", "b"), ("b", "c")]], vocabulary_text=['a', 'b', 'c']) >>> lm.fit([[("a",), ("b",), ("c",)]]) >>> lm.generate(random_seed=3) 'a' >>> lm.generate(text_seed=['a']) 'b' """ text_seed = [] if text_seed is None else list(text_seed) random_generator = _random_generator(random_seed) # base recursion case if num_words == 1: context = ( text_seed[-self.order + 1 :] if len(text_seed) >= self.order else text_seed ) samples = self.context_counts(self.vocab.lookup(context)) while context and not samples: context = context[1:] if len(context) > 1 else [] samples = self.context_counts(self.vocab.lookup(context)) # sorting achieves two things: # - reproducible randomness when sampling # - turning Mapping into Sequence which _weighted_choice expects samples = sorted(samples) return _weighted_choice( samples, tuple(self.score(w, context) for w in samples), random_generator ) # build up text one word at a time generated = [] for _ in range(num_words): generated.append( self.generate( num_words=1, text_seed=text_seed + generated, random_seed=random_generator, ) ) return generated
class LanguageModel(object): """ABC for Language Models. Cannot be directly instantiated itself. """ def __init__(self, order, vocabulary=None, counter=None): """Creates new LanguageModel. :param vocabulary: If provided, this vocabulary will be used instead of creating a new one when training. :type vocabulary: `nltk.lm.Vocabulary` or None :param counter: If provided, use this object to count ngrams. :type vocabulary: `nltk.lm.NgramCounter` or None :param ngrams_fn: If given, defines how sentences in training text are turned to ngram sequences. :type ngrams_fn: function or None :param pad_fn: If given, defines how senteces in training text are padded. :type pad_fn: function or None """ self.order = order self.vocab = Vocabulary() if vocabulary is None else vocabulary self.counts = NgramCounter() if counter is None else counter def fit(self, text, vocabulary_text=None): """Trains the model on a text. :param text: Training text as a sequence of sentences. """ if not self.vocab: if vocabulary_text is None: raise ValueError("Cannot fit without a vocabulary or text to " "create it from.") self.vocab.update(vocabulary_text) self.counts.update(self.vocab.lookup(sent) for sent in text) def score(self, word, context=None): """Masks out of vocab (OOV) words and computes their model score. For model-specific logic of calculating scores, see the `unmasked_score` method. """ return self.unmasked_score( self.vocab.lookup(word), self.vocab.lookup(context) if context else None) @abstractmethod def unmasked_score(self, word, context=None): """Score a word given some optional context. Concrete models are expected to provide an implementation. Note that this method does not mask its arguments with the OOV label. Use the `score` method for that. :param str word: Word for which we want the score :param tuple(str) context: Context the word is in. If `None`, compute unigram score. :param context: tuple(str) or None :rtype: float """ raise NotImplementedError() def logscore(self, word, context=None): """Evaluate the log score of this word in this context. The arguments are the same as for `score` and `unmasked_score`. """ return log_base2(self.score(word, context)) def context_counts(self, context): """Helper method for retrieving counts for a given context. Assumes context has been checked and oov words in it masked. :type context: tuple(str) or None """ return (self.counts[len(context) + 1][context] if context else self.counts.unigrams) def entropy(self, text_ngrams): """Calculate cross-entropy of model for given evaluation text. :param Iterable(tuple(str)) text_ngrams: A sequence of ngram tuples. :rtype: float """ return -1 * _mean( [self.logscore(ngram[-1], ngram[:-1]) for ngram in text_ngrams]) def perplexity(self, text_ngrams): """Calculates the perplexity of the given text. This is simply 2 ** cross-entropy for the text, so the arguments are the same. """ return pow(2.0, self.entropy(text_ngrams)) def generate(self, num_words=1, text_seed=None, random_seed=None): """Generate words from the model. :param int num_words: How many words to generate. By default 1. :param text_seed: Generation can be conditioned on preceding context. :param random_seed: If provided, makes the random sampling part of generation reproducible. :return: One (str) word or a list of words generated from model. Examples: >>> from nltk.lm import MLE >>> lm = MLE(2) >>> lm.fit([[("a", "b"), ("b", "c")]], vocabulary_text=['a', 'b', 'c']) >>> lm.fit([[("a",), ("b",), ("c",)]]) >>> lm.generate(random_seed=3) 'a' >>> lm.generate(text_seed=['a']) 'b' """ text_seed = [] if text_seed is None else list(text_seed) # base recursion case if num_words == 1: context = (text_seed[-self.order + 1:] if len(text_seed) >= self.order else text_seed) samples = self.context_counts(self.vocab.lookup(context)) while context and not samples: context = context[1:] if len(context) > 1 else [] samples = self.context_counts(self.vocab.lookup(context)) # sorting achieves two things: # - reproducible randomness when sampling # - turning Mapping into Sequence which _weighted_choice expects samples = sorted(samples) return _weighted_choice( samples, tuple(self.score(w, context) for w in samples), random_seed) # build up text one word at a time generated = [] for _ in range(num_words): generated.append( self.generate( num_words=1, text_seed=text_seed + generated, random_seed=random_seed, )) return generated
class LanguageModel(metaclass=ABCMeta): """ABC for Language Models. Cannot be directly instantiated itself. """ def __init__( self, order, vocabulary=None, counter=None, verbose=True, ): """Creates new LanguageModel. :param vocabulary: If provided, this vocabulary will be used instead of creating a new one when training. :type vocabulary: `nltk.lm.Vocabulary` or None :param counter: If provided, use this object to count ngrams. :type vocabulary: `nltk.lm.NgramCounter` or None :param ngrams_fn: If given, defines how sentences in training text are turned to ngram sequences. :type ngrams_fn: function or None :param pad_fn: If given, defines how senteces in training text are padded. :type pad_fn: function or None """ self.order = order self.vocab = Vocabulary() if vocabulary is None else vocabulary self.counts = NgramCounter() if counter is None else counter def_dict_callable = partial(defaultdict, float) self._cache = defaultdict(def_dict_callable) self.verbose = verbose def _update_cache(self, word): i, word = word ret_list = [] for order in range(2, self.order + 1): for context in self.counts[order].keys(): if self.counts[order][context].N() > self.cache_limit: ret_list.append((context, word, self.score(word, context))) return ret_list def _check_cache_size(self): return getsizeof(self._cache) / 1e6 def fit(self, text, vocabulary_text=None, verbose=True): """Trains the model on a text. :param text: Training text as a sequence of sentences. """ if not self.vocab: if vocabulary_text is None: raise ValueError( "Cannot fit without a vocabulary or text to create it from." ) self.vocab.update(vocabulary_text) _iter = (self.vocab.lookup(sent) for sent in text) self.counts.update( progress(_iter, desc="Fitting the model") if self. verbose else _iter) def score(self, word, context=None): """Masks out of vocab (OOV) words and computes their model score. For model-specific logic of calculating scores, see the `unmasked_score` method. """ return self.unmasked_score( self.vocab.lookup(word), self.vocab.lookup(context) if context else None) @abstractmethod def unmasked_score(self, word, context=None): """Score a word given some optional context. Concrete models are expected to provide an implementation. Note that this method does not mask its arguments with the OOV label. Use the `score` method for that. :param str word: Word for which we want the score :param tuple(str) context: Context the word is in. If `None`, compute unigram score. :param context: tuple(str) or None :rtype: float """ raise NotImplementedError() def logscore(self, word, context=None): """Evaluate the log score of this word in this context. The arguments are the same as for `score` and `unmasked_score`. """ return log_base2(self.score(word, context)) def context_counts(self, context): """Helper method for retrieving counts for a given context. Assumes context has been checked and oov words in it masked. :type context: tuple(str) or None """ return (self.counts[len(context) + 1][context] if context else self.counts.unigrams) def entropy(self, text_ngrams): """Calculate cross-entropy of model for given evaluation text. :param Iterable(tuple(str)) text_ngrams: A sequence of ngram tuples. :rtype: float """ return -1 * _mean( [self.logscore(ngram[-1], ngram[:-1]) for ngram in text_ngrams]) def perplexity(self, text_ngrams): """Calculates the perplexity of the given text. This is simply 2 ** cross-entropy for the text, so the arguments are the same. """ return pow( 2.0, self.entropy( progress(text_ngrams, desc="Calculating Perplexity") if self. verbose else text_ngrams)) def context_probabilities(self, context): """Helper method for retrieving probabilities for a given context, including all the words in the vocabulary Assumes context has been checked and oov words in it masked. :type context: tuple(str) or None """ if context not in self._cache.keys(): self._cache[context] = { word: self.score(word, context) for word in self.vocab.counts.keys() } return self._cache[context] def _generate_single_word(self, sampler_func, text_seed, random_generator, sampler_kwargs): context = tuple( text_seed[-self.order + 1:] if len(text_seed) >= self.order else text_seed) distribution = self.context_probabilities(context) # Sorting distribution achieves two things: # - reproducible randomness when sampling # - turns Dictionary into Sequence which `sampler` expects distribution = sorted(distribution.items(), key=lambda x: x[1], reverse=True) return sampler_func(distribution, random_generator=random_generator, **sampler_kwargs) def generate( self, sampler_func=greedy_decoding, num_words=1, text_seed=None, random_seed=None, sampler_kwargs={}, EOS=None, ): text_seed = (random.sample(self.vocab.counts.keys(), 1) if text_seed is None else list(text_seed)) random_generator = _random_generator(random_seed) if EOS: sampler_kwargs["EOS"] = EOS # We build up text one word at a time using the preceding context. generated = [] _iter = range(num_words) for _ in (progress(_iter, desc="Generating words") if self.verbose else _iter): token = self._generate_single_word( sampler_func=sampler_func, text_seed=text_seed + generated, random_generator=random_generator, sampler_kwargs=sampler_kwargs, ) generated.append(token) if token == EOS: break return generated