def n_gramas_texto(self,texto,N):
           Searching for n-grams
                 (texto, text) text string
                 (N, int)  size of the n-gram 2 is equal to bigram, 3 is equal to trigram and so on...
                 (listaNgrama, list) n-gram list
         if  isinstance(N, int):
                         if isinstance(texto, str):
                            texto = unicode(texto, "utf-8", "xmlcharrefreplace")
                         texto = self.filtro_caracteres_especiales(texto)
                         lista = self.filtro_palabras_cerradas(texto)
                         for Ngramas in ingrams(lista, N, pad_right=True):
                                 if (str(Ngramas[N-1])!= "None"):
                                         listaNgrama.append(" ".join(Ngramas))
                         return listaNgrama
                 except TypeError:
                         return []
                 except UnicodeDecodeError:
                         return []    
                         return []
                 return []
Example #2
    def __init__(self, n, train, estimator=None, *estimator_args, **estimator_kw_args):
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during

            >>> from nltk.corpus import brown
            >>> from nltk.probability import LidstoneProbDist
            >>> estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
            >>> lm = NgramModel(3, brown.words(categories='news'), estimator)
            >>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said',
            ... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent',
            ... 'primary', 'election', 'produced', '``', 'no', 'evidence',
            ... "''", 'that', 'any', 'irregularities', 'took', 'place', '.'])
            ... # doctest: +ELLIPSIS

        :param n: the order of the language model (ngram size)
        :type n: int
        :param train: the training text
        :type train: list of string
        :param estimator: a function for generating a probability distribution
        :type estimator: a function that takes a ConditionalFreqDist and
              returns a ConditionalProbDist
        :param estimator_args: Extra arguments for estimator.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
            Note: For backward-compatibility, if no arguments are specified, the
            number of bins in the underlying ConditionalFreqDist are passed to
            the estimator as an argument.
        :type estimator_args: (any)
        :param estimator_kw_args: Extra keyword arguments for estimator.
        :type estimator_kw_args: (any)

        self._n = n

        if estimator is None:
            estimator = _estimator

        cfd = ConditionalFreqDist()
        self._ngrams = set()
        self._prefix = ("",) * (n - 1)

        for ngram in ingrams(chain(self._prefix, train), n):
            context = tuple(ngram[:-1])
            token = ngram[-1]

        if (not estimator_args) and (not estimator_kw_args):
            self._model = ConditionalProbDist(cfd, estimator, len(cfd))
            self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kw_args)

        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NgramModel(n - 1, train, estimator, *estimator_args, **estimator_kw_args)
Example #3
    def eval_word(self, word):
            @return: probability that proposed word is a word.
        if self._hypothetical_phonotactics:
            # Make a deep copy of the ngram model so that we can update it without modifying the original
            word_ngram_model = copy.deepcopy(self._ngram_model)
            word_ngram_model = self._ngram_model

        # If I want to duplicate functionality of OCaml code, should probably make a ProbDist that inherits from Lidstone,
        # otherwise this won't do the denominator adjustments and other stuff.
        # for ngram in ingrams(chain(self._ngram_model._padding, word, self._ngram_model._padding), self._n):
        #     print("Ngram: {}\tProb for {} given {}: {}".format(ngram, ngram[-1], tuple(ngram[:-1]), self._ngram_model.prob(tuple(ngram[:-1]), ngram[-1])), file=sys.stderr)

        if self._n > 1:
            adjustment = Fraction(1)
            # Need to compensate for the lack of an empty word by getting rel freq of word_delimiter somehow
            # print("word delimiter prob: {}".format(self._ngram_model.prob(self._word_delimiter, self._ngram_model._padding)), file=sys.stderr)
            adjustment = Fraction(1) / (Fraction(1) - self._ngram_model.prob(self._word_delimiter, self._ngram_model._padding))

        # print([self._ngram_model.prob(ngram[-1], tuple(ngram[:-1])) for ngram in
        #                                     ingrams(chain(*([self._word_delimiter, word, self._word_delimiter] if self._n > 1 else [word, self._word_delimiter])),
        #                                                   self._n)], file=sys.stderr)

        raw_score = self._score_combiner([self._ngram_model.prob(ngram[-1], tuple(ngram[:-1])) for ngram in
                                            ingrams(chain(*([self._word_delimiter, word, self._word_delimiter] if self._n > 1 else [word, self._word_delimiter])),
        # print("adjustment: {:.10e}\traw score: {:.10e}".format(float(adjustment), float(raw_score)), file=sys.stderr)

        return adjustment * raw_score
Example #4
 def N_gramas_lista(self,lista,N):
         if  isinstance(lista, list) and isinstance(N, int):
                 if (all(type(x) is str for x in lista)):
                                 remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
                                 lista=[unicode(s,codificacion) for s in lista]
                                 lista=[s.translate(remove_punctuation_map).lower() for s in lista]
                                 lista=[str(self.elimina_tildes(s)) for s in lista]
                                 lista= filter(None,lista)
                                 lista= filter(str.strip, lista)    
                                 for Ngramas in ingrams(lista, N, pad_right=True):
                                         if (str(Ngramas[N-1])!= "None"):
                                                 listaNgrama.append(" ".join(Ngramas))
                                 return listaNgrama
                       except TypeError:
                                 return []
                       except UnicodeDecodeError:
                                 return []  
                                 return []
                        return []  
                 return []
Example #5
    def __init__(self, n, train, estimator=None):
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during

        :param n: the order of the language model (ngram size)
        :type n: int
        :param train: the training text
        :type train: list(str)
        :param estimator: a function for generating a probability distribution
        :type estimator: function(ConditionalFreqDist) -> ConditionalProbDist

        self._n = n

        if estimator is None:
            estimator = _estimator
        cfd = ConditionalFreqDist()
        self._ngrams = set()
        self._prefix = ('',) * (n - 1)

        for ngram in ingrams(chain(self._prefix, train), n):
            context = tuple(ngram[:-1])
            token = ngram[-1]

        self._model = ConditionalProbDist(cfd, estimator, len(cfd))

        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NgramModel(n-1, train, estimator)
    def from_words(cls, words):
        """Construct a QuadgramCollocationFinder for all quadgrams in the given
        wfd = FreqDist()
        bfd = FreqDist()
        wild2fd = FreqDist()
        tfd = FreqDist()
        wild3fd = FreqDist()
        qfd = FreqDist()

        for w1, w2, w3, w4 in ingrams(words, 4, pad_right=True):
            if w2 is None:
  , w2))
            if w3 is None:
  , w3))
  , w2, w3))
            if w4 is None:
  , w4))
  , w2, w4))
  , w3, w4))
  , w2, w3, w4))
        return cls(wfd, bfd, wild2fd, tfd, wild3fd, qfd)
def big_test(version="3.0", max_length=3):
    from topicmod.util.wordnet import load_wn
    from nltk.corpus import brown
    from nltk.util import ingrams

    wn = load_wn(version)

    term_counts = defaultdict(int)

    for ngram_length in xrange(max_length):
        token = 0
        for w in ingrams(brown.words(), ngram_length):
            token += 1
            normalized = "_".join(w).lower()
            if wn.synsets(normalized, 'n'):
                term_counts[wn.morphy(normalized)] += 1

    filename = "wn/wordnet.wn"
    if version != "3.0":
        filename = "wn/wordnet_%s.wn" % version
    o = OntologyWriter(filename)
    for ii in orderedTraversal(wn):
                    [x.offset for x in ii.hyponyms() + ii.instance_hyponyms()],
                    [(0,, term_counts[] + 1)
                     for x in ii.lemmas])
    def __init__(self, n, train, pad_left=True, pad_right=False,estimator=None, *estimator_args, **estimator_kwargs):
        super(MyNgramModel,self).__init__(n,train,pad_left,pad_right,estimator,*estimator_args, **estimator_kwargs)
        assert(isinstance(pad_left, bool))
        assert(isinstance(pad_right, bool))
        self._n = n
        self._lpad = ('',) * (n - 1) if pad_left else ()
        self._rpad = ('',) * (n - 1) if pad_right else ()

        if estimator is None:
            estimator = _estimator

        self._cfd = ConditionalFreqDist()
        self._ngrams = set()
        # If given a list of strings instead of a list of lists, create enclosing list
        if (train is not None) and isinstance(train[0], basestring):
            train = [train]

        for sent in train:
            for ngram in ingrams(chain(self._lpad, sent, self._rpad), n):
                context = tuple(ngram[:-1])
                token = ngram[-1]

        if not estimator_args and not estimator_kwargs:
            self._model = ConditionalProbDist(self._cfd, estimator, len(self._cfd))
            self._model = ConditionalProbDist(self._cfd, estimator, *estimator_args, **estimator_kwargs)

        # recursively construct the lower-order models
        self._backoff = None
        if n > 1:
            self._backoff = MyNgramModel(n-1, train, pad_left, pad_right, estimator, *estimator_args, **estimator_kwargs)
            if self._backoff is not None:
                self._backoff_alphas = dict()
            # For each condition (or context)
                for ctxt in self._cfd.conditions():
                    pd = self._model[ctxt] # prob dist for this context
                    backoff_ctxt = ctxt[1:]
                    backoff_total_pr = 0
                    total_observed_pr = 0
                    for word in self._cfd[ctxt].keys(): # this is the subset of words that we OBSERVED                    
                        backoff_total_pr += self._backoff.prob(word,backoff_ctxt) 
                        total_observed_pr += pd.prob(word)        
                    assert total_observed_pr <= 1 and total_observed_pr > 0
                    assert backoff_total_pr <= 1 and backoff_total_pr > 0
                    alpha_ctxt = (1.0-total_observed_pr) / (1.0-backoff_total_pr)
                    self._backoff_alphas[ctxt] = alpha_ctxt
Example #9
    def __init__(self, n, train, estimator=None, *estimator_args, **estimator_kw_args):
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during

        @param n: the order of the language model (ngram size)
        @type n: C{int}
        @param train: the training text
        @type train: C{list} of C{string} (or C{list} of C{string} C{list}s)
        @param estimator: a function for generating a probability distribution
        @type estimator: a function that takes a C{ConditionalFreqDist} and
              returns a C{ConditionalProbDist}
        @param estimator_args: Extra arguments for C{estimator}.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
            Note: For backward-compatibility, if no arguments are specified, the
            number of bins in the underlying C{ConditionalFreqDist} are passed to
            the estimator as an argument.
        @type estimator_args: (any)
        @param estimator_kw_args: Extra keyword arguments for C{estimator}.
        @type estimator_kw_args: (any)

        self._n = n

        if estimator is None:
            estimator = _estimator

        cfd = ConditionalFreqDist()
        self._ngrams = set()
        self._padding = ('',) * (n - 1)

        # If given a list of strings instead of a list of lists, create enclosing list
        if isinstance(train[0], basestring):
            train = [train]

        for utterance in train:
            for ngram in ingrams(chain(self._padding, utterance, self._padding), n):
                context = tuple(ngram[:-1])
                token = ngram[-1]

        if (not estimator_args) and (not estimator_kw_args):
            self._model = ConditionalProbDist(cfd, estimator, len(cfd))
            self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kw_args)

        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NgramModel(n-1, train, estimator, *estimator_args, **estimator_kw_args)
Example #10
def quantify_variant(analysis, variant):
    n = variant + 1

    d = {}

    all_pos_tags = [ pos for (_, pos) in analysis.pos_tags() ]

    for ngram in ingrams(all_pos_tags, n):
        sparse_dict_increment(d, ngram)

    return {output_filter_ngram(k): v for (k, v) in d.items()}
Example #11
    def classify(self, sentence, tokenizer_lang, ngram_length=3):
        features = []
        for ii in self.tokenizers[tokenizer_lang].tokenize(sentence):
            d = {}
            for jj in ingrams(ii, ngram_length):
                d[jj] = d.get(jj, 0) + 1
        data = SparseDataSet(features)

        f = FreqDist()
        for ii in [self._labels[self._classifier.classify(data, x)[0]] for x in xrange(len(features))]:
        return f
Example #12
    def update(self, samples, increase_amount=1):
        Update the underlying frequency distributions given the current list of samples.
        cond_samples = []
        for utterance in samples:
            for ngram in ingrams(chain(self._padding, utterance, self._padding), self._n):
                cond_samples.append((tuple(ngram[:-1]), ngram[-1]))
        self._model.update(cond_samples, increase_amount)

        # Recursively update lower-order models
        if self._backoff:
            self._backoff.update(samples, increase_amount)
Example #13
    def AddInstance(self, lang, line, ngram_length):
        if not lang in self.label_names:

        id = line[:line.find("\t")]
        sentence = line[line.find("\t")+1:].strip()

        d = {}
        for ii in ingrams(line, ngram_length):
            d[ii] = d.get(ii, 0) + 1

        self.labels.append(self.label_names.index(lang))"%s-%s" % (lang, id))
    def GetNGrams(self,tokenized_sentences):
        # Description = Takes the tokenized sentences and outputs all of the n-grams from a given text
        # Inputs:
            # Tokenized Sentences = The sentences after we tokenize the file to be used.
        # Outputs:
            # n_grams = The list of n_grams found in the file, not unique

       # This function simply turns all of the work into a really long vector of ngrams
        n_grams = []
        for token_sent in tokenized_sentences:
            sent_n_grams = ingrams(token_sent,self.N,self.leftpad,self.rightpad)
        return n_grams
Example #15
 def prepareBigrams(self, window_size, word):
     wfd = FreqDist()
     bfd = FreqDist()
     if word == '':
         for sentence in self.__sentences:
             if len(sentence) > 1:
                 for window in ingrams(sentence, window_size, pad_right=True):
                     if window[0] not in self.__ignoredColl:
                         w1 = window[0]
                             window = window[:list(window).index(w1, 1)]
                         except ValueError:
                         for w2 in set(window[1:]):
                             if w2 is not None and w2 not in self.__ignoredColl:
                       , w2))
         for sentence in self.__sentences:
             if len(sentence) > 1:
                 for window in ingrams(sentence, window_size, pad_right=True):
                     if window[0] not in self.__ignoredColl:
                         w1 = window[0]
                             window = window[:list(window).index(w1, 1)]
                         except ValueError:
                         bigramOK = False
                         for w2 in set(window[1:]):
                             if w2 is not None and w2 not in self.__ignoredColl and (w1 == word or w2==word):
                       , w2))
                                 bigramOK = True
                         if bigramOK:
     self.__bigrams = MyBigramCollFinder(wfd, bfd)
Example #16
    def __init__(self, n, train, estimator, freqtype, padding, backoff, *estimator_args, **estimator_kw_args):
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during

        @param n: the order of the language model (ngram size)
        @type n: L{int}
        @param train: the training text
        @type train: L{list} of L{str} (or L{list} of L{str} L{list}s)
        @param estimator: a function for generating a probability distribution (must take FreqDist as first argument, and n as second)
        @type estimator: a function that takes a L{ConditionalFreqDist} and returns a L{ConditionalProbDist}
        @param freqtype: the type to use to store the counts in the underlying frequency distribution
        @type freqtype: any numeric type
        @param backoff: whether or not we should use Katz back-off
        @type backoff: L{bool}
        @param estimator_args: Extra arguments for L{estimator}.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
        @type estimator_args: (any)
        @param estimator_kw_args: Extra keyword arguments for L{estimator}.
        @type estimator_kw_args: (any)

        self._n = n

        cfd = ConditionalFreqDist(counttype=freqtype)
        self._ngrams = set()
        self._padding = (padding,) * (n - 1)
        self._estimator = estimator
        self._freqtype = freqtype
        self._estimator_args = estimator_args
        self._estimator_kw_args = estimator_kw_args

        if train:
            for utterance in train:
                for ngram in ingrams(chain(self._padding, utterance, self._padding), n):
                    context = tuple(ngram[:-1])
                    token = ngram[-1]

        self._model = ConditionalProbDist(cfd, estimator, self._freqtype, n, *estimator_args, **estimator_kw_args)

        # recursively construct the lower-order models
        self._backoff = PartialCountNgramModel(n - 1, train, estimator, freqtype, padding, backoff, *estimator_args, **estimator_kw_args) if (backoff and n > 1) else None
Example #17
    def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  By default, bigrams must be contiguous.
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError, "Specify window_size at least 2"

        for window in ingrams(words, window_size, pad_right=True):
            w1 = window[0]
            for w2 in window[1:]:
                if w2 is not None:
          , w2))
        return cls(wfd, bfd)
Example #18
def big_test(hyperparam, treefilename, hyperfilename, vocab, version="3.0",
    @param hyperparam A function that, given a synset, returns a hyperparameter

    @param version The version of WordNet we use

    @param max_length The maximum length of n-grams we'll use for computing
    from python_lib.wordnet import load_wn
    from nltk.corpus import brown
    from nltk.util import ingrams

    wn = load_wn(version)

    term_counts = defaultdict(int)

    for ngram_length in xrange(max_length):
        token = 0
        for w in ingrams(brown.words(), ngram_length):
            token += 1
            normalized = "_".join(w).lower()
            if wn.synsets(normalized, 'n'):
                term_counts[wn.morphy(normalized)] += 1

    print("Done collecting counts")

    if not treefilename:
        treefilename = "wn/wordnet.wn"
        if version != "3.0":
            treefilename = "wn/wordnet_%s.wn" % version

    o = OntologyWriter(treefilename, vocab=vocab, max_leaves=-1)
    for ii in wn.all_synsets('n'):
                    [x.offset for x in ii.hyponyms() + ii.instance_hyponyms()],
                    [(0,, term_counts[] + 1)
                     for x in ii.lemmas], hyperparameter=hyperparam(ii))

    #hyperparam.dump(filename + "_hyp.lookup")
Example #19
    def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  When window_size > 2, count non-contiguous bigrams, in the
        style of Church and Hanks's (1990) association ratio.
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError("Specify window_size at least 2")

        for window in ingrams(words, window_size, pad_right=True):
            w1 = window[0]
            for w2 in window[1:]:
                if w2 is not None:
          , w2))
        return cls(wfd, bfd, window_size=window_size)
Example #20
 def N_gramas_texto(self,texto,N):
         if  isinstance(texto, str) and isinstance(N, int):
                         remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
                         lista=[s.translate(remove_punctuation_map).lower() for s in nltk.word_tokenize(unicode(texto,codificacion))]
                         lista=[str(self.elimina_tildes(s)) for s in lista]
                         lista= filter(None,lista)
                         lista= filter(str.strip, lista)
                         for Ngramas in ingrams(lista, N, pad_right=True):
                                 if (str(Ngramas[N-1])!= "None"):
                                         listaNgrama.append(" ".join(Ngramas))
                         return listaNgrama
                 except TypeError:
                         return []
                 except UnicodeDecodeError:
                         return []    
                         return []
                 return []
Example #21
    def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  By default, bigrams must be contiguous.
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError, "Specify window_size at least 2"

        for window in ingrams(words, window_size, pad_right=True):
            w1 = window[0]
                window = window[:list(window).index(w1, 1)]
            except ValueError:
            for w2 in set(window[1:]):
                if w2 is not None:
          , w2))
        return cls(wfd, bfd)
Example #22
    def __init__(self, n, train, estimator=None):
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during

        @param n: the order of the language model (ngram size)
        @type n: C{int}
        @param train: the training text
        @type train: C{list} of C{string}
        @param estimator: a function for generating a probability distribution
        @type estimator: a function that takes a C{ConditionalFreqDist} and
              returns a C{ConditionalProbDist}
        self._n = n
        self._N = 1 + len(train) - n

        if estimator is None:
            def estimator(fdist, bins): return MLEProbDist(fdist)

        if n == 1:
            fd = FreqDist(train)
            self._model = estimator(fd, fd.B())
            cfd = ConditionalFreqDist()
            self._ngrams = set()
            self._prefix = ('',) * (n - 1)

            for ngram in ingrams(chain(self._prefix, train), n):
                context = tuple(ngram[:-1])
                token = ngram[-1]
                cfd[context][token] += 1

            self._model = ConditionalProbDist(cfd, estimator, len(cfd))
        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NgramModel(n - 1, train, estimator)
    def from_words(cls, words):
        """Construct a QuadgramCollocationFinder for all quadgrams in the given
        wfd = FreqDist()
        bfd = FreqDist()
        wild2fd = FreqDist()
        tfd = FreqDist()
        wild3fd = FreqDist()
        qfd = FreqDist()
        wild4fd = FreqDist()
        pfd = FreqDist()

        for w1, w2, w3, w4, w5 in ingrams(words, 5, pad_right=True):
            if w2 is None:
  , w2))
            if w3 is None:
  , w3))
  , w2, w3))
            if w4 is None:
  , w4))
  , w2, w4))
  , w3, w4))
  , w2, w3, w4))
            if w5 is None:
  , w5))
  , w2, w5))
  , w3, w5))
  , w4, w5))
  , w3, w4, w5))
  , w2, w4, w5))
  , w2, w3, w5))
  , w2, w3, w4, w5))
        return cls(wfd, bfd, wild2fd, tfd, wild3fd, qfd, wild4fd, pfd)
Example #24
    def prob_classify(self, document):
        @return: a probability distribution over labels for the given document.
        @rtype: L{ProbDistI <nltk.probability.ProbDistI>}
        # Find the log probabilty of each label, given the features.
        # Start with the log probability of the label itself.
        logprob = {}

        # Loop through each possible label and calculate the
        # (log) probability of document under that label
        for label in self.labels():
            # Probability of category
            logprob[label] = self.model.category_probdist.prob(label)

            # Extract ngram model
            ngram_model = self.model.ngrams[label]

            prefix = ('',) * (ngram_model._n - 1)

            # Prepare words
            words = [w.lower() for w in document if w.isalpha()]

            # Go through each word and calculate P(w | context)
            for ngram in ingrams(chain(prefix, words), ngram_model._n):
                context = tuple(ngram[:-1])
                token = ngram[-1]

                    logprob[label] += -ngram_model.logprob(token, context)
                except RuntimeError:
                    # Unknown word, skip it
                    #logger.debug(label + ': Ignoring unknown word: ' + token)

            #logger.debug(label + ': ' + str(logprob[label]))

        # Return probability for each label
        return DictionaryProbDist(logprob, normalize=True, log=True)
Example #25
    def eval_word(self, word):
            @return: probability that proposed word is a word.
        if self._hypothetical_phonotactics:
            # Make a deep copy of the ngram model so that we can update it without modifying the original
            word_ngram_model = copy.deepcopy(self._ngram_model)
            word_ngram_model = self._ngram_model

        # If I want to duplicate functionality of OCaml code, should probably make a ProbDist that inherits from Lidstone,
        # otherwise this won't do the denominator adjustments and other stuff.
        # for ngram in ingrams(chain(self._ngram_model._padding, word, self._ngram_model._padding), self._n):
        #     print("Ngram: {}\tProb for {} given {}: {}".format(ngram, ngram[-1], tuple(ngram[:-1]), self._ngram_model.prob(tuple(ngram[:-1]), ngram[-1])), file=sys.stderr)

        if self._n > 1:
            adjustment = Fraction(1)
            # Need to compensate for the lack of an empty word by getting rel freq of word_delimiter somehow
            # print("word delimiter prob: {}".format(self._ngram_model.prob(self._word_delimiter, self._ngram_model._padding)), file=sys.stderr)
            adjustment = Fraction(1) / (Fraction(1) - self._ngram_model.prob(
                self._word_delimiter, self._ngram_model._padding))

        # print([self._ngram_model.prob(ngram[-1], tuple(ngram[:-1])) for ngram in
        #                                     ingrams(chain(*([self._word_delimiter, word, self._word_delimiter] if self._n > 1 else [word, self._word_delimiter])),
        #                                                   self._n)], file=sys.stderr)

        raw_score = self._score_combiner([
            self._ngram_model.prob(ngram[-1], tuple(ngram[:-1]))
            for ngram in ingrams(
                    *([self._word_delimiter, word, self._word_delimiter] if
                      self._n > 1 else [word, self._word_delimiter])), self._n)
        # print("adjustment: {:.10e}\traw score: {:.10e}".format(float(adjustment), float(raw_score)), file=sys.stderr)

        return adjustment * raw_score
def get_nested_noun_phrase_chunks(noun_phrase_chunks):
    # grabs any nested noun phrases
    # helpful for when original noun phrase includes tokens
    # which shouldn't really be part of the noun-phrase 
    # e.g. [('use', 'V'), ('firewalls', 'N')] should just be 'firewalls'
    def is_nounphrase(tagged_phrase):
        # Determine whether a tagged phrase matches the
        # noun-phrase regular expression tag sequence
        for subtree in NP_PARSER.parse(tagged_phrase).subtrees():
            if subtree.node == 'NounPhrase':
                return True
        return False

    from nltk.util import ingrams

    nested_noun_phrase_chunks = []
    for np in noun_phrase_chunks:
        for ngramSize in xrange(len(np)-1,0,-1):
            for ngram in ingrams(np,ngramSize):
                if is_nounphrase(ngram):
    return nested_noun_phrase_chunks
def get_nested_noun_phrase_chunks(noun_phrase_chunks):
    # grabs any nested noun phrases
    # helpful for when original noun phrase includes tokens
    # which shouldn't really be part of the noun-phrase
    # e.g. [('use', 'V'), ('firewalls', 'N')] should just be 'firewalls'

    def is_nounphrase(tagged_phrase):
        # Determine whether a tagged phrase matches the
        # noun-phrase regular expression tag sequence
        for subtree in NP_PARSER.parse(tagged_phrase).subtrees():
            if subtree.node == 'NounPhrase':
                return True
        return False

    from nltk.util import ingrams

    nested_noun_phrase_chunks = []
    for np in noun_phrase_chunks:
        for ngramSize in xrange(len(np) - 1, 0, -1):
            for ngram in ingrams(np, ngramSize):
                if is_nounphrase(ngram):
    return nested_noun_phrase_chunks
Example #28
    def __init__(self, n, train, estimator=None, factor=0.77):
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during

        @param n: the order of the language model (ngram size)
        @type n: C{int}
        @param train: the training text
        @type train: C{list} of C{string}
        @param estimator: a function for generating a probability distribution
        @type estimator: a function that takes a C{ConditionalFreqDist} and
              returns a C{ConditionalProbDist}

        self._n = n

        if estimator is None:
            if n > 1:
                # Use smoothing based on Ney et al
                probdist_factory = lambda fdist, bins, n_train, n_0: \
                                NeyProbDist(fdist, bins, n_train, n_0, factor, NeyProbDist.ABSOLUTE)
                #probdist_factory = lambda fdist, bins, *args: LaplaceProbDist(fdist, bins)
                # Use simple add-1 smoothing for unigrams
                probdist_factory = lambda fdist, bins, *args: LaplaceProbDist(fdist, bins)
            probdist_factory = estimator

        # Initialize conditional frequency distribution
        cfd = ConditionalFreqDist()

        # Initialize set of ngrams
        self._ngrams = set()
        self._ngram_count = 0

        # Prefix beginning of document with empty strings
        self._prefix = ('',) * (n - 1)

        # Count the number of training examples
        num_training = 0

        # Loop through each ngram and add to CFD
        for ngram in ingrams(chain(self._prefix, train), n):
            # Lowercase words
            ngram = tuple(w.lower() for w in ngram)

            # Add to known ngrams

            # Add to CFD
            context = tuple(ngram[:-1])
            token = ngram[-1]

            num_training += 1

        # Calculate vocabulary size (for NeyProbDist)
        v = len(set(train))
        bins = v ** n

        # Number of bins with a count > 0
        self._ngram_count = len(self._ngrams)

        # Gives us number of bins with count = 0
        n_0 = bins - self._ngram_count

        # Create CPD model
        self._model = ConditionalProbDist(cfd, probdist_factory, bins, num_training, n_0)

        # recursively construct the lower-order models
        if n > 1:
            self._backoff = SLINgramModel(n-1, train, estimator)
Example #29
    def __init__(self, n, train, k=5, v=None,
                 liveDangerously=False, quiet=False):
        Creates an Katz-threshholded Ngram language model to capture
        patterns in n consecutive words of training text.
        Uses the KGoodTuringProbDist to estimate the conditional and unigram probabilities,
        to provide coverage of Ngrams not seen during training.

        @param n: the order of the language model (ngram size)
        @type n: C{int}
        @param train: the training text
        @type train: C{list} of C{string}
        @param k: The threshhold above which counts are assumed
                  to be reliable.  Defaults to 5.
        @type  k: C{Int}
        @param v: The number of unseens of degree 1.  Defaults to the
                  number of types in the training set
        @type  v: C{Int}
        @param liveDangerously: If False, for each model check that
                                the total probability mass after all
                                adjustments is close to 1.  Defaults
                                to False.
        @type  liveDangerously: C{Boolean}
        @param quiet: Various information will be printed during model
                       construction unless this is True.  Defaults to False.
        @type  quiet: C{Boolean}
        self._n = n
        self._N = 1 + len(train) - n
        fd = FreqDist(train)
        if v is None:
            v = fd.B()
        print(('v', v))
        if n == 1:
            # Treat this case specially
            self._model = KGoodTuringProbDist(fd, k, v, liveDangerously, ())
            if not quiet:
                print("%s entries for %s tokens at degree 1, %s" % (len(fd),
            def estimator(fdist, ctxt): return KGoodTuringProbDist(fdist, k, v,

            cfd = ConditionalFreqDist()

            for ngram in ingrams(train, n):
                # self._ngrams.add(ngram)
                context = tuple(ngram[:-1])
                token = ngram[-1]

            self._model = ConditionalProbDist(cfd, estimator, True)
            if not quiet:
                statuses = {'normal': 0, 'bigSkewed': 0,
                            'weak': 0, LowHacked: 0}
                for ctx in cfd.conditions():
                    statuses[self[ctx].status] += 1
                print("%s conditions at degree %s" %
                      (len(cfd.conditions()), n))
                for s in list(statuses.keys()):
                    print(" %s %6d" % (s, statuses[s]))

            # recursively construct the lower-order models
            self._backoff = KBNgramModel(n - 1, train, k, v, liveDangerously)
 def __add_token_ngrams(self, token):
     for current_ngram in ingrams(token, self.k + 1):
         sparse_dict_increment(self.histogram, ''.join(current_ngram))
Example #31
    def __init__(self, n, train, pad_left=True, pad_right=False,
                 estimator=None, *estimator_args, **estimator_kwargs):
        Create an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during

            >>> from nltk.corpus import brown
            >>> from nltk.probability import LidstoneProbDist
            >>> est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
            >>> lm = NgramModel(3, brown.words(categories='news'), estimator=est)
            >>> lm
            <NgramModel with 91603 3-grams>
            >>> lm._backoff
            <NgramModel with 62888 2-grams>
            >>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said',
            ... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent',
            ... 'primary', 'election', 'produced', '``', 'no', 'evidence',
            ... "''", 'that', 'any', 'irregularities', 'took', 'place', '.'])
            ... # doctest: +ELLIPSIS

        :param n: the order of the language model (ngram size)
        :type n: int
        :param train: the training text
        :type train: list(str) or list(list(str))
        :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings
        :type pad_left: bool
        :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings
        :type pad_right: bool
        :param estimator: a function for generating a probability distribution
        :type estimator: a function that takes a ConditionalFreqDist and
            returns a ConditionalProbDist
        :param estimator_args: Extra arguments for estimator.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
            Note: For backward-compatibility, if no arguments are specified, the
            number of bins in the underlying ConditionalFreqDist are passed to
            the estimator as an argument.
        :type estimator_args: (any)
        :param estimator_kwargs: Extra keyword arguments for the estimator
        :type estimator_kwargs: (any)

        # protection from cryptic behavior for calling programs
        # that use the pre-2.0.2 interface
        assert(isinstance(pad_left, bool))
        assert(isinstance(pad_right, bool))

        self._n = n
        self._lpad = ('',) * (n - 1) if pad_left else ()
        self._rpad = ('',) * (n - 1) if pad_right else ()

        if estimator is None:
            estimator = _estimator

        cfd = ConditionalFreqDist()
        self._ngrams = set()

        # If given a list of strings instead of a list of lists, create enclosing list
        if (train is not None) and isinstance(train[0], compat.string_types):
            train = [train]

        for sent in train:
            for ngram in ingrams(chain(self._lpad, sent, self._rpad), n):
                context = tuple(ngram[:-1])
                token = ngram[-1]

        if not estimator_args and not estimator_kwargs:
            self._model = ConditionalProbDist(cfd, estimator, len(cfd))
            self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs)

        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NgramModel(n-1, train, pad_left, pad_right,
                                       estimator, *estimator_args, **estimator_kwargs)
def frequency_of_term_in_article(term, tokens):
    # term is a tuple of strings
    # tokens is a list of sentences, where sentences are a list of words
    return sum(1 for sent in tokens for ngram in ingrams(sent, len(term))
               if tuple(ngram) == term)
 def __add_token_ngrams(self, token):
     for current_ngram in ingrams(token, self.k + 1):
         sparse_dict_increment(self.histogram, ''.join(current_ngram))
Example #34
    def __init__(self,
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during

            >>> from nltk.corpus import brown
            >>> from nltk.probability import LidstoneProbDist
            >>> estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
            >>> lm = NgramModel(3, brown.words(categories='news'), estimator)
            >>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said',
            ... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent',
            ... 'primary', 'election', 'produced', '``', 'no', 'evidence',
            ... "''", 'that', 'any', 'irregularities', 'took', 'place', '.'])
            ... # doctest: +ELLIPSIS

        :param n: the order of the language model (ngram size)
        :type n: int
        :param train: the training text
        :type train: list of string
        :param estimator: a function for generating a probability distribution
        :type estimator: a function that takes a ConditionalFreqDist and
              returns a ConditionalProbDist
        :param estimator_args: Extra arguments for estimator.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
            Note: For backward-compatibility, if no arguments are specified, the
            number of bins in the underlying ConditionalFreqDist are passed to
            the estimator as an argument.
        :type estimator_args: (any)
        :param estimator_kw_args: Extra keyword arguments for estimator.
        :type estimator_kw_args: (any)

        self._n = n

        if estimator is None:
            estimator = _estimator

        cfd = ConditionalFreqDist()
        self._ngrams = set()
        self._prefix = ('', ) * (n - 1)

        for ngram in ingrams(chain(self._prefix, train), n):
            context = tuple(ngram[:-1])
            token = ngram[-1]

        if (not estimator_args) and (not estimator_kw_args):
            self._model = ConditionalProbDist(cfd, estimator, len(cfd))
            self._model = ConditionalProbDist(cfd, estimator, *estimator_args,

        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NgramModel(n - 1, train, estimator,
                                       *estimator_args, **estimator_kw_args)
def frequency_of_term_in_article(term,tokens):
    # term is a tuple of strings
    # tokens is a list of sentences, where sentences are a list of words
    return sum(1 for sent in tokens for ngram in ingrams(sent,len(term)) if tuple(ngram) == term)
Example #36
    def __init__(self,
                 char_set=string.lowercase + string.punctuation + string.digits,
        Creates an nchar language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during

        @param n: the order of the language model (nchar size)
        @type n: C{int}
        @param train: the training text
        @type train: C{list} of C{string}
        @param estimator: a function for generating a probability distribution
        @type estimator: a function that takes a C{ConditionalFreqDist} and
              returns a C{ConditionalProbDist}

        self._smoothing = smoothing;
        #self.lagrangian_parameter = lagrangian_parameter;

        self._n = n

        self._maximum_length = maximum_length;
        self._minimum_length = minimum_length;
        self._char_set = char_set;
        #estimator = lambda fdist, bins: nltk.probability.WittenBellProbDist(fdist, len(char_set));
        estimator = lambda fdist, bins: nltk.probability.LidstoneProbDist(fdist, self._smoothing, len(self._char_set)+1);
        #estimator = lambda fdist, bins: nltk.probability.LidstoneProbDist(fdist, 1e-9, len(self._char_set));
        #estimator = lambda fdist, bins: nltk.probability.GoodTuringProbDist(fdist, len(self._char_set));
        #estimator = lambda fdist, bins: nltk.probability.SimpleGoodTuringProbDist(fdist, len(self._char_set));

        cfd = ConditionalFreqDist()
        self._ngrams = set()
        self._patch_char = patch_char;
        self._prefix = (self._patch_char,) * (n - 1)
        length = nltk.probability.FreqDist();
        word_freq_dist = nltk.probability.FreqDist();
        char_list = [];
        for word in train:
            word = word.strip().lower();
            if len(word)<self._minimum_length or len(word)>self._maximum_length:
  , 1);
            char_list.extend([char for char in word if char in self._char_set]);
        self._length = nltk.probability.WittenBellProbDist(length, length.B()+1);
        #self._length = nltk.probability.WittenBellProbDist(length, self._maximum_length);
        #context_freq_dist = nltk.probability.FreqDist();
        #for nchar in ingrams(chain(self._prefix, train), n):
        for nchar in ingrams(char_list, n):
            context = tuple(nchar[:-1])
            token = nchar[-1]
        #self._context = nltk.probability.WittenBellProbDist(context_freq_dist, len(self._char_set)**(n-1)+1);

        if n==3:
            cond = 0;
            for x in self._char_set:
                for y in self._char_set:
                    print (x, y), context_freq_dist[(x, y)], self._context.prob((x, y));
                    cond += self._context.prob((x, y));
            print 'cond is', cond
        #self._model = ConditionalProbDist(cfd, estimator, len(cfd));
        #print self._char_set;
        self._model = ConditionalProbDist(cfd, estimator, len(self._char_set) ** (n - 1));

        #========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ==========
        consonant_freq_dist = nltk.probability.FreqDist();
        consonant_condition_freq_dist = nltk.probability.ConditionalFreqDist();
        for word in train:
            #word = re.sub(r'aeiou', ' ', word);
            word = word[0] + re.sub('aeiouy', ' ', word[1:]);
            consonant_list = word.split();
            #consonant_list = ['#', '#'] + consonant_list;
            for temp in consonant_list:
      , 1);
        #========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ==========        
        word_prob_dist = nltk.probability.MLEProbDist(word_freq_dist);

        word_model_empirical_frequency = numpy.zeros((1, self._maximum_length - self._minimum_length + 1)) + 1e-300;
        word_model_square = numpy.zeros((1, self._maximum_length - self._minimum_length + 1)) + 1e-300;
        #word_model_empirical_frequency_old = numpy.zeros((1, self._maximum_length - self._minimum_length + 1));
        #word_model_square_old = numpy.zeros((1, self._maximum_length - self._minimum_length + 1));
        total_outcomes = 0;
        for x in xrange(self._minimum_length, self._maximum_length+1):
            total_outcomes += len(self._char_set) ** x;

        for word in word_freq_dist.keys():
            word_model_empirical_frequency[0, len(word)-self._minimum_length] += word_prob_dist.prob(word) * self.probability_without_length(word);
            #word_model_empirical_frequency[0, len(word)-self._minimum_length] += 1.0/total_outcomes * self.probability_without_length(word);
            word_model_square[0, len(word)-self._minimum_length] += self.probability_without_length(word) ** 2;
            #word_model_empirical_frequency_old[0, len(word)-self._minimum_length] += word_prob_dist.prob(word) * self.probability_without_length(word);
            #word_model_square_old[0, len(word)-self._minimum_length] += self.probability_without_length(word) ** 2;
        #print "alpha is", 2 * (1-numpy.sum(word_model_empirical_frequency / word_model_square))/numpy.sum(1.0/word_model_square)
        #print word_model_empirical_frequency, word_model_square

        #sum_word_model_square_inverse = numpy.sum(1.0 / word_model_square);
        #sum_word_model_empirical_frequency_over_word_model_square = numpy.sum(word_model_empirical_frequency / word_model_square);
        #self._multinomial_length = (word_model_empirical_frequency * sum_word_model_square_inverse - sum_word_model_empirical_frequency_over_word_model_square + 1) / (word_model_square * sum_word_model_square_inverse);
        #print sum_word_model_square_inverse, sum_word_model_empirical_frequency_over_word_model_square;
        #print self._multinomial_length, numpy.sum(self._multinomial_length);
        if True:
            lagrangian_parameter = 2 * (1-numpy.sum(word_model_empirical_frequency / word_model_square))/numpy.sum(1.0/word_model_square)
            lagrangian_parameter = 1.;
        #print "lagrangian parameter is", lagrangian_parameter
        self._multinomial_length = (word_model_empirical_frequency - lagrangian_parameter / 2) / word_model_square;
        self._multinomial_length /= numpy.sum(self._multinomial_length);
        #print self._multinomial_length, numpy.sum(self._multinomial_length);
        assert numpy.all(self._multinomial_length>=0), self._multinomial_length;

        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NcharModel(n-1, train, self._smoothing, maximum_length,
                 minimum_length, self._char_set, self._patch_char);
Example #37
t = time() - t
print str(t) + 's'

# Test generation of CFD
print 'Creating CFD...',
t = time()

cat = cr.categories()[0]

n = 3

cfd = ConditionalFreqDist()
prefix = ('',) * (n - 1)

for ngram in ingrams(chain(prefix, cr.words(categories=[cat])), n):
    context = tuple(ngram[:-1])
    token = ngram[-1]

t = time() - t
print str(t) + 's'

t = time()
print 'Pickling CFD...',

pickle.dump(cfd, open('cfd.p', 'w'), protocol=1)

t = time() - t
Example #38
    def __init__(self, n, train, pad_left=True, pad_right=False,
                 estimator=None, *estimator_args, **estimator_kwargs):
        Create an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during

            >>> from nltk.corpus import brown
            >>> from nltk.probability import LidstoneProbDist
            >>> est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
            >>> lm = NgramModel(3, brown.words(categories='news'), estimator=est)
            >>> lm
            <NgramModel with 91603 3-grams>
            >>> lm._backoff
            <NgramModel with 62888 2-grams>
            >>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said',
            ... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent',
            ... 'primary', 'election', 'produced', '``', 'no', 'evidence',
            ... "''", 'that', 'any', 'irregularities', 'took', 'place', '.'])
            ... # doctest: +ELLIPSIS

        :param n: the order of the language model (ngram size)
        :type n: int
        :param train: the training text
        :type train: list(str) or list(list(str))
        :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings
        :type pad_left: bool
        :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings
        :type pad_right: bool
        :param estimator: a function for generating a probability distribution
        :type estimator: a function that takes a ConditionalFreqDist and
            returns a ConditionalProbDist
        :param estimator_args: Extra arguments for estimator.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
            Note: For backward-compatibility, if no arguments are specified, the
            number of bins in the underlying ConditionalFreqDist are passed to
            the estimator as an argument.
        :type estimator_args: (any)
        :param estimator_kwargs: Extra keyword arguments for the estimator
        :type estimator_kwargs: (any)

        # protection from cryptic behavior for calling programs
        # that use the pre-2.0.2 interface
        assert (isinstance(pad_left, bool))
        assert (isinstance(pad_right, bool))

        self._n = n
        self._lpad = ('',) * (n - 1) if pad_left else ()
        self._rpad = ('',) * (n - 1) if pad_right else ()

        if estimator is None:
            estimator = _estimator

        cfd = ConditionalFreqDist()
        self._ngrams = set()

        # If given a list of strings instead of a list of lists, create enclosing list
        if (train is not None) and isinstance(train[0], str):
            train = [train]

        for sent in train:
            for ngram in ingrams(chain(self._lpad, sent, self._rpad), n):
                context = tuple(ngram[:-1])
                token = ngram[-1]

        if not estimator_args and not estimator_kwargs:
            self._model = ConditionalProbDist(cfd, estimator, len(cfd))
            self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs)

        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NgramModel(n - 1, train, pad_left, pad_right,
                                       estimator, *estimator_args, **estimator_kwargs)
Example #39
    def __init__(
            char_set=string.lowercase + string.punctuation + string.digits,
        Creates an nchar language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during

        @param n: the order of the language model (nchar size)
        @type n: C{int}
        @param train: the training text
        @type train: C{list} of C{string}
        @param estimator: a function for generating a probability distribution
        @type estimator: a function that takes a C{ConditionalFreqDist} and
              returns a C{ConditionalProbDist}

        self._smoothing = smoothing
        #self.lagrangian_parameter = lagrangian_parameter;

        self._n = n

        self._maximum_length = maximum_length
        self._minimum_length = minimum_length
        self._char_set = char_set

        #estimator = lambda fdist, bins: nltk.probability.WittenBellProbDist(fdist, len(char_set));
        estimator = lambda fdist, bins: nltk.probability.LidstoneProbDist(
            fdist, self._smoothing,
            len(self._char_set) + 1)
        #estimator = lambda fdist, bins: nltk.probability.LidstoneProbDist(fdist, 1e-9, len(self._char_set));
        #estimator = lambda fdist, bins: nltk.probability.GoodTuringProbDist(fdist, len(self._char_set));
        #estimator = lambda fdist, bins: nltk.probability.SimpleGoodTuringProbDist(fdist, len(self._char_set));

        cfd = ConditionalFreqDist()
        self._ngrams = set()
        self._patch_char = patch_char
        self._prefix = (self._patch_char, ) * (n - 1)

        length = nltk.probability.FreqDist()
        word_freq_dist = nltk.probability.FreqDist()
        char_list = []
        for word in train:
            word = word.strip().lower()
            if len(word) < self._minimum_length or len(
                    word) > self._maximum_length:
  , 1)
            char_list.extend([char for char in word if char in self._char_set])
        self._length = nltk.probability.WittenBellProbDist(
            length.B() + 1)
        #self._length = nltk.probability.WittenBellProbDist(length, self._maximum_length);

        #context_freq_dist = nltk.probability.FreqDist();
        #for nchar in ingrams(chain(self._prefix, train), n):
        for nchar in ingrams(char_list, n):
            context = tuple(nchar[:-1])
            token = nchar[-1]
        #self._context = nltk.probability.WittenBellProbDist(context_freq_dist, len(self._char_set)**(n-1)+1);
        if n==3:
            cond = 0;
            for x in self._char_set:
                for y in self._char_set:
                    print (x, y), context_freq_dist[(x, y)], self._context.prob((x, y));
                    cond += self._context.prob((x, y));
            print 'cond is', cond

        #self._model = ConditionalProbDist(cfd, estimator, len(cfd));
        #print self._char_set;
        self._model = ConditionalProbDist(cfd, estimator,
                                          len(self._char_set)**(n - 1))

        #========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ==========
        consonant_freq_dist = nltk.probability.FreqDist();
        consonant_condition_freq_dist = nltk.probability.ConditionalFreqDist();
        for word in train:
            #word = re.sub(r'aeiou', ' ', word);
            word = word[0] + re.sub('aeiouy', ' ', word[1:]);
            consonant_list = word.split();
            #consonant_list = ['#', '#'] + consonant_list;
            for temp in consonant_list:
      , 1);
        #========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ==========
        word_prob_dist = nltk.probability.MLEProbDist(word_freq_dist)

        word_model_empirical_frequency = numpy.zeros(
            (1, self._maximum_length - self._minimum_length + 1)) + 1e-300
        word_model_square = numpy.zeros(
            (1, self._maximum_length - self._minimum_length + 1)) + 1e-300

        #word_model_empirical_frequency_old = numpy.zeros((1, self._maximum_length - self._minimum_length + 1));
        #word_model_square_old = numpy.zeros((1, self._maximum_length - self._minimum_length + 1));

        total_outcomes = 0
        for x in xrange(self._minimum_length, self._maximum_length + 1):
            total_outcomes += len(self._char_set)**x

        for word in word_freq_dist.keys():
                0, len(word) - self._minimum_length] += word_prob_dist.prob(
                    word) * self.probability_without_length(word)
            #word_model_empirical_frequency[0, len(word)-self._minimum_length] += 1.0/total_outcomes * self.probability_without_length(word);
                0, len(word) -
                self._minimum_length] += self.probability_without_length(

            #word_model_empirical_frequency_old[0, len(word)-self._minimum_length] += word_prob_dist.prob(word) * self.probability_without_length(word);
            #word_model_square_old[0, len(word)-self._minimum_length] += self.probability_without_length(word) ** 2;

        #print "alpha is", 2 * (1-numpy.sum(word_model_empirical_frequency / word_model_square))/numpy.sum(1.0/word_model_square)
        #print word_model_empirical_frequency, word_model_square

        #sum_word_model_square_inverse = numpy.sum(1.0 / word_model_square);
        #sum_word_model_empirical_frequency_over_word_model_square = numpy.sum(word_model_empirical_frequency / word_model_square);
        #self._multinomial_length = (word_model_empirical_frequency * sum_word_model_square_inverse - sum_word_model_empirical_frequency_over_word_model_square + 1) / (word_model_square * sum_word_model_square_inverse);
        #print sum_word_model_square_inverse, sum_word_model_empirical_frequency_over_word_model_square;
        #print self._multinomial_length, numpy.sum(self._multinomial_length);

        if True:
            lagrangian_parameter = 2 * (1 - numpy.sum(
                word_model_empirical_frequency /
                word_model_square)) / numpy.sum(1.0 / word_model_square)
            lagrangian_parameter = 1.
        #print "lagrangian parameter is", lagrangian_parameter
        self._multinomial_length = (
            word_model_empirical_frequency -
            lagrangian_parameter / 2) / word_model_square
        self._multinomial_length /= numpy.sum(self._multinomial_length)

        #print self._multinomial_length, numpy.sum(self._multinomial_length);
        assert numpy.all(
            self._multinomial_length >= 0), self._multinomial_length

        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NcharModel(n - 1, train, self._smoothing,
                                       maximum_length, minimum_length,
                                       self._char_set, self._patch_char)
Example #40
from nltk.corpus import genesis
from nltk.probability import LidstoneProbDist
from nltk.model import NgramModel
from nltk.util import ingrams

# Tokens contains the words for Genesis and Reuters Trade
#tokens = list(genesis.words('english-kjv.txt'))
#tokens.extend(list(reuters.words(categories = 'trade')))

# estimator for smoothing the N-gram model
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)

sent = "abraham lincoln be bear feb 12 1809"
tokens = sent.split()
splitNgrams = list(ingrams(list(sent), 3))

tokens = ["".join(x) for x in splitNgrams]

# N-gram language model with 3-grams
# Without an estimator, it assumes Good-Turing.
model = NgramModel(3, tokens, estimator)
print "Model: " + str(model)

sent2 = "abe lincoln was born in 1809"

splitNgrams2 = list(ingrams(list(sent2), 3))
tokens2 = ["".join(x) for x in splitNgrams2]

print "Word: " + tokens2[-1]
Example #41
    def __init__(self,
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during

        @param n: the order of the language model (ngram size)
        @type n: C{int}
        @param train: the training text
        @type train: C{list} of C{string} (or C{list} of C{string} C{list}s)
        @param estimator: a function for generating a probability distribution
        @type estimator: a function that takes a C{ConditionalFreqDist} and
              returns a C{ConditionalProbDist}
        @param estimator_args: Extra arguments for C{estimator}.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
            Note: For backward-compatibility, if no arguments are specified, the
            number of bins in the underlying C{ConditionalFreqDist} are passed to
            the estimator as an argument.
        @type estimator_args: (any)
        @param estimator_kw_args: Extra keyword arguments for C{estimator}.
        @type estimator_kw_args: (any)

        self._n = n

        if estimator is None:
            estimator = _estimator

        cfd = ConditionalFreqDist()
        self._ngrams = set()
        self._padding = ('', ) * (n - 1)

        # If given a list of strings instead of a list of lists, create enclosing list
        if isinstance(train[0], basestring):
            train = [train]

        for utterance in train:
            for ngram in ingrams(
                    chain(self._padding, utterance, self._padding), n):
                context = tuple(ngram[:-1])
                token = ngram[-1]

        if (not estimator_args) and (not estimator_kw_args):
            self._model = ConditionalProbDist(cfd, estimator, len(cfd))
            self._model = ConditionalProbDist(cfd, estimator, *estimator_args,

        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NgramModel(n - 1, train, estimator,
                                       *estimator_args, **estimator_kw_args)
Example #42
    def __init__(self, n, train, pad_left=False, pad_right=False,
                 estimator=None, *estimator_args, **estimator_kwargs):
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during

        :param n: the order of the language model (ngram size)
        :type n: C{int}
        :param train: the training text
        :type train: C{iterable} of C{string} or C{iterable} of C{iterable} of C{string} 
        :param estimator: a function for generating a probability distribution---defaults to MLEProbDist
        :type estimator: a function that takes a C{ConditionalFreqDist} and
              returns a C{ConditionalProbDist}
        :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of <s>
        :type pad_left: bool
        :param pad_right: whether to pad the right of each sentence with </s>
        :type pad_right: bool
        :param estimator_args: Extra arguments for estimator.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
            Note: For backward-compatibility, if no arguments are specified, the
            number of bins in the underlying ConditionalFreqDist are passed to
            the estimator as an argument.
        :type estimator_args: (any)
        :param estimator_kwargs: Extra keyword arguments for the estimator
        :type estimator_kwargs: (any)

        # protection from cryptic behavior for calling programs
        # that use the pre-2.0.2 interface
        assert(isinstance(pad_left, bool))
        assert(isinstance(pad_right, bool))

        # make sure n is greater than zero, otherwise print it
        assert (n > 0), n

        # For explicitness save the check whether this is a unigram model
        self.is_unigram_model = (n == 1)
        # save the ngram order number
        self._n = n
        # save left and right padding
        self._lpad = ('<s>',) * (n - 1) if pad_left else ()
        # Need _rpad even for unigrams or padded entropy will give
        #  wrong answer because '</s>' will be treated as unseen...
        self._rpad = ('</s>',) if pad_right else ()
        self._padLen = len(self._lpad)+len(self._rpad)

        delta = 1+self._padLen-n        # len(sent)+delta == ngrams in sent

        if estimator is None:
            assert (estimator_args is ()) and (estimator_kwargs=={}),\
                   "estimator_args (%s) or _kwargs supplied (%s), but no estimator"%(estimator_args,estimator_kwargs)
            estimator = lambda fdist, bins: MLEProbDist(fdist)

        # Given backoff, a generator isn't acceptable
        if not isinstance(train,
        self._W = len(train)
        # Coerce to list of list -- note that this means to train charGrams,
        #  requires exploding the words ahead of time 
        if train is not None:
            if isinstance(train[0], compat.string_types):
                train = [train]
            elif not isinstance(train[0],
                # if you mix strings and generators, you have only yourself
                #  to blame!
                for i in range(len(train)):

        if n == 1:
            if pad_right:
                sents=(chain(s,self._rpad) for s in train)
            for s in sents:
            if not estimator_args and not estimator_kwargs:
                self._model = estimator(fd,fd.B())
                self._model = estimator(fd,fd.B(),
                                        *estimator_args, **estimator_kwargs)
            cfd = ConditionalFreqDist()
            self._ngrams = set()

            for sent in train:
                for ngram in ingrams(chain(self._lpad, sent, self._rpad), n):
                    context = tuple(ngram[:-1])
                    token = ngram[-1]
            if not estimator_args and not estimator_kwargs:
                self._model = ConditionalProbDist(cfd, estimator, len(cfd))
                self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs)

        # recursively construct the lower-order models
        if not self.is_unigram_model:
            self._backoff = NgramModel(n-1, train,
                                        pad_left, pad_right,

            # Code below here in this method, and the _words_following and _alpha method, are from
            # "Last updated on Feb 26, 2015"
            self._backoff_alphas = dict()
            # For each condition (or context)
            for ctxt in cfd.conditions():
                backoff_ctxt = ctxt[1:]
                backoff_total_pr = 0.0
                total_observed_pr = 0.0

                # this is the subset of words that we OBSERVED following
                # this context.
                # i.e. Count(word | context) > 0
                for word in self._words_following(ctxt, cfd):
                    total_observed_pr += self.prob(word, ctxt)
                    # we also need the total (n-1)-gram probability of
                    # words observed in this n-gram context
                    backoff_total_pr += self._backoff.prob(word, backoff_ctxt)
                if isclose(total_observed_pr,1.0):
                    assert 0.0 <= total_observed_pr <= 1.0,\
                           "sum of probs for %s out of bounds: %.10g"%(ctxt,total_observed_pr)
                # beta is the remaining probability weight after we factor out
                # the probability of observed words.
                # As a sanity check, both total_observed_pr and backoff_total_pr
                # must be GE 0, since probabilities are never negative
                beta = 1.0 - total_observed_pr

                if beta!=0.0:
                    assert (0.0 <= backoff_total_pr < 1.0), \
                           "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr)
                    alpha_ctxt = beta / (1.0 - backoff_total_pr)
                    assert ((0.0 <= backoff_total_pr < 1.0) or
                            isclose(1.0,backoff_total_pr)), \
                           "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr)
                    alpha_ctxt = 0.0

                self._backoff_alphas[ctxt] = alpha_ctxt