Python NGramStack Examples

Programming Language: Python

Namespace/Package Name: NGramStack

Class/Type: NGramStack

Examples at hotexamples.com: 13

Python NGramStack - 13 examples found. These are the top rated real world Python examples of NGramStack.NGramStack extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

push(4)

clear(4)

NGramStack(3)

Example #1

Show file

File: SimpleCount.py Project: dativebase/old

 def __init__(self, order=3, sb="<s>", se="</s>", raw=False, ml=False):
     self.sb = sb
     self.se = se
     self.raw = raw
     self.ml = ml
     self.order = order
     self.ngrams = NGramStack(order=order)
     self.counts = [defaultdict(float) for i in xrange(order)]

Example #2

Show file

 def __init__(self, order=3, sb="<s>", se="</s>"):
     self.sb = sb
     self.se = se
     self.order = order
     self.ngrams = NGramStack(order=order)
     self.denominators = [defaultdict(float) for i in xrange(order - 1)]
     self.numerators = [defaultdict(float) for i in xrange(order - 1)]
     self.nonZeros = [defaultdict(float) for i in xrange(order - 1)]
     self.CoC = [[0.0 for j in xrange(4)] for i in xrange(order)]
     self.discounts = [0.0 for i in xrange(order - 1)]
     self.UD = 0.
     self.UN = defaultdict(float)

Example #3

Show file

File: SimpleModKN.py Project: dativebase/old

 def __init__(self, order=3, sb="<s>", se="</s>"):
     self.sb = sb
     self.se = se
     self.order = order
     self.ngrams = NGramStack(order=order)
     self.denominators = [defaultdict(float) for i in xrange(order - 1)]
     self.numerators = [defaultdict(float) for i in xrange(order - 1)]
     #Modified Kneser-Ney requires that we track the individual N_i
     # in contrast to Kneser-Ney, which just requires the sum-total.
     self.nonZeros = [
         defaultdict(lambda: defaultdict(float)) for i in xrange(order - 1)
     ]
     self.CoC = [[0.0 for j in xrange(4)] for i in xrange(order)]
     self.discounts = [[0.0 for j in xrange(3)] for i in xrange(order - 1)]
     self.UD = 0.
     self.UN = defaultdict(float)

Example #4

Show file

File: SimpleCount.py Project: AdolfVonKleist/SimpleLM

 def __init__( self, order=3, sb="<s>", se="</s>", raw=False, ml=False ):
     self.sb        = sb
     self.se        = se
     self.raw       = raw
     self.ml        = ml
     self.order     = order
     self.ngrams    = NGramStack(order=order)
     self.counts    = [ defaultdict(float) for i in xrange(order) ]

Example #5

Show file

File: SimpleKN.py Project: AdolfVonKleist/SimpleLM

 def __init__( self, order=3, sb="<s>", se="</s>" ):
     self.sb        = sb
     self.se        = se
     self.order     = order
     self.ngrams    = NGramStack(order=order)
     self.denominators = [ defaultdict(float) for i in xrange(order-1) ]
     self.numerators   = [ defaultdict(float) for i in xrange(order-1) ]
     self.nonZeros     = [ defaultdict(float) for i in xrange(order-1) ]
     self.CoC          = [ [ 0.0 for j in xrange(4) ] for i in xrange(order) ]
     self.discounts    = [ 0.0 for i in xrange(order-1) ]
     self.UD = 0.
     self.UN = defaultdict(float)

Example #6

Show file

File: SimpleModKN.py Project: AdolfVonKleist/SimpleLM

 def __init__( self, order=3, sb="<s>", se="</s>" ):
     self.sb        = sb
     self.se        = se
     self.order     = order
     self.ngrams    = NGramStack(order=order)
     self.denominators = [ defaultdict(float) for i in xrange(order-1) ]
     self.numerators   = [ defaultdict(float) for i in xrange(order-1) ]
     #Modified Kneser-Ney requires that we track the individual N_i
     # in contrast to Kneser-Ney, which just requires the sum-total.
     self.nonZeros     = [ defaultdict(lambda: defaultdict(float)) for i in xrange(order-1) ]
     self.CoC          = [ [ 0.0 for j in xrange(4) ] for i in xrange(order) ]
     self.discounts    = [ [ 0.0 for j in xrange(3) ] for i in xrange(order-1) ]
     self.UD = 0.
     self.UN = defaultdict(float)

Example #7

Show file

File: SimpleKN.py Project: AdolfVonKleist/SimpleLM

class KNSmoother( ):
    """
      Stand-alone python implementation of interpolated Fixed Kneser-Ney discounting.

      Intended for educational purposes, this should produce results identical
       to mitlm's 'estimate-ngram' utility,
         mitlm:
          $ estimate-ngram -o 3 -t train.corpus -s FixKN
         SimpleKN.py:
          $ SimpleKN.py -t train.corpus
    
      WARNING: This program has not been optimized in any way and will almost 
       surely be extremely slow for anything larger than a small toy corpus.

    """

    def __init__( self, order=3, sb="<s>", se="</s>" ):
        self.sb        = sb
        self.se        = se
        self.order     = order
        self.ngrams    = NGramStack(order=order)
        self.denominators = [ defaultdict(float) for i in xrange(order-1) ]
        self.numerators   = [ defaultdict(float) for i in xrange(order-1) ]
        self.nonZeros     = [ defaultdict(float) for i in xrange(order-1) ]
        self.CoC          = [ [ 0.0 for j in xrange(4) ] for i in xrange(order) ]
        self.discounts    = [ 0.0 for i in xrange(order-1) ]
        self.UD = 0.
        self.UN = defaultdict(float)

    def _compute_counts_of_counts( self ):
        """
          Compute counts-of-counts (CoC) for each N-gram order.
          Only CoC<=4 are relevant to the computation of
          either ModKNFix or KNFix.
        """

        for k in self.UN:
            if self.UN[k] <= 4:
                self.CoC[0][int(self.UN[k]-1)] += 1.

        for i,dic in enumerate(self.numerators):
            for k in dic:
                if dic[k]<=4:
                    self.CoC[i+1][int(dic[k]-1)] += 1.
        return

    def _compute_discounts( self ):
        """
          Compute the discount parameters. Note that unigram counts
          are not discounted in either FixKN or FixModKN.

          ---------------------------------
          Fixed Kneser-Ney smoothing: FixKN
          ---------------------------------
          This is based on the solution described in Kneser-Ney '95, 
          and reformulated in Chen&Goodman '98.  
          
             D = N_1 / ( N_1 + 2(N_2) )

          where N_1 refers to the # of N-grams that appear exactly
          once, and N_2 refers to the number of N-grams that appear
          exactly twice.  This is computed for each order.

          NOTE: The discount formula for FixKN is identical 
                for Absolute discounting.
        """

        #Uniform discount for each N-gram order
        for o in xrange(self.order-1):
            self.discounts[o] = self.CoC[o+1][0] / (self.CoC[o+1][0]+2*self.CoC[o+1][1])

        return 

    def _get_discount( self, order, ngram ):
        """
          Retrieve the pre-computed discount for this N-gram.
        """

        return self.discounts[order]

    def kneser_ney_from_counts( self, arpa_file ):
        """
          Train the KN-discount language model from an ARPA format 
          file containing raw count data.  This can be generated with,
            $ ./SimpleCount.py --train train.corpus -r > counts.arpa

        """

        m_ord = c_ord = 0

        for line in open(arpa_file, "r"):
            ngram, count = line.strip().split("\t")
            count = float(count)

            ngram = ngram.split(" ")
            if len(ngram)==2:
                self.UD += 1.0

            if len(ngram)==2:
                self.UN[" ".join(ngram[1:])] += 1.0
                self.nonZeros[len(ngram)-2][" ".join(ngram[:-1])] += 1.0
                if ngram[0]==self.sb:
                    self.numerators[len(ngram)-2][" ".join(ngram)] += count
                    self.denominators[len(ngram)-2][" ".join(ngram[:-1])] += count

            if len(ngram)>2 and len(ngram)<self.order:
                self.numerators[len(ngram)-3][" ".join(ngram[1:])] += 1.0
                self.denominators[len(ngram)-3][" ".join(ngram[1:-1])] += 1.0
                self.nonZeros[len(ngram)-2][" ".join(ngram[:-1])] += 1.0
                if ngram[0]==self.sb:
                    self.numerators[len(ngram)-2][" ".join(ngram)] += count
                    self.denominators[len(ngram)-2][" ".join(ngram[:-1])] += count

            if len(ngram)==self.order:
                self.numerators[len(ngram)-3][" ".join(ngram[1:])] += 1.0
                self.numerators[len(ngram)-2][" ".join(ngram)] = count
                self.denominators[len(ngram)-3][" ".join(ngram[1:-1])] += 1.0
                self.denominators[len(ngram)-2][" ".join(ngram[:-1])] += count
                self.nonZeros[len(ngram)-2][" ".join(ngram[:-1])] += 1.0

        self._compute_counts_of_counts ( )
        self._compute_discounts( )

        #self._print_raw_counts( )
        return

    def kneser_ney_discounting( self, training_file ):
        """
          Iterate through the training data using a FIFO stack or 
           'window' of max-length equal to the specified N-gram order.

          Each time a new word is pushed onto the N-gram stack call
           the _kn_recurse() subroutine to increment the N-gram 
           contexts in the current window / on the stack. 

          If pushing a word onto the stack makes len(stack)>max-order, 
           then the word at the bottom (stack[0]) is popped off.
        """
        for line in open(training_file,"r"):
            #Split the current line into words.
            words = re.split(r"\s+",line.strip())

            #Push a sentence-begin token onto the stack
            self.ngrams.push(self.sb)

            for word in words:
                #Get the current 'window' of N-grams
                ngram = self.ngrams.push(word)

                #Now count all N-grams in the current window
                #These will be of span <= self.order
                self._kn_recurse( ngram, len(ngram)-2 )

            #Now push the sentence-end token onto the stack
            ngram = self.ngrams.push(self.se)
            self._kn_recurse( ngram, len(ngram)-2 )

            #Clear the stack for the next sentence
            self.ngrams.clear()
        self._compute_counts_of_counts ( )
        self._compute_discounts( )

        #self._print_raw_counts( )
        return

    def _print_raw_counts( self ):
        """
          Convenience function for sanity checking the history counts.
        """
        print "NUMERATORS:"
        for key in sorted(self.UN.iterkeys()):
            print " ", key, self.UN[key]
        for o in xrange(len(self.numerators)):
            print "ORD",o
            for key in sorted(self.numerators[o].iterkeys()):
                print " ", key, self.numerators[o][key]
        print "DENOMINATORS:"
        print self.UD
        for o in xrange(len(self.denominators)):
            print "DORD", o
            for key in sorted(self.denominators[o].iterkeys()):
                print " ", key, self.denominators[o][key]
        print "NONZEROS:"
        for o in xrange(len(self.nonZeros)):
            print "ZORD", o
            for key in sorted(self.nonZeros[o].iterkeys()):
                print " ", key, self.nonZeros[o][key]

    def _kn_recurse( self, ngram_stack, i ):
        """
         Kneser-Ney discount calculation recursion.
        """
        if i==-1 and ngram_stack[0]==self.sb:
            return
        o     = len(ngram_stack)
        numer = " ".join(ngram_stack[o-(i+2):])
        denom = " ".join(ngram_stack[o-(i+2):o-1])
        self.numerators[  i][numer] += 1.
        self.denominators[i][denom] += 1.
        if self.numerators[i][numer]==1.:
            self.nonZeros[i][denom]  += 1.
            if i>0:
                self._kn_recurse( ngram_stack, i-1 )
            else:
                #The <s> (sentence-begin) token is
                # NOT counted as a unigram event
                if not ngram_stack[-1]==self.sb:
                    self.UN[ngram_stack[-1]] += 1.
                    self.UD += 1.
        return  
        
    def print_ARPA( self ):
        """
          Print the interpolated Kneser-Ney LM out in ARPA format,
           computing the interpolated probabilities and back-off
           weights for each N-gram on-demand.  The format:
           ----------------------------
             \data\
             ngram 1=NUM_1GRAMS
             ngram 2=NUM_2GRAMS
             ...
             ngram N=NUM_NGRAMS (max order)
            
             \1-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \2-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \N-grams:
             p(a_z)  a_z
             ...

             \end\
           ----------------------------

        """

        #Handle the header info
        print "\\data\\"
        print "ngram 1=%d" % (len(self.UN)+1)
        for o in xrange(0,self.order-1):
            print "ngram %d=%d" % (o+2,len(self.numerators[o]) )

        #Handle the Unigrams
        print "\n\\1-grams:"
        d    = self.discounts[0]
        #KN discount
        lmda = self.nonZeros[0][self.sb] * d / self.denominators[0][self.sb]
        print "-99.00000\t%s\t%0.6f"   % ( self.sb, log(lmda, 10.) )

        for key in sorted(self.UN.iterkeys()):
            if key==self.se:
                print "%0.6f\t%s\t-99"   % ( log(self.UN[key]/self.UD, 10.), key )
                continue

            d    = self.discounts[0]
            #KN discount
            lmda = self.nonZeros[0][key] * d / self.denominators[0][key]
            print "%0.6f\t%s\t%0.6f" % ( log(self.UN[key]/self.UD, 10.), key, log(lmda, 10.) )

        #Handle the middle-order N-grams
        for o in xrange(0,self.order-2):
            print "\n\\%d-grams:" % (o+2)
            for key in sorted(self.numerators[o].iterkeys()):
                if key.endswith(self.se):
                    #No back-off prob for N-grams ending in </s>
                    prob = self._compute_interpolated_prob( key )
                    print "%0.6f\t%s" % ( log(prob, 10.), key )
                    continue
                d = self.discounts[o+1]
                #Compute the back-off weight
                #KN discount
                lmda = self.nonZeros[o+1][key] * d / self.denominators[o+1][key]
                #Compute the interpolated N-gram probability
                prob = self._compute_interpolated_prob( key )
                print "%0.6f\t%s\t%0.6f" % ( log(prob, 10.), key, log(lmda, 10.))

        #Handle the N-order N-grams
        print "\n\\%d-grams:" % (self.order)
        for key in sorted(self.numerators[self.order-2].iterkeys()):
            #Compute the interpolated N-gram probability
            prob = self._compute_interpolated_prob( key )
            print "%0.6f\t%s" % ( log(prob, 10.), key )

        print "\n\\end\\"
        return


    def _compute_interpolated_prob( self, ngram ):
        """
          Compute the interpolated probability for the input ngram.
          Cribbing the notation from the SRILM webpages,

             a_z    = An N-gram where a is the first word, z is the 
                       last word, and "_" represents 0 or more words in between.
             p(a_z) = The estimated conditional probability of the 
                       nth word z given the first n-1 words (a_) of an N-gram.
             a_     = The n-1 word prefix of the N-gram a_z.
             _z     = The n-1 word suffix of the N-gram a_z.

          Then we have, 
             f(a_z) = g(a_z) + bow(a_) p(_z)
             p(a_z) = (c(a_z) > 0) ? f(a_z) : bow(a_) p(_z)

          The ARPA format is generated by writing, for each N-gram
          with 1 < order < max_order:
             p(a_z)    a_z   bow(a_z)

          and for the maximum order:
             p(a_z)    a_z

          special care must be taken for certain N-grams containing
           the <s> (sentence-begin) and </s> (sentence-end) tokens.  
          See the implementation for details on how to do this correctly.

          The formulation is based on the seminal Chen&Goodman '98 paper.

          SRILM notation-cribbing from:
             http://www.speech.sri.com/projects/srilm/manpages/ngram-discount.7.html
        """
        probability = 0.0
        ngram_stack = ngram.split(" ")
        probs       = [ 1e-99 for i in xrange(len(ngram_stack)) ]
        o           = len(ngram_stack)

        if not ngram_stack[-1]==self.sb:
            probs[0] = self.UN[ngram_stack[-1]] / self.UD

        for i in xrange(o-1):
            dID = " ".join(ngram_stack[o-(i+2):o-1])
            nID = " ".join(ngram_stack[o-(i+2):])
            if dID in self.denominators[i]:
                d = self.discounts[i]
                if nID in self.numerators[i]:
                    #We have an actual N-gram probability for this N-gram
                    #KN discount
                    probs[i+1] = (self.numerators[i][nID]-d)/self.denominators[i][dID]
                else:
                    #No actual N-gram prob, we will have to back-off
                    probs[i+1] = 1e-99
                #This break-down takes the following form:
                #  probs[i+1]:  The interpolated N-gram probability, p(a_z)
                #  lmda:        The un-normalized 'back-off' weight, bow(a_)
                #  probs[i]:    The next lower-order, interpolated N-gram 
                #               probability corresponding to p(_z)
                #KN discount
                lmda = self.nonZeros[i][dID] * d / self.denominators[i][dID]
                probs[i+1]  = probs[i+1] + lmda * probs[i]
                probability = probs[i+1]

        if probability == 0.0:
            #If we still have nothing, return the unigram probability
            probability = probs[0]

        return probability

Example #8

Show file

class KNSmoother():
    """
      Stand-alone python implementation of interpolated Fixed Kneser-Ney discounting.

      Intended for educational purposes, this should produce results identical
       to mitlm's 'estimate-ngram' utility,
         mitlm:
          $ estimate-ngram -o 3 -t train.corpus -s FixKN
         SimpleKN.py:
          $ SimpleKN.py -t train.corpus
    
      WARNING: This program has not been optimized in any way and will almost 
       surely be extremely slow for anything larger than a small toy corpus.

    """
    def __init__(self, order=3, sb="<s>", se="</s>"):
        self.sb = sb
        self.se = se
        self.order = order
        self.ngrams = NGramStack(order=order)
        self.denominators = [defaultdict(float) for i in xrange(order - 1)]
        self.numerators = [defaultdict(float) for i in xrange(order - 1)]
        self.nonZeros = [defaultdict(float) for i in xrange(order - 1)]
        self.CoC = [[0.0 for j in xrange(4)] for i in xrange(order)]
        self.discounts = [0.0 for i in xrange(order - 1)]
        self.UD = 0.
        self.UN = defaultdict(float)

    def _compute_counts_of_counts(self):
        """
          Compute counts-of-counts (CoC) for each N-gram order.
          Only CoC<=4 are relevant to the computation of
          either ModKNFix or KNFix.
        """

        for k in self.UN:
            if self.UN[k] <= 4:
                self.CoC[0][int(self.UN[k] - 1)] += 1.

        for i, dic in enumerate(self.numerators):
            for k in dic:
                if dic[k] <= 4:
                    self.CoC[i + 1][int(dic[k] - 1)] += 1.
        return

    def _compute_discounts(self):
        """
          Compute the discount parameters. Note that unigram counts
          are not discounted in either FixKN or FixModKN.

          ---------------------------------
          Fixed Kneser-Ney smoothing: FixKN
          ---------------------------------
          This is based on the solution described in Kneser-Ney '95, 
          and reformulated in Chen&Goodman '98.  
          
             D = N_1 / ( N_1 + 2(N_2) )

          where N_1 refers to the # of N-grams that appear exactly
          once, and N_2 refers to the number of N-grams that appear
          exactly twice.  This is computed for each order.

          NOTE: The discount formula for FixKN is identical 
                for Absolute discounting.
        """

        #Uniform discount for each N-gram order
        for o in xrange(self.order - 1):
            self.discounts[o] = self.CoC[o + 1][0] / (self.CoC[o + 1][0] +
                                                      2 * self.CoC[o + 1][1])

        return

    def _get_discount(self, order, ngram):
        """
          Retrieve the pre-computed discount for this N-gram.
        """

        return self.discounts[order]

    def kneser_ney_from_counts(self, arpa_file):
        """
          Train the KN-discount language model from an ARPA format 
          file containing raw count data.  This can be generated with,
            $ ./SimpleCount.py --train train.corpus -r > counts.arpa

        """

        m_ord = c_ord = 0

        for line in open(arpa_file, "r"):
            ngram, count = line.strip().split("\t")
            #In this version, possibly fractional counts
            # are simply rounded to the nearest integer.
            #The result is a "poor-man's" fractional Kneser-Ney
            # which actually works quite well in practice.
            count = round(float(count))
            if count == 0.0: continue

            ngram = ngram.split(" ")
            if len(ngram) == 2:
                self.UD += 1.0

            if len(ngram) == 2:
                self.UN[" ".join(ngram[1:])] += 1.0
                self.nonZeros[len(ngram) - 2][" ".join(ngram[:-1])] += 1.0
                if ngram[0] == self.sb:
                    self.numerators[len(ngram) - 2][" ".join(ngram)] += count
                    self.denominators[len(ngram) - 2][" ".join(
                        ngram[:-1])] += count

            if len(ngram) > 2 and len(ngram) < self.order:
                self.numerators[len(ngram) - 3][" ".join(ngram[1:])] += 1.0
                self.denominators[len(ngram) - 3][" ".join(ngram[1:-1])] += 1.0
                self.nonZeros[len(ngram) - 2][" ".join(ngram[:-1])] += 1.0
                if ngram[0] == self.sb:
                    self.numerators[len(ngram) - 2][" ".join(ngram)] += count
                    self.denominators[len(ngram) - 2][" ".join(
                        ngram[:-1])] += count

            if len(ngram) == self.order:
                self.numerators[len(ngram) - 3][" ".join(ngram[1:])] += 1.0
                self.numerators[len(ngram) - 2][" ".join(ngram)] = count
                self.denominators[len(ngram) - 3][" ".join(ngram[1:-1])] += 1.0
                self.denominators[len(ngram) - 2][" ".join(
                    ngram[:-1])] += count
                self.nonZeros[len(ngram) - 2][" ".join(ngram[:-1])] += 1.0

        self._compute_counts_of_counts()
        self._compute_discounts()

        #self._print_raw_counts( )
        return

    def kneser_ney_discounting(self, training_file):
        """
          Iterate through the training data using a FIFO stack or 
           'window' of max-length equal to the specified N-gram order.

          Each time a new word is pushed onto the N-gram stack call
           the _kn_recurse() subroutine to increment the N-gram 
           contexts in the current window / on the stack. 

          If pushing a word onto the stack makes len(stack)>max-order, 
           then the word at the bottom (stack[0]) is popped off.
        """
        for line in open(training_file, "r"):
            #Split the current line into words.
            words = re.split(r"\s+", line.strip())

            #Push a sentence-begin token onto the stack
            self.ngrams.push(self.sb)

            for word in words:
                #Get the current 'window' of N-grams
                ngram = self.ngrams.push(word)

                #Now count all N-grams in the current window
                #These will be of span <= self.order
                self._kn_recurse(ngram, len(ngram) - 2)

            #Now push the sentence-end token onto the stack
            ngram = self.ngrams.push(self.se)
            self._kn_recurse(ngram, len(ngram) - 2)

            #Clear the stack for the next sentence
            self.ngrams.clear()
        self._compute_counts_of_counts()
        self._compute_discounts()

        self._print_raw_counts()
        return

    def _print_raw_counts(self):
        """
          Convenience function for sanity checking the history counts.
        """
        print "NUMERATORS:"
        for key in sorted(self.UN.iterkeys()):
            print " ", key, self.UN[key]
        for o in xrange(len(self.numerators)):
            print "ORD", o
            for key in sorted(self.numerators[o].iterkeys()):
                print " ", key, self.numerators[o][key]
        print "DENOMINATORS:"
        print self.UD
        for o in xrange(len(self.denominators)):
            print "DORD", o
            for key in sorted(self.denominators[o].iterkeys()):
                print " ", key, self.denominators[o][key]
        print "NONZEROS:"
        for o in xrange(len(self.nonZeros)):
            print "ZORD", o
            for key in sorted(self.nonZeros[o].iterkeys()):
                print " ", key, self.nonZeros[o][key]

    def _kn_recurse(self, ngram_stack, i):
        """
         Kneser-Ney discount calculation recursion.
        """
        if i == -1 and ngram_stack[0] == self.sb:
            return
        o = len(ngram_stack)
        numer = " ".join(ngram_stack[o - (i + 2):])
        denom = " ".join(ngram_stack[o - (i + 2):o - 1])
        self.numerators[i][numer] += 1.
        self.denominators[i][denom] += 1.
        if self.numerators[i][numer] == 1.:
            self.nonZeros[i][denom] += 1.
            if i > 0:
                self._kn_recurse(ngram_stack, i - 1)
            else:
                #The <s> (sentence-begin) token is
                # NOT counted as a unigram event
                if not ngram_stack[-1] == self.sb:
                    self.UN[ngram_stack[-1]] += 1.
                    self.UD += 1.
        return

    def print_ARPA(self):
        """
          Print the interpolated Kneser-Ney LM out in ARPA format,
           computing the interpolated probabilities and back-off
           weights for each N-gram on-demand.  The format:
           ----------------------------
             \data\
             ngram 1=NUM_1GRAMS
             ngram 2=NUM_2GRAMS
             ...
             ngram N=NUM_NGRAMS (max order)
            
             \1-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \2-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \N-grams:
             p(a_z)  a_z
             ...

             \end\
           ----------------------------

           NOTE: The try:/except: blocks are not necessary for normal 
                 circumstances.  Neither raw text nor correct counts will
                 fire the exception blocks.  These are needed only when 
                 using "poor-man's" fractional Kneser-Ney, in which case
                 the rounded counts may not agree.
                 
        """

        #Handle the header info
        print "\\data\\"
        print "ngram 1=%d" % (len(self.UN) + 1)
        for o in xrange(0, self.order - 1):
            print "ngram %d=%d" % (o + 2, len(self.numerators[o]))

        #Handle the Unigrams
        print "\n\\1-grams:"
        d = self.discounts[0]
        #KN discount
        try:
            lmda = self.nonZeros[0][self.sb] * d / self.denominators[0][
                self.sb]
        except:
            lmda = 1e-99
        print "-99.00000\t%s\t%0.6f" % (self.sb, log(lmda, 10.))

        for key in sorted(self.UN.iterkeys()):
            if key == self.se:
                print "%0.6f\t%s\t-99" % (log(self.UN[key] / self.UD,
                                              10.), key)
                continue

            d = self.discounts[0]
            #KN discount
            try:
                lmda = self.nonZeros[0][key] * d / self.denominators[0][key]
            except:
                lmda = 1e-99
            print "%0.6f\t%s\t%0.6f" % (log(self.UN[key] / self.UD,
                                            10.), key, log(lmda, 10.))

        #Handle the middle-order N-grams
        for o in xrange(0, self.order - 2):
            print "\n\\%d-grams:" % (o + 2)
            for key in sorted(self.numerators[o].iterkeys()):
                if key.endswith(self.se):
                    #No back-off prob for N-grams ending in </s>
                    prob = self._compute_interpolated_prob(key)
                    try:
                        print "%0.6f\t%s" % (log(prob, 10.), key)
                    except:
                        print "%0.6f\t%s" % (log(1e-99, 10.), key)
                    continue
                d = self.discounts[o + 1]
                #Compute the back-off weight
                #KN discount
                try:
                    lmda = self.nonZeros[o + 1][key] * d / self.denominators[
                        o + 1][key]
                except:
                    lmda = 1e-99
                #Compute the interpolated N-gram probability
                prob = self._compute_interpolated_prob(key)
                if lmda == 0.: lmda = 1e-99
                prob = max(prob, 1e-99)
                try:
                    print "%0.6f\t%s\t%0.6f" % (log(prob,
                                                    10.), key, log(lmda, 10.))
                except:
                    raise ValueError, "ERROR %0.6f\t%0.6f" % (prob, lmda)

        #Handle the N-order N-grams
        print "\n\\%d-grams:" % (self.order)
        for key in sorted(self.numerators[self.order - 2].iterkeys()):
            #Compute the interpolated N-gram probability
            prob = self._compute_interpolated_prob(key)
            try:
                print "%0.6f\t%s" % (log(prob, 10.), key)
            except:
                print "%0.6f\t%s" % (log(1e-99, 10.), key)

        print "\n\\end\\"
        return

    def _compute_interpolated_prob(self, ngram):
        """
          Compute the interpolated probability for the input ngram.
          Cribbing the notation from the SRILM webpages,

             a_z    = An N-gram where a is the first word, z is the 
                       last word, and "_" represents 0 or more words in between.
             p(a_z) = The estimated conditional probability of the 
                       nth word z given the first n-1 words (a_) of an N-gram.
             a_     = The n-1 word prefix of the N-gram a_z.
             _z     = The n-1 word suffix of the N-gram a_z.

          Then we have, 
             f(a_z) = g(a_z) + bow(a_) p(_z)
             p(a_z) = (c(a_z) > 0) ? f(a_z) : bow(a_) p(_z)

          The ARPA format is generated by writing, for each N-gram
          with 1 < order < max_order:
             p(a_z)    a_z   bow(a_z)

          and for the maximum order:
             p(a_z)    a_z

          special care must be taken for certain N-grams containing
           the <s> (sentence-begin) and </s> (sentence-end) tokens.  
          See the implementation for details on how to do this correctly.

          The formulation is based on the seminal Chen&Goodman '98 paper.

          SRILM notation-cribbing from:
             http://www.speech.sri.com/projects/srilm/manpages/ngram-discount.7.html
        """
        probability = 0.0
        ngram_stack = ngram.split(" ")
        probs = [1e-99 for i in xrange(len(ngram_stack))]
        o = len(ngram_stack)

        if not ngram_stack[-1] == self.sb:
            probs[0] = self.UN[ngram_stack[-1]] / self.UD

        for i in xrange(o - 1):
            dID = " ".join(ngram_stack[o - (i + 2):o - 1])
            nID = " ".join(ngram_stack[o - (i + 2):])
            if dID in self.denominators[i]:
                d = self.discounts[i]
                if nID in self.numerators[i]:
                    #We have an actual N-gram probability for this N-gram
                    #KN discount
                    try:
                        probs[i + 1] = (self.numerators[i][nID] -
                                        d) / self.denominators[i][dID]
                    except:
                        probs[i + 1] = 1e-99
                else:
                    #No actual N-gram prob, we will have to back-off
                    probs[i + 1] = 1e-99
                #This break-down takes the following form:
                #  probs[i+1]:  The interpolated N-gram probability, p(a_z)
                #  lmda:        The un-normalized 'back-off' weight, bow(a_)
                #  probs[i]:    The next lower-order, interpolated N-gram
                #               probability corresponding to p(_z)
                #KN discount
                try:
                    lmda = self.nonZeros[i][dID] * d / self.denominators[i][dID]
                except:
                    lmda = 1e-99
                probs[i + 1] = probs[i + 1] + lmda * probs[i]
                probability = probs[i + 1]

        if probability == 0.0:
            #If we still have nothing, return the unigram probability
            probability = probs[0]

        return probability

Example #9

Show file

File: SimpleModKN.py Project: dativebase/old

class ModKNSmoother():
    """
      Stand-alone python implementation of Fixed Modified Kneser-Ney discounting.

      Intended for educational purposes, this should produce results identical
       to Google NGramLibrary tools with ngrammake --bins=3.  See the included 
       run-NGramLibrary.sh script to train a model for comparison.
    
      WARNING: This may be slow for very large corpora.

    """
    def __init__(self, order=3, sb="<s>", se="</s>"):
        self.sb = sb
        self.se = se
        self.order = order
        self.ngrams = NGramStack(order=order)
        self.denominators = [defaultdict(float) for i in xrange(order - 1)]
        self.numerators = [defaultdict(float) for i in xrange(order - 1)]
        #Modified Kneser-Ney requires that we track the individual N_i
        # in contrast to Kneser-Ney, which just requires the sum-total.
        self.nonZeros = [
            defaultdict(lambda: defaultdict(float)) for i in xrange(order - 1)
        ]
        self.CoC = [[0.0 for j in xrange(4)] for i in xrange(order)]
        self.discounts = [[0.0 for j in xrange(3)] for i in xrange(order - 1)]
        self.UD = 0.
        self.UN = defaultdict(float)

    def _compute_counts_of_counts(self):
        """
          Compute counts-of-counts (CoC) for each N-gram order.
          Only CoC<=4 are relevant to the computation of
          either ModKNFix or KNFix.
        """

        for k in self.UN:
            if self.UN[k] <= 4:
                self.CoC[0][int(self.UN[k] - 1)] += 1.

        for i, dic in enumerate(self.numerators):
            for k in dic:
                if dic[k] <= 4:
                    self.CoC[i + 1][int(dic[k] - 1)] += 1.
        return

    def _compute_discounts(self):
        """
          Compute the discount parameters. Note that unigram counts
          are not discounted in either FixKN or FixModKN.

          ---------------------------------
          Fixed Modified Kneser-Ney: FixModKN
          ---------------------------------
          This is the solution proposed by Chen&Goodman '98

             Y    = N_1 / (N_1 + 2*N_2)
             D_1  = 1 - 2*Y * (N_2 / N_1)
             D_2  = 2 - 3*Y * (N_3 / N_2)
             D_3+ = 3 - 4*Y * (N_4 / N_3)

          where N_i again refers to the number of N-grams that appear
          exactly 'i' times in the training data.  The D_i refer to the
          counts-of-counts for the current N-gram.  That is, if the 
          current N-gram, 'a b c' was seen exactly two times in the 
          training corpus, then discount D_2 would be applied.

        """

        for o in xrange(self.order - 1):
            Y = self.CoC[o + 1][0] / (self.CoC[o + 1][0] +
                                      2 * self.CoC[o + 1][1])
            #Compute all the D_i based on the formula
            for i in xrange(3):
                if self.CoC[o + 1][i] > 0:
                    self.discounts[o][i] = (i + 1) - (i + 2) * Y * (
                        self.CoC[o + 1][i + 1] / self.CoC[o + 1][i])
                else:
                    self.discounts[o][i] = (i + 1)

        return

    def _get_discount(self, order, ngram):
        """
          Compute the discount mass for this N-gram, based on 
           the precomputed D_i and individual N_i.
        """

        c = [0.0, 0.0, 0.0]

        for key in self.nonZeros[order][ngram]:
            if int(self.nonZeros[order][ngram][key]) == 1:
                c[0] += 1.
            elif int(self.nonZeros[order][ngram][key]) == 2:
                c[1] += 1.
            else:
                c[2] += 1.

        #Compute the discount mass by summing over the D_i*N_i
        d = sum([self.discounts[order][i] * c[i] for i in xrange(len(c))])
        return d

    def kneser_ney_from_counts(self, arpa_file):
        """
          Train the KN-discount language model from an ARPA format 
          file containing raw count data.  This can be generated with,
            $ ./SimpleCount.py --train train.corpus -r > counts.arpa

        """

        m_ord = c_ord = 0

        for line in open(arpa_file, "r"):
            ngram, count = line.strip().split("\t")
            count = float(count)
            ngram = ngram.split(" ")
            if len(ngram) == 2:
                self.UD += 1.0

            if len(ngram) == 2:
                self.UN[" ".join(ngram[1:])] += 1.0
                #Nonzeros based on suffixes
                if ngram[0] == self.sb:
                    self.nonZeros[len(ngram) - 2][" ".join(
                        ngram[:-1])][ngram[-1]] += count
                    self.numerators[len(ngram) - 2][" ".join(ngram)] += count
                    self.denominators[len(ngram) - 2][" ".join(
                        ngram[:-1])] += count

            if len(ngram) > 2 and len(ngram) < self.order:
                self.numerators[len(ngram) - 3][" ".join(ngram[1:])] += 1.0
                self.denominators[len(ngram) - 3][" ".join(ngram[1:-1])] += 1.0
                self.nonZeros[len(ngram) - 3][" ".join(
                    ngram[1:-1])][ngram[-1]] += 1.0
                if ngram[0] == self.sb:
                    self.numerators[len(ngram) - 2][" ".join(ngram)] += count
                    self.denominators[len(ngram) - 2][" ".join(
                        ngram[:-1])] += count
                    self.nonZeros[len(ngram) - 2][" ".join(
                        ngram[:-1])][ngram[-1]] += count

            if len(ngram) == self.order:
                self.numerators[len(ngram) - 3][" ".join(ngram[1:])] += 1.0
                self.numerators[len(ngram) - 2][" ".join(ngram)] = count
                self.denominators[len(ngram) - 3][" ".join(ngram[1:-1])] += 1.0
                self.denominators[len(ngram) - 2][" ".join(
                    ngram[:-1])] += count
                self.nonZeros[len(ngram) - 3][" ".join(
                    ngram[1:-1])][ngram[-1]] += 1.0
                self.nonZeros[len(ngram) - 2][" ".join(
                    ngram[:-1])][ngram[-1]] += count

        self._compute_counts_of_counts()
        self._compute_discounts()

        #self._print_raw_counts( )
        return

    def kneser_ney_discounting(self, training_file):
        """
          Iterate through the training data using a FIFO stack or 
           'window' of max-length equal to the specified N-gram order.

          Each time a new word is pushed onto the N-gram stack call
           the _kn_recurse() subroutine to increment the N-gram 
           contexts in the current window / on the stack. 

          If pushing a word onto the stack makes len(stack)>max-order, 
           then the word at the bottom (stack[0]) is popped off.
        """

        for line in open(training_file, "r"):
            #Split the current line into words.
            words = re.split(r"\s+", line.strip())

            #Push a sentence-begin token onto the stack
            self.ngrams.push(self.sb)

            for word in words:
                #Get the current 'window' of N-grams
                ngram = self.ngrams.push(word)

                #Now count all N-grams in the current window
                #These will be of span <= self.order
                self._kn_recurse(ngram, len(ngram) - 2)

            #Now push the sentence-end token onto the stack
            ngram = self.ngrams.push(self.se)
            self._kn_recurse(ngram, len(ngram) - 2)

            #Clear the stack for the next sentence
            self.ngrams.clear()
        self._compute_counts_of_counts()
        self._compute_discounts()

        return

    def print_raw_counts(self):
        """
          Convenience function for sanity checking the history counts.
        """
        #print "NUMERATORS:"
        #for key in sorted(self.UN.iterkeys()):
        #    print " ", key, self.UN[key]
        #for o in xrange(len(self.numerators)):
        #    print "ORD",o
        #    for key in sorted(self.numerators[o].iterkeys()):
        #        print " ", key, self.numerators[o][key]
        #print "DENOMINATORS:"
        #print self.UD
        #for o in xrange(len(self.denominators)):
        #    print "DORD", o
        #    for key in sorted(self.denominators[o].iterkeys()):
        #        print " ", key, self.denominators[o][key]
        print "NONZEROS:"
        for o in xrange(len(self.nonZeros)):
            print "ZORD", o
            for denom in sorted(self.nonZeros[o].iterkeys()):
                print " Den:", denom
                for key in sorted(self.nonZeros[o][denom].iterkeys()):
                    print "   ", key, self.nonZeros[o][denom][key]

    def _kn_recurse(self, ngram_stack, i):
        """
         Kneser-Ney discount calculation recursion.
        """

        if i == -1 and ngram_stack[0] == self.sb:
            return

        o = len(ngram_stack)
        numer = " ".join(ngram_stack[o - (i + 2):])
        denom = " ".join(ngram_stack[o - (i + 2):o - 1])
        self.numerators[i][numer] += 1.
        self.denominators[i][denom] += 1.

        #For Modified Kneser-Ney we need to track
        # individual nonZeros based on their suffixes
        self.nonZeros[i][denom][ngram_stack[-1]] += 1.
        if self.numerators[i][numer] == 1.:
            if i > 0:
                self._kn_recurse(ngram_stack, i - 1)
            else:
                #The <s> (sentence-begin) token is
                # NOT counted as a unigram event
                if not ngram_stack[-1] == self.sb:
                    self.UN[ngram_stack[-1]] += 1.
                    self.UD += 1.
        return

    def print_ARPA(self):
        """
          Print the interpolated Kneser-Ney LM out in ARPA format,
           computing the interpolated probabilities and back-off
           weights for each N-gram on-demand.  The format:
           ----------------------------
             \data\
             ngram 1=NUM_1GRAMS
             ngram 2=NUM_2GRAMS
             ...
             ngram N=NUM_NGRAMS (max order)
            
             \1-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \2-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \N-grams:
             p(a_z)  a_z
             ...

             \end\
           ----------------------------
        """

        #Handle the header info
        print "\\data\\"
        print "ngram 1=%d" % (len(self.UN) + 1)
        for o in xrange(0, self.order - 1):
            print "ngram %d=%d" % (o + 2, len(self.numerators[o]))

        #Handle the Unigrams
        print "\n\\1-grams:"
        d = self._get_discount(0, self.sb)
        #ModKN discount
        lmda = d / self.denominators[0][self.sb]
        print "-99.00000\t%s\t%0.7f" % (self.sb, log(lmda, 10.))

        for key in sorted(self.UN.iterkeys()):
            if key == self.se:
                print "%0.7f\t%s\t-99" % (log(self.UN[key] / self.UD,
                                              10.), key)
                continue

            d = self._get_discount(0, key)
            #ModKN discount
            lmda = d / self.denominators[0][key]
            print "%0.7f\t%s\t%0.7f" % (log(self.UN[key] / self.UD,
                                            10.), key, log(lmda, 10.))

        #Handle the middle-order N-grams
        for o in xrange(0, self.order - 2):
            print "\n\\%d-grams:" % (o + 2)
            for key in sorted(self.numerators[o].iterkeys()):
                if key.endswith(self.se):
                    #No back-off prob for N-grams ending in </s>
                    prob = self._compute_interpolated_prob(key)
                    print "%0.7f\t%s" % (log(prob, 10.), key)
                    continue
                d = self._get_discount(o + 1, key)
                #Compute the back-off weight
                #ModKN discount
                lmda = d / self.denominators[o + 1][key]
                #Compute the interpolated N-gram probability
                prob = self._compute_interpolated_prob(key)
                print "%0.7f\t%s\t%0.7f" % (log(prob, 10.), key, log(
                    lmda, 10.))

        #Handle the N-order N-grams
        print "\n\\%d-grams:" % (self.order)
        for key in sorted(self.numerators[self.order - 2].iterkeys()):
            #Compute the interpolated N-gram probability
            prob = self._compute_interpolated_prob(key)
            print "%0.7f\t%s" % (log(prob, 10.), key)

        print "\n\\end\\"
        return

    def _compute_interpolated_prob(self, ngram):
        """
          Compute the interpolated probability for the input ngram.
          Cribbing the notation from the SRILM webpages,

             a_z    = An N-gram where a is the first word, z is the 
                       last word, and "_" represents 0 or more words in between.
             p(a_z) = The estimated conditional probability of the 
                       nth word z given the first n-1 words (a_) of an N-gram.
             a_     = The n-1 word prefix of the N-gram a_z.
             _z     = The n-1 word suffix of the N-gram a_z.

          Then we have, 
             f(a_z) = g(a_z) + bow(a_) p(_z)
             p(a_z) = (c(a_z) > 0) ? f(a_z) : bow(a_) p(_z)

          The ARPA format is generated by writing, for each N-gram
          with 1 < order < max_order:
             p(a_z)    a_z   bow(a_z)

          and for the maximum order:
             p(a_z)    a_z

          special care must be taken for certain N-grams containing
           the <s> (sentence-begin) and </s> (sentence-end) tokens.  
          See the implementation for details on how to do this correctly.

          The formulation is based on the seminal Chen&Goodman '98 paper.

          SRILM notation-cribbing from:
             http://www.speech.sri.com/projects/srilm/manpages/ngram-discount.7.html
        """
        probability = 0.0
        ngram_stack = ngram.split(" ")
        probs = [1e-99 for i in xrange(len(ngram_stack))]
        o = len(ngram_stack)

        if not ngram_stack[-1] == self.sb:
            probs[0] = self.UN[ngram_stack[-1]] / self.UD

        for i in xrange(o - 1):
            dID = " ".join(ngram_stack[o - (i + 2):o - 1])
            nID = " ".join(ngram_stack[o - (i + 2):])
            if dID in self.denominators[i]:
                count = int(self.numerators[i][nID])
                d_i = min(count, 3)
                probs[i + 1] = (count - self.discounts[i][d_i - 1]
                                ) / self.denominators[i][dID]

                #This break-down takes the following form:
                #  probs[i+1]:  The interpolated N-gram probability, p(a_z)
                #  d:           The discount mass for a_: \Sum_i D_i*N_i
                #  lmda:        The un-normalized 'back-off' weight, bow(a_)
                #  probs[i]:    The next lower-order, interpolated N-gram
                #               probability corresponding to p(_z)
                #ModKN discount
                d = self._get_discount(i, dID)
                lmda = d / self.denominators[i][dID]
                probs[i + 1] = probs[i + 1] + lmda * probs[i]
                probability = probs[i + 1]

        if probability == 0.0:
            #If we still have nothing, return the unigram probability
            probability = probs[0]

        return probability

Example #10

Show file

File: SimpleCount.py Project: AdolfVonKleist/SimpleLM

class MLCounter( ):
    """
      Stand-alone python implementation of an simple Maximum Likelihood LM.

      This class simply counts NGrams in a training corpus and either,
        * Dumps the raw, log_10 counts into ARPA format
        * Computes an unsmoothed Maximum Likelihood LM 
    """

    def __init__( self, order=3, sb="<s>", se="</s>", raw=False, ml=False ):
        self.sb        = sb
        self.se        = se
        self.raw       = raw
        self.ml        = ml
        self.order     = order
        self.ngrams    = NGramStack(order=order)
        self.counts    = [ defaultdict(float) for i in xrange(order) ]

    def maximum_likelihood( self, training_file ):
        """
          Iterate through the training data using a FIFO stack or 
           'window' of max-length equal to the specified N-gram order.

          Each time a new word is pushed onto the N-gram stack call
           the _ml_count() subroutine to increment the N-gram counts.

          If pushing a word onto the stack makes len(stack)>max-order, 
           then the word at the bottom (stack[0]) is popped off.
        """

        for line in open(training_file,"r"):
            #Split the current line into words.
            words = re.split(r"\s+",line.strip())

            #Push a sentence-begin token onto the stack
            ngram = self.ngrams.push(self.sb)
            self._ml_count( ngram )

            for word in words:
                #Get the current 'window' of N-grams
                ngram = self.ngrams.push(word)

                #Now count all N-grams in the current window
                #These will be of span <= self.order
                self._ml_count( ngram )

            #Now push the sentence-end token onto the stack
            ngram = self.ngrams.push(self.se)
            self._ml_count( ngram )

            #Clear the stack for the next sentence
            self.ngrams.clear()
        
        return

    def _ml_count( self, ngram_stack ):
        """
          Just count NGrams.  The only slightly confusing thing here
          is the sentence-begin (<s>).  It does NOT count as a
          unigram event and thus does not contribute to the unigram tally.
          It IS however used as a history denominator.
        """

        #Iterate backwards through the stack
        for o in xrange(len(ngram_stack),0,-1):
            start = len(ngram_stack)-o
            self.counts[o-1][" ".join(ngram_stack[start:])] += 1.0

        return  
        
    def print_ARPA( self ):
        """
          Print the raw counts or ML LM out in ARPA format,
           ARPA format:
           ----------------------------
             \data\
             ngram 1=NUM_1GRAMS
             ngram 2=NUM_2GRAMS
             ...
             ngram N=NUM_NGRAMS (max order)
            
             \1-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \2-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \N-grams:
             p(a_z)  a_z
             ...

             \end\
           ----------------------------
          NOTE: Neither the ML model nor the raw counts
                will ever have a 'backoff weight'.
        """

        #Handle the header info
        print "\\data\\"
        for o in xrange(0,self.order):
            print "ngram %d=%d" % (o+1,len(self.counts[o]) )

        #Handle the Unigrams
        print "\n\\1-grams:"
        for key in sorted(self.counts[0].iterkeys()):
            if key==self.sb:
                if self.raw:
                    print "0.00000\t%s"   % ( self.sb )
                else:
                    print "-99.00000\t%s" % ( self.sb )
            else:
                if self.ml:
                    ml_prob = self.counts[0][key] / ( sum( [ self.counts[0][c] for c in self.counts[0].keys() ] ) - self.counts[0][self.sb] )
                    if self.raw:
                        print "%0.6f\t%s" % ( ml_prob, key )
                    else:
                        print "%0.6f\t%s" % ( log(ml_prob,10.), key )
                elif self.raw:
                    print "%0.6f\t%s" % (self.counts[0][key], key)
                else:
                    print "%0.6f\t%s" % (log(self.counts[0][key],10.), key)

        #Handle the middle-order N-grams
        for o in xrange(1,self.order):
            print "\n\\%d-grams:" % (o+1)
            for key in sorted(self.counts[o].iterkeys()):
                if self.ml:
                    hist = key[:key.rfind(" ")]
                    ml_prob = self.counts[o][key] / self.counts[o-1][hist]
                    if self.raw:
                        print "%0.6f\t%s" % ( ml_prob, key )
                    else:
                        print "%0.6f\t%s" % ( log(ml_prob,10.), key )
                elif self.raw:
                    print "%0.6f\t%s" % (self.counts[o][key], key)
                else:
                    print "%0.6f\t%s" % (log(self.counts[o][key],10.), key)

        print "\n\\end\\"
        return

Example #11

Show file

class AbsSmoother( ):
    """
      Stand-alone python implementation of interpolated Absolute discounting.

      Intended for educational purposes, this should produce results identical
       to mitlm's 'estimate-ngram' utility,
         mitlm:
          $ estimate-ngram -o 3 -t train.corpus -s FixKN
         SimpleKN.py:
          $ SimpleKN.py -t train.corpus
    
      WARNING: This program has not been optimized in any way and will almost 
       surely be extremely slow for anything larger than a small toy corpus.

    """

    def __init__( self, order=3, sb="<s>", se="</s>" ):
        self.sb        = sb
        self.se        = se
        self.order     = order
        self.ngrams    = NGramStack(order=order)
        self.denominators = [ defaultdict(float) for i in xrange(order-1) ]
        self.numerators   = [ defaultdict(float) for i in xrange(order-1) ]
        self.nonZeros     = [ defaultdict(float) for i in xrange(order-1) ]
        self.CoC          = [ [ 0.0 for j in xrange(4) ] for i in xrange(order) ]
        self.discounts    = [ 0.0 for i in xrange(order-1) ]
        self.UD = 0.
        self.UN = defaultdict(float)

    def _compute_counts_of_counts( self ):
        """
          Compute counts-of-counts (CoC) for each N-gram order.
          Only CoC<=4 are relevant to the computation of
          ModKNFix or KNFix or Absolute discounting.
        """

        for k in self.UN:
            if self.UN[k] <= 4:
                self.CoC[0][int(self.UN[k]-1)] += 1.

        for i,dic in enumerate(self.numerators):
            for k in dic:
                if dic[k]<=4:
                    self.CoC[i+1][int(dic[k]-1)] += 1.
        return

    def _compute_discounts( self ):
        """
          Compute the discount parameters. Note that unigram counts
          are not discounted in Absolute, FixKN or FixModKN.

          ---------------------------------
          Fixed Kneser-Ney smoothing: FixKN
          ---------------------------------
          This is based on the solution described in Kneser-Ney '95, 
          and reformulated in Chen&Goodman '98.  
          
             D = N_1 / ( N_1 + 2(N_2) )

          where N_1 refers to the # of N-grams that appear exactly
          once, and N_2 refers to the number of N-grams that appear
          exactly twice.  This is computed for each order.

          NOTE: The discount formula for FixKN is identical 
                for Absolute discounting.
        """

        #Uniform discount for each N-gram order
        for o in xrange(self.order-1):
            self.discounts[o] = self.CoC[o+1][0] / (self.CoC[o+1][0]+2*self.CoC[o+1][1])

        return 

    def _get_discount( self, order, ngram ):
        """
          Retrieve the pre-computed discount for this N-gram.
        """

        return self.discounts[order]

    def absolute_discounting( self, training_file ):
        """
          Iterate through the training data using a FIFO stack or 
           'window' of max-length equal to the specified N-gram order.

          Each time a new word is pushed onto the N-gram stack call
           the _abs_count() subroutine to increment the N-gram 
           contexts in the current window / on the stack. 

          If pushing a word onto the stack makes len(stack)>max-order, 
           then the word at the bottom (stack[0]) is popped off.
        """
        for line in open(training_file,"r"):
            #Split the current line into words.
            words = re.split(r"\s+",line.strip())

            #Push a sentence-begin token onto the stack
            self.ngrams.push(self.sb)

            for word in words:
                #Get the current 'window' of N-grams
                ngram = self.ngrams.push(word)

                #Now count all N-grams in the current window
                #These will be of span <= self.order
                self._abs_count( ngram, len(ngram)-2 )

            #Now push the sentence-end token onto the stack
            ngram = self.ngrams.push(self.se)
            self._abs_count( ngram, len(ngram)-2 )

            #Clear the stack for the next sentence
            self.ngrams.clear()
        self._compute_counts_of_counts ( )
        self._compute_discounts( )
        return

    def _abs_count( self, ngram_stack, i ):
        """
         Absolute discount calculation recursion.
        """
        if i==-1 and ngram_stack[0]==self.sb:
            return
        o     = len(ngram_stack)
        numer = " ".join(ngram_stack[o-(i+2):])
        denom = " ".join(ngram_stack[o-(i+2):o-1])
        self.numerators[  i][numer] += 1.
        self.denominators[i][denom] += 1.
        if self.numerators[i][numer]==1.:
            self.nonZeros[i][denom]  += 1.
        #The ONLY difference in terms of implementation
        #  between Kneser-Ney and Absolute discounting 
        #  is the following if/else.
        #In Kneser-Ney this is nested inside of the 
        #  preceding if statement.
        if i>0:
            self._abs_count( ngram_stack, i-1 )
        else:
            #The <s> (sentence-begin) token is
            # NOT counted as a unigram event
            if not ngram_stack[-1]==self.sb:
                self.UN[ngram_stack[-1]] += 1.
                self.UD += 1.
        return  
        
    def print_ARPA( self ):
        """
          Print the interpolated Absolute discounting LM out in ARPA format,
           computing the interpolated probabilities and back-off
           weights for each N-gram on-demand.  The format:
           ----------------------------
             \data\
             ngram 1=NUM_1GRAMS
             ngram 2=NUM_2GRAMS
             ...
             ngram N=NUM_NGRAMS (max order)
            
             \1-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \2-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \N-grams:
             p(a_z)  a_z
             ...

             \end\
           ----------------------------
        """

        #Handle the header info
        print "\\data\\"
        print "ngram 1=%d" % (len(self.UN)+1)
        for o in xrange(0,self.order-1):
            print "ngram %d=%d" % (o+2,len(self.numerators[o]) )

        #Handle the Unigrams
        print "\n\\1-grams:"
        d    = self.discounts[0]
        #Abs discount
        lmda = self.nonZeros[0][self.sb] * d / self.denominators[0][self.sb]
        print "-99.00000\t%s\t%0.6f"   % ( self.sb, log(lmda, 10.) )

        for key in sorted(self.UN.iterkeys()):
            if key==self.se:
                print "%0.6f\t%s\t-99"   % ( log(self.UN[key]/self.UD, 10.), key )
                continue

            d    = self.discounts[0]
            #Abs discount
            lmda = self.nonZeros[0][key] * d / self.denominators[0][key]
            print "%0.6f\t%s\t%0.6f" % ( log(self.UN[key]/self.UD, 10.), key, log(lmda, 10.) )

        #Handle the middle-order N-grams
        for o in xrange(0,self.order-2):
            print "\n\\%d-grams:" % (o+2)
            for key in sorted(self.numerators[o].iterkeys()):
                if key.endswith(self.se):
                    #No back-off prob for N-grams ending in </s>
                    prob = self._compute_interpolated_prob( key )
                    print "%0.6f\t%s" % ( log(prob, 10.), key )
                    continue
                d = self.discounts[o+1]
                #Compute the back-off weight
                #Abs discount
                lmda = self.nonZeros[o+1][key] * d / self.denominators[o+1][key]
                #Compute the interpolated N-gram probability
                prob = self._compute_interpolated_prob( key )
                print "%0.6f\t%s\t%0.6f" % ( log(prob, 10.), key, log(lmda, 10.))

        #Handle the N-order N-grams
        print "\n\\%d-grams:" % (self.order)
        for key in sorted(self.numerators[self.order-2].iterkeys()):
            #Compute the interpolated N-gram probability
            prob = self._compute_interpolated_prob( key )
            print "%0.6f\t%s" % ( log(prob, 10.), key )

        print "\n\\end\\"
        return


    def _compute_interpolated_prob( self, ngram ):
        """
          Compute the interpolated probability for the input ngram.
          Cribbing the notation from the SRILM webpages,

             a_z    = An N-gram where a is the first word, z is the 
                       last word, and "_" represents 0 or more words in between.
             p(a_z) = The estimated conditional probability of the 
                       nth word z given the first n-1 words (a_) of an N-gram.
             a_     = The n-1 word prefix of the N-gram a_z.
             _z     = The n-1 word suffix of the N-gram a_z.

          Then we have, 
             f(a_z) = g(a_z) + bow(a_) p(_z)
             p(a_z) = (c(a_z) > 0) ? f(a_z) : bow(a_) p(_z)

          The ARPA format is generated by writing, for each N-gram
          with 1 < order < max_order:
             p(a_z)    a_z   bow(a_z)

          and for the maximum order:
             p(a_z)    a_z

          special care must be taken for certain N-grams containing
           the <s> (sentence-begin) and </s> (sentence-end) tokens.  
          See the implementation for details on how to do this correctly.

          The formulation is based on the seminal Chen&Goodman '98 paper.

          SRILM notation-cribbing from:
             http://www.speech.sri.com/projects/srilm/manpages/ngram-discount.7.html
        """
        probability = 0.0
        ngram_stack = ngram.split(" ")
        probs       = [ 1e-99 for i in xrange(len(ngram_stack)) ]
        o           = len(ngram_stack)

        if not ngram_stack[-1]==self.sb:
            probs[0] = self.UN[ngram_stack[-1]] / self.UD

        for i in xrange(o-1):
            dID = " ".join(ngram_stack[o-(i+2):o-1])
            nID = " ".join(ngram_stack[o-(i+2):])
            if dID in self.denominators[i]:
                d = self.discounts[i]
                if nID in self.numerators[i]:
                    #We have an actual N-gram probability for this N-gram
                    #KN discount
                    probs[i+1] = (self.numerators[i][nID]-d)/self.denominators[i][dID]
                else:
                    #No actual N-gram prob, we will have to back-off
                    probs[i+1] = 0.0
                #This break-down takes the following form:
                #  probs[i+1]:  The interpolated N-gram probability, p(a_z)
                #  lmda:        The un-normalized 'back-off' weight, bow(a_)
                #  probs[i]:    The next lower-order, interpolated N-gram 
                #               probability corresponding to p(_z)
                #KN discount
                lmda        = self.nonZeros[i][dID] * d / self.denominators[i][dID]
                probs[i+1]  = probs[i+1] + lmda * probs[i]
                probability = probs[i+1]

        if probability == 0.0:
            #If we still have nothing, return the unigram probability
            probability = probs[0]

        return probability

Example #12

Show file

File: SimpleCount.py Project: dativebase/old

class MLCounter():
    """
      Stand-alone python implementation of an simple Maximum Likelihood LM.

      This class simply counts NGrams in a training corpus and either,
        * Dumps the raw, log_10 counts into ARPA format
        * Computes an unsmoothed Maximum Likelihood LM 
    """
    def __init__(self, order=3, sb="<s>", se="</s>", raw=False, ml=False):
        self.sb = sb
        self.se = se
        self.raw = raw
        self.ml = ml
        self.order = order
        self.ngrams = NGramStack(order=order)
        self.counts = [defaultdict(float) for i in xrange(order)]

    def maximum_likelihood(self, training_file):
        """
          Iterate through the training data using a FIFO stack or 
           'window' of max-length equal to the specified N-gram order.

          Each time a new word is pushed onto the N-gram stack call
           the _ml_count() subroutine to increment the N-gram counts.

          If pushing a word onto the stack makes len(stack)>max-order, 
           then the word at the bottom (stack[0]) is popped off.
        """

        for line in open(training_file, "r"):
            #Split the current line into words.
            words = re.split(r"\s+", line.strip())

            #Push a sentence-begin token onto the stack
            ngram = self.ngrams.push(self.sb)
            self._ml_count(ngram)

            for word in words:
                #Get the current 'window' of N-grams
                ngram = self.ngrams.push(word)

                #Now count all N-grams in the current window
                #These will be of span <= self.order
                self._ml_count(ngram)

            #Now push the sentence-end token onto the stack
            ngram = self.ngrams.push(self.se)
            self._ml_count(ngram)

            #Clear the stack for the next sentence
            self.ngrams.clear()

        return

    def _ml_count(self, ngram_stack):
        """
          Just count NGrams.  The only slightly confusing thing here
          is the sentence-begin (<s>).  It does NOT count as a
          unigram event and thus does not contribute to the unigram tally.
          It IS however used as a history denominator.
        """

        #Iterate backwards through the stack
        for o in xrange(len(ngram_stack), 0, -1):
            start = len(ngram_stack) - o
            self.counts[o - 1][" ".join(ngram_stack[start:])] += 1.0

        return

    def print_ARPA(self):
        """
          Print the raw counts or ML LM out in ARPA format,
           ARPA format:
           ----------------------------
             \data\
             ngram 1=NUM_1GRAMS
             ngram 2=NUM_2GRAMS
             ...
             ngram N=NUM_NGRAMS (max order)
            
             \1-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \2-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \N-grams:
             p(a_z)  a_z
             ...

             \end\
           ----------------------------
          NOTE: Neither the ML model nor the raw counts
                will ever have a 'backoff weight'.
        """

        #Handle the header info
        print "\\data\\"
        for o in xrange(0, self.order):
            print "ngram %d=%d" % (o + 1, len(self.counts[o]))

        #Handle the Unigrams
        print "\n\\1-grams:"
        for key in sorted(self.counts[0].iterkeys()):
            if key == self.sb:
                if self.raw:
                    print "0.00000\t%s" % (self.sb)
                else:
                    print "-99.00000\t%s" % (self.sb)
            else:
                if self.ml:
                    ml_prob = self.counts[0][key] / (
                        sum([self.counts[0][c]
                             for c in self.counts[0].keys()]) -
                        self.counts[0][self.sb])
                    if self.raw:
                        print "%0.6f\t%s" % (ml_prob, key)
                    else:
                        print "%0.6f\t%s" % (log(ml_prob, 10.), key)
                elif self.raw:
                    print "%0.6f\t%s" % (self.counts[0][key], key)
                else:
                    print "%0.6f\t%s" % (log(self.counts[0][key], 10.), key)

        #Handle the middle-order N-grams
        for o in xrange(1, self.order):
            print "\n\\%d-grams:" % (o + 1)
            for key in sorted(self.counts[o].iterkeys()):
                if self.ml:
                    hist = key[:key.rfind(" ")]
                    ml_prob = self.counts[o][key] / self.counts[o - 1][hist]
                    if self.raw:
                        print "%0.6f\t%s" % (ml_prob, key)
                    else:
                        print "%0.6f\t%s" % (log(ml_prob, 10.), key)
                elif self.raw:
                    print "%0.6f\t%s" % (self.counts[o][key], key)
                else:
                    print "%0.6f\t%s" % (log(self.counts[o][key], 10.), key)

        print "\n\\end\\"
        return

Example #13

Show file

File: SimpleModKN.py Project: AdolfVonKleist/SimpleLM

class ModKNSmoother( ):
    """
      Stand-alone python implementation of Fixed Modified Kneser-Ney discounting.

      Intended for educational purposes, this should produce results identical
       to Google NGramLibrary tools with ngrammake --bins=3.  See the included 
       run-NGramLibrary.sh script to train a model for comparison.
    
      WARNING: This may be slow for very large corpora.

    """

    def __init__( self, order=3, sb="<s>", se="</s>" ):
        self.sb        = sb
        self.se        = se
        self.order     = order
        self.ngrams    = NGramStack(order=order)
        self.denominators = [ defaultdict(float) for i in xrange(order-1) ]
        self.numerators   = [ defaultdict(float) for i in xrange(order-1) ]
        #Modified Kneser-Ney requires that we track the individual N_i
        # in contrast to Kneser-Ney, which just requires the sum-total.
        self.nonZeros     = [ defaultdict(lambda: defaultdict(float)) for i in xrange(order-1) ]
        self.CoC          = [ [ 0.0 for j in xrange(4) ] for i in xrange(order) ]
        self.discounts    = [ [ 0.0 for j in xrange(3) ] for i in xrange(order-1) ]
        self.UD = 0.
        self.UN = defaultdict(float)

    def _compute_counts_of_counts( self ):
        """
          Compute counts-of-counts (CoC) for each N-gram order.
          Only CoC<=4 are relevant to the computation of
          either ModKNFix or KNFix.
        """

        for k in self.UN:
            if self.UN[k] <= 4:
                self.CoC[0][int(self.UN[k]-1)] += 1.

        for i,dic in enumerate(self.numerators):
            for k in dic:
                if dic[k]<=4:
                    self.CoC[i+1][int(dic[k]-1)] += 1.
        return

    def _compute_discounts( self ):
        """
          Compute the discount parameters. Note that unigram counts
          are not discounted in either FixKN or FixModKN.

          ---------------------------------
          Fixed Modified Kneser-Ney: FixModKN
          ---------------------------------
          This is the solution proposed by Chen&Goodman '98

             Y    = N_1 / (N_1 + 2*N_2)
             D_1  = 1 - 2*Y * (N_2 / N_1)
             D_2  = 2 - 3*Y * (N_3 / N_2)
             D_3+ = 3 - 4*Y * (N_4 / N_3)

          where N_i again refers to the number of N-grams that appear
          exactly 'i' times in the training data.  The D_i refer to the
          counts-of-counts for the current N-gram.  That is, if the 
          current N-gram, 'a b c' was seen exactly two times in the 
          training corpus, then discount D_2 would be applied.

        """

        for o in xrange(self.order-1):
            Y = self.CoC[o+1][0] / (self.CoC[o+1][0]+2*self.CoC[o+1][1])
            #Compute all the D_i based on the formula
            for i in xrange(3):
                if self.CoC[o+1][i]>0:
                    self.discounts[o][i] = (i+1) - (i+2)*Y * (self.CoC[o+1][i+1]/self.CoC[o+1][i])
                else:
                    self.discounts[o][i] = (i+1)

        return 

    def _get_discount( self, order, ngram ):
        """
          Compute the discount mass for this N-gram, based on 
           the precomputed D_i and individual N_i.
        """

        c  = [0.0, 0.0, 0.0]
                    
        for key in self.nonZeros[order][ngram]:
            if int(self.nonZeros[order][ngram][key])==1:
                c[0] += 1.
            elif int(self.nonZeros[order][ngram][key])==2:
                c[1] += 1.
            else:
                c[2] += 1.

        #Compute the discount mass by summing over the D_i*N_i 
        d = sum([ self.discounts[order][i]*c[i] for i in xrange(len(c)) ])
        return d

    def kneser_ney_from_counts( self, arpa_file ):
        """
          Train the KN-discount language model from an ARPA format 
          file containing raw count data.  This can be generated with,
            $ ./SimpleCount.py --train train.corpus -r > counts.arpa

        """

        m_ord = c_ord = 0

        for line in open(arpa_file, "r"):
            ngram, count = line.strip().split("\t")
            count = float(count)
            ngram = ngram.split(" ")
            if len(ngram)==2:
                self.UD += 1.0

            if len(ngram)==2:
                self.UN[" ".join(ngram[1:])] += 1.0
                #Nonzeros based on suffixes
                if ngram[0]==self.sb:
                    self.nonZeros[len(ngram)-2][" ".join(ngram[:-1])][ngram[-1]] += count
                    self.numerators[len(ngram)-2][" ".join(ngram)] += count
                    self.denominators[len(ngram)-2][" ".join(ngram[:-1])] += count

            if len(ngram)>2 and len(ngram)<self.order:
                self.numerators[len(ngram)-3][" ".join(ngram[1:])] += 1.0
                self.denominators[len(ngram)-3][" ".join(ngram[1:-1])] += 1.0
                self.nonZeros[len(ngram)-3][" ".join(ngram[1:-1])][ngram[-1]] += 1.0
                if ngram[0]==self.sb:
                    self.numerators[len(ngram)-2][" ".join(ngram)] += count
                    self.denominators[len(ngram)-2][" ".join(ngram[:-1])] += count
                    self.nonZeros[len(ngram)-2][" ".join(ngram[:-1])][ngram[-1]] += count

            if len(ngram)==self.order:
                self.numerators[len(ngram)-3][" ".join(ngram[1:])] += 1.0
                self.numerators[len(ngram)-2][" ".join(ngram)] = count
                self.denominators[len(ngram)-3][" ".join(ngram[1:-1])] += 1.0
                self.denominators[len(ngram)-2][" ".join(ngram[:-1])] += count
                self.nonZeros[len(ngram)-3][" ".join(ngram[1:-1])][ngram[-1]] += 1.0
                self.nonZeros[len(ngram)-2][" ".join(ngram[:-1])][ngram[-1]] += count

        self._compute_counts_of_counts ( )
        self._compute_discounts( )

        #self._print_raw_counts( )
        return

    def kneser_ney_discounting( self, training_file ):
        """
          Iterate through the training data using a FIFO stack or 
           'window' of max-length equal to the specified N-gram order.

          Each time a new word is pushed onto the N-gram stack call
           the _kn_recurse() subroutine to increment the N-gram 
           contexts in the current window / on the stack. 

          If pushing a word onto the stack makes len(stack)>max-order, 
           then the word at the bottom (stack[0]) is popped off.
        """

        for line in open(training_file,"r"):
            #Split the current line into words.
            words = re.split(r"\s+",line.strip())

            #Push a sentence-begin token onto the stack
            self.ngrams.push(self.sb)

            for word in words:
                #Get the current 'window' of N-grams
                ngram = self.ngrams.push(word)

                #Now count all N-grams in the current window
                #These will be of span <= self.order
                self._kn_recurse( ngram, len(ngram)-2 )

            #Now push the sentence-end token onto the stack
            ngram = self.ngrams.push(self.se)
            self._kn_recurse( ngram, len(ngram)-2 )

            #Clear the stack for the next sentence
            self.ngrams.clear()
        self._compute_counts_of_counts ( )
        self._compute_discounts( )

        return


    def print_raw_counts( self ):
        """
          Convenience function for sanity checking the history counts.
        """
        #print "NUMERATORS:"
        #for key in sorted(self.UN.iterkeys()):
        #    print " ", key, self.UN[key]
        #for o in xrange(len(self.numerators)):
        #    print "ORD",o
        #    for key in sorted(self.numerators[o].iterkeys()):
        #        print " ", key, self.numerators[o][key]
        #print "DENOMINATORS:"
        #print self.UD
        #for o in xrange(len(self.denominators)):
        #    print "DORD", o
        #    for key in sorted(self.denominators[o].iterkeys()):
        #        print " ", key, self.denominators[o][key]
        print "NONZEROS:"
        for o in xrange(len(self.nonZeros)):
            print "ZORD", o
            for denom in sorted(self.nonZeros[o].iterkeys()):
                print " Den:", denom
                for key in sorted(self.nonZeros[o][denom].iterkeys()):
                    print "   ", key, self.nonZeros[o][denom][key]


    def _kn_recurse( self, ngram_stack, i ):
        """
         Kneser-Ney discount calculation recursion.
        """

        if i==-1 and ngram_stack[0]==self.sb:
            return

        o     = len(ngram_stack)
        numer = " ".join(ngram_stack[o-(i+2):])
        denom = " ".join(ngram_stack[o-(i+2):o-1])
        self.numerators[  i][numer] += 1.
        self.denominators[i][denom] += 1.
 
        #For Modified Kneser-Ney we need to track 
        # individual nonZeros based on their suffixes
        self.nonZeros[i][denom][ngram_stack[-1]] += 1.
        if self.numerators[i][numer]==1.:
            if i>0:
                self._kn_recurse( ngram_stack, i-1 )
            else:
                #The <s> (sentence-begin) token is
                # NOT counted as a unigram event
                if not ngram_stack[-1]==self.sb:
                    self.UN[ngram_stack[-1]] += 1.
                    self.UD += 1.
        return  
        
    def print_ARPA( self ):
        """
          Print the interpolated Kneser-Ney LM out in ARPA format,
           computing the interpolated probabilities and back-off
           weights for each N-gram on-demand.  The format:
           ----------------------------
             \data\
             ngram 1=NUM_1GRAMS
             ngram 2=NUM_2GRAMS
             ...
             ngram N=NUM_NGRAMS (max order)
            
             \1-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \2-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \N-grams:
             p(a_z)  a_z
             ...

             \end\
           ----------------------------
        """

        #Handle the header info
        print "\\data\\"
        print "ngram 1=%d" % (len(self.UN)+1)
        for o in xrange(0,self.order-1):
            print "ngram %d=%d" % (o+2,len(self.numerators[o]) )

        #Handle the Unigrams
        print "\n\\1-grams:"
        d    = self._get_discount( 0, self.sb )
        #ModKN discount
        lmda = d / self.denominators[0][self.sb]
        print "-99.00000\t%s\t%0.7f"   % ( self.sb, log(lmda, 10.) )

        for key in sorted(self.UN.iterkeys()):
            if key==self.se:
                print "%0.7f\t%s\t-99"   % ( log(self.UN[key]/self.UD, 10.), key )
                continue

            d    = self._get_discount( 0, key )
            #ModKN discount
            lmda = d / self.denominators[0][key]
            print "%0.7f\t%s\t%0.7f" % ( log(self.UN[key]/self.UD, 10.), key, log(lmda, 10.) )

        #Handle the middle-order N-grams
        for o in xrange(0,self.order-2):
            print "\n\\%d-grams:" % (o+2)
            for key in sorted(self.numerators[o].iterkeys()):
                if key.endswith(self.se):
                    #No back-off prob for N-grams ending in </s>
                    prob = self._compute_interpolated_prob( key )
                    print "%0.7f\t%s" % ( log(prob, 10.), key )
                    continue
                d = self._get_discount( o+1, key )
                #Compute the back-off weight
                #ModKN discount
                lmda  = d / self.denominators[o+1][key]
                #Compute the interpolated N-gram probability
                prob = self._compute_interpolated_prob( key )
                print "%0.7f\t%s\t%0.7f" % ( log(prob, 10.), key, log(lmda, 10.))

        #Handle the N-order N-grams
        print "\n\\%d-grams:" % (self.order)
        for key in sorted(self.numerators[self.order-2].iterkeys()):
            #Compute the interpolated N-gram probability
            prob = self._compute_interpolated_prob( key )
            print "%0.7f\t%s" % ( log(prob, 10.), key )

        print "\n\\end\\"
        return


    def _compute_interpolated_prob( self, ngram ):
        """
          Compute the interpolated probability for the input ngram.
          Cribbing the notation from the SRILM webpages,

             a_z    = An N-gram where a is the first word, z is the 
                       last word, and "_" represents 0 or more words in between.
             p(a_z) = The estimated conditional probability of the 
                       nth word z given the first n-1 words (a_) of an N-gram.
             a_     = The n-1 word prefix of the N-gram a_z.
             _z     = The n-1 word suffix of the N-gram a_z.

          Then we have, 
             f(a_z) = g(a_z) + bow(a_) p(_z)
             p(a_z) = (c(a_z) > 0) ? f(a_z) : bow(a_) p(_z)

          The ARPA format is generated by writing, for each N-gram
          with 1 < order < max_order:
             p(a_z)    a_z   bow(a_z)

          and for the maximum order:
             p(a_z)    a_z

          special care must be taken for certain N-grams containing
           the <s> (sentence-begin) and </s> (sentence-end) tokens.  
          See the implementation for details on how to do this correctly.

          The formulation is based on the seminal Chen&Goodman '98 paper.

          SRILM notation-cribbing from:
             http://www.speech.sri.com/projects/srilm/manpages/ngram-discount.7.html
        """
        probability = 0.0
        ngram_stack = ngram.split(" ")
        probs       = [ 1e-99 for i in xrange(len(ngram_stack)) ]
        o           = len(ngram_stack)

        if not ngram_stack[-1]==self.sb:
            probs[0] = self.UN[ngram_stack[-1]] / self.UD

        for i in xrange(o-1):
            dID = " ".join(ngram_stack[o-(i+2):o-1])
            nID = " ".join(ngram_stack[o-(i+2):])
            if dID in self.denominators[i]:
                count = int(self.numerators[i][nID])
                d_i   = min(count, 3)
                probs[i+1] = (count-self.discounts[i][d_i-1])/self.denominators[i][dID]

                #This break-down takes the following form:
                #  probs[i+1]:  The interpolated N-gram probability, p(a_z)
                #  d:           The discount mass for a_: \Sum_i D_i*N_i
                #  lmda:        The un-normalized 'back-off' weight, bow(a_)
                #  probs[i]:    The next lower-order, interpolated N-gram 
                #               probability corresponding to p(_z)
                #ModKN discount
                d = self._get_discount( i, dID )
                lmda        = d / self.denominators[i][dID]
                probs[i+1]  = probs[i+1] + lmda * probs[i]
                probability = probs[i+1]

        if probability == 0.0:
            #If we still have nothing, return the unigram probability
            probability = probs[0]

        return probability