Example #1
0
 def __init__(self, order=3, sb="<s>", se="</s>", raw=False, ml=False):
     self.sb = sb
     self.se = se
     self.raw = raw
     self.ml = ml
     self.order = order
     self.ngrams = NGramStack(order=order)
     self.counts = [defaultdict(float) for i in xrange(order)]
Example #2
0
 def __init__(self, order=3, sb="<s>", se="</s>"):
     self.sb = sb
     self.se = se
     self.order = order
     self.ngrams = NGramStack(order=order)
     self.denominators = [defaultdict(float) for i in xrange(order - 1)]
     self.numerators = [defaultdict(float) for i in xrange(order - 1)]
     self.nonZeros = [defaultdict(float) for i in xrange(order - 1)]
     self.CoC = [[0.0 for j in xrange(4)] for i in xrange(order)]
     self.discounts = [0.0 for i in xrange(order - 1)]
     self.UD = 0.
     self.UN = defaultdict(float)
Example #3
0
 def __init__(self, order=3, sb="<s>", se="</s>"):
     self.sb = sb
     self.se = se
     self.order = order
     self.ngrams = NGramStack(order=order)
     self.denominators = [defaultdict(float) for i in xrange(order - 1)]
     self.numerators = [defaultdict(float) for i in xrange(order - 1)]
     #Modified Kneser-Ney requires that we track the individual N_i
     # in contrast to Kneser-Ney, which just requires the sum-total.
     self.nonZeros = [
         defaultdict(lambda: defaultdict(float)) for i in xrange(order - 1)
     ]
     self.CoC = [[0.0 for j in xrange(4)] for i in xrange(order)]
     self.discounts = [[0.0 for j in xrange(3)] for i in xrange(order - 1)]
     self.UD = 0.
     self.UN = defaultdict(float)
Example #4
0
 def __init__( self, order=3, sb="<s>", se="</s>", raw=False, ml=False ):
     self.sb        = sb
     self.se        = se
     self.raw       = raw
     self.ml        = ml
     self.order     = order
     self.ngrams    = NGramStack(order=order)
     self.counts    = [ defaultdict(float) for i in xrange(order) ]
Example #5
0
 def __init__( self, order=3, sb="<s>", se="</s>" ):
     self.sb        = sb
     self.se        = se
     self.order     = order
     self.ngrams    = NGramStack(order=order)
     self.denominators = [ defaultdict(float) for i in xrange(order-1) ]
     self.numerators   = [ defaultdict(float) for i in xrange(order-1) ]
     self.nonZeros     = [ defaultdict(float) for i in xrange(order-1) ]
     self.CoC          = [ [ 0.0 for j in xrange(4) ] for i in xrange(order) ]
     self.discounts    = [ 0.0 for i in xrange(order-1) ]
     self.UD = 0.
     self.UN = defaultdict(float)
Example #6
0
 def __init__( self, order=3, sb="<s>", se="</s>" ):
     self.sb        = sb
     self.se        = se
     self.order     = order
     self.ngrams    = NGramStack(order=order)
     self.denominators = [ defaultdict(float) for i in xrange(order-1) ]
     self.numerators   = [ defaultdict(float) for i in xrange(order-1) ]
     #Modified Kneser-Ney requires that we track the individual N_i
     # in contrast to Kneser-Ney, which just requires the sum-total.
     self.nonZeros     = [ defaultdict(lambda: defaultdict(float)) for i in xrange(order-1) ]
     self.CoC          = [ [ 0.0 for j in xrange(4) ] for i in xrange(order) ]
     self.discounts    = [ [ 0.0 for j in xrange(3) ] for i in xrange(order-1) ]
     self.UD = 0.
     self.UN = defaultdict(float)
Example #7
0
class KNSmoother( ):
    """
      Stand-alone python implementation of interpolated Fixed Kneser-Ney discounting.

      Intended for educational purposes, this should produce results identical
       to mitlm's 'estimate-ngram' utility,
         mitlm:
          $ estimate-ngram -o 3 -t train.corpus -s FixKN
         SimpleKN.py:
          $ SimpleKN.py -t train.corpus
    
      WARNING: This program has not been optimized in any way and will almost 
       surely be extremely slow for anything larger than a small toy corpus.

    """

    def __init__( self, order=3, sb="<s>", se="</s>" ):
        self.sb        = sb
        self.se        = se
        self.order     = order
        self.ngrams    = NGramStack(order=order)
        self.denominators = [ defaultdict(float) for i in xrange(order-1) ]
        self.numerators   = [ defaultdict(float) for i in xrange(order-1) ]
        self.nonZeros     = [ defaultdict(float) for i in xrange(order-1) ]
        self.CoC          = [ [ 0.0 for j in xrange(4) ] for i in xrange(order) ]
        self.discounts    = [ 0.0 for i in xrange(order-1) ]
        self.UD = 0.
        self.UN = defaultdict(float)

    def _compute_counts_of_counts( self ):
        """
          Compute counts-of-counts (CoC) for each N-gram order.
          Only CoC<=4 are relevant to the computation of
          either ModKNFix or KNFix.
        """

        for k in self.UN:
            if self.UN[k] <= 4:
                self.CoC[0][int(self.UN[k]-1)] += 1.

        for i,dic in enumerate(self.numerators):
            for k in dic:
                if dic[k]<=4:
                    self.CoC[i+1][int(dic[k]-1)] += 1.
        return

    def _compute_discounts( self ):
        """
          Compute the discount parameters. Note that unigram counts
          are not discounted in either FixKN or FixModKN.

          ---------------------------------
          Fixed Kneser-Ney smoothing: FixKN
          ---------------------------------
          This is based on the solution described in Kneser-Ney '95, 
          and reformulated in Chen&Goodman '98.  
          
             D = N_1 / ( N_1 + 2(N_2) )

          where N_1 refers to the # of N-grams that appear exactly
          once, and N_2 refers to the number of N-grams that appear
          exactly twice.  This is computed for each order.

          NOTE: The discount formula for FixKN is identical 
                for Absolute discounting.
        """

        #Uniform discount for each N-gram order
        for o in xrange(self.order-1):
            self.discounts[o] = self.CoC[o+1][0] / (self.CoC[o+1][0]+2*self.CoC[o+1][1])

        return 

    def _get_discount( self, order, ngram ):
        """
          Retrieve the pre-computed discount for this N-gram.
        """

        return self.discounts[order]

    def kneser_ney_from_counts( self, arpa_file ):
        """
          Train the KN-discount language model from an ARPA format 
          file containing raw count data.  This can be generated with,
            $ ./SimpleCount.py --train train.corpus -r > counts.arpa

        """

        m_ord = c_ord = 0

        for line in open(arpa_file, "r"):
            ngram, count = line.strip().split("\t")
            count = float(count)

            ngram = ngram.split(" ")
            if len(ngram)==2:
                self.UD += 1.0

            if len(ngram)==2:
                self.UN[" ".join(ngram[1:])] += 1.0
                self.nonZeros[len(ngram)-2][" ".join(ngram[:-1])] += 1.0
                if ngram[0]==self.sb:
                    self.numerators[len(ngram)-2][" ".join(ngram)] += count
                    self.denominators[len(ngram)-2][" ".join(ngram[:-1])] += count

            if len(ngram)>2 and len(ngram)<self.order:
                self.numerators[len(ngram)-3][" ".join(ngram[1:])] += 1.0
                self.denominators[len(ngram)-3][" ".join(ngram[1:-1])] += 1.0
                self.nonZeros[len(ngram)-2][" ".join(ngram[:-1])] += 1.0
                if ngram[0]==self.sb:
                    self.numerators[len(ngram)-2][" ".join(ngram)] += count
                    self.denominators[len(ngram)-2][" ".join(ngram[:-1])] += count

            if len(ngram)==self.order:
                self.numerators[len(ngram)-3][" ".join(ngram[1:])] += 1.0
                self.numerators[len(ngram)-2][" ".join(ngram)] = count
                self.denominators[len(ngram)-3][" ".join(ngram[1:-1])] += 1.0
                self.denominators[len(ngram)-2][" ".join(ngram[:-1])] += count
                self.nonZeros[len(ngram)-2][" ".join(ngram[:-1])] += 1.0

        self._compute_counts_of_counts ( )
        self._compute_discounts( )

        #self._print_raw_counts( )
        return

    def kneser_ney_discounting( self, training_file ):
        """
          Iterate through the training data using a FIFO stack or 
           'window' of max-length equal to the specified N-gram order.

          Each time a new word is pushed onto the N-gram stack call
           the _kn_recurse() subroutine to increment the N-gram 
           contexts in the current window / on the stack. 

          If pushing a word onto the stack makes len(stack)>max-order, 
           then the word at the bottom (stack[0]) is popped off.
        """
        for line in open(training_file,"r"):
            #Split the current line into words.
            words = re.split(r"\s+",line.strip())

            #Push a sentence-begin token onto the stack
            self.ngrams.push(self.sb)

            for word in words:
                #Get the current 'window' of N-grams
                ngram = self.ngrams.push(word)

                #Now count all N-grams in the current window
                #These will be of span <= self.order
                self._kn_recurse( ngram, len(ngram)-2 )

            #Now push the sentence-end token onto the stack
            ngram = self.ngrams.push(self.se)
            self._kn_recurse( ngram, len(ngram)-2 )

            #Clear the stack for the next sentence
            self.ngrams.clear()
        self._compute_counts_of_counts ( )
        self._compute_discounts( )

        #self._print_raw_counts( )
        return

    def _print_raw_counts( self ):
        """
          Convenience function for sanity checking the history counts.
        """
        print "NUMERATORS:"
        for key in sorted(self.UN.iterkeys()):
            print " ", key, self.UN[key]
        for o in xrange(len(self.numerators)):
            print "ORD",o
            for key in sorted(self.numerators[o].iterkeys()):
                print " ", key, self.numerators[o][key]
        print "DENOMINATORS:"
        print self.UD
        for o in xrange(len(self.denominators)):
            print "DORD", o
            for key in sorted(self.denominators[o].iterkeys()):
                print " ", key, self.denominators[o][key]
        print "NONZEROS:"
        for o in xrange(len(self.nonZeros)):
            print "ZORD", o
            for key in sorted(self.nonZeros[o].iterkeys()):
                print " ", key, self.nonZeros[o][key]

    def _kn_recurse( self, ngram_stack, i ):
        """
         Kneser-Ney discount calculation recursion.
        """
        if i==-1 and ngram_stack[0]==self.sb:
            return
        o     = len(ngram_stack)
        numer = " ".join(ngram_stack[o-(i+2):])
        denom = " ".join(ngram_stack[o-(i+2):o-1])
        self.numerators[  i][numer] += 1.
        self.denominators[i][denom] += 1.
        if self.numerators[i][numer]==1.:
            self.nonZeros[i][denom]  += 1.
            if i>0:
                self._kn_recurse( ngram_stack, i-1 )
            else:
                #The <s> (sentence-begin) token is
                # NOT counted as a unigram event
                if not ngram_stack[-1]==self.sb:
                    self.UN[ngram_stack[-1]] += 1.
                    self.UD += 1.
        return  
        
    def print_ARPA( self ):
        """
          Print the interpolated Kneser-Ney LM out in ARPA format,
           computing the interpolated probabilities and back-off
           weights for each N-gram on-demand.  The format:
           ----------------------------
             \data\
             ngram 1=NUM_1GRAMS
             ngram 2=NUM_2GRAMS
             ...
             ngram N=NUM_NGRAMS (max order)
            
             \1-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \2-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \N-grams:
             p(a_z)  a_z
             ...

             \end\
           ----------------------------

        """

        #Handle the header info
        print "\\data\\"
        print "ngram 1=%d" % (len(self.UN)+1)
        for o in xrange(0,self.order-1):
            print "ngram %d=%d" % (o+2,len(self.numerators[o]) )

        #Handle the Unigrams
        print "\n\\1-grams:"
        d    = self.discounts[0]
        #KN discount
        lmda = self.nonZeros[0][self.sb] * d / self.denominators[0][self.sb]
        print "-99.00000\t%s\t%0.6f"   % ( self.sb, log(lmda, 10.) )

        for key in sorted(self.UN.iterkeys()):
            if key==self.se:
                print "%0.6f\t%s\t-99"   % ( log(self.UN[key]/self.UD, 10.), key )
                continue

            d    = self.discounts[0]
            #KN discount
            lmda = self.nonZeros[0][key] * d / self.denominators[0][key]
            print "%0.6f\t%s\t%0.6f" % ( log(self.UN[key]/self.UD, 10.), key, log(lmda, 10.) )

        #Handle the middle-order N-grams
        for o in xrange(0,self.order-2):
            print "\n\\%d-grams:" % (o+2)
            for key in sorted(self.numerators[o].iterkeys()):
                if key.endswith(self.se):
                    #No back-off prob for N-grams ending in </s>
                    prob = self._compute_interpolated_prob( key )
                    print "%0.6f\t%s" % ( log(prob, 10.), key )
                    continue
                d = self.discounts[o+1]
                #Compute the back-off weight
                #KN discount
                lmda = self.nonZeros[o+1][key] * d / self.denominators[o+1][key]
                #Compute the interpolated N-gram probability
                prob = self._compute_interpolated_prob( key )
                print "%0.6f\t%s\t%0.6f" % ( log(prob, 10.), key, log(lmda, 10.))

        #Handle the N-order N-grams
        print "\n\\%d-grams:" % (self.order)
        for key in sorted(self.numerators[self.order-2].iterkeys()):
            #Compute the interpolated N-gram probability
            prob = self._compute_interpolated_prob( key )
            print "%0.6f\t%s" % ( log(prob, 10.), key )

        print "\n\\end\\"
        return


    def _compute_interpolated_prob( self, ngram ):
        """
          Compute the interpolated probability for the input ngram.
          Cribbing the notation from the SRILM webpages,

             a_z    = An N-gram where a is the first word, z is the 
                       last word, and "_" represents 0 or more words in between.
             p(a_z) = The estimated conditional probability of the 
                       nth word z given the first n-1 words (a_) of an N-gram.
             a_     = The n-1 word prefix of the N-gram a_z.
             _z     = The n-1 word suffix of the N-gram a_z.

          Then we have, 
             f(a_z) = g(a_z) + bow(a_) p(_z)
             p(a_z) = (c(a_z) > 0) ? f(a_z) : bow(a_) p(_z)

          The ARPA format is generated by writing, for each N-gram
          with 1 < order < max_order:
             p(a_z)    a_z   bow(a_z)

          and for the maximum order:
             p(a_z)    a_z

          special care must be taken for certain N-grams containing
           the <s> (sentence-begin) and </s> (sentence-end) tokens.  
          See the implementation for details on how to do this correctly.

          The formulation is based on the seminal Chen&Goodman '98 paper.

          SRILM notation-cribbing from:
             http://www.speech.sri.com/projects/srilm/manpages/ngram-discount.7.html
        """
        probability = 0.0
        ngram_stack = ngram.split(" ")
        probs       = [ 1e-99 for i in xrange(len(ngram_stack)) ]
        o           = len(ngram_stack)

        if not ngram_stack[-1]==self.sb:
            probs[0] = self.UN[ngram_stack[-1]] / self.UD

        for i in xrange(o-1):
            dID = " ".join(ngram_stack[o-(i+2):o-1])
            nID = " ".join(ngram_stack[o-(i+2):])
            if dID in self.denominators[i]:
                d = self.discounts[i]
                if nID in self.numerators[i]:
                    #We have an actual N-gram probability for this N-gram
                    #KN discount
                    probs[i+1] = (self.numerators[i][nID]-d)/self.denominators[i][dID]
                else:
                    #No actual N-gram prob, we will have to back-off
                    probs[i+1] = 1e-99
                #This break-down takes the following form:
                #  probs[i+1]:  The interpolated N-gram probability, p(a_z)
                #  lmda:        The un-normalized 'back-off' weight, bow(a_)
                #  probs[i]:    The next lower-order, interpolated N-gram 
                #               probability corresponding to p(_z)
                #KN discount
                lmda = self.nonZeros[i][dID] * d / self.denominators[i][dID]
                probs[i+1]  = probs[i+1] + lmda * probs[i]
                probability = probs[i+1]

        if probability == 0.0:
            #If we still have nothing, return the unigram probability
            probability = probs[0]

        return probability
Example #8
0
class KNSmoother():
    """
      Stand-alone python implementation of interpolated Fixed Kneser-Ney discounting.

      Intended for educational purposes, this should produce results identical
       to mitlm's 'estimate-ngram' utility,
         mitlm:
          $ estimate-ngram -o 3 -t train.corpus -s FixKN
         SimpleKN.py:
          $ SimpleKN.py -t train.corpus
    
      WARNING: This program has not been optimized in any way and will almost 
       surely be extremely slow for anything larger than a small toy corpus.

    """
    def __init__(self, order=3, sb="<s>", se="</s>"):
        self.sb = sb
        self.se = se
        self.order = order
        self.ngrams = NGramStack(order=order)
        self.denominators = [defaultdict(float) for i in xrange(order - 1)]
        self.numerators = [defaultdict(float) for i in xrange(order - 1)]
        self.nonZeros = [defaultdict(float) for i in xrange(order - 1)]
        self.CoC = [[0.0 for j in xrange(4)] for i in xrange(order)]
        self.discounts = [0.0 for i in xrange(order - 1)]
        self.UD = 0.
        self.UN = defaultdict(float)

    def _compute_counts_of_counts(self):
        """
          Compute counts-of-counts (CoC) for each N-gram order.
          Only CoC<=4 are relevant to the computation of
          either ModKNFix or KNFix.
        """

        for k in self.UN:
            if self.UN[k] <= 4:
                self.CoC[0][int(self.UN[k] - 1)] += 1.

        for i, dic in enumerate(self.numerators):
            for k in dic:
                if dic[k] <= 4:
                    self.CoC[i + 1][int(dic[k] - 1)] += 1.
        return

    def _compute_discounts(self):
        """
          Compute the discount parameters. Note that unigram counts
          are not discounted in either FixKN or FixModKN.

          ---------------------------------
          Fixed Kneser-Ney smoothing: FixKN
          ---------------------------------
          This is based on the solution described in Kneser-Ney '95, 
          and reformulated in Chen&Goodman '98.  
          
             D = N_1 / ( N_1 + 2(N_2) )

          where N_1 refers to the # of N-grams that appear exactly
          once, and N_2 refers to the number of N-grams that appear
          exactly twice.  This is computed for each order.

          NOTE: The discount formula for FixKN is identical 
                for Absolute discounting.
        """

        #Uniform discount for each N-gram order
        for o in xrange(self.order - 1):
            self.discounts[o] = self.CoC[o + 1][0] / (self.CoC[o + 1][0] +
                                                      2 * self.CoC[o + 1][1])

        return

    def _get_discount(self, order, ngram):
        """
          Retrieve the pre-computed discount for this N-gram.
        """

        return self.discounts[order]

    def kneser_ney_from_counts(self, arpa_file):
        """
          Train the KN-discount language model from an ARPA format 
          file containing raw count data.  This can be generated with,
            $ ./SimpleCount.py --train train.corpus -r > counts.arpa

        """

        m_ord = c_ord = 0

        for line in open(arpa_file, "r"):
            ngram, count = line.strip().split("\t")
            #In this version, possibly fractional counts
            # are simply rounded to the nearest integer.
            #The result is a "poor-man's" fractional Kneser-Ney
            # which actually works quite well in practice.
            count = round(float(count))
            if count == 0.0: continue

            ngram = ngram.split(" ")
            if len(ngram) == 2:
                self.UD += 1.0

            if len(ngram) == 2:
                self.UN[" ".join(ngram[1:])] += 1.0
                self.nonZeros[len(ngram) - 2][" ".join(ngram[:-1])] += 1.0
                if ngram[0] == self.sb:
                    self.numerators[len(ngram) - 2][" ".join(ngram)] += count
                    self.denominators[len(ngram) - 2][" ".join(
                        ngram[:-1])] += count

            if len(ngram) > 2 and len(ngram) < self.order:
                self.numerators[len(ngram) - 3][" ".join(ngram[1:])] += 1.0
                self.denominators[len(ngram) - 3][" ".join(ngram[1:-1])] += 1.0
                self.nonZeros[len(ngram) - 2][" ".join(ngram[:-1])] += 1.0
                if ngram[0] == self.sb:
                    self.numerators[len(ngram) - 2][" ".join(ngram)] += count
                    self.denominators[len(ngram) - 2][" ".join(
                        ngram[:-1])] += count

            if len(ngram) == self.order:
                self.numerators[len(ngram) - 3][" ".join(ngram[1:])] += 1.0
                self.numerators[len(ngram) - 2][" ".join(ngram)] = count
                self.denominators[len(ngram) - 3][" ".join(ngram[1:-1])] += 1.0
                self.denominators[len(ngram) - 2][" ".join(
                    ngram[:-1])] += count
                self.nonZeros[len(ngram) - 2][" ".join(ngram[:-1])] += 1.0

        self._compute_counts_of_counts()
        self._compute_discounts()

        #self._print_raw_counts( )
        return

    def kneser_ney_discounting(self, training_file):
        """
          Iterate through the training data using a FIFO stack or 
           'window' of max-length equal to the specified N-gram order.

          Each time a new word is pushed onto the N-gram stack call
           the _kn_recurse() subroutine to increment the N-gram 
           contexts in the current window / on the stack. 

          If pushing a word onto the stack makes len(stack)>max-order, 
           then the word at the bottom (stack[0]) is popped off.
        """
        for line in open(training_file, "r"):
            #Split the current line into words.
            words = re.split(r"\s+", line.strip())

            #Push a sentence-begin token onto the stack
            self.ngrams.push(self.sb)

            for word in words:
                #Get the current 'window' of N-grams
                ngram = self.ngrams.push(word)

                #Now count all N-grams in the current window
                #These will be of span <= self.order
                self._kn_recurse(ngram, len(ngram) - 2)

            #Now push the sentence-end token onto the stack
            ngram = self.ngrams.push(self.se)
            self._kn_recurse(ngram, len(ngram) - 2)

            #Clear the stack for the next sentence
            self.ngrams.clear()
        self._compute_counts_of_counts()
        self._compute_discounts()

        self._print_raw_counts()
        return

    def _print_raw_counts(self):
        """
          Convenience function for sanity checking the history counts.
        """
        print "NUMERATORS:"
        for key in sorted(self.UN.iterkeys()):
            print " ", key, self.UN[key]
        for o in xrange(len(self.numerators)):
            print "ORD", o
            for key in sorted(self.numerators[o].iterkeys()):
                print " ", key, self.numerators[o][key]
        print "DENOMINATORS:"
        print self.UD
        for o in xrange(len(self.denominators)):
            print "DORD", o
            for key in sorted(self.denominators[o].iterkeys()):
                print " ", key, self.denominators[o][key]
        print "NONZEROS:"
        for o in xrange(len(self.nonZeros)):
            print "ZORD", o
            for key in sorted(self.nonZeros[o].iterkeys()):
                print " ", key, self.nonZeros[o][key]

    def _kn_recurse(self, ngram_stack, i):
        """
         Kneser-Ney discount calculation recursion.
        """
        if i == -1 and ngram_stack[0] == self.sb:
            return
        o = len(ngram_stack)
        numer = " ".join(ngram_stack[o - (i + 2):])
        denom = " ".join(ngram_stack[o - (i + 2):o - 1])
        self.numerators[i][numer] += 1.
        self.denominators[i][denom] += 1.
        if self.numerators[i][numer] == 1.:
            self.nonZeros[i][denom] += 1.
            if i > 0:
                self._kn_recurse(ngram_stack, i - 1)
            else:
                #The <s> (sentence-begin) token is
                # NOT counted as a unigram event
                if not ngram_stack[-1] == self.sb:
                    self.UN[ngram_stack[-1]] += 1.
                    self.UD += 1.
        return

    def print_ARPA(self):
        """
          Print the interpolated Kneser-Ney LM out in ARPA format,
           computing the interpolated probabilities and back-off
           weights for each N-gram on-demand.  The format:
           ----------------------------
             \data\
             ngram 1=NUM_1GRAMS
             ngram 2=NUM_2GRAMS
             ...
             ngram N=NUM_NGRAMS (max order)
            
             \1-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \2-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \N-grams:
             p(a_z)  a_z
             ...

             \end\
           ----------------------------

           NOTE: The try:/except: blocks are not necessary for normal 
                 circumstances.  Neither raw text nor correct counts will
                 fire the exception blocks.  These are needed only when 
                 using "poor-man's" fractional Kneser-Ney, in which case
                 the rounded counts may not agree.
                 
        """

        #Handle the header info
        print "\\data\\"
        print "ngram 1=%d" % (len(self.UN) + 1)
        for o in xrange(0, self.order - 1):
            print "ngram %d=%d" % (o + 2, len(self.numerators[o]))

        #Handle the Unigrams
        print "\n\\1-grams:"
        d = self.discounts[0]
        #KN discount
        try:
            lmda = self.nonZeros[0][self.sb] * d / self.denominators[0][
                self.sb]
        except:
            lmda = 1e-99
        print "-99.00000\t%s\t%0.6f" % (self.sb, log(lmda, 10.))

        for key in sorted(self.UN.iterkeys()):
            if key == self.se:
                print "%0.6f\t%s\t-99" % (log(self.UN[key] / self.UD,
                                              10.), key)
                continue

            d = self.discounts[0]
            #KN discount
            try:
                lmda = self.nonZeros[0][key] * d / self.denominators[0][key]
            except:
                lmda = 1e-99
            print "%0.6f\t%s\t%0.6f" % (log(self.UN[key] / self.UD,
                                            10.), key, log(lmda, 10.))

        #Handle the middle-order N-grams
        for o in xrange(0, self.order - 2):
            print "\n\\%d-grams:" % (o + 2)
            for key in sorted(self.numerators[o].iterkeys()):
                if key.endswith(self.se):
                    #No back-off prob for N-grams ending in </s>
                    prob = self._compute_interpolated_prob(key)
                    try:
                        print "%0.6f\t%s" % (log(prob, 10.), key)
                    except:
                        print "%0.6f\t%s" % (log(1e-99, 10.), key)
                    continue
                d = self.discounts[o + 1]
                #Compute the back-off weight
                #KN discount
                try:
                    lmda = self.nonZeros[o + 1][key] * d / self.denominators[
                        o + 1][key]
                except:
                    lmda = 1e-99
                #Compute the interpolated N-gram probability
                prob = self._compute_interpolated_prob(key)
                if lmda == 0.: lmda = 1e-99
                prob = max(prob, 1e-99)
                try:
                    print "%0.6f\t%s\t%0.6f" % (log(prob,
                                                    10.), key, log(lmda, 10.))
                except:
                    raise ValueError, "ERROR %0.6f\t%0.6f" % (prob, lmda)

        #Handle the N-order N-grams
        print "\n\\%d-grams:" % (self.order)
        for key in sorted(self.numerators[self.order - 2].iterkeys()):
            #Compute the interpolated N-gram probability
            prob = self._compute_interpolated_prob(key)
            try:
                print "%0.6f\t%s" % (log(prob, 10.), key)
            except:
                print "%0.6f\t%s" % (log(1e-99, 10.), key)

        print "\n\\end\\"
        return

    def _compute_interpolated_prob(self, ngram):
        """
          Compute the interpolated probability for the input ngram.
          Cribbing the notation from the SRILM webpages,

             a_z    = An N-gram where a is the first word, z is the 
                       last word, and "_" represents 0 or more words in between.
             p(a_z) = The estimated conditional probability of the 
                       nth word z given the first n-1 words (a_) of an N-gram.
             a_     = The n-1 word prefix of the N-gram a_z.
             _z     = The n-1 word suffix of the N-gram a_z.

          Then we have, 
             f(a_z) = g(a_z) + bow(a_) p(_z)
             p(a_z) = (c(a_z) > 0) ? f(a_z) : bow(a_) p(_z)

          The ARPA format is generated by writing, for each N-gram
          with 1 < order < max_order:
             p(a_z)    a_z   bow(a_z)

          and for the maximum order:
             p(a_z)    a_z

          special care must be taken for certain N-grams containing
           the <s> (sentence-begin) and </s> (sentence-end) tokens.  
          See the implementation for details on how to do this correctly.

          The formulation is based on the seminal Chen&Goodman '98 paper.

          SRILM notation-cribbing from:
             http://www.speech.sri.com/projects/srilm/manpages/ngram-discount.7.html
        """
        probability = 0.0
        ngram_stack = ngram.split(" ")
        probs = [1e-99 for i in xrange(len(ngram_stack))]
        o = len(ngram_stack)

        if not ngram_stack[-1] == self.sb:
            probs[0] = self.UN[ngram_stack[-1]] / self.UD

        for i in xrange(o - 1):
            dID = " ".join(ngram_stack[o - (i + 2):o - 1])
            nID = " ".join(ngram_stack[o - (i + 2):])
            if dID in self.denominators[i]:
                d = self.discounts[i]
                if nID in self.numerators[i]:
                    #We have an actual N-gram probability for this N-gram
                    #KN discount
                    try:
                        probs[i + 1] = (self.numerators[i][nID] -
                                        d) / self.denominators[i][dID]
                    except:
                        probs[i + 1] = 1e-99
                else:
                    #No actual N-gram prob, we will have to back-off
                    probs[i + 1] = 1e-99
                #This break-down takes the following form:
                #  probs[i+1]:  The interpolated N-gram probability, p(a_z)
                #  lmda:        The un-normalized 'back-off' weight, bow(a_)
                #  probs[i]:    The next lower-order, interpolated N-gram
                #               probability corresponding to p(_z)
                #KN discount
                try:
                    lmda = self.nonZeros[i][dID] * d / self.denominators[i][dID]
                except:
                    lmda = 1e-99
                probs[i + 1] = probs[i + 1] + lmda * probs[i]
                probability = probs[i + 1]

        if probability == 0.0:
            #If we still have nothing, return the unigram probability
            probability = probs[0]

        return probability
Example #9
0
class ModKNSmoother():
    """
      Stand-alone python implementation of Fixed Modified Kneser-Ney discounting.

      Intended for educational purposes, this should produce results identical
       to Google NGramLibrary tools with ngrammake --bins=3.  See the included 
       run-NGramLibrary.sh script to train a model for comparison.
    
      WARNING: This may be slow for very large corpora.

    """
    def __init__(self, order=3, sb="<s>", se="</s>"):
        self.sb = sb
        self.se = se
        self.order = order
        self.ngrams = NGramStack(order=order)
        self.denominators = [defaultdict(float) for i in xrange(order - 1)]
        self.numerators = [defaultdict(float) for i in xrange(order - 1)]
        #Modified Kneser-Ney requires that we track the individual N_i
        # in contrast to Kneser-Ney, which just requires the sum-total.
        self.nonZeros = [
            defaultdict(lambda: defaultdict(float)) for i in xrange(order - 1)
        ]
        self.CoC = [[0.0 for j in xrange(4)] for i in xrange(order)]
        self.discounts = [[0.0 for j in xrange(3)] for i in xrange(order - 1)]
        self.UD = 0.
        self.UN = defaultdict(float)

    def _compute_counts_of_counts(self):
        """
          Compute counts-of-counts (CoC) for each N-gram order.
          Only CoC<=4 are relevant to the computation of
          either ModKNFix or KNFix.
        """

        for k in self.UN:
            if self.UN[k] <= 4:
                self.CoC[0][int(self.UN[k] - 1)] += 1.

        for i, dic in enumerate(self.numerators):
            for k in dic:
                if dic[k] <= 4:
                    self.CoC[i + 1][int(dic[k] - 1)] += 1.
        return

    def _compute_discounts(self):
        """
          Compute the discount parameters. Note that unigram counts
          are not discounted in either FixKN or FixModKN.

          ---------------------------------
          Fixed Modified Kneser-Ney: FixModKN
          ---------------------------------
          This is the solution proposed by Chen&Goodman '98

             Y    = N_1 / (N_1 + 2*N_2)
             D_1  = 1 - 2*Y * (N_2 / N_1)
             D_2  = 2 - 3*Y * (N_3 / N_2)
             D_3+ = 3 - 4*Y * (N_4 / N_3)

          where N_i again refers to the number of N-grams that appear
          exactly 'i' times in the training data.  The D_i refer to the
          counts-of-counts for the current N-gram.  That is, if the 
          current N-gram, 'a b c' was seen exactly two times in the 
          training corpus, then discount D_2 would be applied.

        """

        for o in xrange(self.order - 1):
            Y = self.CoC[o + 1][0] / (self.CoC[o + 1][0] +
                                      2 * self.CoC[o + 1][1])
            #Compute all the D_i based on the formula
            for i in xrange(3):
                if self.CoC[o + 1][i] > 0:
                    self.discounts[o][i] = (i + 1) - (i + 2) * Y * (
                        self.CoC[o + 1][i + 1] / self.CoC[o + 1][i])
                else:
                    self.discounts[o][i] = (i + 1)

        return

    def _get_discount(self, order, ngram):
        """
          Compute the discount mass for this N-gram, based on 
           the precomputed D_i and individual N_i.
        """

        c = [0.0, 0.0, 0.0]

        for key in self.nonZeros[order][ngram]:
            if int(self.nonZeros[order][ngram][key]) == 1:
                c[0] += 1.
            elif int(self.nonZeros[order][ngram][key]) == 2:
                c[1] += 1.
            else:
                c[2] += 1.

        #Compute the discount mass by summing over the D_i*N_i
        d = sum([self.discounts[order][i] * c[i] for i in xrange(len(c))])
        return d

    def kneser_ney_from_counts(self, arpa_file):
        """
          Train the KN-discount language model from an ARPA format 
          file containing raw count data.  This can be generated with,
            $ ./SimpleCount.py --train train.corpus -r > counts.arpa

        """

        m_ord = c_ord = 0

        for line in open(arpa_file, "r"):
            ngram, count = line.strip().split("\t")
            count = float(count)
            ngram = ngram.split(" ")
            if len(ngram) == 2:
                self.UD += 1.0

            if len(ngram) == 2:
                self.UN[" ".join(ngram[1:])] += 1.0
                #Nonzeros based on suffixes
                if ngram[0] == self.sb:
                    self.nonZeros[len(ngram) - 2][" ".join(
                        ngram[:-1])][ngram[-1]] += count
                    self.numerators[len(ngram) - 2][" ".join(ngram)] += count
                    self.denominators[len(ngram) - 2][" ".join(
                        ngram[:-1])] += count

            if len(ngram) > 2 and len(ngram) < self.order:
                self.numerators[len(ngram) - 3][" ".join(ngram[1:])] += 1.0
                self.denominators[len(ngram) - 3][" ".join(ngram[1:-1])] += 1.0
                self.nonZeros[len(ngram) - 3][" ".join(
                    ngram[1:-1])][ngram[-1]] += 1.0
                if ngram[0] == self.sb:
                    self.numerators[len(ngram) - 2][" ".join(ngram)] += count
                    self.denominators[len(ngram) - 2][" ".join(
                        ngram[:-1])] += count
                    self.nonZeros[len(ngram) - 2][" ".join(
                        ngram[:-1])][ngram[-1]] += count

            if len(ngram) == self.order:
                self.numerators[len(ngram) - 3][" ".join(ngram[1:])] += 1.0
                self.numerators[len(ngram) - 2][" ".join(ngram)] = count
                self.denominators[len(ngram) - 3][" ".join(ngram[1:-1])] += 1.0
                self.denominators[len(ngram) - 2][" ".join(
                    ngram[:-1])] += count
                self.nonZeros[len(ngram) - 3][" ".join(
                    ngram[1:-1])][ngram[-1]] += 1.0
                self.nonZeros[len(ngram) - 2][" ".join(
                    ngram[:-1])][ngram[-1]] += count

        self._compute_counts_of_counts()
        self._compute_discounts()

        #self._print_raw_counts( )
        return

    def kneser_ney_discounting(self, training_file):
        """
          Iterate through the training data using a FIFO stack or 
           'window' of max-length equal to the specified N-gram order.

          Each time a new word is pushed onto the N-gram stack call
           the _kn_recurse() subroutine to increment the N-gram 
           contexts in the current window / on the stack. 

          If pushing a word onto the stack makes len(stack)>max-order, 
           then the word at the bottom (stack[0]) is popped off.
        """

        for line in open(training_file, "r"):
            #Split the current line into words.
            words = re.split(r"\s+", line.strip())

            #Push a sentence-begin token onto the stack
            self.ngrams.push(self.sb)

            for word in words:
                #Get the current 'window' of N-grams
                ngram = self.ngrams.push(word)

                #Now count all N-grams in the current window
                #These will be of span <= self.order
                self._kn_recurse(ngram, len(ngram) - 2)

            #Now push the sentence-end token onto the stack
            ngram = self.ngrams.push(self.se)
            self._kn_recurse(ngram, len(ngram) - 2)

            #Clear the stack for the next sentence
            self.ngrams.clear()
        self._compute_counts_of_counts()
        self._compute_discounts()

        return

    def print_raw_counts(self):
        """
          Convenience function for sanity checking the history counts.
        """
        #print "NUMERATORS:"
        #for key in sorted(self.UN.iterkeys()):
        #    print " ", key, self.UN[key]
        #for o in xrange(len(self.numerators)):
        #    print "ORD",o
        #    for key in sorted(self.numerators[o].iterkeys()):
        #        print " ", key, self.numerators[o][key]
        #print "DENOMINATORS:"
        #print self.UD
        #for o in xrange(len(self.denominators)):
        #    print "DORD", o
        #    for key in sorted(self.denominators[o].iterkeys()):
        #        print " ", key, self.denominators[o][key]
        print "NONZEROS:"
        for o in xrange(len(self.nonZeros)):
            print "ZORD", o
            for denom in sorted(self.nonZeros[o].iterkeys()):
                print " Den:", denom
                for key in sorted(self.nonZeros[o][denom].iterkeys()):
                    print "   ", key, self.nonZeros[o][denom][key]

    def _kn_recurse(self, ngram_stack, i):
        """
         Kneser-Ney discount calculation recursion.
        """

        if i == -1 and ngram_stack[0] == self.sb:
            return

        o = len(ngram_stack)
        numer = " ".join(ngram_stack[o - (i + 2):])
        denom = " ".join(ngram_stack[o - (i + 2):o - 1])
        self.numerators[i][numer] += 1.
        self.denominators[i][denom] += 1.

        #For Modified Kneser-Ney we need to track
        # individual nonZeros based on their suffixes
        self.nonZeros[i][denom][ngram_stack[-1]] += 1.
        if self.numerators[i][numer] == 1.:
            if i > 0:
                self._kn_recurse(ngram_stack, i - 1)
            else:
                #The <s> (sentence-begin) token is
                # NOT counted as a unigram event
                if not ngram_stack[-1] == self.sb:
                    self.UN[ngram_stack[-1]] += 1.
                    self.UD += 1.
        return

    def print_ARPA(self):
        """
          Print the interpolated Kneser-Ney LM out in ARPA format,
           computing the interpolated probabilities and back-off
           weights for each N-gram on-demand.  The format:
           ----------------------------
             \data\
             ngram 1=NUM_1GRAMS
             ngram 2=NUM_2GRAMS
             ...
             ngram N=NUM_NGRAMS (max order)
            
             \1-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \2-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \N-grams:
             p(a_z)  a_z
             ...

             \end\
           ----------------------------
        """

        #Handle the header info
        print "\\data\\"
        print "ngram 1=%d" % (len(self.UN) + 1)
        for o in xrange(0, self.order - 1):
            print "ngram %d=%d" % (o + 2, len(self.numerators[o]))

        #Handle the Unigrams
        print "\n\\1-grams:"
        d = self._get_discount(0, self.sb)
        #ModKN discount
        lmda = d / self.denominators[0][self.sb]
        print "-99.00000\t%s\t%0.7f" % (self.sb, log(lmda, 10.))

        for key in sorted(self.UN.iterkeys()):
            if key == self.se:
                print "%0.7f\t%s\t-99" % (log(self.UN[key] / self.UD,
                                              10.), key)
                continue

            d = self._get_discount(0, key)
            #ModKN discount
            lmda = d / self.denominators[0][key]
            print "%0.7f\t%s\t%0.7f" % (log(self.UN[key] / self.UD,
                                            10.), key, log(lmda, 10.))

        #Handle the middle-order N-grams
        for o in xrange(0, self.order - 2):
            print "\n\\%d-grams:" % (o + 2)
            for key in sorted(self.numerators[o].iterkeys()):
                if key.endswith(self.se):
                    #No back-off prob for N-grams ending in </s>
                    prob = self._compute_interpolated_prob(key)
                    print "%0.7f\t%s" % (log(prob, 10.), key)
                    continue
                d = self._get_discount(o + 1, key)
                #Compute the back-off weight
                #ModKN discount
                lmda = d / self.denominators[o + 1][key]
                #Compute the interpolated N-gram probability
                prob = self._compute_interpolated_prob(key)
                print "%0.7f\t%s\t%0.7f" % (log(prob, 10.), key, log(
                    lmda, 10.))

        #Handle the N-order N-grams
        print "\n\\%d-grams:" % (self.order)
        for key in sorted(self.numerators[self.order - 2].iterkeys()):
            #Compute the interpolated N-gram probability
            prob = self._compute_interpolated_prob(key)
            print "%0.7f\t%s" % (log(prob, 10.), key)

        print "\n\\end\\"
        return

    def _compute_interpolated_prob(self, ngram):
        """
          Compute the interpolated probability for the input ngram.
          Cribbing the notation from the SRILM webpages,

             a_z    = An N-gram where a is the first word, z is the 
                       last word, and "_" represents 0 or more words in between.
             p(a_z) = The estimated conditional probability of the 
                       nth word z given the first n-1 words (a_) of an N-gram.
             a_     = The n-1 word prefix of the N-gram a_z.
             _z     = The n-1 word suffix of the N-gram a_z.

          Then we have, 
             f(a_z) = g(a_z) + bow(a_) p(_z)
             p(a_z) = (c(a_z) > 0) ? f(a_z) : bow(a_) p(_z)

          The ARPA format is generated by writing, for each N-gram
          with 1 < order < max_order:
             p(a_z)    a_z   bow(a_z)

          and for the maximum order:
             p(a_z)    a_z

          special care must be taken for certain N-grams containing
           the <s> (sentence-begin) and </s> (sentence-end) tokens.  
          See the implementation for details on how to do this correctly.

          The formulation is based on the seminal Chen&Goodman '98 paper.

          SRILM notation-cribbing from:
             http://www.speech.sri.com/projects/srilm/manpages/ngram-discount.7.html
        """
        probability = 0.0
        ngram_stack = ngram.split(" ")
        probs = [1e-99 for i in xrange(len(ngram_stack))]
        o = len(ngram_stack)

        if not ngram_stack[-1] == self.sb:
            probs[0] = self.UN[ngram_stack[-1]] / self.UD

        for i in xrange(o - 1):
            dID = " ".join(ngram_stack[o - (i + 2):o - 1])
            nID = " ".join(ngram_stack[o - (i + 2):])
            if dID in self.denominators[i]:
                count = int(self.numerators[i][nID])
                d_i = min(count, 3)
                probs[i + 1] = (count - self.discounts[i][d_i - 1]
                                ) / self.denominators[i][dID]

                #This break-down takes the following form:
                #  probs[i+1]:  The interpolated N-gram probability, p(a_z)
                #  d:           The discount mass for a_: \Sum_i D_i*N_i
                #  lmda:        The un-normalized 'back-off' weight, bow(a_)
                #  probs[i]:    The next lower-order, interpolated N-gram
                #               probability corresponding to p(_z)
                #ModKN discount
                d = self._get_discount(i, dID)
                lmda = d / self.denominators[i][dID]
                probs[i + 1] = probs[i + 1] + lmda * probs[i]
                probability = probs[i + 1]

        if probability == 0.0:
            #If we still have nothing, return the unigram probability
            probability = probs[0]

        return probability
Example #10
0
class MLCounter( ):
    """
      Stand-alone python implementation of an simple Maximum Likelihood LM.

      This class simply counts NGrams in a training corpus and either,
        * Dumps the raw, log_10 counts into ARPA format
        * Computes an unsmoothed Maximum Likelihood LM 
    """

    def __init__( self, order=3, sb="<s>", se="</s>", raw=False, ml=False ):
        self.sb        = sb
        self.se        = se
        self.raw       = raw
        self.ml        = ml
        self.order     = order
        self.ngrams    = NGramStack(order=order)
        self.counts    = [ defaultdict(float) for i in xrange(order) ]

    def maximum_likelihood( self, training_file ):
        """
          Iterate through the training data using a FIFO stack or 
           'window' of max-length equal to the specified N-gram order.

          Each time a new word is pushed onto the N-gram stack call
           the _ml_count() subroutine to increment the N-gram counts.

          If pushing a word onto the stack makes len(stack)>max-order, 
           then the word at the bottom (stack[0]) is popped off.
        """

        for line in open(training_file,"r"):
            #Split the current line into words.
            words = re.split(r"\s+",line.strip())

            #Push a sentence-begin token onto the stack
            ngram = self.ngrams.push(self.sb)
            self._ml_count( ngram )

            for word in words:
                #Get the current 'window' of N-grams
                ngram = self.ngrams.push(word)

                #Now count all N-grams in the current window
                #These will be of span <= self.order
                self._ml_count( ngram )

            #Now push the sentence-end token onto the stack
            ngram = self.ngrams.push(self.se)
            self._ml_count( ngram )

            #Clear the stack for the next sentence
            self.ngrams.clear()
        
        return

    def _ml_count( self, ngram_stack ):
        """
          Just count NGrams.  The only slightly confusing thing here
          is the sentence-begin (<s>).  It does NOT count as a
          unigram event and thus does not contribute to the unigram tally.
          It IS however used as a history denominator.
        """

        #Iterate backwards through the stack
        for o in xrange(len(ngram_stack),0,-1):
            start = len(ngram_stack)-o
            self.counts[o-1][" ".join(ngram_stack[start:])] += 1.0

        return  
        
    def print_ARPA( self ):
        """
          Print the raw counts or ML LM out in ARPA format,
           ARPA format:
           ----------------------------
             \data\
             ngram 1=NUM_1GRAMS
             ngram 2=NUM_2GRAMS
             ...
             ngram N=NUM_NGRAMS (max order)
            
             \1-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \2-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \N-grams:
             p(a_z)  a_z
             ...

             \end\
           ----------------------------
          NOTE: Neither the ML model nor the raw counts
                will ever have a 'backoff weight'.
        """

        #Handle the header info
        print "\\data\\"
        for o in xrange(0,self.order):
            print "ngram %d=%d" % (o+1,len(self.counts[o]) )

        #Handle the Unigrams
        print "\n\\1-grams:"
        for key in sorted(self.counts[0].iterkeys()):
            if key==self.sb:
                if self.raw:
                    print "0.00000\t%s"   % ( self.sb )
                else:
                    print "-99.00000\t%s" % ( self.sb )
            else:
                if self.ml:
                    ml_prob = self.counts[0][key] / ( sum( [ self.counts[0][c] for c in self.counts[0].keys() ] ) - self.counts[0][self.sb] )
                    if self.raw:
                        print "%0.6f\t%s" % ( ml_prob, key )
                    else:
                        print "%0.6f\t%s" % ( log(ml_prob,10.), key )
                elif self.raw:
                    print "%0.6f\t%s" % (self.counts[0][key], key)
                else:
                    print "%0.6f\t%s" % (log(self.counts[0][key],10.), key)

        #Handle the middle-order N-grams
        for o in xrange(1,self.order):
            print "\n\\%d-grams:" % (o+1)
            for key in sorted(self.counts[o].iterkeys()):
                if self.ml:
                    hist = key[:key.rfind(" ")]
                    ml_prob = self.counts[o][key] / self.counts[o-1][hist]
                    if self.raw:
                        print "%0.6f\t%s" % ( ml_prob, key )
                    else:
                        print "%0.6f\t%s" % ( log(ml_prob,10.), key )
                elif self.raw:
                    print "%0.6f\t%s" % (self.counts[o][key], key)
                else:
                    print "%0.6f\t%s" % (log(self.counts[o][key],10.), key)

        print "\n\\end\\"
        return
Example #11
0
class AbsSmoother( ):
    """
      Stand-alone python implementation of interpolated Absolute discounting.

      Intended for educational purposes, this should produce results identical
       to mitlm's 'estimate-ngram' utility,
         mitlm:
          $ estimate-ngram -o 3 -t train.corpus -s FixKN
         SimpleKN.py:
          $ SimpleKN.py -t train.corpus
    
      WARNING: This program has not been optimized in any way and will almost 
       surely be extremely slow for anything larger than a small toy corpus.

    """

    def __init__( self, order=3, sb="<s>", se="</s>" ):
        self.sb        = sb
        self.se        = se
        self.order     = order
        self.ngrams    = NGramStack(order=order)
        self.denominators = [ defaultdict(float) for i in xrange(order-1) ]
        self.numerators   = [ defaultdict(float) for i in xrange(order-1) ]
        self.nonZeros     = [ defaultdict(float) for i in xrange(order-1) ]
        self.CoC          = [ [ 0.0 for j in xrange(4) ] for i in xrange(order) ]
        self.discounts    = [ 0.0 for i in xrange(order-1) ]
        self.UD = 0.
        self.UN = defaultdict(float)

    def _compute_counts_of_counts( self ):
        """
          Compute counts-of-counts (CoC) for each N-gram order.
          Only CoC<=4 are relevant to the computation of
          ModKNFix or KNFix or Absolute discounting.
        """

        for k in self.UN:
            if self.UN[k] <= 4:
                self.CoC[0][int(self.UN[k]-1)] += 1.

        for i,dic in enumerate(self.numerators):
            for k in dic:
                if dic[k]<=4:
                    self.CoC[i+1][int(dic[k]-1)] += 1.
        return

    def _compute_discounts( self ):
        """
          Compute the discount parameters. Note that unigram counts
          are not discounted in Absolute, FixKN or FixModKN.

          ---------------------------------
          Fixed Kneser-Ney smoothing: FixKN
          ---------------------------------
          This is based on the solution described in Kneser-Ney '95, 
          and reformulated in Chen&Goodman '98.  
          
             D = N_1 / ( N_1 + 2(N_2) )

          where N_1 refers to the # of N-grams that appear exactly
          once, and N_2 refers to the number of N-grams that appear
          exactly twice.  This is computed for each order.

          NOTE: The discount formula for FixKN is identical 
                for Absolute discounting.
        """

        #Uniform discount for each N-gram order
        for o in xrange(self.order-1):
            self.discounts[o] = self.CoC[o+1][0] / (self.CoC[o+1][0]+2*self.CoC[o+1][1])

        return 

    def _get_discount( self, order, ngram ):
        """
          Retrieve the pre-computed discount for this N-gram.
        """

        return self.discounts[order]

    def absolute_discounting( self, training_file ):
        """
          Iterate through the training data using a FIFO stack or 
           'window' of max-length equal to the specified N-gram order.

          Each time a new word is pushed onto the N-gram stack call
           the _abs_count() subroutine to increment the N-gram 
           contexts in the current window / on the stack. 

          If pushing a word onto the stack makes len(stack)>max-order, 
           then the word at the bottom (stack[0]) is popped off.
        """
        for line in open(training_file,"r"):
            #Split the current line into words.
            words = re.split(r"\s+",line.strip())

            #Push a sentence-begin token onto the stack
            self.ngrams.push(self.sb)

            for word in words:
                #Get the current 'window' of N-grams
                ngram = self.ngrams.push(word)

                #Now count all N-grams in the current window
                #These will be of span <= self.order
                self._abs_count( ngram, len(ngram)-2 )

            #Now push the sentence-end token onto the stack
            ngram = self.ngrams.push(self.se)
            self._abs_count( ngram, len(ngram)-2 )

            #Clear the stack for the next sentence
            self.ngrams.clear()
        self._compute_counts_of_counts ( )
        self._compute_discounts( )
        return

    def _abs_count( self, ngram_stack, i ):
        """
         Absolute discount calculation recursion.
        """
        if i==-1 and ngram_stack[0]==self.sb:
            return
        o     = len(ngram_stack)
        numer = " ".join(ngram_stack[o-(i+2):])
        denom = " ".join(ngram_stack[o-(i+2):o-1])
        self.numerators[  i][numer] += 1.
        self.denominators[i][denom] += 1.
        if self.numerators[i][numer]==1.:
            self.nonZeros[i][denom]  += 1.
        #The ONLY difference in terms of implementation
        #  between Kneser-Ney and Absolute discounting 
        #  is the following if/else.
        #In Kneser-Ney this is nested inside of the 
        #  preceding if statement.
        if i>0:
            self._abs_count( ngram_stack, i-1 )
        else:
            #The <s> (sentence-begin) token is
            # NOT counted as a unigram event
            if not ngram_stack[-1]==self.sb:
                self.UN[ngram_stack[-1]] += 1.
                self.UD += 1.
        return  
        
    def print_ARPA( self ):
        """
          Print the interpolated Absolute discounting LM out in ARPA format,
           computing the interpolated probabilities and back-off
           weights for each N-gram on-demand.  The format:
           ----------------------------
             \data\
             ngram 1=NUM_1GRAMS
             ngram 2=NUM_2GRAMS
             ...
             ngram N=NUM_NGRAMS (max order)
            
             \1-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \2-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \N-grams:
             p(a_z)  a_z
             ...

             \end\
           ----------------------------
        """

        #Handle the header info
        print "\\data\\"
        print "ngram 1=%d" % (len(self.UN)+1)
        for o in xrange(0,self.order-1):
            print "ngram %d=%d" % (o+2,len(self.numerators[o]) )

        #Handle the Unigrams
        print "\n\\1-grams:"
        d    = self.discounts[0]
        #Abs discount
        lmda = self.nonZeros[0][self.sb] * d / self.denominators[0][self.sb]
        print "-99.00000\t%s\t%0.6f"   % ( self.sb, log(lmda, 10.) )

        for key in sorted(self.UN.iterkeys()):
            if key==self.se:
                print "%0.6f\t%s\t-99"   % ( log(self.UN[key]/self.UD, 10.), key )
                continue

            d    = self.discounts[0]
            #Abs discount
            lmda = self.nonZeros[0][key] * d / self.denominators[0][key]
            print "%0.6f\t%s\t%0.6f" % ( log(self.UN[key]/self.UD, 10.), key, log(lmda, 10.) )

        #Handle the middle-order N-grams
        for o in xrange(0,self.order-2):
            print "\n\\%d-grams:" % (o+2)
            for key in sorted(self.numerators[o].iterkeys()):
                if key.endswith(self.se):
                    #No back-off prob for N-grams ending in </s>
                    prob = self._compute_interpolated_prob( key )
                    print "%0.6f\t%s" % ( log(prob, 10.), key )
                    continue
                d = self.discounts[o+1]
                #Compute the back-off weight
                #Abs discount
                lmda = self.nonZeros[o+1][key] * d / self.denominators[o+1][key]
                #Compute the interpolated N-gram probability
                prob = self._compute_interpolated_prob( key )
                print "%0.6f\t%s\t%0.6f" % ( log(prob, 10.), key, log(lmda, 10.))

        #Handle the N-order N-grams
        print "\n\\%d-grams:" % (self.order)
        for key in sorted(self.numerators[self.order-2].iterkeys()):
            #Compute the interpolated N-gram probability
            prob = self._compute_interpolated_prob( key )
            print "%0.6f\t%s" % ( log(prob, 10.), key )

        print "\n\\end\\"
        return


    def _compute_interpolated_prob( self, ngram ):
        """
          Compute the interpolated probability for the input ngram.
          Cribbing the notation from the SRILM webpages,

             a_z    = An N-gram where a is the first word, z is the 
                       last word, and "_" represents 0 or more words in between.
             p(a_z) = The estimated conditional probability of the 
                       nth word z given the first n-1 words (a_) of an N-gram.
             a_     = The n-1 word prefix of the N-gram a_z.
             _z     = The n-1 word suffix of the N-gram a_z.

          Then we have, 
             f(a_z) = g(a_z) + bow(a_) p(_z)
             p(a_z) = (c(a_z) > 0) ? f(a_z) : bow(a_) p(_z)

          The ARPA format is generated by writing, for each N-gram
          with 1 < order < max_order:
             p(a_z)    a_z   bow(a_z)

          and for the maximum order:
             p(a_z)    a_z

          special care must be taken for certain N-grams containing
           the <s> (sentence-begin) and </s> (sentence-end) tokens.  
          See the implementation for details on how to do this correctly.

          The formulation is based on the seminal Chen&Goodman '98 paper.

          SRILM notation-cribbing from:
             http://www.speech.sri.com/projects/srilm/manpages/ngram-discount.7.html
        """
        probability = 0.0
        ngram_stack = ngram.split(" ")
        probs       = [ 1e-99 for i in xrange(len(ngram_stack)) ]
        o           = len(ngram_stack)

        if not ngram_stack[-1]==self.sb:
            probs[0] = self.UN[ngram_stack[-1]] / self.UD

        for i in xrange(o-1):
            dID = " ".join(ngram_stack[o-(i+2):o-1])
            nID = " ".join(ngram_stack[o-(i+2):])
            if dID in self.denominators[i]:
                d = self.discounts[i]
                if nID in self.numerators[i]:
                    #We have an actual N-gram probability for this N-gram
                    #KN discount
                    probs[i+1] = (self.numerators[i][nID]-d)/self.denominators[i][dID]
                else:
                    #No actual N-gram prob, we will have to back-off
                    probs[i+1] = 0.0
                #This break-down takes the following form:
                #  probs[i+1]:  The interpolated N-gram probability, p(a_z)
                #  lmda:        The un-normalized 'back-off' weight, bow(a_)
                #  probs[i]:    The next lower-order, interpolated N-gram 
                #               probability corresponding to p(_z)
                #KN discount
                lmda        = self.nonZeros[i][dID] * d / self.denominators[i][dID]
                probs[i+1]  = probs[i+1] + lmda * probs[i]
                probability = probs[i+1]

        if probability == 0.0:
            #If we still have nothing, return the unigram probability
            probability = probs[0]

        return probability
Example #12
0
class MLCounter():
    """
      Stand-alone python implementation of an simple Maximum Likelihood LM.

      This class simply counts NGrams in a training corpus and either,
        * Dumps the raw, log_10 counts into ARPA format
        * Computes an unsmoothed Maximum Likelihood LM 
    """
    def __init__(self, order=3, sb="<s>", se="</s>", raw=False, ml=False):
        self.sb = sb
        self.se = se
        self.raw = raw
        self.ml = ml
        self.order = order
        self.ngrams = NGramStack(order=order)
        self.counts = [defaultdict(float) for i in xrange(order)]

    def maximum_likelihood(self, training_file):
        """
          Iterate through the training data using a FIFO stack or 
           'window' of max-length equal to the specified N-gram order.

          Each time a new word is pushed onto the N-gram stack call
           the _ml_count() subroutine to increment the N-gram counts.

          If pushing a word onto the stack makes len(stack)>max-order, 
           then the word at the bottom (stack[0]) is popped off.
        """

        for line in open(training_file, "r"):
            #Split the current line into words.
            words = re.split(r"\s+", line.strip())

            #Push a sentence-begin token onto the stack
            ngram = self.ngrams.push(self.sb)
            self._ml_count(ngram)

            for word in words:
                #Get the current 'window' of N-grams
                ngram = self.ngrams.push(word)

                #Now count all N-grams in the current window
                #These will be of span <= self.order
                self._ml_count(ngram)

            #Now push the sentence-end token onto the stack
            ngram = self.ngrams.push(self.se)
            self._ml_count(ngram)

            #Clear the stack for the next sentence
            self.ngrams.clear()

        return

    def _ml_count(self, ngram_stack):
        """
          Just count NGrams.  The only slightly confusing thing here
          is the sentence-begin (<s>).  It does NOT count as a
          unigram event and thus does not contribute to the unigram tally.
          It IS however used as a history denominator.
        """

        #Iterate backwards through the stack
        for o in xrange(len(ngram_stack), 0, -1):
            start = len(ngram_stack) - o
            self.counts[o - 1][" ".join(ngram_stack[start:])] += 1.0

        return

    def print_ARPA(self):
        """
          Print the raw counts or ML LM out in ARPA format,
           ARPA format:
           ----------------------------
             \data\
             ngram 1=NUM_1GRAMS
             ngram 2=NUM_2GRAMS
             ...
             ngram N=NUM_NGRAMS (max order)
            
             \1-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \2-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \N-grams:
             p(a_z)  a_z
             ...

             \end\
           ----------------------------
          NOTE: Neither the ML model nor the raw counts
                will ever have a 'backoff weight'.
        """

        #Handle the header info
        print "\\data\\"
        for o in xrange(0, self.order):
            print "ngram %d=%d" % (o + 1, len(self.counts[o]))

        #Handle the Unigrams
        print "\n\\1-grams:"
        for key in sorted(self.counts[0].iterkeys()):
            if key == self.sb:
                if self.raw:
                    print "0.00000\t%s" % (self.sb)
                else:
                    print "-99.00000\t%s" % (self.sb)
            else:
                if self.ml:
                    ml_prob = self.counts[0][key] / (
                        sum([self.counts[0][c]
                             for c in self.counts[0].keys()]) -
                        self.counts[0][self.sb])
                    if self.raw:
                        print "%0.6f\t%s" % (ml_prob, key)
                    else:
                        print "%0.6f\t%s" % (log(ml_prob, 10.), key)
                elif self.raw:
                    print "%0.6f\t%s" % (self.counts[0][key], key)
                else:
                    print "%0.6f\t%s" % (log(self.counts[0][key], 10.), key)

        #Handle the middle-order N-grams
        for o in xrange(1, self.order):
            print "\n\\%d-grams:" % (o + 1)
            for key in sorted(self.counts[o].iterkeys()):
                if self.ml:
                    hist = key[:key.rfind(" ")]
                    ml_prob = self.counts[o][key] / self.counts[o - 1][hist]
                    if self.raw:
                        print "%0.6f\t%s" % (ml_prob, key)
                    else:
                        print "%0.6f\t%s" % (log(ml_prob, 10.), key)
                elif self.raw:
                    print "%0.6f\t%s" % (self.counts[o][key], key)
                else:
                    print "%0.6f\t%s" % (log(self.counts[o][key], 10.), key)

        print "\n\\end\\"
        return
Example #13
0
class ModKNSmoother( ):
    """
      Stand-alone python implementation of Fixed Modified Kneser-Ney discounting.

      Intended for educational purposes, this should produce results identical
       to Google NGramLibrary tools with ngrammake --bins=3.  See the included 
       run-NGramLibrary.sh script to train a model for comparison.
    
      WARNING: This may be slow for very large corpora.

    """

    def __init__( self, order=3, sb="<s>", se="</s>" ):
        self.sb        = sb
        self.se        = se
        self.order     = order
        self.ngrams    = NGramStack(order=order)
        self.denominators = [ defaultdict(float) for i in xrange(order-1) ]
        self.numerators   = [ defaultdict(float) for i in xrange(order-1) ]
        #Modified Kneser-Ney requires that we track the individual N_i
        # in contrast to Kneser-Ney, which just requires the sum-total.
        self.nonZeros     = [ defaultdict(lambda: defaultdict(float)) for i in xrange(order-1) ]
        self.CoC          = [ [ 0.0 for j in xrange(4) ] for i in xrange(order) ]
        self.discounts    = [ [ 0.0 for j in xrange(3) ] for i in xrange(order-1) ]
        self.UD = 0.
        self.UN = defaultdict(float)

    def _compute_counts_of_counts( self ):
        """
          Compute counts-of-counts (CoC) for each N-gram order.
          Only CoC<=4 are relevant to the computation of
          either ModKNFix or KNFix.
        """

        for k in self.UN:
            if self.UN[k] <= 4:
                self.CoC[0][int(self.UN[k]-1)] += 1.

        for i,dic in enumerate(self.numerators):
            for k in dic:
                if dic[k]<=4:
                    self.CoC[i+1][int(dic[k]-1)] += 1.
        return

    def _compute_discounts( self ):
        """
          Compute the discount parameters. Note that unigram counts
          are not discounted in either FixKN or FixModKN.

          ---------------------------------
          Fixed Modified Kneser-Ney: FixModKN
          ---------------------------------
          This is the solution proposed by Chen&Goodman '98

             Y    = N_1 / (N_1 + 2*N_2)
             D_1  = 1 - 2*Y * (N_2 / N_1)
             D_2  = 2 - 3*Y * (N_3 / N_2)
             D_3+ = 3 - 4*Y * (N_4 / N_3)

          where N_i again refers to the number of N-grams that appear
          exactly 'i' times in the training data.  The D_i refer to the
          counts-of-counts for the current N-gram.  That is, if the 
          current N-gram, 'a b c' was seen exactly two times in the 
          training corpus, then discount D_2 would be applied.

        """

        for o in xrange(self.order-1):
            Y = self.CoC[o+1][0] / (self.CoC[o+1][0]+2*self.CoC[o+1][1])
            #Compute all the D_i based on the formula
            for i in xrange(3):
                if self.CoC[o+1][i]>0:
                    self.discounts[o][i] = (i+1) - (i+2)*Y * (self.CoC[o+1][i+1]/self.CoC[o+1][i])
                else:
                    self.discounts[o][i] = (i+1)

        return 

    def _get_discount( self, order, ngram ):
        """
          Compute the discount mass for this N-gram, based on 
           the precomputed D_i and individual N_i.
        """

        c  = [0.0, 0.0, 0.0]
                    
        for key in self.nonZeros[order][ngram]:
            if int(self.nonZeros[order][ngram][key])==1:
                c[0] += 1.
            elif int(self.nonZeros[order][ngram][key])==2:
                c[1] += 1.
            else:
                c[2] += 1.

        #Compute the discount mass by summing over the D_i*N_i 
        d = sum([ self.discounts[order][i]*c[i] for i in xrange(len(c)) ])
        return d

    def kneser_ney_from_counts( self, arpa_file ):
        """
          Train the KN-discount language model from an ARPA format 
          file containing raw count data.  This can be generated with,
            $ ./SimpleCount.py --train train.corpus -r > counts.arpa

        """

        m_ord = c_ord = 0

        for line in open(arpa_file, "r"):
            ngram, count = line.strip().split("\t")
            count = float(count)
            ngram = ngram.split(" ")
            if len(ngram)==2:
                self.UD += 1.0

            if len(ngram)==2:
                self.UN[" ".join(ngram[1:])] += 1.0
                #Nonzeros based on suffixes
                if ngram[0]==self.sb:
                    self.nonZeros[len(ngram)-2][" ".join(ngram[:-1])][ngram[-1]] += count
                    self.numerators[len(ngram)-2][" ".join(ngram)] += count
                    self.denominators[len(ngram)-2][" ".join(ngram[:-1])] += count

            if len(ngram)>2 and len(ngram)<self.order:
                self.numerators[len(ngram)-3][" ".join(ngram[1:])] += 1.0
                self.denominators[len(ngram)-3][" ".join(ngram[1:-1])] += 1.0
                self.nonZeros[len(ngram)-3][" ".join(ngram[1:-1])][ngram[-1]] += 1.0
                if ngram[0]==self.sb:
                    self.numerators[len(ngram)-2][" ".join(ngram)] += count
                    self.denominators[len(ngram)-2][" ".join(ngram[:-1])] += count
                    self.nonZeros[len(ngram)-2][" ".join(ngram[:-1])][ngram[-1]] += count

            if len(ngram)==self.order:
                self.numerators[len(ngram)-3][" ".join(ngram[1:])] += 1.0
                self.numerators[len(ngram)-2][" ".join(ngram)] = count
                self.denominators[len(ngram)-3][" ".join(ngram[1:-1])] += 1.0
                self.denominators[len(ngram)-2][" ".join(ngram[:-1])] += count
                self.nonZeros[len(ngram)-3][" ".join(ngram[1:-1])][ngram[-1]] += 1.0
                self.nonZeros[len(ngram)-2][" ".join(ngram[:-1])][ngram[-1]] += count

        self._compute_counts_of_counts ( )
        self._compute_discounts( )

        #self._print_raw_counts( )
        return

    def kneser_ney_discounting( self, training_file ):
        """
          Iterate through the training data using a FIFO stack or 
           'window' of max-length equal to the specified N-gram order.

          Each time a new word is pushed onto the N-gram stack call
           the _kn_recurse() subroutine to increment the N-gram 
           contexts in the current window / on the stack. 

          If pushing a word onto the stack makes len(stack)>max-order, 
           then the word at the bottom (stack[0]) is popped off.
        """

        for line in open(training_file,"r"):
            #Split the current line into words.
            words = re.split(r"\s+",line.strip())

            #Push a sentence-begin token onto the stack
            self.ngrams.push(self.sb)

            for word in words:
                #Get the current 'window' of N-grams
                ngram = self.ngrams.push(word)

                #Now count all N-grams in the current window
                #These will be of span <= self.order
                self._kn_recurse( ngram, len(ngram)-2 )

            #Now push the sentence-end token onto the stack
            ngram = self.ngrams.push(self.se)
            self._kn_recurse( ngram, len(ngram)-2 )

            #Clear the stack for the next sentence
            self.ngrams.clear()
        self._compute_counts_of_counts ( )
        self._compute_discounts( )

        return


    def print_raw_counts( self ):
        """
          Convenience function for sanity checking the history counts.
        """
        #print "NUMERATORS:"
        #for key in sorted(self.UN.iterkeys()):
        #    print " ", key, self.UN[key]
        #for o in xrange(len(self.numerators)):
        #    print "ORD",o
        #    for key in sorted(self.numerators[o].iterkeys()):
        #        print " ", key, self.numerators[o][key]
        #print "DENOMINATORS:"
        #print self.UD
        #for o in xrange(len(self.denominators)):
        #    print "DORD", o
        #    for key in sorted(self.denominators[o].iterkeys()):
        #        print " ", key, self.denominators[o][key]
        print "NONZEROS:"
        for o in xrange(len(self.nonZeros)):
            print "ZORD", o
            for denom in sorted(self.nonZeros[o].iterkeys()):
                print " Den:", denom
                for key in sorted(self.nonZeros[o][denom].iterkeys()):
                    print "   ", key, self.nonZeros[o][denom][key]


    def _kn_recurse( self, ngram_stack, i ):
        """
         Kneser-Ney discount calculation recursion.
        """

        if i==-1 and ngram_stack[0]==self.sb:
            return

        o     = len(ngram_stack)
        numer = " ".join(ngram_stack[o-(i+2):])
        denom = " ".join(ngram_stack[o-(i+2):o-1])
        self.numerators[  i][numer] += 1.
        self.denominators[i][denom] += 1.
 
        #For Modified Kneser-Ney we need to track 
        # individual nonZeros based on their suffixes
        self.nonZeros[i][denom][ngram_stack[-1]] += 1.
        if self.numerators[i][numer]==1.:
            if i>0:
                self._kn_recurse( ngram_stack, i-1 )
            else:
                #The <s> (sentence-begin) token is
                # NOT counted as a unigram event
                if not ngram_stack[-1]==self.sb:
                    self.UN[ngram_stack[-1]] += 1.
                    self.UD += 1.
        return  
        
    def print_ARPA( self ):
        """
          Print the interpolated Kneser-Ney LM out in ARPA format,
           computing the interpolated probabilities and back-off
           weights for each N-gram on-demand.  The format:
           ----------------------------
             \data\
             ngram 1=NUM_1GRAMS
             ngram 2=NUM_2GRAMS
             ...
             ngram N=NUM_NGRAMS (max order)
            
             \1-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \2-grams:
             p(a_z)  a_z  bow(a_z)
             ...
            
             \N-grams:
             p(a_z)  a_z
             ...

             \end\
           ----------------------------
        """

        #Handle the header info
        print "\\data\\"
        print "ngram 1=%d" % (len(self.UN)+1)
        for o in xrange(0,self.order-1):
            print "ngram %d=%d" % (o+2,len(self.numerators[o]) )

        #Handle the Unigrams
        print "\n\\1-grams:"
        d    = self._get_discount( 0, self.sb )
        #ModKN discount
        lmda = d / self.denominators[0][self.sb]
        print "-99.00000\t%s\t%0.7f"   % ( self.sb, log(lmda, 10.) )

        for key in sorted(self.UN.iterkeys()):
            if key==self.se:
                print "%0.7f\t%s\t-99"   % ( log(self.UN[key]/self.UD, 10.), key )
                continue

            d    = self._get_discount( 0, key )
            #ModKN discount
            lmda = d / self.denominators[0][key]
            print "%0.7f\t%s\t%0.7f" % ( log(self.UN[key]/self.UD, 10.), key, log(lmda, 10.) )

        #Handle the middle-order N-grams
        for o in xrange(0,self.order-2):
            print "\n\\%d-grams:" % (o+2)
            for key in sorted(self.numerators[o].iterkeys()):
                if key.endswith(self.se):
                    #No back-off prob for N-grams ending in </s>
                    prob = self._compute_interpolated_prob( key )
                    print "%0.7f\t%s" % ( log(prob, 10.), key )
                    continue
                d = self._get_discount( o+1, key )
                #Compute the back-off weight
                #ModKN discount
                lmda  = d / self.denominators[o+1][key]
                #Compute the interpolated N-gram probability
                prob = self._compute_interpolated_prob( key )
                print "%0.7f\t%s\t%0.7f" % ( log(prob, 10.), key, log(lmda, 10.))

        #Handle the N-order N-grams
        print "\n\\%d-grams:" % (self.order)
        for key in sorted(self.numerators[self.order-2].iterkeys()):
            #Compute the interpolated N-gram probability
            prob = self._compute_interpolated_prob( key )
            print "%0.7f\t%s" % ( log(prob, 10.), key )

        print "\n\\end\\"
        return


    def _compute_interpolated_prob( self, ngram ):
        """
          Compute the interpolated probability for the input ngram.
          Cribbing the notation from the SRILM webpages,

             a_z    = An N-gram where a is the first word, z is the 
                       last word, and "_" represents 0 or more words in between.
             p(a_z) = The estimated conditional probability of the 
                       nth word z given the first n-1 words (a_) of an N-gram.
             a_     = The n-1 word prefix of the N-gram a_z.
             _z     = The n-1 word suffix of the N-gram a_z.

          Then we have, 
             f(a_z) = g(a_z) + bow(a_) p(_z)
             p(a_z) = (c(a_z) > 0) ? f(a_z) : bow(a_) p(_z)

          The ARPA format is generated by writing, for each N-gram
          with 1 < order < max_order:
             p(a_z)    a_z   bow(a_z)

          and for the maximum order:
             p(a_z)    a_z

          special care must be taken for certain N-grams containing
           the <s> (sentence-begin) and </s> (sentence-end) tokens.  
          See the implementation for details on how to do this correctly.

          The formulation is based on the seminal Chen&Goodman '98 paper.

          SRILM notation-cribbing from:
             http://www.speech.sri.com/projects/srilm/manpages/ngram-discount.7.html
        """
        probability = 0.0
        ngram_stack = ngram.split(" ")
        probs       = [ 1e-99 for i in xrange(len(ngram_stack)) ]
        o           = len(ngram_stack)

        if not ngram_stack[-1]==self.sb:
            probs[0] = self.UN[ngram_stack[-1]] / self.UD

        for i in xrange(o-1):
            dID = " ".join(ngram_stack[o-(i+2):o-1])
            nID = " ".join(ngram_stack[o-(i+2):])
            if dID in self.denominators[i]:
                count = int(self.numerators[i][nID])
                d_i   = min(count, 3)
                probs[i+1] = (count-self.discounts[i][d_i-1])/self.denominators[i][dID]

                #This break-down takes the following form:
                #  probs[i+1]:  The interpolated N-gram probability, p(a_z)
                #  d:           The discount mass for a_: \Sum_i D_i*N_i
                #  lmda:        The un-normalized 'back-off' weight, bow(a_)
                #  probs[i]:    The next lower-order, interpolated N-gram 
                #               probability corresponding to p(_z)
                #ModKN discount
                d = self._get_discount( i, dID )
                lmda        = d / self.denominators[i][dID]
                probs[i+1]  = probs[i+1] + lmda * probs[i]
                probability = probs[i+1]

        if probability == 0.0:
            #If we still have nothing, return the unigram probability
            probability = probs[0]

        return probability