def score(self, morpheme, eow=0):
        """Score the set of target MORPHEMES with the n-gram language model given the current history of MORPHEMES.
		
		Args:
		words (list): Set of morphemes to score
		Returns:
		dict. Language model scores for the words in ``words``
		"""
        prefix = "%s " % ' '.join(self.history)
        order = len(self.history) + 1

        if eow == 1:
            # Score for the end of word symbol:
            # logP(second-last-morf last-morf morf </s>) = logP(second-last-morf last-morf morf) + logP(last-morf morf </s>)
            prefix_eos = "%s " % ' '.join(self.history[1:]) if len(
                self.history) == self.history_len else "%s " % ' '.join(
                    self.history)
            order_eos = order if len(
                self.history) == self.history_len else order + 1

            prob = (getNgramProb(self.lm, prefix + str(morpheme), order) +
                    getNgramProb(self.lm, prefix_eos + str(morpheme) + " </s>",
                                 order_eos))

        else:
            prob = getNgramProb(self.lm, prefix + str(morpheme), order)

        return prob
Beispiel #2
0
    def score(self, morpheme, eow=0):
        """Score the target MORPHEME with the n-gram language model given the current history of MORPHEMES.
        Args:
        morpheme (string): morpheme to score
        eow (int): whether morpheme is followed by end of word symbold (1) or not (0)
        Returns:
        logprob (float). Language model score for the next morpheme
        """
        prefix = "%s " % ' '.join(self.history)
        order = len(self.history) + 1

        if eow == 1:
            # Score for the end of word symbol:
            # logP(second-last-morf last-morf morf </s>) = logP(second-last-morf last-morf morf) + logP(last-morf morf </s>)
            prefix_eos = "%s " % ' '.join(self.history[1:]) if len(
                self.history) == self.history_len else "%s " % ' '.join(
                    self.history)
            order_eos = order if len(
                self.history) == self.history_len else order + 1
            #            print u'scoring morfeme: {}, {}'.format(morpheme,order)
            #            print u'scoring eow: {}, {}'.format(morpheme,order_eos)
            prob = (getNgramProb(self.lm, prefix + str(morpheme), order) +
                    getNgramProb(self.lm, prefix_eos + str(morpheme) + " </s>",
                                 order_eos))
        else:
            #            print u'scoring morfeme: {}, {}'.format(morpheme,order)
            prob = getNgramProb(self.lm, prefix + str(morpheme), order)
        return prob
    def score(self, char):
        """retrieve the probability of a sequence from the language model
		 based on the current history"""
        prefix = "%s " % ' '.join(self.history)
        order = len(self.history) + 1

        ret = getNgramProb(
            self.lm, prefix + ("</s>" if char == self.EOS_ID else str(char)),
            order)
        return ret
Beispiel #4
0
    def score(self, char):
        """retrieve the probability of a sequence from the language model
         based on the current history"""
        prefix = "%s " % ' '.join(self.history)
        order = len(self.history) + 1
        #        print u'scoring char: {}, {}, {}'.format(prefix,char,order)
        #		print u'history, scoring with lm: {}, {}'.format(prefix,prefix + ("</s>" if char == self.EOS_ID else str(char)))

        ret = getNgramProb(
            self.lm, prefix + ("</s>" if char == self.EOS_ID else str(char)),
            order)
        return ret
Beispiel #5
0
 def predict_next(self, words):
     """Score the set of target words with the n-gram language 
     model given the current history
     
     Args:
         words (list): Set of words to score
     Returns:
         dict. Language model scores for the words in ``words``
     """
     prefix = "%s " % ' '.join(self.history)
     order = len(self.history) + 1
     ret = {w: getNgramProb(
                     self.lm,
                     prefix + ("</s>" if w == utils.EOS_ID else str(w)),
                     order) for w in words} 
     return ret
Beispiel #6
0
    def predict_next(self, morphemes, eow=0):
        """Score the set of target MORPHEMES with the n-gram language model given the current history of MORPHEMES.
        
        Args:
        words (list): Set of morphemes to score
        Returns:
        dict. Language model scores for the words in ``words``
        """
        prefix = "%s " % ' '.join(self.history)
        order = len(self.history) + 1

        scaling_factor = math.log(10) if self.convert_to_ln else 1.0

        if eow == 1:
            # Score for the end of word symbol:
            # logP(second-last-morf last-morf morf </s>) = logP(second-last-morf last-morf morf) + logP(last-morf morf </s>)
            #prefix_eos = "%s " % ' '.join(self.history[1:]) if len(self.history) > self.history_len else "%s " % ' '.join(self.history)
            prefix_eos = "%s " % ' '.join(self.history[1:]) if len(
                self.history) == self.history_len else "%s " % ' '.join(
                    self.history)
            #prefix_eos = "%s " % ' '.join(self.history[1:])
            #order_eos = order+1
            order_eos = order if len(
                self.history) == self.history_len else order + 1
            #order_eos = order
            logging.debug(u"prefix: {} w: {} order: {} score {}".format(
                prefix + str(morphemes[0]), str(morphemes[0]), order,
                getNgramProb(self.lm, prefix + str(morphemes[0]), order)))

            logging.debug(u"prefix_eos: {} w: {} order: {} score {}".format(
                prefix_eos + str(morphemes[0]) + " </s>", " </s>", order_eos,
                getNgramProb(self.lm, prefix_eos + str(morphemes[0]) + " </s>",
                             order_eos)))

            prob = {
                w:
                (getNgramProb(self.lm, prefix + str(w), order) + getNgramProb(
                    self.lm, prefix_eos + str(w) + " </s>", order_eos)) *
                scaling_factor
                for w in morphemes
            }

        else:
            logging.debug(u"prefix: {} w: {} score {}".format(
                prefix, str(morphemes[0]),
                getNgramProb(self.lm, prefix + str(morphemes[0]), order)))
            # Score for the segmentation boundary symbol:
            prob = {
                w:
                getNgramProb(self.lm, prefix + str(w), order) * scaling_factor
                for w in morphemes
            }

        return prob
    def _ngram_log_score(self,hist_list,cur_id):
        """
        Generate n-gram probability score


        hist_list: the list of word ids in the history. Can be empty if no history

        cur_id: id of the current symbol. 
       """
        ngram_id=self._generate_index_representation(hist_list+[cur_id])
        #ngram_id= u'_'.join([str(x) for x in  hist_list+[cur_id]])

        if ngram_id not in self._lm_cache_ngram:
            ngram, len_ngram=self._generate_ngram(hist_list,cur_id)
            self._lm_cache_ngram[ngram_id]= srilm.getNgramProb(self._lm_model,ngram.encode('utf-8'),len_ngram)/LOG_E_BASE10 

        return self._lm_cache_ngram[ngram_id]
Beispiel #8
0
 def predict_next(self, words):
     """Score the set of target words with the n-gram language 
     model given the current history
     
     Args:
         words (list): Set of words to score
     Returns:
         dict. Language model scores for the words in ``words``
     """
     prefix = "%s " % ' '.join(self.history)
     order = len(self.history) + 1
     ret = {
         w: getNgramProb(self.lm,
                         prefix + ("</s>" if w == utils.EOS_ID else str(w)),
                         order)
         for w in words
     }
     return ret
Beispiel #9
0
    def _ngram_log_score(self, hist_list, cur_id):
        """
        Generate n-gram probability score


        hist_list: the list of word ids in the history. Can be empty if no history

        cur_id: id of the current symbol. 
       """
        ngram_id = self._generate_index_representation(hist_list + [cur_id])
        #ngram_id= u'_'.join([str(x) for x in  hist_list+[cur_id]])

        if ngram_id not in self._lm_cache_ngram:
            ngram, len_ngram = self._generate_ngram(hist_list, cur_id)
            self._lm_cache_ngram[ngram_id] = srilm.getNgramProb(
                self._lm_model, ngram.encode('utf-8'),
                len_ngram) / LOG_E_BASE10

        return self._lm_cache_ngram[ngram_id]
Beispiel #10
0
 def predict_next(self, words):
     """Score the set of target words with the n-gram language 
     model given the current history
     
     Args:
         words (list): Set of words to score
     Returns:
         dict. Language model scores for the words in ``words``
     """
     prefix = "%s " % ' '.join(self.history)
     logging.debug(u'LM over chars prefix: {}'.format(prefix))
     
     order = len(self.history) + 1
     scaling_factor = math.log(10) if self.convert_to_ln else 1.0
     ret = {w: getNgramProb(
                     self.lm,
                     prefix + ("</s>" if w == utils.EOS_ID else str(w)),
                     order) * scaling_factor for w in words}
     logging.debug(u'LM over chars distribution: {}'.format(ret))
     return ret
Beispiel #11
0
 def get_unk_probability(self, posterior):
     """Use the probability for '<unk>' in the language model """
     order = len(self.history) + 1
     return getNgramProb(self.lm, "%s <unk>" % ' '.join(self.history),
                         order)
Beispiel #12
0
 def get_unk_probability(self, posterior):
     """Use the probability for '<unk>' in the language model """
     order = len(self.history) + 1
     return getNgramProb(self.lm,
                         "%s <unk>" % ' '.join(self.history),
                         order)