def score(self, morpheme, eow=0): """Score the set of target MORPHEMES with the n-gram language model given the current history of MORPHEMES. Args: words (list): Set of morphemes to score Returns: dict. Language model scores for the words in ``words`` """ prefix = "%s " % ' '.join(self.history) order = len(self.history) + 1 if eow == 1: # Score for the end of word symbol: # logP(second-last-morf last-morf morf </s>) = logP(second-last-morf last-morf morf) + logP(last-morf morf </s>) prefix_eos = "%s " % ' '.join(self.history[1:]) if len( self.history) == self.history_len else "%s " % ' '.join( self.history) order_eos = order if len( self.history) == self.history_len else order + 1 prob = (getNgramProb(self.lm, prefix + str(morpheme), order) + getNgramProb(self.lm, prefix_eos + str(morpheme) + " </s>", order_eos)) else: prob = getNgramProb(self.lm, prefix + str(morpheme), order) return prob
def score(self, morpheme, eow=0): """Score the target MORPHEME with the n-gram language model given the current history of MORPHEMES. Args: morpheme (string): morpheme to score eow (int): whether morpheme is followed by end of word symbold (1) or not (0) Returns: logprob (float). Language model score for the next morpheme """ prefix = "%s " % ' '.join(self.history) order = len(self.history) + 1 if eow == 1: # Score for the end of word symbol: # logP(second-last-morf last-morf morf </s>) = logP(second-last-morf last-morf morf) + logP(last-morf morf </s>) prefix_eos = "%s " % ' '.join(self.history[1:]) if len( self.history) == self.history_len else "%s " % ' '.join( self.history) order_eos = order if len( self.history) == self.history_len else order + 1 # print u'scoring morfeme: {}, {}'.format(morpheme,order) # print u'scoring eow: {}, {}'.format(morpheme,order_eos) prob = (getNgramProb(self.lm, prefix + str(morpheme), order) + getNgramProb(self.lm, prefix_eos + str(morpheme) + " </s>", order_eos)) else: # print u'scoring morfeme: {}, {}'.format(morpheme,order) prob = getNgramProb(self.lm, prefix + str(morpheme), order) return prob
def score(self, char): """retrieve the probability of a sequence from the language model based on the current history""" prefix = "%s " % ' '.join(self.history) order = len(self.history) + 1 ret = getNgramProb( self.lm, prefix + ("</s>" if char == self.EOS_ID else str(char)), order) return ret
def score(self, char): """retrieve the probability of a sequence from the language model based on the current history""" prefix = "%s " % ' '.join(self.history) order = len(self.history) + 1 # print u'scoring char: {}, {}, {}'.format(prefix,char,order) # print u'history, scoring with lm: {}, {}'.format(prefix,prefix + ("</s>" if char == self.EOS_ID else str(char))) ret = getNgramProb( self.lm, prefix + ("</s>" if char == self.EOS_ID else str(char)), order) return ret
def predict_next(self, words): """Score the set of target words with the n-gram language model given the current history Args: words (list): Set of words to score Returns: dict. Language model scores for the words in ``words`` """ prefix = "%s " % ' '.join(self.history) order = len(self.history) + 1 ret = {w: getNgramProb( self.lm, prefix + ("</s>" if w == utils.EOS_ID else str(w)), order) for w in words} return ret
def predict_next(self, morphemes, eow=0): """Score the set of target MORPHEMES with the n-gram language model given the current history of MORPHEMES. Args: words (list): Set of morphemes to score Returns: dict. Language model scores for the words in ``words`` """ prefix = "%s " % ' '.join(self.history) order = len(self.history) + 1 scaling_factor = math.log(10) if self.convert_to_ln else 1.0 if eow == 1: # Score for the end of word symbol: # logP(second-last-morf last-morf morf </s>) = logP(second-last-morf last-morf morf) + logP(last-morf morf </s>) #prefix_eos = "%s " % ' '.join(self.history[1:]) if len(self.history) > self.history_len else "%s " % ' '.join(self.history) prefix_eos = "%s " % ' '.join(self.history[1:]) if len( self.history) == self.history_len else "%s " % ' '.join( self.history) #prefix_eos = "%s " % ' '.join(self.history[1:]) #order_eos = order+1 order_eos = order if len( self.history) == self.history_len else order + 1 #order_eos = order logging.debug(u"prefix: {} w: {} order: {} score {}".format( prefix + str(morphemes[0]), str(morphemes[0]), order, getNgramProb(self.lm, prefix + str(morphemes[0]), order))) logging.debug(u"prefix_eos: {} w: {} order: {} score {}".format( prefix_eos + str(morphemes[0]) + " </s>", " </s>", order_eos, getNgramProb(self.lm, prefix_eos + str(morphemes[0]) + " </s>", order_eos))) prob = { w: (getNgramProb(self.lm, prefix + str(w), order) + getNgramProb( self.lm, prefix_eos + str(w) + " </s>", order_eos)) * scaling_factor for w in morphemes } else: logging.debug(u"prefix: {} w: {} score {}".format( prefix, str(morphemes[0]), getNgramProb(self.lm, prefix + str(morphemes[0]), order))) # Score for the segmentation boundary symbol: prob = { w: getNgramProb(self.lm, prefix + str(w), order) * scaling_factor for w in morphemes } return prob
def _ngram_log_score(self,hist_list,cur_id): """ Generate n-gram probability score hist_list: the list of word ids in the history. Can be empty if no history cur_id: id of the current symbol. """ ngram_id=self._generate_index_representation(hist_list+[cur_id]) #ngram_id= u'_'.join([str(x) for x in hist_list+[cur_id]]) if ngram_id not in self._lm_cache_ngram: ngram, len_ngram=self._generate_ngram(hist_list,cur_id) self._lm_cache_ngram[ngram_id]= srilm.getNgramProb(self._lm_model,ngram.encode('utf-8'),len_ngram)/LOG_E_BASE10 return self._lm_cache_ngram[ngram_id]
def predict_next(self, words): """Score the set of target words with the n-gram language model given the current history Args: words (list): Set of words to score Returns: dict. Language model scores for the words in ``words`` """ prefix = "%s " % ' '.join(self.history) order = len(self.history) + 1 ret = { w: getNgramProb(self.lm, prefix + ("</s>" if w == utils.EOS_ID else str(w)), order) for w in words } return ret
def _ngram_log_score(self, hist_list, cur_id): """ Generate n-gram probability score hist_list: the list of word ids in the history. Can be empty if no history cur_id: id of the current symbol. """ ngram_id = self._generate_index_representation(hist_list + [cur_id]) #ngram_id= u'_'.join([str(x) for x in hist_list+[cur_id]]) if ngram_id not in self._lm_cache_ngram: ngram, len_ngram = self._generate_ngram(hist_list, cur_id) self._lm_cache_ngram[ngram_id] = srilm.getNgramProb( self._lm_model, ngram.encode('utf-8'), len_ngram) / LOG_E_BASE10 return self._lm_cache_ngram[ngram_id]
def predict_next(self, words): """Score the set of target words with the n-gram language model given the current history Args: words (list): Set of words to score Returns: dict. Language model scores for the words in ``words`` """ prefix = "%s " % ' '.join(self.history) logging.debug(u'LM over chars prefix: {}'.format(prefix)) order = len(self.history) + 1 scaling_factor = math.log(10) if self.convert_to_ln else 1.0 ret = {w: getNgramProb( self.lm, prefix + ("</s>" if w == utils.EOS_ID else str(w)), order) * scaling_factor for w in words} logging.debug(u'LM over chars distribution: {}'.format(ret)) return ret
def get_unk_probability(self, posterior): """Use the probability for '<unk>' in the language model """ order = len(self.history) + 1 return getNgramProb(self.lm, "%s <unk>" % ' '.join(self.history), order)