Esempio n. 1
0
    def next_word_by_tokens(self, tokens):
        candidate = set(self.high_freq)
        state = kenlm.State()
        state2 = kenlm.State()
        tids = map(lambda x: self.voc[x] if x in self.voc else 0, tokens[-5:])
        candidate |= set(
            map(lambda x: self.voc[x] if x in self.voc else 0, tokens))

        if len(tids) < 5:
            self.model.BeginSentenceWrite(state)
        else:
            self.model.NullContextWrite(state)
        # state query
        for tid in tids:
            self.model.BaseScore(state, str(tid), state2)
            state, state2 = state2, state
        # find max candidate
        ranking = []
        for tid in candidate:
            p = self.model.BaseScore(state, str(tid), state2)
            heapq.heappush(ranking, (p, tid))

        return map(
            lambda (max_p, max_id): '<unk>'
            if max_id == 0 else self.id2word[max_id][1],
            heapq.nlargest(10, ranking))
Esempio n. 2
0
 def _update_lm_state(self):
     self.lm_state = kenlm.State()
     tmp_state = kenlm.State()
     self.lm.BeginSentenceWrite(self.lm_state)
     for w in self.history[-6:]:
         self.lm.BaseScore(self.lm_state, w, tmp_state)
         self.lm_state, tmp_state = tmp_state, self.lm_state
Esempio n. 3
0
def vocab_prob_given_ngram(lm, v_prev_ngram, trg_vocab, trg_vocab_i2w, given=False, wid=True):

    if wid:
        v_prev_ngram = [trg_vocab_i2w[i] for i in v_prev_ngram if i != -1]

    # debug(str(v_prev_ngram))
    logps, wids = [], []
    if given:
        state_in = kenlm.State()
        lm.NullContextWrite(state_in)
        # m.BeginSentenceWrite(state_in)
        for w in v_prev_ngram:
            ngram_state = kenlm.State()
            lm.BaseScore(state_in, w, ngram_state)
            state_in = ngram_state

        for w, idx in trg_vocab.iteritems():
            state_out = kenlm.State()
            log_prob = lm.BaseScore(ngram_state, w, state_out)
            logps.append(log_prob)
            wids.append(idx)

    else:
        for w, idx in trg_vocab.iteritems():
            new_gram = ' '.join(v_prev_ngram + [w])
            log_prob = lm.score(new_gram, bos=False, eos=False)
            logps.append(log_prob)
            wids.append(idx)

    return logps, wids
Esempio n. 4
0
    def _infer_instance_lm(self, instance: str, *args, **kwargs):
        candidates = []
        for incorrect in instance.split():
            if any([c not in self.dictionary.alphabet for c in incorrect]):
                candidates.append([(0, incorrect)])
            else:
                res = self.find_candidates(incorrect,
                                           k=self.candidates_count,
                                           prop_threshold=1e-6)
                if res:
                    candidates.append([(score, candidate)
                                       for candidate, score in res])
                else:
                    candidates.append([(0, incorrect)])
        candidates.append([(0, '</s>')])

        state = kenlm.State()
        self.lm.BeginSentenceWrite(state)
        beam = [(0, state, [])]
        for sublist in candidates:
            new_beam = []
            for beam_score, beam_state, beam_words in beam:
                for score, candidate in sublist:
                    state = kenlm.State()
                    c_score = self.lm.BaseScore(beam_state, candidate, state)
                    new_beam.append((beam_score + score + c_score, state,
                                     beam_words + [candidate]))
            new_beam.sort(reverse=True)
            beam = new_beam[:self.beam_size]
        score, state, words = beam[0]
        return ' '.join(words[:-1])
Esempio n. 5
0
 def test_get_words_with_prefix(self):
     self.vocabtrie.add_word('abc')
     stateIn = kenlm.State()
     stateOut = kenlm.State()
     words_with_probs = self.vocabtrie.get_words_with_prefix(
         'a', self.language_model, stateIn, stateOut).pop(0)
     self.assertEqual(words_with_probs[0], 'abc',
                      'Returned item is not equal')
def get_state(context_tokens, lm):
    if context_tokens is None or len(context_tokens) == 0:
        return kenlm.State()
    instate = kenlm.State()
    outstate = kenlm.State()
    for w in context_tokens:
        __ = lm.BaseScore(instate, w, outstate)
        instate = outstate
    return outstate
Esempio n. 7
0
    def get_context_state(self, context, model):
        state_in = kenlm.State()
        state_out = kenlm.State()
        context = '<s> ' + context
        context_words = context.split()
        for w in context_words:
            # print('Context', '{0}\t{1}'.format(model.BaseScore(state_in, w, state_out), w))
            state_in = state_out
            state_out = kenlm.State()

        return state_in, state_out
Esempio n. 8
0
    def get_context_state(self, context, model, vocab_id):
        state_in = kenlm.State()
        state_out = kenlm.State()
        context = '<s> ' + self.format_context(context, vocab_id)
        context_words = context.split()
        for w in context_words:
            # print('Context', '{0}\t{1}'.format(model.BaseScore(state_in, w.lower(), state_out), w.lower()))
            '{0}\t{1}'.format(model.BaseScore(state_in, w, state_out), w)
            state_in = state_out
            state_out = kenlm.State()

        return state_in, state_out
Esempio n. 9
0
def compute_word_logprob(model, current_state, target_word):
    word_pred = 0
    if target_word == EOS:
        stateOut = kenlm.State()
        word_pred += model.BaseScore(current_state, str(target_word), stateOut) * LOG10
    else:
        stateIn = current_state
        for token in list(target_word):
            stateOut = kenlm.State()
            word_pred += model.BaseScore(stateIn, token, stateOut) * LOG10
            stateIn = stateOut
    return word_pred
Esempio n. 10
0
 def _cond_probs(self, history) -> mx.nd.NDArray:
     startstate = kenlm.State()
     self.lm.NullContextWrite(startstate)
     for word in history:
         endstate = kenlm.State()
         self.lm.BaseScore(startstate, word, endstate)
         startstate = endstate
     # base-10 log score
     # ONLY works on cpu
     return mx.nd.array([
         self.lm.BaseScore(startstate, word, kenlm.State())
         for word in self.vocab
     ])
Esempio n. 11
0
    def get_cond_log_prob(self, sequence):
        sequence = sequence[-self.lm.order:]

        in_state = kenlm.State()
        self.lm.NullContextWrite(in_state)
        out_state = kenlm.State()

        for word in sequence:
            lm_prob = self.lm.BaseScore(in_state, word, out_state)
            tmp_state = in_state
            in_state = out_state
            out_state = tmp_state

        return lm_prob
Esempio n. 12
0
 def eval_logprobs_for_words(self, state, next_words):
     new_state = kenlm.State()
     logprobs = np.empty(len(next_words))
     for next_idx, word_idx in enumerate(next_words):
         logprobs[next_idx] = self.model.base_score_from_idx(state, word_idx, new_state)
     logprobs *= LOG10
     return logprobs
Esempio n. 13
0
 def score_seq_by_word(self, state, words):
     scores = []
     for word in words:
         new_state = kenlm.State()
         scores.append(LOG10 * self.model.base_score_from_idx(state, self.model.vocab_index(word), new_state))
         state = new_state
     return scores
Esempio n. 14
0
def generate_phrase(model, context_toks, length, prefix_logprobs=None, **kw):
    if context_toks[0] == '<s>':
        state, _ = model.get_state(context_toks[1:], bos=True)
    else:
        state, _ = model.get_state(context_toks, bos=False)
    phrase = context_toks[:]
    generated_logprobs = np.empty(length)
    for i in range(length):
        next_words, probs = next_word_probs(model,
                                            state,
                                            phrase[-1],
                                            prefix_logprobs=prefix_logprobs,
                                            **kw)
        if len(next_words) == 0:
            raise GenerationFailedException
        prefix_logprobs = None
        picked_subidx = np.random.choice(len(probs), p=probs)
        picked_idx = next_words[picked_subidx]
        new_state = kenlm.State()
        model.model.base_score_from_idx(state, picked_idx, new_state)
        state = new_state
        word = model.id2str[picked_idx]
        phrase.append(word)
        generated_logprobs[i] = np.log(probs[picked_subidx])
    return phrase[len(context_toks):], generated_logprobs
Esempio n. 15
0
 def next_word_logprobs_raw(self, state, prev_word, prefix_logprobs=None):
     bigrams = self.unfiltered_bigrams
     if prefix_logprobs is not None:
         next_words = []
         prior_logprobs = []
         for logprob, prefix in prefix_logprobs:
             for word, word_idx in self.vocab_trie.items(prefix):
                 next_words.append(word_idx)
                 prior_logprobs.append(logprob)
     else:
         next_words = bigrams.get(prev_word, [])
         if len(next_words) == 0:
             next_words = self.vocab
         next_words = [
             w for w in next_words if w != self.eos and w != self.eop
         ]
     if len(next_words) == 0:
         return [], np.zeros(0)
     new_state = kenlm.State()
     logprobs = np.empty(len(next_words))
     for next_idx, word in enumerate(next_words):
         logprob = self.model.BaseScore(state, word, new_state)
         if prefix_logprobs is not None:
             logprob += prior_logprobs[next_idx]
         logprobs[next_idx] = logprob
     logprobs *= LOG10
     return next_words, logprobs
Esempio n. 16
0
 def score_seq_by_word(self, state, words):
     scores = []
     for word in words:
         new_state = kenlm.State()
         scores.append(LOG10 * self.model.BaseScore(state, word, new_state))
         state = new_state
     return scores
Esempio n. 17
0
 def score_seq(self, state, words):
     score = 0.
     for word in words:
         new_state = kenlm.State()
         score += self.model.BaseScore(state, word, new_state)
         state = new_state
     return score * LOG10, state
Esempio n. 18
0
    def __init__(self, uid,
                 name,
                 order,
                 path,
                 bos,
                 eos):
        """
        A language model scorer (KenLM only).

        :param uid: unique id (int)
        :param name: prefix for features
        :param weights: weight vector (two features: logprob and oov count)
        :param order: n-gram order
        :param bos: a Terminal symbol representing the left boundary of the sentence.
        :param eos: a Terminal symbol representing the right boundary of the sentence.
        :param path: path to a kenlm model (ARPA or binary).
        :return:
        """
        super(StatelessLM, self).__init__(uid, name)
        self._order = order
        self._bos = bos
        self._eos = eos
        self._path = path
        self._model = klm.Model(path)
        self._features = (name, '{0}_OOV'.format(name))

        # get the initial state
        self._initial = klm.State()
        self._model.BeginSentenceWrite(self._initial)
Esempio n. 19
0
 def __call__(self, token, state):
     """
     Args:
         token (th.Tensor): V, previous tokens
         state (list[list[State]] or None): LM states
     Return:
         score (Tensor): N x V, LM scores
         state (list[list[State]]), new states
     """
     device = token.device
     token = token.tolist()
     if state is None:
         init_state = kenlm.State()
         self.ngram_lm.BeginSentenceWrite(init_state)
         prev_state = [init_state for _ in range(len(token))]
     else:
         assert len(token) == len(state)
         prev_state = [s[token[i]] for i, s in enumerate(state)]
     scores, states = [], []
     for state in prev_state:
         score, state = self._step(state)
         scores.append(score)
         states.append(state)
     scores = th.stack(scores).to(device)
     return scores, states
Esempio n. 20
0
def get_conditional_logprobs(base_model,
                             context_toks,
                             seq,
                             prefix_logprobs=None):
    import kenlm
    context_toks = ['<s>'] + context_toks
    state, _ = base_model.get_state(context_toks, bos=True)
    possible_word_indices = []
    offset_of_chosen_word = []
    base_logprobs = []

    for i, word in enumerate(seq):
        word_idx = base_model.model.vocab_index(word)
        next_words, logprobs = base_model.next_word_logprobs_raw(
            state,
            context_toks[-1] if i == 0 else seq[i - 1],
            prefix_logprobs=prefix_logprobs)

        # At this point we're past the first word, so no more prefix logprobs.
        prefix_logprobs = None

        # Store results.
        #
        # Note that since what we're scoring was always generated by the main
        # model, there will never be an <unk>.
        possible_word_indices.append(np.asanyarray(next_words))
        offset_of_chosen_word.append(next_words.index(word_idx))
        base_logprobs.append(logprobs)

        # Advance the model state.
        new_state = kenlm.State()
        base_model.model.base_score_from_idx(state, word_idx, new_state)
        state = new_state

    return possible_word_indices, offset_of_chosen_word, base_logprobs
Esempio n. 21
0
 def score_seq(self, state, words):
     score = 0.
     for word in words:
         new_state = kenlm.State()
         score += self.model.base_score_from_idx(state, self.model.vocab_index(word), new_state)
         state = new_state
     return score * LOG10, state
    def expand_token(self, prev: Hypothesis, token: str,
                     token_score: float) -> Hypothesis:

        if prev.lm_state is None:
            prev_state = kenlm.State()
            self.model.BeginSentenceWrite(prev_state)
        else:
            prev_state = prev.lm_state

        new_lm_state = kenlm.State()
        token_lm_score = self.model.BaseScore(prev_state, token, new_lm_state)

        hyp = copy.deepcopy(prev)
        hyp.expand_by_token(token, token_score, token_lm_score, new_lm_state)

        return hyp
Esempio n. 23
0
    def get_score(self, cand_parents, cand_syms, lang_model):
        """
        the saved lm model will be called here
        Args:
            cand_parents: last selected top candidates
            cand_syms: last selected top char index
            lang_model: the language model
        Return:
            scores: the lm scores
        """
        scale = 1.0 / np.log10(np.e)  # convert log10 to ln

        num_cands = len(cand_syms)
        scores = np.zeros((num_cands, self.num_syms))
        new_states = np.zeros((num_cands, self.num_syms), dtype=object)
        chars = [str(x) for x in range(self.num_syms)]
        chars[self.sos] = "<s>"
        chars[self.eos] = "</s>"
        chars[0] = "<space>"

        for i in range(num_cands):
            parent = cand_parents[i]
            kenlm_state_list = self.cand_kenlm_states[parent]
            kenlm_state = kenlm_state_list[cand_syms[i]]
            for sym in range(self.num_syms):
                char = chars[sym]
                out_state = kenlm.State()
                score = scale * lang_model.BaseScore(kenlm_state, char, out_state)
                scores[i, sym] = score
                new_states[i, sym] = out_state
        self.cand_kenlm_states = new_states
        return scores
Esempio n. 24
0
 def reset(self):
     """
     Call this function to reset the lm to predict on a new sequence
     """
     kenlm_state = kenlm.State()
     self.lang_model.BeginSentenceWrite(kenlm_state)
     self.cand_kenlm_states = np.array([[kenlm_state] * self.num_syms])
Esempio n. 25
0
def main():
    vt = VocabTrie()

    state_in = kenlm.State()
    state_out = kenlm.State()
    model = kenlm.LanguageModel('resources/lm_word_medium.kenlm')

    vt.add_word('hel')
    vt.add_word('help')
    vt.add_word('hi')
    vt.add_word('hello')
    vt.add_word('hellboy')
    vt.add_word('helen')

    print(vt.contains_word('hell'))

    print(vt.get_words_with_prefix('he', model, state_in, state_out))
Esempio n. 26
0
 def featurize_yield(self, projection):
     """
     :param words: sequence of Terminal objects
     :return: weight
     """
     qa = klm.State()
     qb = klm.State()
     self._model.BeginSentenceWrite(qa)
     log_prob = 0.0
     oov = 0.0
     for word in projection:
         r = self._model.BaseFullScore(qa, word.surface, qb)
         log_prob += r.log_prob
         oov += int(r.oov)
         qa, qb = qb, qa
     log_prob += self._model.BaseScore(qa, self._eos.surface, qb)
     return np.array([log_prob, oov])
Esempio n. 27
0
def batch_advance(lm, inner_states, w, out_states):
    probs = []

    for state in inner_states:
        out_states.append(kenlm.State())
        probs.append(lm.BaseScore(state, w, out_states[-1]))

    return probs
Esempio n. 28
0
 def featurize_final(self, context):
     """
     :param context: a state
     :return:
     """
     out_state = klm.State()
     score = self._model.BaseFullScore(context, self._eos.surface,
                                       out_state)
     return np.array([score.log_prob, float(score.oov)])
Esempio n. 29
0
 def featurize(self, word, context):
     """
     :param word: a Terminal
     :param context: a state
     :returns: weight, state
     """
     out_state = klm.State()
     score = self._model.BaseFullScore(context, word.surface, out_state)
     return np.array([score.log_prob, float(score.oov)]), out_state
    def get_trans_prob_use_kenlm(self, *args):
        '''
        获取转移概率, 使用log函数做处理,分数越接近0,转移概率越高
        :param phrase: 词组类似:中国/钟国/忠国
        :return: -9.256282567977905
        '''
        word_list = args
        state = kenlm.State()
        state1 = kenlm.State()
        self.model.NullContextWrite(state)

        acc = 0.0
        for index, word in enumerate(word_list):
            if index % 2 == 0:
                acc += self.model.BaseScore(state, word, state1)
            else:
                acc += self.model.BaseScore(state1, word, state)
        return acc