コード例 #1
0
def gen_pinyin2hanzi(pinyin, num=5):
    results = {}
    hmmparams = DefaultHmmParams()
    result = viterbi(hmm_params=hmmparams,
                     observations=pinyin,
                     path_num=num,
                     log=True)
    for item in result:
        # results.setdefault(''.join(item.path),0)
        results[''.join(item.path)] = item.score
    return results
コード例 #2
0
def top_k_transform(importance_score, list_of_texts, porpotion, new_word_dictionary, black_list_word):

    hmmparams = DefaultHmmParams() # HMM pinyin2hanzi
    

    target_text = list_of_texts
    target_text = tokenize(target_text).split(' ')
    k = int(len(target_text)*porpotion) + 1
    top_k_score = heapq.nlargest(k, importance_score)
    top_k_score_index = [importance_score.index(score) for score in top_k_score]

    for index in top_k_score_index:
        # make a virables repsent modified list_of_text
        gedit_text = copy.deepcopy(list_of_texts)
        if(is_Chinese(target_text[index])):
            pinyin_of_target_text = lazy_pinyin(target_text[index])
            if pinyin_of_target_text == ['ni']:
                pinyin_of_target_text = random.choice([['li'], ['ni']])
            if pinyin_of_target_text == ['ta']:
                pinyin_of_target_text = random.choice([['ta'], ['te']])
            if pinyin_of_target_text == ['cao']:
                pinyin_of_target_text = random.choice([['ca'], ['cao']])
            if pinyin_of_target_text == ['ma']:
                pinyin_of_target_text = random.choice([['me'], ['ma']])
            if pinyin_of_target_text == ['si']:
                pinyin_of_target_text = random.choice([['shi'], ['si']])
            try:
                #pinyin to other Chinese
                hanzi_of_target_test = viterbi(hmm_params=hmmparams, observations=pinyin_of_target_text, path_num = 10)

                # choose a word randly
                # target_text[index] = ''.join(random.choice(hanzi_of_target_test).path)


                # caculate the similarity between original word and transferable word
                # use greedy algorithm
                m_destination_word = calculate_similarity(target_text, index, list_of_texts[i], hanzi_of_target_test, gedit_text, black_list_word)

                target_text[index] = m_destination_word
                list_of_texts = ''.join(target_text)
                # 加入新词字典
                temp = new_word_dictionary.get(m_destination_word,0)
                temp += 1
                # 如果这个新词已经出现了10次,那么把它加到黑名单里
                if(temp < 20):
                    new_word_dictionary[m_destination_word] = temp
                else:
                    new_word_dictionary.pop(m_destination_word)
                    black_list_word.append(m_destination_word)
            except:
                pass
        else:
            continue
    return list_of_texts
コード例 #3
0
ファイル: httpHandler.py プロジェクト: luoaijun/datashub-api
 def get(self):
     """get请求"""
     word = self.get_argument('word') + " end"
     num = self.get_argument('num')
     list = []
     result = viterbi(hmm_params=hmmparams,
                      observations=(tuple(word.split(" "))),
                      path_num=num,
                      log=True)
     for item in result:
         result = {item.score, item.path}
         list.append(result)
         print(item.score, item.path)
     self.write(json.dumps(list))
コード例 #4
0
def transfer_pinyin_to_hanzi_by_hmm(sets):
    """
    HMM模式拼音转汉字
    :param sets:
    :return:
    """
    try:
        result = viterbi(hmm_params=hmmparams, observations=sets, path_num=1, log=True)
        path = ''
        for item in result:
            path = item.path
    except Exception as error:
        raise Exception('error:', error)
    else:
        return path
コード例 #5
0
ファイル: pytest.py プロジェクト: mosincos/machinelearning
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from pypinyin import pinyin, lazy_pinyin, Style
from Pinyin2Hanzi import DefaultHmmParams
from Pinyin2Hanzi import viterbi

txt = u'锄禾日当午'

py = lazy_pinyin(txt)

hmmparams = DefaultHmmParams()
result = viterbi(hmm_params=hmmparams, observations=py, path_num = 1)
for item in result:
  txt_rtn = u''.join(item.path)

if txt == txt_rtn:
  print u'OK'
else:
  print u'Error: %s -> %s -> %s' % (txt, py, txt_rtn)
コード例 #6
0
    def emission(self, state, observation):
        ''' state (hanzi) -> observation (pinyin) '''
        return self.emission_probability[state][observation]

    def transition(self, from_state, to_state):
        ''' state -> state '''
        return self.transition_probability[from_state][to_state]

    def get_states(self, observation):
        ''' get states which produce the given obs '''
        return self.states


result = viterbi(hmm_params=HmmParams(),
                 observations=('normal', 'cold', 'dizzy'),
                 path_num=10,
                 log=False)
for item in result:
    print(item.score, item.path)

print(20 * '--')

result = viterbi(hmm_params=HmmParams(),
                 observations=('normal', 'cold', 'dizzy'),
                 path_num=2,
                 log=False)
for item in result:
    print(item.score, item.path)

print(20 * '--')
コード例 #7
0
# coding: utf-8
from __future__ import (print_function, unicode_literals)

import sys
sys.path.append('..')

from Pinyin2Hanzi import DefaultHmmParams
from Pinyin2Hanzi import viterbi

hmmparams = DefaultHmmParams()

result = viterbi(hmm_params=hmmparams,
                 observations=('ni', 'hao', 'a'),
                 path_num=5,
                 log=True)
for item in result:
    print(item.score, '/'.join(item.path))

print(20 * '--')

result = viterbi(hmm_params=hmmparams,
                 observations=('ni', 'hao', 'a'),
                 path_num=2,
                 log=True)
for item in result:
    print(item.score, '/'.join(item.path))

print(20 * '--')

result = viterbi(hmm_params=hmmparams,
                 observations=('chuang', 'qian', 'ming', 'yve', 'guang'),
コード例 #8
0
        return self.start_probability[state]

    def emission(self, state, observation):
        ''' state (hanzi) -> observation (pinyin) '''
        return self.emission_probability[state][observation]

    def transition(self, from_state, to_state):
        ''' state -> state '''
        return self.transition_probability[from_state][to_state]

    def get_states(self, observation):
        ''' get states which produce the given obs '''
        return self.states


result = viterbi(hmm_params=HmmParams(), observations=('normal', 'cold', 'dizzy'), path_num = 10, log = False)
for item in result:
    print(item.score, item.path)

print(20*'--')

result = viterbi(hmm_params=HmmParams(), observations=('normal', 'cold', 'dizzy'), path_num = 2, log = False)
for item in result:
    print(item.score, item.path)

print(20*'--')

result = viterbi(hmm_params=HmmParams(), observations=('normal', 'cold', 'dizzy'), path_num = 1, log = False)
for item in result:
    print(item.score, item.path)
コード例 #9
0
ファイル: Local_HMM.py プロジェクト: hjfzzm/Input-Method
def interactive(client, addr):
    global re_string, pyt
    seq = []
    with open("Files/simple_words.txt", "r", encoding='UTF-8') as read:
        for line in read:
            line = line.strip()
            seq.append(line)
    voca = {}
    with open("Files/vocabulary.txt", "r", encoding='UTF-8') as read:
        for line in read:
            line = line.strip().split(":")
            voca[line[0]] = line[1]

    while True:
        try:
            text = client.recv(4096).decode('UTF-8')
            text = text.split('\t')[1]
            re_string = ""
            if text == "" or text == "send":
                text = 'send\t' + ""
                client.send(text.encode('UTF-8'))
                print("Text is NULL but send")
            else:
                if "send" in text:
                    text = text[0:len(text) - 4]
                words = pyt.scan(str(text))
                re_tup = tuple()
                for item in words:
                    if "invalid" not in item:
                        # tmp += item + "\'"
                        if not is_pinyin(item):
                            item = simplify_pinyin(item)
                        tmp = (item, )
                        re_tup += tmp
                print("Recv: ", end="")
                print(re_tup)
                re_string = ""
                try:
                    re_string = ""
                    result = viterbi(hmm_params=hmm,
                                     observations=re_tup,
                                     path_num=3,
                                     log=True)
                    n = 0
                    for item in result:
                        n += 1
                        string_t = str(n) + "."
                        re_string += string_t
                        for word in item.path:
                            re_string += word
                        re_string += "\n"
                    tmp = ""
                    for item in result:
                        for word in item.path:
                            tmp += word
                        break
                    for i in range(0, len(seq)):
                        if tmp in seq[i][0:len(tmp)] and len(
                                seq[i]) > len(tmp):
                            n += 1
                            string_t = str(n) + "."
                            re_string += string_t + seq[i] + "\n"
                            if n == 6:
                                break
                    if text in voca.keys():
                        n += 1
                        string_t = str(n) + "."
                        re_string += string_t + voca[text] + "\n"
                    dan_tup = (re_tup[0], )
                    result = viterbi(hmm_params=hmm,
                                     observations=dan_tup,
                                     path_num=100,
                                     log=True)
                    for item in result:
                        n += 1
                        if n == 9:
                            n = 1
                        string_t = str(n) + "."
                        re_string += string_t
                        for word in item.path:
                            re_string += word
                        re_string += "\n"
                    re_string = "send\t" + re_string
                    client.send(re_string.encode('UTF-8'))
                    print(re_string)
                except:
                    print("Recv Error")
        except:
            print(str(addr) + 'is out')
            client.close()
            break
コード例 #10
0
ファイル: TestHMM.py プロジェクト: luoaijun/datashub-api
from Pinyin2Hanzi import DefaultHmmParams
from Pinyin2Hanzi import viterbi

hmmparams = DefaultHmmParams()

## 2个候选
result = viterbi(hmm_params=hmmparams,
                 observations=('ni', 'zhi', 'bu', 'zhi', 'dao'),
                 path_num=2)
for item in result:
    print(item.score, item.path)
'''输出
1.3155294593897203e-08 ['你', '知', '不', '知', '道']
3.6677865125992192e-09 ['你', '只', '不', '知', '道']
'''

## 2个候选,使用对数打分
result = viterbi(hmm_params=hmmparams,
                 observations=('ni', 'zhi', 'bu', 'zhi', 'dao'),
                 path_num=2,
                 log=True)
for item in result:
    print(item.score, item.path)
'''输出
-18.14644152864202 ['你', '知', '不', '知', '道']
-19.423677486918002 ['你', '只', '不', '知', '道']
'''

## 2个候选,使用对数打分
# result = viterbi(hmm_params=hmmparams, observations=('ni', 'zhii', 'bu', 'zhi', 'dao'), path_num = 2, log = True)
# for item in result: