Ejemplo n.º 1
0
 def py2hz(self, pinyin):
     result = dag(self.dagparams, (pinyin, ),
                  path_num=self.limit *
                  self.page)[(self.page - 1) * self.limit:self.page *
                             self.limit]
     data = [item.path[0] for item in result]
     return data
def genarate_word_error(sents):
    ans = []
    # hmmparams = DefaultHmmParams()
    dagparams = DefaultDagParams()
    for sent in sents:
        seg_sent = list(jieba.cut(sent))
        while True:
            select_word = random.sample(seg_sent, 1)[0]  #随机一个候选词
            if len(select_word) > 1:
                break

        error_word = select_word
        pinyin_list = lazy_pinyin(select_word)  #获取选定词的拼音
        # result1 = viterbi(hmm_params=hmmparams, observations=pinyin_list, path_num=5)
        try:
            result2 = dag(dagparams, pinyin_list, path_num=5, log=True)
        except KeyError:
            continue
        while len(result2) > 1:
            error_word = ''.join(random.choice(result2).path)
            if error_word != select_word:
                break

        word_index = sent.find(select_word)  #替换词语中的单字
        err_sent = sent[:word_index] + error_word
        if word_index + len(select_word) < len(sent):
            err_sent += sent[word_index + len(select_word):]
        if err_sent != sent:
            ans.append((sent, err_sent))
    return ans
Ejemplo n.º 3
0
def pinyin_2_hanzi(pinyinList):
    from Pinyin2Hanzi import DefaultDagParams
    from Pinyin2Hanzi import dag

    dagParams = DefaultDagParams()
    # 取第一个值
    result = dag(dagParams, pinyinList, path_num=10, log=True)[0].path[0]
    return result
Ejemplo n.º 4
0
def change_word_b(word, path_num=6):  
    pinyin_list = lazy_pinyin(word)
    result = dag(dagParams, pinyin_list, path_num, log=True)  # print(len(result))  10 <class 'list'>
    try:
        index = randint(0, len(result)-1) # 当result只有一位时(比如','),不进行更改,直接返回原word
    except:
        return word
    new_word = result[index].path 
    return new_word[0] # print(new_word)  ['疫情'] 
Ejemplo n.º 5
0
def pinyin_2_hanzi(pinyinList):
    from Pinyin2Hanzi import DefaultDagParams
    from Pinyin2Hanzi import dag
    dagParams = DefaultDagParams()
    result = dag(dagParams, pinyinList, path_num=10, log=True)  # 10 代表后选值个数
    for item in result:
        socre = item.score
        res = item.path  # 转换结果
        print(socre, ''.join(res))
Ejemplo n.º 6
0
    def pinyin_to_chinese(self, data):
        '''get the chinese from the pinyin

        :param data: pinyin data
        :return:
        '''
        dagparames = DefaultDagParams()
        result = dag(dagparames, data, path_num=10, log=True)
        for item in result:
            print(str(item.score) + ":", item.path)
Ejemplo n.º 7
0
def pinyin_2_hanzi(pinyin_str):
    pinyin_list = pinyin_str.split()
    dagParams = DefaultDagParams()
    # 1个候选值
    result = dag(dagParams, pinyin_list, path_num=1, log=True)
    if result:
        res = result[0].path # 转换结果
        hanzi_str = ''.join(res)
        return hanzi_str
    else:
        logger.info("转化有误:" + pinyin_str)
Ejemplo n.º 8
0
def pinyin_to_hanzi(pinyin, Topk=5):
    '''
    拼音转化为汉字
    汉字存在多意性,所以这里没有一一对应的关系,只能选出概率最高的topk
    '''
    translator = DefaultDagParams()
    result = dag(translator, pinyin, path_num=Topk, log=True)
    for item in result:
        socre = item.score  # 得分
        res = item.path  # 转换结果
        print socre, ''.join([one.decode('utf-8') for one in res])
Ejemplo n.º 9
0
def pinyin_2_hanzi(word):
	if Pinyin2Hanzi.is_chinese(word):
		word_pinyin = lazy_pinyin(word)
		dagParams = DefaultDagParams()
		word_list = []
		result = dag(dagParams, word_pinyin, path_num=3, log=True)
		for item in result:
			word_list.append(item.path[0])
		return word_list
	else:
		return "Null"
Ejemplo n.º 10
0
def pinyin_2_hanzi(sentences):
    from Pinyin2Hanzi import DefaultDagParams
    from Pinyin2Hanzi import dag
    dagParams = DefaultDagParams()
    pinyinList = lazy_pinyin(sentences)
    print(pinyinList)
    result = dag(dagParams, pinyinList, path_num=3)  #10代表侯选值个数
    for item in result:
        socre = item.score
        res = item.path  # 转换结果
        print(socre, res)
Ejemplo n.º 11
0
 def get(self):
     """get请求"""
     word = self.get_argument('word') + " end"
     num = self.get_argument('num')
     print(tuple(word.split(" ")))
     result = dag(dagparams, tuple(word.split(" ")), path_num=num)
     list = []
     for item in result:
         result = {item.score, item.path}
         list.append(result)
         print(item.score, item.path)
     self.write(json.dumps(list))
Ejemplo n.º 12
0
 def pinyin_2_hanzi(self, pinyinList):
     dagParams = DefaultDagParams()
     result = dag(dagParams, pinyinList, path_num=10, log=True)  #10代表侯选值个数
     item_result = []
     try:
         for item in result:
             # socre = item.score
             # res = item.path # 转换结果
             item_result.append([item.score, item.path])
         return sorted(item_result, key=itemgetter(0), reverse=True)[0][1]
     except Exception as e:
         print(e)
         print("输入异常,请重新输入拼音")
Ejemplo n.º 13
0
def pinyin2hanzi(pinyin_list):
    '''
    :param text_list: 拼音列表
    :return: 文本二维列表,且每个一维列表的长度为1
    '''
    dagParams, entities = DefaultDagParams(), []
    for line in pinyin_list:
        result = dag(dagParams, line, path_num=5, log=True)
        for item in result:
            res = item.path  # 转换结果
            if len(res) > 1:
                continue
            entities.append(res)
    return entities
Ejemplo n.º 14
0
 def _py2hz_dag(self, pinyin_list):
     if len(pinyin_list) == 1:
         num = 1000
     elif len(pinyin_list) == 2:
         num = 20
     elif len(pinyin_list) == 3:
         num = 10
     elif len(pinyin_list) <= 5:
         num = 5
     elif len(pinyin_list) <= 7:
         num = 3
     else:
         num = 1
     return dag(self._dagparams, pinyin_list, num, True)
Ejemplo n.º 15
0
def pinyin_to_hanzi(pinyin,Topk=5,Log=True):
    '''
    拼音转化为汉字
    汉字存在多意性,所以这里没有一一对应的关系,只能选出概率最高的topk
    '''
    print(pinyin)
    translator=DefaultDagParams()
    result=dag(translator,pinyin,path_num=Topk,log=Log)
    #print(result)

    # for item in result:
    #     socre=item.score # 得分
    #     res=item.path # 转换结果
    #     print(socre, ''.join([one.decode('utf-8') for one in res]))
    return result
def transfer_pinyin_to_hanzi_by_dag(sets):
    """
    DAG模式拼音转汉字
    :param sets:
    :return:
    """
    try:
        result = dag(dagparams, sets, path_num=1, log=True)
        path = ''
        for item in result:
            path = item.path
    except Exception as error:
        raise Exception('error:', error)
    else:
        return path
Ejemplo n.º 17
0
def pinyin_2_hanzi(pinyin_str):
    '''
    zhao qing shi ding hu qu fang di chan xie hui --- 肇庆市鼎湖区房地产协会
    '''
    pinyin_list = pinyin_str.split()
    dagParams = DefaultDagParams()
    # 1个候选值
    result = dag(dagParams, pinyin_list, path_num=1, log=True)
    if result:
        res = result[0].path  # 转换结果
        hanzi_str = ''.join(res)
        return hanzi_str
    else:
        return ''
        logger.info("转化有误:" + pinyin_str)
Ejemplo n.º 18
0
def get_chengyu(word):
    bigger_list = []
    if Pinyin2Hanzi.is_chinese(word):
        word_pinyin = lazy_pinyin(word)
        dagParams = DefaultDagParams()
        word_list = []
        result = dag(dagParams, word_pinyin, path_num=3, log=True)
        for item in result:
            word_list.append(item.path[0])

        for avg_word in word_list:
            bigger_list.append(find_chengyu(avg_word))

        heiheihei = list((chain(*bigger_list)))
        max_len = len(heiheihei) - 1
        flag = random.randint(0, max_len)
        return heiheihei[flag]
    else:
        return "Null"
Ejemplo n.º 19
0
def pinyin_2_hanzi(pinyinList):
    from Pinyin2Hanzi import DefaultDagParams
    from Pinyin2Hanzi import dag
    dagParams = DefaultDagParams()
    result = dag(dagParams, pinyinList, path_num=500000, log=True)#10代表侯选值个数
    return [''.join(item.path) for item in result]
Ejemplo n.º 20
0
def get_word_by_pinyin(s0):
    rlt = []
    l0 = dag(__param, get_pinyin(s0), path_num=PATH_NUM)
    for i in l0:
        rlt.append(''.join(i.path))
    return rlt
Ejemplo n.º 21
0
# coding: utf-8
from __future__ import (print_function, unicode_literals)

import sys
sys.path.append('..')

from Pinyin2Hanzi import DefaultDagParams
from Pinyin2Hanzi import dag

dagparams = DefaultDagParams()

result = dag(dagparams, ['wo'])
for item in result:
    print(item.score, '/'.join(item.path))

print(20 * '*')

result = dag(dagparams, ['ni', 'hao'])
for item in result:
    print(item.score, '/'.join(item.path))

print(20 * '*')

result = dag(dagparams, ['ni', 'bu', 'zhi', 'dao', 'de', 'shi'])
for item in result:
    print(item.score, '/'.join(item.path))

print(20 * '*')

result = dag(dagparams, ['ni', 'bu', 'zhi', 'dao', 'de', 'shi'],
             path_num=2,
Ejemplo n.º 22
0
# coding: utf-8
from __future__ import (print_function, unicode_literals)

import sys
sys.path.append('..')

from Pinyin2Hanzi import DefaultDagParams
from Pinyin2Hanzi import dag

dagparams = DefaultDagParams()

print(
    dag(dagparams, [u'ti', u'chu', u'le', u'jie', u'jve', u'fang', u'an'],
        path_num=1))
print(dag(dagparams, [u'ti', u'chu', u'le'], path_num=1))

print(dag(dagparams, ['jie', 'jve', 'fang', 'an'], path_num=1))
print(dag(dagparams, ['jie', 'jve'], path_num=1))
print(dag(dagparams, ['fang', 'an'], path_num=1))
Ejemplo n.º 23
0
# coding: utf-8
from __future__ import (print_function, unicode_literals)

import sys
sys.path.append('..')

from Pinyin2Hanzi import DefaultDagParams
from Pinyin2Hanzi import dag

dagparams = DefaultDagParams()


result = dag(dagparams, ['wo'])
for item in result:
    print(item.score, '/'.join(item.path))

print(20*'*')

result = dag(dagparams, ['ni', 'hao'])
for item in result:
    print(item.score, '/'.join(item.path))

print(20*'*')

result = dag(dagparams, ['ni', 'bu', 'zhi', 'dao', 'de', 'shi'])
for item in result:
    print(item.score, '/'.join(item.path))

print(20*'*')

result = dag(dagparams, ['ni', 'bu', 'zhi', 'dao', 'de', 'shi'], path_num=2, log=True)
Ejemplo n.º 24
0
# coding: utf-8
from __future__ import (print_function, unicode_literals)

import sys
sys.path.append('..')

from Pinyin2Hanzi import DefaultDagParams
from Pinyin2Hanzi import dag

dagparams = DefaultDagParams()


print( dag(dagparams, [u'ti', u'chu', u'le', u'jie', u'jve', u'fang', u'an'], path_num=1) )
print( dag(dagparams, [u'ti', u'chu', u'le'], path_num=1) )


print( dag(dagparams, ['jie', 'jve', 'fang', 'an'], path_num=1) )
print( dag(dagparams, ['jie', 'jve'], path_num=1) )
print( dag(dagparams, ['fang', 'an'], path_num=1) )



Ejemplo n.º 25
0
try:
    reload(sys)
    sys.setdefaultencoding('utf-8')
except:
    pass

if __name__ == '__main__':

    data_dir = './original_corpus.txt'
    dst_fpath = './pinyin2hanzi_dag.txt'

    dagparams = DefaultDagParams()

    with open(dst_fpath, 'w') as f_write:
        with open(data_dir, 'r') as f_read:
            for line in f_read:
                line = line.strip('\n')
                items = line.split('\t')
                hanzi = items[0]
                pinyin = items[1]
                try:
                    ## 2个候选
                    result = dag(dagparams, pinyin.split('#'), path_num=2)
                    for item in result:
                        line = line + '\t' + ''.join(item.path) + ':' + str(item.score)
                except Exception as e:
                    print(e)
                finally:
                    f_write.write(line + '\n')
Ejemplo n.º 26
0
from Pinyin2Hanzi import DefaultDagParams
from Pinyin2Hanzi import dag

dagparams = DefaultDagParams()

## 2个候选
result = dag(dagparams, ('pin'), path_num=3)
for item in result:
    print(item.score, item.path)
''' 输出
0.08117536840088911 ['你不知道', '的是']
0.04149191639287887 ['你不知道', '的诗']
'''

## 2个候选,使用对数打分
# result = dag(dagparams, ('ni', 'bu', 'zhi', 'dao', 'de', 'shi'), path_num=2, log=True)
# for item in result:
#     print(item.score, item.path)
''' 输出
-2.5111434226494866 ['你不知道', '的是']
-3.1822566564324477 ['你不知道', '的诗']
'''

## 1个候选
# print( dag(dagparams, ['ti', 'chu', 'le', 'bu', 'cuo', 'de', 'jie', 'jve', 'fang', 'an'], path_num=1) )
'''输出
[< score=0.0017174549839096384, path=['提出了', '不错', '的', '解决方案'] >]
'''
change_word = raw_word

alpha = 1
beta = 1

for i in range(len(raw_word)):
    ch = raw_word[i]
    chacha = model.predict(ch, k=2)
    if (chacha[0][0] == '__label__辱骂'):
        P1 = chacha[1][0]
    else:
        P1 = chacha[1][1]
    pinyin = lazy_pinyin(ch)
    max_point = 0
    change_ch = ch
    result = dag(dagParams, pinyin, path_num=5, log=True)
    for j in range(5):
        if len(result) > j:
            ss = ""
            for c in result[j].path:
                ss = c
            chacha = model.predict(ss, k=2)
            if (chacha[0][0] == '__label__辱骂'):
                P2 = chacha[1][0]
            else:
                P2 = chacha[1][1]
            if ss == ch: continue
            four_dis = distance([ch], [ss])
            all_dis = 3.0 / 14 * (
                1 - four_dis['normalized_levenshtein'][0]) + 1.0 / 7 * (
                    1 - four_dis['jaccard_word'][0]) + 3.0 / 14 * (