Example #1
0
  def prefixs_for_term (self,term):
    """
    Get prefixs for TERM.
    """
    # Normalization
    term=term.lower()

    # Prefixs for term
    prefixs=[]
    for i in xrange(1, len(term) + 1):
      word = term[:i]
      prefixs.append(word)
      prefixs.append(''.join([i[0] for i in pinyin(word, style=pypinyin.FIRST_LETTER)]).lower())
      prefixs.append(''.join([i[0] for i in pinyin(word, style=pypinyin.NORMAL)]).lower())
      prefixs.append(word)

    tokens = self.normalize(term)
    for token in tokens:
      for i in xrange (1,len(token)+1):
        word = token[:i]
        prefixs.append(word)
        prefixs.append(''.join([i[0] for i in pinyin(word, style=pypinyin.FIRST_LETTER)]).lower())
        prefixs.append(''.join([i[0] for i in pinyin(word, style=pypinyin.NORMAL)]).lower())
        prefixs.append(word)

    return list(set(prefixs))
 def get_pinyin(self):
     su_temp = pinyin(self.names[0], style=pypinyin.NORMAL,heteronym=True)
     fn_temp = pinyin(self.names[1], style=pypinyin.NORMAL, heteronym=True)
     su_py = self.combination(self, su_temp, '')
     fn_py = self.combination(self, fn_temp, '')
     pys = self.combination(self, [su_py, fn_py], ' ')
     return pys
Example #3
0
def test_zh_and_en():
    """中英文混合的情况"""
    # 中英文
    hans = '中心'
    try:
        assert pinyin(hans + 'abc') == [['zh\u014dng'], ['x\u012bn'], ['abc']]
    except AssertionError:
        assert pinyin(hans + 'abc') == [['zh\u014dng'], ['x\u012bn'],
                                        ['a'], ['b'], ['c']]
Example #4
0
def test_errors_callable():
    def foobar(chars):
        return 'a' * len(chars)

    class Foobar(object):
        def __call__(self, chars):
            return 'a' * len(chars)

    n = 5
    assert pinyin('あ' * n, errors=foobar) == [['a' * n]]
    assert pinyin('あ' * n, errors=Foobar()) == [['a' * n]]
Example #5
0
def test_others():
    # 空字符串
    assert pinyin('') == []
    # 单个汉字
    assert pinyin('營') == [['y\xedng']]
    # 中国 人
    assert pinyin('中国人') == [['zh\u014dng'], ['gu\xf3'], ['r\xe9n']]
    # 日文
    assert pinyin('の') == [['\u306e']]
    # 没有读音的汉字,还不存在的汉字
    assert pinyin('\u9fff') == [['\u9fff']]
Example #6
0
def get_homophones_by_char(input_char):
    """
    根据汉字取同音字
    :param input_char:
    :return:
    """
    result = []
    # CJK统一汉字区的范围是0x4E00-0x9FA5,也就是我们经常提到的20902个汉字
    for i in range(0x4e00, 0x9fa6):
        if pinyin([chr(i)], style=pypinyin.NORMAL)[0][0] == pinyin(input_char, style=pypinyin.NORMAL)[0][0]:
            result.append(chr(i))
    return result
Example #7
0
def test_custom_style_with_decorator():
    style_value = 'test_custom_style_with_decorator'

    @register(style_value)
    def func(pinyin, **kwargs):
        return pinyin + str(len(pinyin))

    hans = '北京'
    origin_pinyin_s = pinyin(hans)
    expected_pinyin_s = deepcopy(origin_pinyin_s)
    for pinyin_s in expected_pinyin_s:
        for index, py in enumerate(pinyin_s):
            pinyin_s[index] = func(py)

    assert pinyin(hans, style=style_value) == expected_pinyin_s
Example #8
0
def lang_zh(text):
    res = []
    for line in text.split('\n'):
        cut = jieba.cut(line)
        ln = [[i, "'".join(j[0] for j in pypinyin.pinyin(i, style=0))] for i in cut]
        res.append(ln)
    return res
    def _du(self, _request, _rdata):
        if "user_uuid" not in _request:
            self.setErrorCode(API_ERR.NO_PARA)
            logging.error("Error for no para: %s.", (str(_request)))
            return

        _o = redis_hash_to_dict(self.application.redis, DeviceUser, _request["user_uuid"])

        logging.info(_o)
        
        if _o == None:
            self.setErrorCode(API_ERR.NO_OBJECT)
            logging.error("Error for no user uuid: %s." % (_request["user_uuid"]))
            return

        # not return the password default
        return_password = False
        if "return_password" in _request:
            return_password = _request["return_password"]
        if not return_password:
            del _o["user_password"]
        
        _fn = _o.get("user_fullname")
        if _fn != None and not isinstance(_fn, unicode):
            _fn = _fn.decode("utf-8")

        _rdata.update(_o)
        _rdata["pinyinname0"] = "".join(lazy_pinyin(_fn))
        _rdata["pinyinname1"] = "".join(list(itertools.chain.from_iterable(pinyin(_fn, style=pypinyin.INITIALS))))
        
        return
    def _du(self):

        _request = json.loads(self.request.body)

        _user_uuid = _request.get("user_uuid")
        if not _user_uuid:
            self.setErrorCode(API_ERR.NO_PARA)
            return

        _o = redis_hash_to_dict(self.application.redis, DeviceUser, _user_uuid)
        if not _o:
            self.setErrorCode(API_ERR.NO_OBJECT)
            return

        # not return the password default
        return_password = False
        if "return_password" in _request:
            return_password = _request["return_password"]
        if not return_password:
            del _o["user_password"]
        
        _fn = _o.get("user_fullname")
        if _fn != None and not isinstance(_fn, unicode):
            _fn = _fn.decode("utf-8")

        _rdata = self.getReturnData()
        _rdata.update(_o)
        _rdata["pinyinname0"] = "".join(lazy_pinyin(_fn))
        _rdata["pinyinname1"] = "".join(list(itertools.chain.from_iterable(pinyin(_fn, style=pypinyin.INITIALS))))

        _app_uuid = _get_config().get("team").get("app_uuid")
        _o = redis_hash_to_dict(self.application.redis, AppInfo, _app_uuid)
        _rdata.update({"team": _o});
        return
Example #11
0
 def get_following_users(self, user):
     doc = yield self._db.followers.find_one({"user": user}, {"_id":0, "following":1})
     if doc and "following" in doc:
         ret = yield [self.find_user(_, True) for _ in doc["following"] if _]
     else:
         ret = []
     raise gen.Return(sorted(ret, key=lambda x: pinyin(to_unicode(("real_name" in x and x["real_name"]) or ""), style=TONE2)))
Example #12
0
def _get_pinyin_all(existing_combinations, characters):
    """
    Get all combinations of pinyin of some chinese characters as list, in a 
    recurrence way, since format of result from pinyin is [['a'], ['b']]
    So a combination of two level loop is needed to get all the pinyin. 
    :param existing_combinations:  Existing combinations, for already calculated characters. 
    :param characters: Characters to get combination of pinyin 
    :return:  A flat list of all combinations of pinyin for 多音字
    """
    first_character, other_characters = characters[0:1], characters[1:]
    if len(first_character) > 0:
        py = pinyin(first_character, style=pypinyin.FIRST_LETTER, heteronym=True)
        new_existing = []
        for p in py:
            for a in p:
                if len(existing_combinations) > 0:
                    for e in existing_combinations:
                        ne = e[:]
                        ne.append(a)
                        new_existing.append(ne)
                else:
                    ne = existing_combinations[:]
                    ne.append(a)
                    new_existing.append(ne)
        return _get_pinyin_all(new_existing, other_characters)
    return existing_combinations
Example #13
0
def addPinyin(sometext):
    mylist=pinyin(sometext, heteronym=True)
    str=u''
    for pp in mylist:
        str+=pp[0]+u' '
    print str.rstrip()
    return str.rstrip()
Example #14
0
def test_errors():
    hans = (
        ('啊', {'style': TONE2}, [['a']]),
        ('啊a', {'style': TONE2}, [['a'], ['a']]),
        # 非中文字符,没有拼音
        ('⺁', {'style': TONE2}, [['\u2e81']]),
        ('⺁', {'style': TONE2, 'errors': 'ignore'}, []),
        ('⺁', {'style': TONE2, 'errors': 'replace'}, [['2e81']]),
        ('⺁⺁', {'style': TONE2, 'errors': 'replace'}, [['2e812e81']]),
        ('⺁⺁', {'style': TONE2, 'errors': lambda x: ['a' for _ in x]},
         [['a'], ['a']]),
        ('⺁⺁', {'style': TONE2, 'errors': lambda x: [['a', 'b'], ['b', 'c']]},
         [['a'], ['b']]),
        ('⺁⺁', {'style': TONE2, 'heteronym': True,
                'errors': lambda x: [['a', 'b'], ['b', 'c']]},
         [['a', 'b'], ['b', 'c']]),
        # 中文字符,没有拼音
        ('鿅', {'style': TONE2}, [['\u9fc5']]),
        ('鿅', {'style': TONE2, 'errors': 'ignore'}, []),
        ('鿅', {'style': TONE2, 'errors': '233'}, []),
        ('鿅', {'style': TONE2, 'errors': 'replace'}, [['9fc5']]),
        ('鿅', {'style': TONE2, 'errors': lambda x: ['a']}, [['a']]),
        ('鿅', {'style': TONE2, 'errors': lambda x: None}, []),
        ('鿅鿅', {'style': TONE2, 'errors': lambda x: ['a' for _ in x]},
         [['a'], ['a']]),
        ('鿅鿅', {'style': TONE2, 'errors': lambda x: [['a', 'b']]},
         [['a'], ['a']]),
        ('鿅鿅', {'style': TONE2, 'heteronym': True,
                'errors': lambda x: [['a', 'b']]},
         [['a', 'b'], ['a', 'b']]),
    )
    for han in hans:
        assert pinyin(han[0], **han[1]) == han[2]
Example #15
0
def add_term(term, weight):
    words, types = term2words(term)
    if len(words) == 0: #avoid '......'
        return
    #max prefix match
    level, node_id = max_prefix_match(words, types)
    
    # 如果全部存在这个字符序列,则更新 node_id
    if level == len(words):#exist already
        add_weight(node_id, weight)#may lead to parent weight bigger than weight sum of all children
    else:
        for word in words[level:]:
            #insert normal node
            parent = node_id
            node_id = new_node(word, parent)
            if len(word)==1 and ord(word)>=19904 and ord(word)<=40895:
                #insert pinyin node
                pys = pypinyin.pinyin(word, style=pypinyin.NORMAL, heteronym=True)
                for py in pys[0]:
                    #complete pinyin
                    push_pinyin_node(parent, node_id, py)
                    push_pinyin_node(parent, node_id, py[0])
                    if py[0]=='c' or py[0]=='s' or py[0]=='z':
                        if py[1] == 'h':
                            push_pinyin_node(parent, node_id, py[:2])
                
        add_weight(node_id, weight)
Example #16
0
def addPinyin(sometext):
    mylist=pinyin(sometext,heteronym=True)
    str=u''
    for pp in mylist:
        str+=pp[0]+u' '
#去除最后空格
    
    return strB2Q(str.rstrip())
Example #17
0
 def get_university_by_province(self, province, need_pinyin=True):
     if province in self._university_of_province:
         if need_pinyin:
             return [{"university": u, "pinyin": self.to_pinyin(u)}
                     for u in self._university_of_province[province]]
         else:
             return sorted(self._university_of_province[province], key=lambda x: pinyin(to_unicode(x), style=TONE2))
     else:
         return []
Example #18
0
def get_pinyin(text):
    pinyin_list = pinyin(text, style=pypinyin.TONE3)
    strs = ''
    for i in range(0,len(pinyin_list)):
        if strs != None:
            strs = strs + ' ' + pinyin_list[i][0]
        else:
            strs = pinyin_list[i][0]
    return strs
Example #19
0
def test_pinyin():
    hans = u'中心'
    assert pinyin(hans) == [[u'zh\u014dng'], [u'x\u012bn']]
    assert pinyin(hans + 'abc') == [[u'zh\u014dng'], [u'x\u012bn'], ['abc']]
    assert pinyin(hans, pypinyin.STYLE_NORMAL) == [[u'zhong'], [u'xin']]
    assert pinyin(hans, pypinyin.STYLE_TONE) == [[u'zh\u014dng'], [u'x\u012bn']]
    assert pinyin(hans, pypinyin.STYLE_TONE2) == [[u'zho1ng'], [u'xi1n']]
    assert pinyin(hans, pypinyin.STYLE_INITIALS) == [['zh'], ['x']]
    assert pinyin(hans, pypinyin.STYLE_FIRST_LETTER) == [[u'z'], [u'x']]
    assert pinyin(hans, heteronym=True) == [[u'zh\u014dng', u'zh\xf2ng'],
                                            [u'x\u012bn']]
Example #20
0
def getFirstCase(str):
    firstcase = str[0]
    if is_chinese(firstcase):
        return pinyin(firstcase)[0][0][0].upper()
    elif is_alphabet(firstcase):
        return firstcase.upper()
    elif is_number(firstcase):
        return retutn_alphabet(firstcase)
    else:
        return "WARNING"
Example #21
0
def name2pinyin(name):
    input = name.decode('utf-8')

    letter_list = pinyin(input, 4)
    #print letter_list
    output = "".join([ x[0] for x in letter_list])
    output = safestr(output)
    #print safestr(name), safestr(input), output

    return output
Example #22
0
def test_seg_jieba():
    hans = '音乐'
    import jieba
    hans_seg = list(jieba.cut(hans))
    assert pinyin(hans_seg, style=TONE2) == [['yi1n'], ['yue4']]
    # 中英文混合的固定词组
    assert pinyin('黄山B股', style=TONE2) == [['hua2ng'], ['sha1n'], ['B'], ['gu3']]
    assert pinyin('A股', style=TONE2) == [['A'], ['gu3']]
    assert pinyin('阿Q', style=TONE2) == [['a1'], ['Q']]
    assert pinyin('B超', style=TONE2) == [['B'], ['cha1o']]
    assert pinyin('AB超C', style=TONE2) == [['A'], ['B'], ['cha1o'], ['C']]
    assert pinyin('AB阿C', style=TONE2) == [['AB'], ['a1'], ['C']]
    assert pinyin('维生素C', style=TONE2) == [['we2i'], ['she1ng'], ['su4'], ['C']]
Example #23
0
def main():
    """
    /anaconda3/bin/python han2pinyin.py '我们爱世界'
    python3 han2pinyin.py '与会' #ok

    """
    input = sys.argv[1]

    output = pinyin(input)
    print(output)
    return(output)
Example #24
0
def test_zh_and_en():
    """中英文混合的情况"""
    # 中英文
    hans = '中心'
    if has_module('jieba'):
        assert pinyin(hans + 'abc') == [['zh\u014dng'], ['x\u012bn'], ['abc']]
    else:
        assert pinyin(hans + 'abc') == [['zh\u014dng'], ['x\u012bn'],
                                        ['a'], ['b'], ['c']]
    # 中英文混合的固定词组
    assert pinyin('黄山B股', style=TONE2) == [['hua2ng'], ['sha1n'], ['B'], ['gu3']]
    assert pinyin('A股', style=TONE2) == [['A'], ['gu3']]
    assert pinyin('阿Q', style=TONE2) == [['a1'], ['Q']]
    assert pinyin('B超', style=TONE2) == [['B'], ['cha1o']]
    assert pinyin('AB超C', style=TONE2) == [['A'], ['B'], ['cha1o'], ['C']]
    if has_module('jieba'):
        assert pinyin('AB阿C', style=TONE2) == [['AB'], ['a1'], ['C']]
    else:
        assert pinyin('AB阿C', style=TONE2) == [['A'], ['B'], ['a1'], ['C']]
    assert pinyin('维生素C', style=TONE2) == [['we2i'], ['she1ng'], ['su4'], ['C']]
Example #25
0
def rename(filepath):
    files = []
    for file in os.listdir(filepath):
        if os.path.isfile(os.path.join(filepath,file)) == True and file.find('.jpg') > 0:
            print(file)
            files.append({'file':file,'by':pypinyin.pinyin(file,style=pypinyin.NORMAL)})
    fs=sorted(files,key = py)
    i = 0
    for f in fs:
        i = i+1;
        print(f['by'])
        os.rename(os.path.join(filepath,f['file']),os.path.join(filepath,'%s_%d_%s' % (u'赵妮',i,f['file'])))
Example #26
0
def get_homophones_by_pinyin(input_pinyin):
    """
    根据拼音取同音字
    :param input_pinyin:
    :return:
    """
    result = []
    # CJK统一汉字区的范围是0x4E00-0x9FA5,也就是我们经常提到的20902个汉字
    for i in range(0x4e00, 0x9fa6):
        if pinyin([chr(i)], style=pypinyin.TONE2)[0][0] == input_pinyin:
            # TONE2: 中zho1ng
            result.append(chr(i))
    return result
Example #27
0
def ranking_function(output_prob_tree, cx, cy):
    # 平仄
    x_py = pypinyin.pinyin(cx, style=pypinyin.TONE2)
    y_py = pypinyin.pinyin(cy, style=pypinyin.TONE2)
    x_pz = map(lambda i: -1 if int(re.search('\d', i[0]).group(0)) <= 2 else 1, x_py)
    y_pz = map(lambda i: -1 if int(re.search('\d', i[0]).group(0)) <= 2 else 1, y_py)
    pingze_score = sum(map(lambda i, j: i + j == 0, x_pz, y_pz)) / float(len(cx)) + 0.001

    def sigmoid(x):
        return 1 / (1 + math.e ** (-x))

    def pos_eq(x_pos, y_pos):
        return x_pos == y_pos or x_pos in y_pos or y_pos in x_pos

    import operator
    smooth_value = 0.001
    freq_amp = 10 ** math.sqrt(len(cx))

    # 词性
    cx_pos = map(lambda x: zip(*pseg.lcut(x)[0])[0][1], cx)
    cy_pos = map(lambda y: zip(*pseg.lcut(y)[0])[0][1], cy)
    pos_score = reduce(operator.add, map(lambda x, y: float(1)/len(cx) if pos_eq(x, y) else 0, cx_pos, cy_pos))
    pos_score += smooth_value

    # 输出概率
    out_score = reduce(operator.mul, map(lambda x, y: output_prob_tree[x][y] * freq_amp, cx, cy))
    out_score = sigmoid(out_score)
    out_score += smooth_value

    # 整合
    score = pingze_score * out_score * pos_score
    # score = pingze_score * pos_score

    # print 'ranking', cy
    # print 'pingze', pingze_score
    # print 'pos', pos_score
    # print 'freq', out_score

    return score
Example #28
0
 def get_college_by_university(self, university, need_pinyin=True):
     university=utf8(university)
     if university in self._universities:
         if need_pinyin:
             colleges = []
             for c in (self._college_of_university[university]):
                 c = to_unicode(c)
                 colleges.append({"college": c, "pinyin":  self.to_pinyin(c)})
             return sorted(colleges, key=lambda x: pinyin(x["college"], style=TONE2))
         else:
             return sorted(list(self._college_of_university[university]), key=lambda x: pinyin(to_unicode(x), style=TONE2))
     else:
         return []
Example #29
0
def test_pinyin_initials():
    """包含声明和韵母的词语"""
    hans = '中心'
    # 默认风格,带声调
    assert pinyin(hans) == [['zh\u014dng'], ['x\u012bn']]
    # 普通风格,不带声调
    assert pinyin(hans, NORMAL) == [['zhong'], ['xin']]
    # 声调风格,拼音声调在韵母第一个字母上
    assert pinyin(hans, TONE) == [['zh\u014dng'], ['x\u012bn']]
    # 声调风格2,即拼音声调在各个拼音之后,用数字 [0-4] 进行表示
    assert pinyin(hans, TONE2) == [['zho1ng'], ['xi1n']]
    # 声母风格,只返回各个拼音的声母部分
    assert pinyin(hans, INITIALS) == [['zh'], ['x']]
    # 首字母风格,只返回拼音的首字母部分
    assert pinyin(hans, FIRST_LETTER) == [['z'], ['x']]
    # 启用多音字模式
    assert pinyin(hans, heteronym=True) == [['zh\u014dng', 'zh\xf2ng'],
                                            ['x\u012bn']]
    # 韵母风格1,只返回各个拼音的韵母部分,不带声调
    assert pinyin(hans, style=FINALS) == [['ong'], ['in']]
    # 韵母风格2,带声调,声调在韵母第一个字母上
    assert pinyin(hans, style=FINALS_TONE) == [['\u014dng'], ['\u012bn']]
    # 韵母风格2,带声调,声调在各个拼音之后,用数字 [0-4] 进行表示
    assert pinyin(hans, style=FINALS_TONE2) == [['o1ng'], ['i1n']]
def get_word_pinyin_py(word):
    ''' 将输入的中文处理成拼音和拼音首字母 '''

    word_pinyin = ''
    word_py     = ''

    try:
        word_pinyin_list = lazy_pinyin(word, errors='ignore')
        for w in word_pinyin_list: word_pinyin += str(w)
        #print 'word_pinyin: ', word_pinyin

        word_py_list_out = pinyin(word, style=pypinyin.FIRST_LETTER)
        for i in word_py_list_out: word_py += str(i[0])
        #print 'word_py: ', word_py
    except Exception, ex:
        logger.exception(ex)
                    prov = 20
                    cityname = '广州'
                meta = {
                    "brand": rows["brandid"],
                    "series": rows["familyid"],
                    "model": rows["salesdescid"],
                    "registerDate": registerDate,
                    "city": city,
                    "prov": prov,
                    "mile": mile
                }
                s = f"brand={meta['brand']}&city={meta['city']}&mileAge={mile}&model={meta['model']}&prov={meta['prov']}&registerDate={registerDate}&series={meta['series']}njB6TTeQvTnGN4To"
                md = get_md5_value(s)

                # cityname = cityname_dic[city]
                a = pypinyin.pinyin(cityname, style=pypinyin.FIRST_LETTER)
                c_pinyin = ''.join([str(a[i][0]) for i in range(len(a))])
                # print(c_pinyin)
                url = start_url.format(meta["prov"], meta["city"], rows["brandid"], rows["familyid"],
                                       rows["salesdescid"], registerDate, mile, partnerId[0], md, c_pinyin)
                url_list.append(url)
                data = {"url": url}
                data_list.append(data)
                print(url)
            else:
                city_dic = dict(zip(city_list, prov_list))
                cityname_dic = dict(zip(city_list, cityname_list))
                for city, prov in city_dic.items():
                    meta = {
                        "brand": rows["brandid"],
                        "series": rows["familyid"],
Example #32
0
import pypinyin as py
word = '朝阳'

A = py.pinyin(word, heteronym=True) #含声调
B = py.lazy_pinyin(word)	#不含声调

print(A)
print(B)
Example #33
0
# !/usr/bin/env python
# -*- coding: UTF-8 -*-

"""
为什么没有 y, w, yu 几个声母?

声母风格(INITIALS)下,“雨”、“我”、“圆”等汉字返回空字符串,因为根据 《汉语拼音方案》 , y,w,ü (yu) 都不是声母,在某些特定韵母无声母时,才加上 y 或 w,而 ü 也有其特定规则。 —— @hotoo

如果你觉得这个给你带来了麻烦,那么也请小心一些无声母的汉字(如“啊”、“饿”、“按”、“昂”等)。 这时候你也许需要的是首字母风格(FIRST_LETTER)。 —— @hotoo
"""

from pypinyin import pinyin, lazy_pinyin, Style

pinyinlist = pinyin("西藏", style=Style.NORMAL)
print(pinyinlist)
pinyinlist = pinyin("西藏")
print(pinyinlist)
pinyinlist = pinyin('中心', heteronym=True)  # 启用多音字模式
print(pinyinlist)
pinyinlist = pinyin('银行', heteronym=True)  # 启用多音字模式,无效!!
print(pinyinlist)
pinyinlist = pinyin('武汉', style=Style.FIRST_LETTER)  # 设置拼音风格
print(pinyinlist)
pinyinlist = pinyin("差错")
print(pinyinlist)
pinyinlist = lazy_pinyin('差错')  # 不考虑多音字的情况
print(pinyinlist)
pinyinlist = lazy_pinyin('你好☆☆', errors='ignore')   # 当遇到不包含拼音的字符(串)时,会根据 errors 参数的值做相应的处理:
print(pinyinlist)
pinyinlist = lazy_pinyin('你好☆☆')    # 不做任何处理,原样返回
print(pinyinlist)
Example #34
0
target_texts = []
target_processed = []
pairs = []
with open(data_path, 'r', encoding='utf-8') as f:
    for line in f.readlines():
        input_text, target_text = line.split('\t')

        input_text = re.sub(r'\([^)]*\)', '', input_text.strip())
        input_text = re.sub(r'subsp\. (\w)+', '', input_text)

        target_text = html.unescape(target_text)
        target_text = re.sub(r'\([^)]*\)', '', target_text.strip())

        if len(target_text) > 1 and not re.findall(r'[A-Za-z]', target_text):
            target_text_py = ' '.join(
                [item for sublist in pinyin(target_text) for item in sublist])
            target_text_processed = ' '.join(jieba.cut(target_text, HMM=False))
            if [input_text, target_text_py] not in pairs:
                pairs.append([input_text, target_text_py])
                input_texts.append(input_text)
                target_texts.append(target_text)
                if ' ' in input_text:
                    names = input_text.split(' ')
                    input_reverse.append(' '.join(names[::-1]).strip())
                else:
                    input_reverse.append(input_text)
                target_processed.append(target_text_processed)

with open('input.txt', 'w') as i:
    i.write('\n'.join(input_texts))
Example #35
0
def handle(c):
    s = pinyin(c, heteronym=True, style=TONE3, strict=False)
    return s[0]
Example #36
0
        print Assio

        authorFirst = author[0]

        pinyin_list = authorFirst.split(' ')
        pinyinAll = ""
        for ph in pinyin_list:
            pinyinAll += ph
        V = viterbi(pinyin_list)

        author_prob = {}

        for phrase, prob in V.iteritems():
            pinyinFound = ""
            namePinyin = pinyin(phrase, style=NORMAL)
            for n in namePinyin:
                for t in n:
                    pinyinFound += t.encode("utf8")
            if cmp(pinyinFound[0:len(pinyinAll)], pinyinAll) != 0:
                continue
            author_prob[phrase] = prob

        nameSet = {}
        author_prob = sorted(author_prob.items(),
                             key=lambda item: item[1],
                             reverse=True)
        for phrase, prob in author_prob:
            result = wordDis.comparePerson(phrase.encode("utf8"), keyWord)
            if result > (0.8 / len(keyWord)):
                print "找到了标签相同的名字: "
Example #37
0
def init_data():
    """
    @describe: 准备训练用的数据文件.npy
    """
    # toutiao_data
    phrase = open('../train_data/toutiao_cat_data.txt', 'r',encoding="utf-8")
    ans = []
    for line in phrase.readlines():
        ls = line.split("_!_")
        # 清洗数据
        ls = ls[3:]
        for item in ls:
            string = ""
            i = 0
            while(i<len(item)):
                res = re.match(r'[\u4E00-\u9FA5]', item[i])
                if(res == None):
                    if(string != ""):
                        ans.append(string)
                    string = ""
                else:
                    string += item[i]
                i+=1
            if(string != ""):
                ans.append(string)
    print(ans[0:100])
    
    # 汉字的拼音添加
    ans_pinyin = []
    for item in ans:
        pinyin_ans = pinyin(u'{0}'.format(item), style=pypinyin.NORMAL)
        if(pinyin_ans == None):
            continue
        string = ""
        for item in pinyin_ans:
            string += item[0]+" "
        string = string[0:-1]
        if(string != ""):
            ans_pinyin.append(string)
    # ['bao li ji tuan', 'ma wei du', 'zhong guo ke xue ji zhu guan',...
    print(ans_pinyin[1:100]) 
    
    # 统计汉字信息 [汉字] [出现次数]一元语料
    hanzi_ls = []
    hanzi_count_ls = []
    for item in ans:
        for chr in item:
            if chr not in hanzi_ls:
                hanzi_ls.append(chr)
                hanzi_count_ls.append(0)
            else: 
                hanzi_count_ls[hanzi_ls.index(chr)] += 1
    np.save("../data/my_hanzi_num", hanzi_count_ls, allow_pickle=True, fix_imports=True)
    total_hanzi_num = len(hanzi_ls)
    print(total_hanzi_num)
    
    # dic 汉字:汉字编码  映射表
    hanzi_dict = {}
    encode_num = 0
    for item in hanzi_ls:
        hanzi_dict[item] = encode_num
        encode_num += 1
    np.save("../data/my_hanzi_dict", hanzi_dict, allow_pickle=True, fix_imports=True)
    
    # 二元语料训练
    # 汉字编码到汉字编码的映射 次数
    hanzi_matrix = np.zeros([total_hanzi_num, total_hanzi_num])
    for item in ans:
        for i in range(1,len(item)):
            chr1 = item[i-1]
            chr2 = item[i]
            code1 = hanzi_dict[chr1]
            code2 = hanzi_dict[chr2]
            hanzi_matrix[code1][code2] += 1
    np.save("../data/my_moving_array", hanzi_matrix, allow_pickle=True, fix_imports=True)
    
    # 汉子拼音字典表数据准备
    py2hanzi = {}
    for i in range(len(ans)):
        pinyin_ls = ans_pinyin[i].split() # 拼音序列
        for pinyin_item in pinyin_ls: # 初始化
            py2hanzi[pinyin_item] = ""
    for i in range(len(ans)):
        str = ans[i]# 汉字串
        pinyin_ls = ans_pinyin[i].split() # 拼音序列
        for j in range(len(str)):
            chr = str[j]
            pinyin_item = pinyin_ls[j]
            if( chr not in py2hanzi[pinyin_item]):
                py2hanzi[pinyin_item] += chr
    np.save("../data/py2hanzi.npy", py2hanzi, allow_pickle=True, fix_imports=True)  
    print(py2hanzi.keys())  
    
    # 汉字 对应的 拼音 频数 eg:{'了':{'le':5, 'liao':10},'屈':{'qu':5}}
    hanzi2pin_dict = {} # {'了':{'le':5, 'liao':10},'屈':{'qu':5}}
    hanzi_str = hanzi_ls  # 已存在拼音的汉字序列
    py_data_ls = [] # 已存在的拼音列表
    # 构建双重字典表结构
    for k,v in py2hanzi.items():
        py_data_ls.append(k)
        if "ü" in k:
            print(k)
        hanzi_str += v
        for chr in v:
            hanzi2pin_dict[chr] = {} # 初始化每一个字对应一个拼音频率字典
    for k,v in py2hanzi.items():
        for chr in v:
            hanzi2pin_dict[chr][k] = 0 # 初始化每一个拼音的频率value

    # phrase = open('../train_data/emission_train.txt', 'r')
    for i in range(len(ans)):
        str = ans[i]# 汉字串
        pinyin_ls = ans_pinyin[i].split() # 拼音序列
        for i in range(len(pinyin_ls)): # 去除音调
            pinyin_ls[i] = pinyin_ls[i][0:-1]
        if(len(str) != len(pinyin_ls)): # 拼音与汉字数不匹配
            continue
        for i in range(len(str)):
            if(str[i] not in hanzi_str): # 该汉字不在有拼音的汉字列表 添加到dic
                hanzi2pin_dict[ str[i] ] = {}
                hanzi2pin_dict[ str[i] ][ pinyin_ls[i] ] = 0
            py_data_ls = hanzi2pin_dict[ str[i] ].keys() # 该汉字所有的拼音列表 
            if(pinyin_ls[i] not in py_data_ls):
                hanzi2pin_dict[ str[i] ][ pinyin_ls[i] ] = 0
            hanzi2pin_dict[str[i]][pinyin_ls[i]] += 1
    phrase.close()
    np.save("../data/my_emission_dic", hanzi2pin_dict, allow_pickle=True, fix_imports=True)
Example #38
0
                    conn.commit()
                except Exception:
                    conn.rollback()
                    print('已存在', j['title'])
        conn.close()

    def run(self):
        radi_idlist = self.get_title()
        for i in radi_idlist:
            data = self.get_radio(i)
            self.saveFmysql(data=data, radio_id=i)


if __name__ == '__main__':
    # a=Radio().run()
    # r={"c_user_id":209747,"session_key":"a65d158284540e74814fe63c101b32ad","device":"hradio","id":10525}
    # b=Radio().req_post('https://api.fm.subat.cn/v2.2/radio/programs',r)
    # a=Radio().saveFmysql(b)
    # print(a)
    # pwd = os.path.dirname(os.path.realpath(__file__))
    # print(pwd)
    # radiot=RadioT.objects.all()
    # for i in radiot:
    #     print(i.mp3)
    from pypinyin import pinyin, TONE

    pinyinlist = pinyin("四是四十是十", style=TONE)
    text = ''
    for i in pinyinlist:
        text += " " + i[0]
    print(text)
Example #39
0
def __to_pinyin__(str=''):
    rs = pinyin(str, style=Style.TONE3, neutral_tone_with_five=True)
    return slug(rs, separator='')
Example #40
0
def test_mmseg_for_pinyin(input, default_ret, mmseg_ret):
    assert pinyin(input) == mmseg_ret
    assert pinyin(mmseg.seg.cut(input)) == mmseg_ret
Example #41
0
import torch.nn as nn
import torch
import torch.nn.functional as F
import heapq
import numpy
from pypinyin import pinyin, Style

config = BertConfig()
config.vocab_size = 41460  # 句子词典
model = AutoModelForMaskedLM.from_config(config)
model.bert.embeddings.word_embeddings = nn.Embedding(1839, 768, padding_idx=0)
state_dict = torch.load('./results/checkpoint-00000/pytorch_model.bin',
                        map_location="cpu")
model.load_state_dict(state_dict)
model.eval()
pinyin_list = [
    i
    for tmp in pinyin('手机没电了', style=Style.TONE3, neutral_tone_with_five=True)
    for i in tmp
]
con_tokenizer = BertTokenizer.from_pretrained('y2d1')
lab_tokenizer = BertTokenizer.from_pretrained('z2d')
con = torch.tensor(
    con_tokenizer.convert_tokens_to_ids(pinyin_list)).unsqueeze(0)
out_top5 = torch.topk(F.softmax(model(con)[0].squeeze(0), dim=-1), k=10)
values = out_top5[0].detach().numpy().tolist()
indices = out_top5[1].detach().numpy().tolist()
for i, item in enumerate(indices):
    print(lab_tokenizer.convert_ids_to_tokens(item))
    print(values[i])
Example #42
0
# -*- encoding: UTF-8 -*-
from pypinyin import pinyin, lazy_pinyin
import MyEsTools as es


class Main:
    def __init__(self):
        pass


if __name__ == '__main__':
    tool = es.MyEsTools('10.116.27.131', 'test', 'cn')
    ACTIONS = []
    source = {"data": "test"}
    action = {
        "_index": tool.index,
        "_type": tool.type,
        "_source": source,
        "_id": "车质网_12345"
    }
    ACTIONS.append(action)
    #     tool.bulk_data(ACTIONS)
    #     print pinyin(unicode("车质网","UTF-8"))
    print pinyin("车质网".decode("utf-8"), errors='ignore')
    print ''.join(lazy_pinyin("太平洋汽车网".decode("utf-8"), errors='ignore'))
    print ''.join(lazy_pinyin("车质网%".decode("utf-8")))
Example #43
0
def pypinyin_g2p(text) -> List[str]:
    from pypinyin import pinyin
    from pypinyin import Style

    phones = [phone[0] for phone in pinyin(text, style=Style.TONE3)]
    return phones
def SPY(ch_str_1, ch_str_2):
    pinyin_1 = pinyin(ch_str_1, style=Style.NORMAL)
    pinyin_2 = pinyin(ch_str_2, style=Style.NORMAL)
    return ch_similarity_sub(pinyin_1, pinyin_2)
Example #45
0
def pro(text):
    t = pinyin(text, style=Style.BOPOMOFO)
    t = t[0][0]
    return t
Example #46
0
def char2bpmf(char):
    from pypinyin import pinyin, Style
    return pinyin(char, style=Style.BOPOMOFO)[0][0]
Example #47
0
    def extract(self, char_seq):
        raw_result = pinyin(char_seq, **self.params)
        result = [i[0] for i in raw_result]

        return result
Example #48
0
def pinyin(text):
    py = pypinyin.pinyin(text, style=pypinyin.Style.TONE3)
    return ''.join([x[0] for x in py])
Example #49
0
def p(input):
    str = ""
    arr = pinyin(input, style=Style.TONE3)
    for i in arr:
        str += i[0] + " "
    return str
Example #50
0
def get_pinyin_first_litter(hanzi):
    pinyin_list = pinyin(hanzi, style=pypinyin.FIRST_LETTER)
    pinyin_st = ''
    for i in pinyin_list:
        pinyin_st += i[0]
    return pinyin_st
Example #51
0
 def renew_pinyin(self, request, queryset):
     for stu in queryset:
         pinyin_list = pypinyin.pinyin(stu.Sname, style=pypinyin.NORMAL)
         stu.pinyin = ''.join([w[0][0] for w in pinyin_list])
         stu.save()
     return self.message_user(request=request, message='修改学生拼音成功!')
def STM(ch_str_1, ch_str_2):
    shengdiao_1 = get_shengdiao(pinyin(ch_str_1, style=Style.TONE3))
    shengdiao_2 = get_shengdiao(pinyin(ch_str_2, style=Style.TONE3))
    return ch_similarity_sub(shengdiao_1, shengdiao_2)
Example #53
0
def getStrFirstAplha(str):
    return pinyin(str, style=Style.FIRST_LETTER)[0][0].upper()
Example #54
0
        default=None,
        help="path for the English transcription text file",
    )
    args = parser.parse_args()

    # clean every line in transcription file first
    transcription_dict = {}
    with codecs.open(args.transcription_path, "r", "utf-8") as fid:
        for line in fid.readlines():
            segments = line.split(" ")
            lang_char = args.transcription_path.split("/")[-1][0]
            id = args.spk + "_" + lang_char + segments[0]  # ex. TMF1_M10001
            content = segments[1].replace("\n", "")

            # Some special rules to match CSMSC pinyin
            text = pinyin(content, style=Style.TONE3)
            text = [c[0] for c in text]
            clean_content = []
            for c in text:
                c_init = get_initials(c, strict=True)
                c_final = get_finals(c, strict=True)
                for c in [c_init, c_final]:
                    if len(c) == 0:
                        continue
                    c = c.replace("ü", "v")
                    c = c.replace("ui", "uei")
                    c = c.replace("un", "uen")
                    c = c.replace("iu", "iou")

                    # Special rule: "e5n" -> "en5"
                    if "5" in c:
def SSM(ch_str_1, ch_str_2):
    shengmu_1 = pinyin(ch_str_1, style=Style.INITIALS)
    shengmu_2 = pinyin(ch_str_2, style=Style.INITIALS)
    return ch_similarity_sub(shengmu_1, shengmu_2)
Example #56
0
def all_company():
    """
    存储所有上市公司表信息
    :return:
    """

    # TODO: 建表语句用SQL脚本 or 代码执行
    cursor = conn.cursor()
    df = gsd.get_all_company()

    stockCode = list(df.index)  # 股票代码
    stockName = list(df['name'])  # 股票名称
    stockIndustry = list(df['industry'])  # 所属行业
    stockArea = list(df['area'])  # 所在区域
    stockPe = list(df['pe'])  # 市盈率
    stockOutstanding = list(df['outstanding'])  # 流通股本(亿)
    stockTotals = list(df['totals'])  # 总股本(亿)
    stockTotalAssets = list(df['totalAssets'])  # 总资产(万)
    stockLiquidAssets = list(df['liquidAssets'])  # 流动资产
    stockFixedAssets = list(df['fixedAssets'])  # 固定资产
    stockReserved = list(df['reserved'])  # 公积金
    stockReservedPerShare = list(df['reservedPerShare'])  # 每股公积金
    stockEsp = list(df['esp'])  # 每股收益
    stockBvps = list(df['bvps'])  # 每股净资
    stockPb = list(df['pb'])  # 市净率
    stockTimeToMarket = list(df['timeToMarket'])  # 上市日期
    stockUndp = list(df['undp'])  # 未分利润
    stockPerundp = list(df['perundp'])  # 每股未分配
    stockRev = list(df['rev'])  # 收入同比(%)
    stockProfit = list(df['profit'])  # 利润同比(%)
    stockGpr = list(df['gpr'])  # 毛利率(%)
    stockNpr = list(df['npr'])  # 净利润率(%)
    stockHolders = list(df['holders'])  # 股东人数

    dfLen = len(df)

    # print(time.strptime(stockTimeToMarket[1], "%Y%m%d"))

    for i in range(0, dfLen):
        stockCodeDB = str(stockCode[i])
        stockNameDB = str(stockName[i])
        stockIndustryDB = str(stockIndustry[i])
        stockAreaDB = str(stockArea[i])
        stockPeDB = round(float(stockPe[i]), 4)
        stockOutstandingDB = round(float(stockOutstanding[i]), 4)
        stockTotalsDB = round(float(stockTotals[i]), 4)
        stockTotalAssetsDB = round(float(stockTotalAssets[i]), 4)
        stockLiquidAssetsDB = round(float(stockLiquidAssets[i]), 4)
        stockFixedAssetsDB = round(float(stockFixedAssets[i]), 4)
        stockReservedDB = round(float(stockReserved[i]), 4)
        stockReservedPerShareDB = round(float(stockReservedPerShare[i]), 4)
        stockEspDB = round(float(stockEsp[i]), 4)
        stockBvpsDB = round(float(stockBvps[i]), 4)
        stockPbDB = round(float(stockPb[i]), 4)
        timeToMarketDB = str(stockTimeToMarket[i])[0:4] + '-' + str(
            stockTimeToMarket[i])[4:6] + '-' + str(stockTimeToMarket[i])[6:8]
        stockUndpDB = round(float(stockUndp[i]), 4)
        stockPerundpDB = round(float(stockPerundp[i]), 4)
        stockRevDB = round(float(stockRev[i]), 4)
        stockProfitDB = round(float(stockProfit[i]), 4)
        stockGprDB = round(float(stockGpr[i]), 4)
        stockNprDB = round(float(stockNpr[i]), 4)
        stockHoldersDB = round(float(stockHolders[i]), 4)

        a = str(pinyin(stockNameDB, style=pypinyin.FIRST_LETTER))
        stockTableNameDB = "".join(a).replace('[', '').replace(']', '').replace("'", '').replace(',', ''). \
                               replace(' ', '').replace('*', '').upper() + stockCodeDB
        # print(stockTableNameDB)

        # print(stockTimeToMarket[i])
        # print(timeToMarketDB)
        #
        try:
            cursor.execute(
                "insert into stock_basics(code, name, industry, area, pe, outstanding, "
                "totals, totalAssets, liquidAssets, fixedAssets, reserved, "
                "reservedPerShare, esp, bvps, pb, timeToMarket, undp, "
                "perundp, rev, profit, gpr, npr, holders, tablename)"
                "values('%s', '%s', '%s', '%s', '%f', '%f', "
                "'%f', '%f', '%f', '%f', '%f', "
                "'%f', '%f', '%f', '%f', to_date('%s', 'yyyy-MM-dd'), '%f', "
                "'%f', '%f', '%f', '%f', '%f', '%f', '%s')" %
                (stockCodeDB, stockNameDB, stockIndustryDB, stockAreaDB,
                 stockPeDB, stockOutstandingDB, stockTotalsDB,
                 stockTotalAssetsDB, stockLiquidAssetsDB, stockFixedAssetsDB,
                 stockReservedDB, stockReservedPerShareDB, stockEspDB,
                 stockBvpsDB, stockPbDB, timeToMarketDB, stockUndpDB,
                 stockPerundpDB, stockRevDB, stockProfitDB, stockGprDB,
                 stockNprDB, stockHoldersDB, stockTableNameDB))
            cursor.execute("commit")
            print("已存入  ", i)
        except Exception:
            print("Error")
Example #57
0
def get_foot(line):
    '''获取诗句韵脚'''
    return pinyin(line, style=9, errors='ignore')[-1][0][:-1]
def SYM(ch_str_1, ch_str_2):
    yunmu_1 = pinyin(ch_str_1, style=Style.FINALS)
    yunmu_2 = pinyin(ch_str_2, style=Style.FINALS)
    return ch_similarity_sub(yunmu_1, yunmu_2)
Example #59
0
def test_mmseg_and_jieba_for_pinyin(input, jieba_ret, mmseg_ret):
    assert pinyin(input) == mmseg_ret
    assert pinyin(mmseg.seg.cut(input)) == mmseg_ret
 def text_to_vocab_func(txt):
     pins = pypinyin.pinyin(txt)
     pins = [i[0] for i in pins]
     return pins