Ejemplo n.º 1
0
def extract_zero_consonant(filename):
    f = open(filename, 'r')
    lines = f.readlines()
    f.close()
    lines = filter_punct(lines)
    er_text = "儿".decode('utf-8')
    for line in lines:
        line = line.decode('utf-8').strip()
        index = line.find(er_text)
        if index != -1:
            #print "ecape er.... ", line
            continue
        han_pinyin = pinyin.get(line, ' ')
        #if is_zero_consonant(han_pinyin):
        #    print line
        pinyins = han_pinyin.split(" ")
        count = 0
        flag = 1
        #print han_pinyin
        for py_str in pinyins:
            if is_zero_consonant(py_str):
                count = count + 1
            #if flag == 1:
            #    flag = 0
            #    if is_modal(py_str):
            #        count = count + 1
            #if py_str == ',':
            #    #print "biao dian........"
            #    flag = 1

        if count > 1 and len(line) < 40:
            #print line, count
            print line
Ejemplo n.º 2
0
def change2pinyin(from_file, to_file, punt_dict):
    print from_file, to_file
    ffrom = open(from_file, 'r')
    words = ffrom.readline()
    print words
    #words = words.decode("utf-8")
    #这里转化为utf8之所以可转可不转是因为在获取拼音的代码内部有做转化utf8
    #jieba内部也有对字符串进行转换的代码,所以可以直接输入utf8. 但是实际上在处理的时候都是unicode
    han_pinyin = pinyin.get(words, ' ')
    res = []
    for char in han_pinyin:
        #res.join(punt_dict.get(char, char))
        #ret = punt_dict.get(char, char).encode('utf-8')
        ret = punt_dict.get(char, char)
        #res.join(ret)
        res = res + [ret]
        print char, ret
    res = ''.join(res)
    print han_pinyin, "-->", res
    #temp = han_pinyin.decode("utf8")
    #res = re.sub("[\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+".decode("utf8"), ",".decode("utf8"),temp)
    #han_pinyin = pinyin.filter_zh_punc(han_pinyin)
    #print res
    fto = open(to_file, 'w')
    #fto.write(han_pinyin)
    fto.write(res)
    ffrom.close()
    fto.close()
Ejemplo n.º 3
0
def change2pinyin(from_file, to_file, punt_dict):
    print from_file, to_file
    ffrom = open(from_file, 'r')
    words = ffrom.readline()
    print words
    #words = words.decode("utf-8")
    #这里转化为utf8之所以可转可不转是因为在获取拼音的代码内部有做转化utf8
    #jieba内部也有对字符串进行转换的代码,所以可以直接输入utf8. 但是实际上在处理的时候都是unicode
    han_pinyin = pinyin.get(words, ' ')
    res = []
    for char in han_pinyin:
        #res.join(punt_dict.get(char, char))
        #ret = punt_dict.get(char, char).encode('utf-8')
        ret = punt_dict.get(char, char)
        #res.join(ret)
        res = res + [ret]
        print char, ret
    res = ''.join(res)
    print han_pinyin, "-->", res
    #temp = han_pinyin.decode("utf8")
    #res = re.sub("[\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+".decode("utf8"), ",".decode("utf8"),temp)
    #han_pinyin = pinyin.filter_zh_punc(han_pinyin)
    #print res
    fto = open(to_file, 'w')
    #fto.write(han_pinyin)
    fto.write(res)
    ffrom.close()
    fto.close()
Ejemplo n.º 4
0
def extract_zero_consonant(filename):
    f = open(filename, 'r')
    lines = f.readlines()
    f.close()
    lines = filter_punct(lines)
    er_text = "儿".decode('utf-8')
    for line in lines:
        line = line.decode('utf-8').strip()
        index = line.find(er_text)
        if index != -1:
            #print "ecape er.... ", line
            continue
        han_pinyin = pinyin.get(line, ' ')
        #if is_zero_consonant(han_pinyin):
        #    print line
        pinyins = han_pinyin.split(" ")
        count = 0
        flag = 1
        #print han_pinyin
        for py_str in pinyins:
            if is_zero_consonant(py_str):
                count = count + 1
            #if flag == 1:
            #    flag = 0
            #    if is_modal(py_str):
            #        count = count + 1
            #if py_str == ',':
            #    #print "biao dian........"
            #    flag = 1

        if count > 1 and len(line) < 40:
            #print line, count
            print line
Ejemplo n.º 5
0
def convert_lines_pinyin(lines):
    pinyin_lines = []
    for line in lines:
        line = line.decode('utf-8').strip()
        if len(line) == 0:
            print "empty line"
        else:
            han_pinyin = pinyin.get(line, ' ')
Ejemplo n.º 6
0
def convert_lines_pinyin(lines):
    pinyin_lines = []
    for line in lines:
        line = line.decode('utf-8').strip()
        if len(line) == 0:
            print "empty line"
        else:
            han_pinyin = pinyin.get(line, ' ')
Ejemplo n.º 7
0
def cslt_zh_txt_gen():
    from_file = "/home/sooda/speech/phoneSet/cslt/cslt_dict.txt"
    ffrom = open(from_file, 'r')
    lines = ffrom.readlines()
    for line in lines:
        segs = line.decode("utf-8").strip().split(" ")
        word = segs[0]
        han_pinyin = pinyin.get(word, ' ')
        #print word, han_pinyin, segs[1], segs[2]
        print han_pinyin, segs[1], segs[2]
Ejemplo n.º 8
0
def cslt_zh_txt_gen():
    from_file = "/home/sooda/speech/phoneSet/cslt/cslt_dict.txt"
    ffrom = open(from_file, 'r')
    lines = ffrom.readlines()
    for line in lines:
        segs = line.decode("utf-8").strip().split(" ")
        word = segs[0]
        han_pinyin = pinyin.get(word, ' ')
        #print word, han_pinyin, segs[1], segs[2]
        print han_pinyin, segs[1], segs[2]
Ejemplo n.º 9
0
def transcript_pinyin(from_file, to_file):
    print from_file, to_file
    ffrom = open(from_file, 'r')
    fto = open(to_file, 'w')
    lines = ffrom.readlines()
    linenum = 0
    for words in lines:
        print words
        han_pinyin = pinyin.get(words, ' ')

        fto.write(str(linenum)+'\n')
        fto.write(words)
        fto.write(han_pinyin)
        linenum = linenum + 1
        #res = []
        #for w in words:
        #  wpy = pinyin.get(w, ' ')
        #  temp = [w] + [wpy]
        #  res = res + temp
        #fto.write(res)
    ffrom.close()
    fto.close()
Ejemplo n.º 10
0
def transcript_pinyin(from_file, to_file):
    print from_file, to_file
    ffrom = open(from_file, 'r')
    fto = open(to_file, 'w')
    lines = ffrom.readlines()
    linenum = 0
    for words in lines:
        print words
        han_pinyin = pinyin.get(words, ' ')

        fto.write(str(linenum) + '\n')
        fto.write(words)
        fto.write(han_pinyin)
        linenum = linenum + 1
        #res = []
        #for w in words:
        #  wpy = pinyin.get(w, ' ')
        #  temp = [w] + [wpy]
        #  res = res + temp
        #fto.write(res)
    ffrom.close()
    fto.close()
Ejemplo n.º 11
0
def convert_word_pinyin(line):
    han_pinyin = pinyin.get(line, ' ')
    return han_pinyin
Ejemplo n.º 12
0
def convert_word_pinyin(line):
    han_pinyin = pinyin.get(line, ' ')
    return han_pinyin