def extract_zero_consonant(filename): f = open(filename, 'r') lines = f.readlines() f.close() lines = filter_punct(lines) er_text = "儿".decode('utf-8') for line in lines: line = line.decode('utf-8').strip() index = line.find(er_text) if index != -1: #print "ecape er.... ", line continue han_pinyin = pinyin.get(line, ' ') #if is_zero_consonant(han_pinyin): # print line pinyins = han_pinyin.split(" ") count = 0 flag = 1 #print han_pinyin for py_str in pinyins: if is_zero_consonant(py_str): count = count + 1 #if flag == 1: # flag = 0 # if is_modal(py_str): # count = count + 1 #if py_str == ',': # #print "biao dian........" # flag = 1 if count > 1 and len(line) < 40: #print line, count print line
def change2pinyin(from_file, to_file, punt_dict): print from_file, to_file ffrom = open(from_file, 'r') words = ffrom.readline() print words #words = words.decode("utf-8") #这里转化为utf8之所以可转可不转是因为在获取拼音的代码内部有做转化utf8 #jieba内部也有对字符串进行转换的代码,所以可以直接输入utf8. 但是实际上在处理的时候都是unicode han_pinyin = pinyin.get(words, ' ') res = [] for char in han_pinyin: #res.join(punt_dict.get(char, char)) #ret = punt_dict.get(char, char).encode('utf-8') ret = punt_dict.get(char, char) #res.join(ret) res = res + [ret] print char, ret res = ''.join(res) print han_pinyin, "-->", res #temp = han_pinyin.decode("utf8") #res = re.sub("[\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+".decode("utf8"), ",".decode("utf8"),temp) #han_pinyin = pinyin.filter_zh_punc(han_pinyin) #print res fto = open(to_file, 'w') #fto.write(han_pinyin) fto.write(res) ffrom.close() fto.close()
def convert_lines_pinyin(lines): pinyin_lines = [] for line in lines: line = line.decode('utf-8').strip() if len(line) == 0: print "empty line" else: han_pinyin = pinyin.get(line, ' ')
def cslt_zh_txt_gen(): from_file = "/home/sooda/speech/phoneSet/cslt/cslt_dict.txt" ffrom = open(from_file, 'r') lines = ffrom.readlines() for line in lines: segs = line.decode("utf-8").strip().split(" ") word = segs[0] han_pinyin = pinyin.get(word, ' ') #print word, han_pinyin, segs[1], segs[2] print han_pinyin, segs[1], segs[2]
def transcript_pinyin(from_file, to_file): print from_file, to_file ffrom = open(from_file, 'r') fto = open(to_file, 'w') lines = ffrom.readlines() linenum = 0 for words in lines: print words han_pinyin = pinyin.get(words, ' ') fto.write(str(linenum)+'\n') fto.write(words) fto.write(han_pinyin) linenum = linenum + 1 #res = [] #for w in words: # wpy = pinyin.get(w, ' ') # temp = [w] + [wpy] # res = res + temp #fto.write(res) ffrom.close() fto.close()
def transcript_pinyin(from_file, to_file): print from_file, to_file ffrom = open(from_file, 'r') fto = open(to_file, 'w') lines = ffrom.readlines() linenum = 0 for words in lines: print words han_pinyin = pinyin.get(words, ' ') fto.write(str(linenum) + '\n') fto.write(words) fto.write(han_pinyin) linenum = linenum + 1 #res = [] #for w in words: # wpy = pinyin.get(w, ' ') # temp = [w] + [wpy] # res = res + temp #fto.write(res) ffrom.close() fto.close()
def convert_word_pinyin(line): han_pinyin = pinyin.get(line, ' ') return han_pinyin