def getSpeechSynthesis_AlgorithmDir(): s = '/trunk/algorithm/tools/' path = findDirByString(sys.argv[0], s) if len(path) == 0: path = findDirByString(__file__, s) if len(path) == 0: path = findDirByString(sys.path[0], s) s = '/algorithm/tools/' path = findDirByString(sys.argv[0], s) if len(path) == 0: path = findDirByString(__file__, s) if len(path) == 0: path = findDirByString(sys.path[0], s) if not os.path.exists(path): path = '/Users/daiqiang/speech_synthesis_svr_proj/trunk/algorithm/' if len(path) == 0: print(pyUsage.get_cur_info(), 'dir not exists! (算法目录不存在!)') sys.exit(0) algorithm_dir = findDirByString(path, '/algorithm/') if not os.path.exists(algorithm_dir): print(pyUsage.get_cur_info(), 'dir not exists! (算法目录不存在!)') sys.exit(0) return algorithm_dir
def append_eating(eating_id, day, pics, name, kind): print (pyUsage.get_cur_info(), locals()) e = EatingModel( eating_id = eating_id, day = day, pics = pics, name = name, kind = kind) ret = e.save() print (pyUsage.get_cur_info(), 'ret= ', ret) resp = pack_json_resp(ret, 'append_eating err', -1) return resp
def objs2dict_list(objs, fields): d_list = [] for obj in objs: d = {} print (pyUsage.get_cur_info(), fields) for name in fields: d[name] = getattr(obj, name) if isinstance(d[name], datetime.datetime): t = d[name].strftime('%Y-%m-%d') d[name] = t else: pass print (pyUsage.get_cur_info(), type(d[name]) ) d_list.append(d) return d_list
def addEnglish2Number(root): id_value = 0 c = ET.SubElement(root, 'ElementEntry') c.set('Comment', '默认顺序是:先具体,后正则;先短后长') t_list = pyUsage.get_cur_info() c.set('TextType', t_list[1].replace('add', '')) ###1 d = ET.SubElement(c, 'Node') d.set('Comment', 'JX06323') d.set('ContentType', 'Regex') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = '[a-zA-Z]+\d+' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Bit') e.text = '{English(Bit)}{Number(Bit)}' ###1 d = ET.SubElement(c, 'Node') d.set('Comment', '动车,快车,城际列车D632') d.set('ContentType', 'Regex') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = '[a-zA-Z]+\d+' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Bit') e.text = '{English(Bit)}{Number(Bit)}'
def addNumber2Kanji(root): id_value = 0 c = ET.SubElement(root, 'ElementEntry') c.set('Comment', '默认顺序是:先具体,后正则;先短后长') t_list = pyUsage.get_cur_info() c.set('TextType', t_list[1].replace('add', '')) ###1 d = ET.SubElement(c, 'Node') d.set('Comment', '数量') d.set('ContentType', 'Regex') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = '\d+年' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'NumberBit+Kanji') e.text = '{Number(Count)}年' e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Year') e.text = '{Number(Bit)}年' e = ET.SubElement(d, 'Voice') ###1 d = ET.SubElement(c, 'Node') d.set('Comment', '数量') d.set('ContentType', 'Regex') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = '\d+.*' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Count+Kanji') e.text = '{Number(Count)}{Kanji}'
def addNumber2Punctuation(root): id_value = 0 c = ET.SubElement(root, 'ElementEntry') c.set('Comment', '默认顺序是:先具体,后正则;先短后长') t_list = pyUsage.get_cur_info() c.set('TextType', t_list[1].replace('add', '')) ###1 d = ET.SubElement(c, 'Node') d.set('Comment', '百分数') d.set('ContentType', 'Regex') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = '\d+%' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'ChinesePrecent') e.text = '百分之{Number(Count)}' e = ET.SubElement(d, 'Voice') ###1 d = ET.SubElement(c, 'Node') d.set('Comment', '其他') d.set('ContentType', 'Regex') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = '\d+.*' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'NumberBit') e.text = '{Number(BitIgnorePunctuation)}'
def addPunctuation(root): id_value = 0 c = ET.SubElement(root, 'ElementEntry') c.set('Comment', '默认顺序是:先具体,后正则;先短后长') t_list = pyUsage.get_cur_info() c.set('TextType', t_list[1].replace('add', '')) ###1 d = ET.SubElement(c, 'Node') d.set('Comment', '笑脸符号') d.set('ContentType', 'StringValue') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = ':)' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Kanji') e.text = '笑脸' ###1 d = ET.SubElement(c, 'Node') d.set('Comment', '<') d.set('ContentType', 'StringValue') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = '<' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Kanji') e.text = '小于' e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Puctuation') e.text = ''
def checkMultiKey(word_info_list): mul_word_list = [] word_info_dict = {} for i, (word, pro_list) in enumerate(word_info_list): ###添加到词典 if word in word_info_dict: old_list = word_info_dict[word] ###记录有问题的汉字 mul_word_list.append(word) ###剩下的拼音去重处理 for pro in pro_list: pos_list = [e for e in word_info_dict[word] if e == pro] if len(pos_list) > 0: pass else: word_info_dict[word].append(pro) print(pyUsage.get_cur_info(), word, 'bad duplicate:') ###打印旧的 for e in old_list: print('====', e) print('') ###打印新的 for e in pro_list: print('----', e) print('') ###打印 for e in word_info_dict[word]: print('>>>>', e) print('\n') else: word_info_dict[word] = pro_list return mul_word_list, word_info_dict
def extract_word_charactor(element): # print('element= ', element) pos = element.rfind('/') if -1 == pos: print(pyUsage.get_cur_info(), 'no split for chara error! element=', element) sys.exit(0) ###对符号进行加"/w"操作 if len(element.encode('utf8')) == len(element): return [element, 'w'] elif len(element) > 10: print(pyUsage.get_cur_info(), 'error! too much long! element=', element) sys.exit(0) else: return [element, ''] #sys.exit(0) return [element[:pos], element[pos+1:]]
def append_sns(sns_id, url, title, types, duration): e = SnsModel( sns_id = sns_id, url = url, title = title, types = types, duration = duration) ret = e.save() print (pyUsage.get_cur_info(), 'ret= ', ret) resp = pack_json_resp(ret, 'append_sns err', -1) return resp
def readPinyinXmlDict(path, ignore_duplicate = False): lang, name, word_info_list = readPinyinXml(path) res_dict = {} for e in word_info_list: if e[0] in res_dict and not ignore_duplicate: print('error! duplicate key: ', e) sys.exit(0) res_dict[e[0]] = e[1] print(pyUsage.get_cur_info(), 'len(res_dict)=', len(res_dict), path) return lang, name, res_dict
def getGBKText(word): word = word.strip() total_gbk_text = '' for w in word: gbk_val = w.encode('gb18030') if len(gbk_val) != 2: print(pyUsage.get_cur_info(), 'critical error', '不是GBK编码') sys.exit(0) gbk_text = '%X%X'%(gbk_val[0], gbk_val[1]) total_gbk_text += gbk_text return total_gbk_text
def append_audio(audio_id, url, title, type, duration): e = AudioModel( audio_id = audio_id, url = url, title = title, type = type, duration = duration) ret = e.save() print (pyUsage.get_cur_info(), 'ret= ', ret) resp = pack_json_resp(ret, 'append_audio err', -1) return resp
def getPinyinCntDicts(path): if not os.path.exists(path): print(pyUsage.get_cur_info(), '路径不存在,path= ', path) sys.exit(0) lang, name, word_info_dict = pyXml.readPinyinXmlDict(path) word_py_dict = {} word_py_cnt_dict={} pinyin_cnt_distribution_dict = {} for k in word_info_dict: v = word_info_dict[k] word_py_dict[k] = getPinyin(v) word_py_cnt_dict[k] = len(getPinyin(v)) pyString.insert_or_add_dict(pinyin_cnt_distribution_dict, len(getPinyin(v)),1) ###单词有多少个拼音 print(pyUsage.get_cur_info(), 'pinyin_cnt_distribution_dict= ', pinyin_cnt_distribution_dict, 'total_word_cnt= ', len(word_info_dict)) return word_py_dict, word_py_cnt_dict, pinyin_cnt_distribution_dict
def add_eating(request): form = EatingForm() eating_list = [] if request.method == 'POST': print (pyUsage.get_cur_info(), 'POST= ', request.POST) pics = request.POST.get('pics', '') print (pyUsage.get_cur_info(), 'url= ', pics) eating_id = request.POST.get('eating_id','') day = request.POST.get('day','') pics = request.POST.get('pics','') name = request.POST.get('name','') kind = request.POST.get('kind','') if len(day) == 0: return pack_json_resp(False, 'add_eating error! day is empty', -1) objs = EatingModel.objects.filter(day = day) if len(objs) > 0: return pack_json_resp(False, 'add_eating error! day already exists!', -1) if len(eating_id) == 0: eating_id = day return append_eating(eating_id, day, pics, name, kind) else: form = EatingForm( initial={ 'eating_id':'2', 'day':'20160515', 'pics':'', 'name':'西红柿鸡蛋', 'kind':"3", } ) objs = EatingModel.objects.all() eating_list = objs2dict_list(objs, g_eating_fields) #print pyUsage.get_cur_info(), 'eating_list= ', eating_list #return render_to_response('add_eating.html', {'form': form, 'eating_list':eating_list}) return render_to_response('up_pic7.html', {'form': form, 'eating_list':eating_list})
def findDirByString(path, s): path = path.strip() path = os.path.abspath(path) ###路径加一个斜杠 if os.path.isdir(path): if path[-1] != os.sep: path += os.sep pos = path.find(s) if pos == -1: return '' print(pyUsage.get_cur_info(), 'total path= ', path) return path[:pos + len(s)]
def addPunctuation2English2Number(root): id_value = 0 c = ET.SubElement(root, 'ElementEntry') c.set('Comment', '默认顺序是:先具体,后正则;先短后长') t_list = pyUsage.get_cur_info() c.set('TextType', t_list[1].replace('add', '')) ###1 d = ET.SubElement(c, 'Node') d.set('Comment', ' 1938/nx') d.set('ContentType', 'Regex') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = ' \d+' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Time') e.text = '{Number(Count)}'
def addNumber2English(root): id_value = 0 c = ET.SubElement(root, 'ElementEntry') c.set('Comment', '默认顺序是:先具体,后正则;先短后长') t_list = pyUsage.get_cur_info() c.set('TextType', t_list[1].replace('add', '')) ###1 d = ET.SubElement(c, 'Node') d.set('Comment', '3a') d.set('ContentType', 'Regex') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = '\d[a-zA-Z]' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'NumberBit') e.text = '{Number(Bit)}{English(Bit)}'
def addComplicatedText(root): id_value = 0 c = ET.SubElement(root, 'ElementEntry') c.set('Comment', '默认顺序是:先具体,后正则;先短后长') t_list = pyUsage.get_cur_info() c.set('TextType', t_list[1].replace('add', '')) ###1 d = ET.SubElement(c, 'Node') d.set('Comment', '电子邮件') d.set('ContentType', 'Regex') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = '.*@.*' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'EMail') e.text = '{ComplicatedText(Email)}'
def addKanji(root): id_value = 0 c = ET.SubElement(root, 'ElementEntry') c.set('Comment', '默认顺序是:先具体,后正则;先短后长') t_list = pyUsage.get_cur_info() c.set('TextType', t_list[1].replace('add', '')) ###1 d = ET.SubElement(c, 'Node') d.set('Comment', 'mg') d.set('ContentType', 'Regex') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = '[^0-9a-zA-Z].*' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Kanji') e.text = '{Kanji}'
def addPunctuation2Number2Punctuation2Number(root): id_value = 0 c = ET.SubElement(root, 'ElementEntry') c.set('Comment', '默认顺序是:先具体,后正则;先短后长') t_list = pyUsage.get_cur_info() c.set('TextType', t_list[1].replace('add', '')) ###1 d = ET.SubElement(c, 'Node') d.set('Comment', '"9:25 分 -9:30 分 之间') d.set('ContentType', 'Regex') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = '-\d+:\d+' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Time') e.text = '至{Time}'
def addPunctuation2Number2Punctuation(root): id_value = 0 c = ET.SubElement(root, 'ElementEntry') c.set('Comment', '默认顺序是:先具体,后正则;先短后长') t_list = pyUsage.get_cur_info() c.set('TextType', t_list[1].replace('add', '')) ###1 d = ET.SubElement(c, 'Node') d.set('Comment', '(000936)股票证券代码') d.set('ContentType', 'Regex') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = '\(\d+\)' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'NumberBit') e.text = '{Number(Count)}'
def addNumber2Punctuation2Number(root): id_value = 0 c = ET.SubElement(root, 'ElementEntry') c.set('Comment', '默认顺序是:先具体,后正则;先短后长') t_list = pyUsage.get_cur_info() c.set('TextType', t_list[1].replace('add', '')) ###1 d = ET.SubElement(c, 'Node') d.set('Comment', '时间,比分') d.set('ContentType', 'Regex') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = '\d+:\d+' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Time') e.text = '{Number(Count)}点{Number(Count)}分' e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Score2Score') e.text = '{Number(Count)}比{Number(Count)}' ###1 d = ET.SubElement(c, 'Node') d.set('Comment', '时间,比分') d.set('ContentType', 'Regex') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = '\d+-\d+' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Time') e.text = '{Number(Count)}点{Number(Count)}分' e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Count') e.text = '{Number(Count)}比{Number(Count)}' ###1 d = ET.SubElement(c, 'Node') d.set('Comment', '时间,比分') d.set('ContentType', 'Regex') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = '\d+.\d+' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'DecimalChinese') e.text = '{Number(Count)}点{Number(Count)}'
def generateCharacteristicInfo(w_c_list, sen_id, index, sub_index, chara_cnt = 5): length = len(w_c_list) w,c = w_c_list[index] relate_info = ['%d'%sen_id,]###句子ID relate_info.append('%s'%index) relate_info.append(w) relate_info.append(c) relate_info.append('%d'%len(w)) relate_info.append('%d'%sub_index) #for diff in [-1,1,-2,2,-3,3,-4,4,-5,5]: ###目前增加逻辑:如果遇到逗号,那么后面的就不解析了. positive_valid_flag = True negtive_valid_flag = True for diff in genCharaDiffList(chara_cnt): text = getDefaultText(diff) pos = getDefaultPos(diff) if (index+diff) >= 0 and (index+diff) < length: word = w_c_list[index+diff][0] ###词 chara = w_c_list[index+diff][1] ###词性 ###针对逗号进行屏蔽 if word == ',': if diff < 0: negtive_valid_flag = False if diff > 0: positive_valid_flag = False if diff > 0 and positive_valid_flag: text = word pos = chara if diff < 0 and negtive_valid_flag: text = word pos = chara relate_info.append(text) relate_info.append(pos) ###加上字的信息 print(pyUsage.get_cur_info(True), 'relate_info= ', relate_info) #t_list = generateKanjiInfo(w_c_list, sen_id, index, sub_index, chara_cnt = 5) #print(pyUsage.get_cur_info(True), 'relate_info= ', relate_info) #relate_info.extend(t_list) return relate_info
def get_eating(request): objs = EatingModel.objects.all() eating_list = objs2dict_list(objs, g_eating_fields) ###append comment for i,d in enumerate(eating_list): print (pyUsage.get_cur_info(), 'd= ', d) vid = d['eating_id'] #comments = CommentModel.objects.filter(food_id=vid) #if len(comments) > 0: # #print pyUsage.get_cur_info(), 'g_comment_fields= ', g_comment_fields # c_dict = objs2dict_list(comments, g_comment_fields) # eating_list[i]['comment'] = c_dict return JsonResponse( { 'status': 'success', 'status_code':0, 'cnt': len(eating_list), 'eating_list':eating_list, })
def reset_all_audio(request): e_list = AudioModel.objects.all() for e in e_list: e.delete() #f = '/data/babyguard/babyguard/babyguard/audio/all_audio.txt' f = os.path.split(os.path.realpath(__file__))[0] + '/all_audio.txt' t_list = pyIO.read_file_content(f, 'utf-8') for t in t_list: #print pyUsage.get_cur_info(), 't= ', t t = t.strip() if t.find('http://') == -1: continue arr = t.split('\t') if len(arr) < 4: continue arr = t.split('\t') audio_id = get_md5(arr[0]) url = arr[0] title = arr[1] type = arr[2] duration = arr[3] ret = append_audio(audio_id, url, title, type, duration) print (pyUsage.get_cur_info(), 'add ret= ', ret) objs = AudioModel.objects.all() #print pyUsage.get_cur_info(), 'cnt= ', len(objs) audio_list = objs2dict_list(objs, g_audio_fields) #print pyUsage.get_cur_info(), 'cnt= ', len(audio_list) return JsonResponse( { 'status': 'reset audio', 'status_code':0, 'cnt':len(audio_list), 'audio_list':audio_list, })
def addNumber2Punctuation2Number2Punctuation2Number(root): id_value = 0 c = ET.SubElement(root, 'ElementEntry') c.set('Comment', '默认顺序是:先具体,后正则;先短后长') t_list = pyUsage.get_cur_info() c.set('TextType', t_list[1].replace('add', '')) ###1 d = ET.SubElement(c, 'Node') d.set('Comment', '2,377,155') d.set('ContentType', 'Regex') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = '\d+,\d+,\d+' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'CountIgnorePunctuation') e.text = '{Number(CountIgnorePunctuation)}' ###1 d = ET.SubElement(c, 'Node') d.set('Comment', '9:26:01') d.set('ContentType', 'Regex') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = '\d+:\d+:\d+' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Time') e.text = '{Number(Count)}点{Number(Count)}分{Number(Count)}秒' ###1 d = ET.SubElement(c, 'Node') d.set('Comment', '2007-10-31') d.set('ContentType', 'Regex') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = '\d{4,4}-\d{1,2}-\d{1,2}' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Date') e.text = '{Number(Count)}年{Number(Count)}月{Number(Count)}日'
def addCombinationText(root): ###日期+时间 id_value = 0 c = ET.SubElement(root, 'ElementEntry') c.set('Comment', '默认顺序是:先具体,后正则;先短后长') t_list = pyUsage.get_cur_info() c.set('TextType', t_list[1].replace('add', '')) d = ET.SubElement(c, 'Node') d.set('Comment', '日期+时间') d.set('ContentType', 'Part') id_value += 1 d.set('ID', '%d'%(id_value)) if True: ###1 e = ET.SubElement(d, 'Part') e.set('Comment', '日期') e.set('ContentType', 'TextType') id_value += 1 e.set('ID', '%d'%(id_value)) if True: e.text = 'Number2Punctuation2Number2Punctuation2Number' ###1 e = ET.SubElement(d, 'Part') e.set('Comment', '时间') e.set('ContentType', 'TextType') id_value += 1 e.set('ID', '%d'%(id_value)) if True: e.text = 'Number2Punctuation2Number2Punctuation2Number' id_value = 0 c = ET.SubElement(root, 'ElementEntry') c.set('Comment', '默认顺序是:先具体,后正则;先短后长') t_list = pyUsage.get_cur_info() c.set('TextType', t_list[1].replace('add', '')) ###符号+日期+时间 d = ET.SubElement(c, 'Node') d.set('Comment', '符号+日期+时间') d.set('ContentType', 'Part') id_value += 1 d.set('ID', '%d'%(id_value)) if True: ###1 e = ET.SubElement(d, 'Part') e.set('Comment', '符号') e.set('ContentType', 'Regex') id_value += 1 e.set('ID', '%d'%(id_value)) if True: e.text = ':' ###1 e = ET.SubElement(d, 'Part') e.set('Comment', '日期') e.set('ContentType', 'TextType') id_value += 1 e.set('ID', '%d'%(id_value)) if True: e.text = 'Number2Punctuation2Number2Punctuation2Number' ###1 e = ET.SubElement(d, 'Part') e.set('Comment', '时间') e.set('ContentType', 'TextType') id_value += 1 e.set('ID', '%d'%(id_value)) if True: e.text = 'Number2Punctuation2Number2Punctuation2Number'
import os import os BASE_DIR = os.path.dirname(__file__) import sys sys.path.extend([ '/data/util/CoreFunction/', '/data/util', '/data/', '/home/daiqiang/gif_search_data/util/CoreFunction', '/home/daiqiang/gif_search_data/tags_index_for_gif', '/Users/xinmei365/gif_search_data/util/CoreFunction', '/Users/xinmei365/gif_search_data/tags_index_for_gif', ]) import pyUsage print (pyUsage.get_cur_info(), 'BASE_DIR= ', BASE_DIR) import pyIO import pyString # Build paths inside the project like this: os.path.join(BASE_DIR, ...) BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # Quick-start development settings - unsuitable for production # See https://docs.djangoproject.com/en/1.8/howto/deployment/checklist/ # SECURITY WARNING: keep the secret key used in production secret! SECRET_KEY = 'h)44u3bdct=gm-k684$o(0j4i1e5@d=h=qmd$p(u5l)k^-ai*d' # SECURITY WARNING: don't run with debug turned on in production! DEBUG = True
def addNumber(root): id_value = 0 c = ET.SubElement(root, 'ElementEntry') c.set('Comment', '默认顺序是:先具体,后正则;先短后长') t_list = pyUsage.get_cur_info() c.set('TextType', t_list[1].replace('add', '')) ###1 d = ET.SubElement(c, 'Node') d.set('Comment', '一,幺,One,First') d.set('ContentType', 'Number') d.set('Model', 'regexNormalize_Number_1.crf_model') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = '1' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'NumberBit') e.text = '{Number(Count)}' e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Kanji') e.text = '幺' ###2 d = ET.SubElement(c, 'Node') d.set('Comment', '二,两,Two') d.set('ContentType', 'Number') d.set('Model', 'regexNormalize_Number_2.crf_model') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = '2' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', '二') e.text = '{Number(Count)}' e = ET.SubElement(d, 'Voice') e.set('ReadType', '两') e.text = '两' ###2 d = ET.SubElement(c, 'Node') d.set('Comment', '年份,数量') d.set('ContentType', 'Regex') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = '[1-9][0-9]{0,3}' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Year') e.text = '{Number(Bit)}' e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Count') e.text = '{Number(Count)}' e = ET.SubElement(d, 'Voice') e.set('ReadType', 'NumberBit') e.text = '{Number(Bit)}' ###2 d = ET.SubElement(c, 'Node') d.set('Comment', '月份,年份,数量,值为1~12') d.set('ContentType', 'Regex') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = '([1-9]|1[0-2])' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Month') e.text = '{Number(Count)}' e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Year') e.text = '{Number(Count)}' e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Count') e.text = '{Number(Count)}' e = ET.SubElement(d, 'Voice') e.set('ReadType', 'NumberBit') e.text = '{Number(Bit)}' ###2 d = ET.SubElement(c, 'Node') d.set('Comment', '座机号码') d.set('ContentType', 'Regex') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = '[1-9]\d{5,7}' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Count') e.text = '{Number(Count)}' e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Telephone') e.text = '{Number(Telephone)}' e = ET.SubElement(d, 'Voice') e.set('ReadType', 'NumberBit') e.text = '{Number(Bit)}' ###2 d = ET.SubElement(c, 'Node') d.set('Comment', '手机号码') d.set('ContentType', 'Regex') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = '[1]\d{10}' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Count') e.text = '{Number(Count)}' e = ET.SubElement(d, 'Voice') e.set('ReadType', 'Telephone') e.text = '{Number(Telephone)}' e = ET.SubElement(d, 'Voice') e.set('ReadType', 'NumberBit') e.text = '{Number(Bit)}' ###1 d = ET.SubElement(c, 'Node') d.set('Comment', '其他') d.set('ContentType', 'Regex') id_value += 1 d.set('ID', '%d'%(id_value)) d.text = '\d+' if True: e = ET.SubElement(d, 'Voice') e.set('ReadType', 'NumberBit') e.text = '{Number(Bit)}'
def getSpeechSynthesis_DataDictDir(): dict_dir = getSpeechSynthesis_AlgorithmDir() + '../../document/data/dict/' print(pyUsage.get_cur_info(), 'dict_dir= ', dict_dir) return dict_dir
def readPinyinXml(path): print(pyUsage.get_cur_info(), 'path= ', path) parser01 = XMLParser(encoding='gbk') ###先转为utf-16格式 c_list = pyIO.read_file_content(path) flag = 'encoding="GBK"' flag.lower() pos = [i for i,e in enumerate(c_list) if e.find(flag) != -1] #print('pos= ', pos) if len(pos) == 0: s_flag = 'encoding=\'gbk\'' pos = [i for i,e in enumerate(c_list) if e.find(s_flag) != -1] if len(pos) > 0: c_list[pos[0]] = c_list[pos[0]].replace('\'gbk\'', '"GBK"') #print(' 2 pos= ', pos) if len(pos) == 0: s_flag = 'encoding=\'GBK\'' pos = [i for i,e in enumerate(c_list) if e.find(s_flag) != -1] c_list[pos[0]] = c_list[pos[0]].replace('\'GBK\'', '"GBK"') #print(' 3 pos= ', pos) #c_list[pos[0]].replace('encoding="GBK"', 'encoding="utf-8"') #print (' item= ', c_list[pos[0]]) t = '\n'.join(c_list) ###读入数据 root = ET.fromstring(t) ###文件头 lang = '' name = '' for i,child in enumerate(root[:1]): l = child.find('DictionaryLanguage') lang= l.text n = child.find('DictionaryName') name = n.text ###遍历数据 word_info_list = [] for i,child in enumerate(root[1:]): ###查找单词 word = child.find('Word') #print(word.text) ###查找拼音 pro_list = [] for rank in child.iter('TYPE_PURE_NUMBER'): t1 = rank.find('0').text #print(t1.text) t2 = rank.find('1').text #print(t1.text) t3 = rank.find('2').text #print(t1.text) t4 = rank.find('BianDiao').text #print(t1.text) if not t1: t1 = '' if not t2: t2 = '' if not t3: t3 = '' if not t4: t4 = '' tmp_dict = { 'ProID': t1, 'PartOfSpeech': t2, 'PinYin': t3, 'BianDiao': t4, } pro_list.append(tmp_dict) if not t3: print(word.text, tmp_dict) sys.exit(0) pro_list = singleItem(pro_list) word_info_list.append((word.text, pro_list)) return lang, name, word_info_list
def saveWordPronunceList2WordPinyinXml(file_name, lang, name, word_info_dict): ###dict2list tmp_list = [] for k in word_info_dict: tmp_list.append((k, word_info_dict[k])) print(pyUsage.get_cur_info(), 'len(tmp_list)= ', len(tmp_list)) tmp_list.sort(key=xml_sort_list) ###构造xml文件 a = ET.Element('Dictionary') b = ET.SubElement(a, 'DictionaryHeader') c = ET.SubElement(b, 'DictionaryLanguage') c.text = lang d = ET.SubElement(b, 'DictionaryName') d.text = name for index,py_list in enumerate(tmp_list): w = py_list[0]###.lower() b = ET.SubElement(a, 'DictionaryEntry') c = ET.SubElement(b, 'Word') c.text = w # if w == '上': # print(py_list) for i,pro_dict in enumerate(py_list[1]): d = ET.SubElement(b, 'Pronunciation') e = ET.SubElement(d, 'ProID') #e.text = '%s'%pro_dict['ProID'] ###重新更改ProgID e.text = '%d'%(i+1) f = ET.SubElement(d, 'PartOfSpeech') f.text = pro_dict['PartOfSpeech'] g = ET.SubElement(d, 'PinYin') g.text = pro_dict['PinYin'] h = ET.SubElement(d, 'BianDiao') h.text = pro_dict['BianDiao'] text = ET.tostring(a, encoding="gbk", method="xml") text = text.decode('gbk') text = text.replace('version=\'1.0\'', 'version="1.0"') text = text.replace('encoding=\'gbk\'', 'encoding="GBK"') text = text.replace('<DictionaryHeader>', '\n<DictionaryHeader>') text = text.replace('</DictionaryHeader>', '</DictionaryHeader>\n') text = text.replace('</DictionaryEntry>', '</DictionaryEntry>\n') ###由于多加了空格,目前不知道怎么做比较好,这里去除 text = text.replace('<PartOfSpeech />', '<PartOfSpeech></PartOfSpeech>') text = text.replace('<BianDiao />', '<BianDiao></BianDiao>') ###这里补充空格:xp用bash改写过xml lapse_space = ' ' if file_name.find('multi_han.xml') != -1: lapse_space = '' text = text.replace('<DictionaryHeader>', lapse_space + '<DictionaryHeader>') text = text.replace('<DictionaryEntry>', lapse_space + '<DictionaryEntry>') pyIO.clear_to_file(file_name) pyIO.add_to_file(file_name, text)
def readPinyinXml(path): print(pyUsage.get_cur_info(), 'path= ', path) parser01 = XMLParser(encoding='gbk') ###先转为utf-16格式 c_list = pyIO.read_file_content(path) # ###过滤转义符: # for i, e in enumerate(c_list): # while True: # t = pyString.reExtractData('(&#\d\d\d;)', e, 1) # if len(t) > 0: # print('reExtractData t= ', t) # c_list[i] = e.replace(t, '') # e = c_list[i] # print('ignore ', t) # print('c_list[i]= ', c_list[i]) # pass # else: # break # ###查找转义符 # strip_line = [reExtractData('(&#\d+;)', e, 1) for e in c_list if e.find('&#') != -1] # strip_line = list(set(strip_line)) # if len(strip_line) > 0: # print(strip_line) # print(path) # sys.exit(0) flag = 'encoding="GBK"' flag.lower() pos = [i for i,e in enumerate(c_list) if e.find(flag) != -1] #print('pos= ', pos) if len(pos) == 0: s_flag = 'encoding=\'gbk\'' pos = [i for i,e in enumerate(c_list) if e.find(s_flag) != -1] if len(pos) > 0: c_list[pos[0]] = c_list[pos[0]].replace('\'gbk\'', '"GBK"') #print(' 2 pos= ', pos) if len(pos) == 0: s_flag = 'encoding=\'GBK\'' pos = [i for i,e in enumerate(c_list) if e.find(s_flag) != -1] c_list[pos[0]] = c_list[pos[0]].replace('\'GBK\'', '"GBK"') #print(' 3 pos= ', pos) #c_list[pos[0]].replace('encoding="GBK"', 'encoding="utf-8"') #print (' item= ', c_list[pos[0]]) t = '\n'.join(c_list) ###读入数据 root = ET.fromstring(t) ###文件头 lang = '' name = '' for i,child in enumerate(root[:1]): l = child.find('DictionaryLanguage') lang= l.text n = child.find('DictionaryName') name = n.text ###遍历数据 word_info_list = [] for i,child in enumerate(root[1:]): ###查找单词 word = child.find('Word') #print(word.text) ###查找拼音 pro_list = [] for rank in child.iter('Pronunciation'): t1 = rank.find('ProID').text #print(t1.text) t2 = rank.find('PartOfSpeech').text #print(t1.text) t3 = rank.find('PinYin').text #print(t1.text) t4 = rank.find('BianDiao').text #print(t1.text) if not t1: t1 = '' if not t2: t2 = '' if not t3: t3 = '' if not t4: t4 = '' tmp_dict = { 'ProID': t1, 'PartOfSpeech': t2, 'PinYin': t3, 'BianDiao': t4, } pro_list.append(tmp_dict) if not t3: print(word.text, tmp_dict) sys.exit(0) pro_list = singleItem(pro_list) word_info_list.append((word.text, pro_list)) return lang, name, word_info_list
def getSpeechSynthesis_ToolsSegDir(): seg_dir = getSpeechSynthesis_AlgorithmDir() + '/tools/seg/' print(pyUsage.get_cur_info(), 'seg_dir= ', seg_dir) return seg_dir