Esempio n. 1
0
def getSpeechSynthesis_AlgorithmDir():
    s = '/trunk/algorithm/tools/'
    path = findDirByString(sys.argv[0], s)
    if len(path) == 0:
        path = findDirByString(__file__, s)
    if len(path) == 0:
        path = findDirByString(sys.path[0], s)

    s = '/algorithm/tools/'
    path = findDirByString(sys.argv[0], s)
    if len(path) == 0:
        path = findDirByString(__file__, s)
    if len(path) == 0:
        path = findDirByString(sys.path[0], s)
    
    if not os.path.exists(path):
        path = '/Users/daiqiang/speech_synthesis_svr_proj/trunk/algorithm/'
        
    if len(path) == 0:
        print(pyUsage.get_cur_info(), 'dir not exists! (算法目录不存在!)')
        sys.exit(0)

    algorithm_dir = findDirByString(path, '/algorithm/')
    
    if not os.path.exists(algorithm_dir):
        print(pyUsage.get_cur_info(), 'dir not exists! (算法目录不存在!)')
        sys.exit(0)
    return algorithm_dir
Esempio n. 2
0
def append_eating(eating_id, day, pics, name, kind):
    print (pyUsage.get_cur_info(), locals())
    e = EatingModel(
          eating_id = eating_id,
          day = day,
          pics = pics,
          name = name, 
          kind = kind)
    ret = e.save()
    print (pyUsage.get_cur_info(), 'ret= ', ret)
    resp = pack_json_resp(ret, 'append_eating err', -1)
    return resp
Esempio n. 3
0
def objs2dict_list(objs, fields):
    d_list = []
    for obj in objs:
        d = {}
        print (pyUsage.get_cur_info(), fields)
        for name in fields:
            d[name] = getattr(obj, name)
            if isinstance(d[name], datetime.datetime):
                t = d[name].strftime('%Y-%m-%d')
                d[name] = t
            else:    
                pass
                print (pyUsage.get_cur_info(), type(d[name]) )
        d_list.append(d)
    return d_list
Esempio n. 4
0
def addEnglish2Number(root):
    id_value = 0
    c = ET.SubElement(root, 'ElementEntry')
    c.set('Comment', '默认顺序是:先具体,后正则;先短后长')
    t_list = pyUsage.get_cur_info()
    c.set('TextType', t_list[1].replace('add', ''))
    ###1
    d = ET.SubElement(c, 'Node')
    d.set('Comment', 'JX06323')
    d.set('ContentType', 'Regex')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '[a-zA-Z]+\d+'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Bit')
        e.text = '{English(Bit)}{Number(Bit)}'
    ###1
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '动车,快车,城际列车D632')
    d.set('ContentType', 'Regex')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '[a-zA-Z]+\d+'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Bit')
        e.text = '{English(Bit)}{Number(Bit)}'
Esempio n. 5
0
def addNumber2Kanji(root):
    id_value = 0
    c = ET.SubElement(root, 'ElementEntry')
    c.set('Comment', '默认顺序是:先具体,后正则;先短后长')
    t_list = pyUsage.get_cur_info()
    c.set('TextType', t_list[1].replace('add', ''))
    ###1
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '数量')
    d.set('ContentType', 'Regex')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '\d+年'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'NumberBit+Kanji')
        e.text = '{Number(Count)}年'
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Year')
        e.text = '{Number(Bit)}年'
        e = ET.SubElement(d, 'Voice')

    ###1
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '数量')
    d.set('ContentType', 'Regex')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '\d+.*'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Count+Kanji')
        e.text = '{Number(Count)}{Kanji}'
Esempio n. 6
0
def addNumber2Punctuation(root):
    id_value = 0
    c = ET.SubElement(root, 'ElementEntry')
    c.set('Comment', '默认顺序是:先具体,后正则;先短后长')
    t_list = pyUsage.get_cur_info()
    c.set('TextType', t_list[1].replace('add', ''))
    ###1
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '百分数')
    d.set('ContentType', 'Regex')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '\d+%'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'ChinesePrecent')
        e.text = '百分之{Number(Count)}'
        e = ET.SubElement(d, 'Voice')

    ###1
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '其他')
    d.set('ContentType', 'Regex')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '\d+.*'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'NumberBit')
        e.text = '{Number(BitIgnorePunctuation)}'
Esempio n. 7
0
def addPunctuation(root):
    id_value = 0
    c = ET.SubElement(root, 'ElementEntry')
    c.set('Comment', '默认顺序是:先具体,后正则;先短后长')
    t_list = pyUsage.get_cur_info()
    c.set('TextType', t_list[1].replace('add', ''))
    ###1
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '笑脸符号')
    d.set('ContentType', 'StringValue')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = ':)'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Kanji')
        e.text = '笑脸'
    ###1
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '<')
    d.set('ContentType', 'StringValue')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '<'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Kanji')
        e.text = '小于'
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Puctuation')
        e.text = ''
Esempio n. 8
0
def checkMultiKey(word_info_list):
    mul_word_list = []
    
    word_info_dict = {}
    for i, (word, pro_list) in enumerate(word_info_list):
        ###添加到词典
        if word in word_info_dict:
            old_list = word_info_dict[word]
            ###记录有问题的汉字
            mul_word_list.append(word)
            ###剩下的拼音去重处理
            for pro in pro_list:
                pos_list = [e for e in word_info_dict[word] if e == pro]
                if len(pos_list) > 0:
                    pass
                else:
                    word_info_dict[word].append(pro)

            print(pyUsage.get_cur_info(), word, 'bad duplicate:')
            ###打印旧的
            for e in old_list:
                print('====', e)
            print('')
            ###打印新的
            for e in pro_list:
                print('----', e)
            print('')
            ###打印
            for e in word_info_dict[word]:
                print('>>>>', e)
            print('\n')
        else:
            word_info_dict[word] = pro_list
    return mul_word_list, word_info_dict
Esempio n. 9
0
def extract_word_charactor(element):
#     print('element= ', element)
    pos = element.rfind('/')
    if -1 == pos:
        print(pyUsage.get_cur_info(), 'no split for chara error! element=', element)
        sys.exit(0)
        ###对符号进行加"/w"操作
        if len(element.encode('utf8')) == len(element):
            return [element, 'w']
        elif len(element) > 10:
            print(pyUsage.get_cur_info(), 'error! too much long! element=', element)
            sys.exit(0)
        else:
            return [element, '']
            #sys.exit(0)

    return [element[:pos], element[pos+1:]]
Esempio n. 10
0
def append_sns(sns_id, url, title, types, duration):
    e = SnsModel(
          sns_id = sns_id,
          url = url,
          title = title, 
          types = types,
          duration = duration)
    ret = e.save()
    print (pyUsage.get_cur_info(), 'ret= ', ret)
    resp = pack_json_resp(ret, 'append_sns err', -1)
    return resp
Esempio n. 11
0
def readPinyinXmlDict(path, ignore_duplicate = False):
    lang, name, word_info_list = readPinyinXml(path)
    res_dict = {}
    for e in word_info_list:
        if e[0] in res_dict and not ignore_duplicate:
            print('error! duplicate key: ', e)
            sys.exit(0)
        res_dict[e[0]] = e[1]
    print(pyUsage.get_cur_info(), 'len(res_dict)=', len(res_dict), path)
    
    return lang, name, res_dict
Esempio n. 12
0
def getGBKText(word):
    word = word.strip()
    total_gbk_text = ''
    for w in word:
        gbk_val = w.encode('gb18030')
        if len(gbk_val) != 2:
            print(pyUsage.get_cur_info(), 'critical error', '不是GBK编码')
            sys.exit(0)
        gbk_text = '%X%X'%(gbk_val[0], gbk_val[1])
        total_gbk_text += gbk_text
    return total_gbk_text
Esempio n. 13
0
def append_audio(audio_id, url, title, type, duration):
    e = AudioModel(
          audio_id = audio_id,
          url = url,
          title = title, 
          type = type,
          duration = duration)
    ret = e.save()
    print (pyUsage.get_cur_info(), 'ret= ', ret)
    resp = pack_json_resp(ret, 'append_audio err', -1)
    return resp
Esempio n. 14
0
def getPinyinCntDicts(path):
    if not os.path.exists(path):
        print(pyUsage.get_cur_info(), '路径不存在,path= ', path)
        sys.exit(0)
        
    lang, name, word_info_dict = pyXml.readPinyinXmlDict(path)
    
    word_py_dict = {}
    word_py_cnt_dict={}
    pinyin_cnt_distribution_dict = {}
    
    for k in word_info_dict:
        v = word_info_dict[k]
        
        word_py_dict[k] = getPinyin(v)
        word_py_cnt_dict[k] = len(getPinyin(v))
        pyString.insert_or_add_dict(pinyin_cnt_distribution_dict, len(getPinyin(v)),1)
    
    ###单词有多少个拼音
    print(pyUsage.get_cur_info(), 'pinyin_cnt_distribution_dict= ', pinyin_cnt_distribution_dict, 'total_word_cnt= ', len(word_info_dict))
    return word_py_dict, word_py_cnt_dict, pinyin_cnt_distribution_dict
Esempio n. 15
0
def add_eating(request):
    form = EatingForm()
    eating_list = []
    if request.method == 'POST':
        print (pyUsage.get_cur_info(), 'POST= ', request.POST)
        pics = request.POST.get('pics', '')
        print (pyUsage.get_cur_info(), 'url= ', pics)
        eating_id = request.POST.get('eating_id','')
        day = request.POST.get('day','')
        pics = request.POST.get('pics','')
        name = request.POST.get('name','')
        kind = request.POST.get('kind','')
        if len(day) == 0:
            return pack_json_resp(False, 'add_eating error! day is empty', -1)
        objs = EatingModel.objects.filter(day = day)
        if len(objs) > 0:
            return pack_json_resp(False, 'add_eating error! day already exists!', -1)

        if len(eating_id) == 0:
            eating_id = day

        return append_eating(eating_id, day, pics, name, kind)
    else:
        form = EatingForm(
            initial={
                    'eating_id':'2',
                    'day':'20160515',
                    'pics':'',
                    'name':'西红柿鸡蛋',
                    'kind':"3",
                  }
              )

    objs = EatingModel.objects.all()
    eating_list = objs2dict_list(objs, g_eating_fields)
    #print pyUsage.get_cur_info(), 'eating_list= ', eating_list
    #return render_to_response('add_eating.html', {'form': form, 'eating_list':eating_list})      
    return render_to_response('up_pic7.html', {'form': form, 'eating_list':eating_list})
Esempio n. 16
0
def findDirByString(path, s):
    path = path.strip()
    path = os.path.abspath(path)
    ###路径加一个斜杠
    if os.path.isdir(path):
        if path[-1] != os.sep:
            path += os.sep

    pos = path.find(s)
    if pos == -1:
        return ''
    
    print(pyUsage.get_cur_info(), 'total path= ', path)
    return path[:pos + len(s)]
Esempio n. 17
0
def addPunctuation2English2Number(root):
    id_value = 0
    c = ET.SubElement(root, 'ElementEntry')
    c.set('Comment', '默认顺序是:先具体,后正则;先短后长')
    t_list = pyUsage.get_cur_info()
    c.set('TextType', t_list[1].replace('add', ''))
    ###1
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '&nbsp1938/nx')
    d.set('ContentType', 'Regex')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '&nbsp\d+'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Time')
        e.text = '{Number(Count)}'
Esempio n. 18
0
def addNumber2English(root):
    id_value = 0
    c = ET.SubElement(root, 'ElementEntry')
    c.set('Comment', '默认顺序是:先具体,后正则;先短后长')
    t_list = pyUsage.get_cur_info()
    c.set('TextType', t_list[1].replace('add', ''))
    ###1
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '3a')
    d.set('ContentType', 'Regex')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '\d[a-zA-Z]'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'NumberBit')
        e.text = '{Number(Bit)}{English(Bit)}'
Esempio n. 19
0
def addComplicatedText(root):
    id_value = 0
    c = ET.SubElement(root, 'ElementEntry')
    c.set('Comment', '默认顺序是:先具体,后正则;先短后长')
    t_list = pyUsage.get_cur_info()
    c.set('TextType', t_list[1].replace('add', ''))
    ###1
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '电子邮件')
    d.set('ContentType', 'Regex')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '.*@.*'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'EMail')
        e.text = '{ComplicatedText(Email)}'
Esempio n. 20
0
def addKanji(root):
    id_value = 0
    c = ET.SubElement(root, 'ElementEntry')
    c.set('Comment', '默认顺序是:先具体,后正则;先短后长')
    t_list = pyUsage.get_cur_info()
    c.set('TextType', t_list[1].replace('add', ''))
    ###1
    d = ET.SubElement(c, 'Node')
    d.set('Comment', 'mg')
    d.set('ContentType', 'Regex')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '[^0-9a-zA-Z].*'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Kanji')
        e.text = '{Kanji}'
Esempio n. 21
0
def addPunctuation2Number2Punctuation2Number(root):
    id_value = 0
    c = ET.SubElement(root, 'ElementEntry')
    c.set('Comment', '默认顺序是:先具体,后正则;先短后长')
    t_list = pyUsage.get_cur_info()
    c.set('TextType', t_list[1].replace('add', ''))
    ###1
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '"9:25 分 -9:30 分 之间')
    d.set('ContentType', 'Regex')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '-\d+:\d+'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Time')
        e.text = '至{Time}'
Esempio n. 22
0
def addPunctuation2Number2Punctuation(root):
    id_value = 0
    c = ET.SubElement(root, 'ElementEntry')
    c.set('Comment', '默认顺序是:先具体,后正则;先短后长')
    t_list = pyUsage.get_cur_info()
    c.set('TextType', t_list[1].replace('add', ''))
    ###1
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '(000936)股票证券代码')
    d.set('ContentType', 'Regex')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '\(\d+\)'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'NumberBit')
        e.text = '{Number(Count)}'
Esempio n. 23
0
def addNumber2Punctuation2Number(root):
    id_value = 0
    c = ET.SubElement(root, 'ElementEntry')
    c.set('Comment', '默认顺序是:先具体,后正则;先短后长')
    t_list = pyUsage.get_cur_info()
    c.set('TextType', t_list[1].replace('add', ''))
    ###1
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '时间,比分')
    d.set('ContentType', 'Regex')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '\d+:\d+'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Time')
        e.text = '{Number(Count)}点{Number(Count)}分'
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Score2Score')
        e.text = '{Number(Count)}比{Number(Count)}'
    ###1
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '时间,比分')
    d.set('ContentType', 'Regex')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '\d+-\d+'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Time')
        e.text = '{Number(Count)}点{Number(Count)}分'
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Count')
        e.text = '{Number(Count)}比{Number(Count)}'
    ###1
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '时间,比分')
    d.set('ContentType', 'Regex')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '\d+.\d+'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'DecimalChinese')
        e.text = '{Number(Count)}点{Number(Count)}'
Esempio n. 24
0
def generateCharacteristicInfo(w_c_list, sen_id, index, sub_index, chara_cnt = 5):
    length = len(w_c_list)
    w,c = w_c_list[index]
    relate_info = ['%d'%sen_id,]###句子ID
    relate_info.append('%s'%index)
    relate_info.append(w)
    relate_info.append(c)
    relate_info.append('%d'%len(w))
    relate_info.append('%d'%sub_index)

    #for diff in [-1,1,-2,2,-3,3,-4,4,-5,5]:

    ###目前增加逻辑:如果遇到逗号,那么后面的就不解析了.
    positive_valid_flag = True
    negtive_valid_flag = True
    
    for diff in genCharaDiffList(chara_cnt):
        text = getDefaultText(diff)
        pos = getDefaultPos(diff)
        if (index+diff) >= 0 and (index+diff) < length:
            word = w_c_list[index+diff][0] ###词
            chara = w_c_list[index+diff][1]  ###词性

            ###针对逗号进行屏蔽
            if word == ',':
                if diff < 0:
                    negtive_valid_flag = False
                if diff > 0:
                    positive_valid_flag = False
            if diff > 0 and positive_valid_flag:
                text = word
                pos = chara
            if diff < 0 and negtive_valid_flag:
                text = word
                pos = chara
                
        relate_info.append(text)
        relate_info.append(pos)
    ###加上字的信息
    print(pyUsage.get_cur_info(True), 'relate_info= ', relate_info)
    #t_list = generateKanjiInfo(w_c_list, sen_id, index, sub_index, chara_cnt = 5)
    #print(pyUsage.get_cur_info(True), 'relate_info= ', relate_info)
    #relate_info.extend(t_list)
    return relate_info
Esempio n. 25
0
def get_eating(request):
    objs = EatingModel.objects.all()
    eating_list = objs2dict_list(objs, g_eating_fields)
    ###append comment
    for i,d in enumerate(eating_list):
        print (pyUsage.get_cur_info(), 'd= ', d)
        vid = d['eating_id']
        #comments = CommentModel.objects.filter(food_id=vid)
        #if len(comments) > 0:
        #    #print pyUsage.get_cur_info(), 'g_comment_fields= ', g_comment_fields
        #    c_dict = objs2dict_list(comments, g_comment_fields)
        #    eating_list[i]['comment'] = c_dict

    return JsonResponse(
            {
                'status': 'success',
                'status_code':0,
                'cnt': len(eating_list),
                'eating_list':eating_list,
            })
Esempio n. 26
0
def reset_all_audio(request):
    e_list = AudioModel.objects.all()
    for e in e_list:
        e.delete()

    #f = '/data/babyguard/babyguard/babyguard/audio/all_audio.txt'
    f = os.path.split(os.path.realpath(__file__))[0] + '/all_audio.txt'
    t_list = pyIO.read_file_content(f, 'utf-8')
    for t in t_list:
        #print pyUsage.get_cur_info(), 't= ', t
        t = t.strip()
        if t.find('http://') == -1:
            continue

        arr = t.split('\t')
        if len(arr) < 4:
            continue
        
        arr = t.split('\t')
        audio_id = get_md5(arr[0])
        url = arr[0]
        title = arr[1]
        type = arr[2]
        duration = arr[3]
        ret = append_audio(audio_id, url, title, type, duration)       
        print (pyUsage.get_cur_info(), 'add ret= ', ret)

    objs = AudioModel.objects.all()
    #print pyUsage.get_cur_info(), 'cnt= ', len(objs)
    audio_list = objs2dict_list(objs, g_audio_fields)
    #print pyUsage.get_cur_info(), 'cnt= ', len(audio_list)

    return JsonResponse(
            {
                'status': 'reset audio',
                'status_code':0,
                'cnt':len(audio_list),
                'audio_list':audio_list,
             })
Esempio n. 27
0
def addNumber2Punctuation2Number2Punctuation2Number(root):
    id_value = 0
    c = ET.SubElement(root, 'ElementEntry')
    c.set('Comment', '默认顺序是:先具体,后正则;先短后长')
    t_list = pyUsage.get_cur_info()
    c.set('TextType', t_list[1].replace('add', ''))
    ###1
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '2,377,155')
    d.set('ContentType', 'Regex')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '\d+,\d+,\d+'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'CountIgnorePunctuation')
        e.text = '{Number(CountIgnorePunctuation)}'
    ###1
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '9:26:01')
    d.set('ContentType', 'Regex')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '\d+:\d+:\d+'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Time')
        e.text = '{Number(Count)}点{Number(Count)}分{Number(Count)}秒'
    ###1
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '2007-10-31')
    d.set('ContentType', 'Regex')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '\d{4,4}-\d{1,2}-\d{1,2}'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Date')
        e.text = '{Number(Count)}年{Number(Count)}月{Number(Count)}日'
Esempio n. 28
0
def addCombinationText(root):
    ###日期+时间
    id_value = 0
    c = ET.SubElement(root, 'ElementEntry')
    c.set('Comment', '默认顺序是:先具体,后正则;先短后长')
    t_list = pyUsage.get_cur_info()
    c.set('TextType', t_list[1].replace('add', ''))

    d = ET.SubElement(c, 'Node')
    d.set('Comment', '日期+时间')
    d.set('ContentType', 'Part')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    if True:
        ###1
        e = ET.SubElement(d, 'Part')
        e.set('Comment', '日期')
        e.set('ContentType', 'TextType')
        id_value += 1
        e.set('ID', '%d'%(id_value))
        if True:
            e.text = 'Number2Punctuation2Number2Punctuation2Number'
        ###1
        e = ET.SubElement(d, 'Part')
        e.set('Comment', '时间')
        e.set('ContentType', 'TextType')
        id_value += 1
        e.set('ID', '%d'%(id_value))
        if True:
            e.text = 'Number2Punctuation2Number2Punctuation2Number'
    
    id_value = 0
    c = ET.SubElement(root, 'ElementEntry')
    c.set('Comment', '默认顺序是:先具体,后正则;先短后长')
    t_list = pyUsage.get_cur_info()
    c.set('TextType', t_list[1].replace('add', ''))
    ###符号+日期+时间
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '符号+日期+时间')
    d.set('ContentType', 'Part')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    
    if True:
        ###1
        e = ET.SubElement(d, 'Part')
        e.set('Comment', '符号')
        e.set('ContentType', 'Regex')
        id_value += 1
        e.set('ID', '%d'%(id_value))
        if True:
            e.text = ':'
        ###1
        e = ET.SubElement(d, 'Part')
        e.set('Comment', '日期')
        e.set('ContentType', 'TextType')
        id_value += 1
        e.set('ID', '%d'%(id_value))
        if True:
            e.text = 'Number2Punctuation2Number2Punctuation2Number'
        ###1
        e = ET.SubElement(d, 'Part')
        e.set('Comment', '时间')
        e.set('ContentType', 'TextType')
        id_value += 1
        e.set('ID', '%d'%(id_value))
        if True:
            e.text = 'Number2Punctuation2Number2Punctuation2Number'
Esempio n. 29
0
import os
import os
BASE_DIR = os.path.dirname(__file__)

import sys 
sys.path.extend([
    '/data/util/CoreFunction/',
    '/data/util',
    '/data/',
    '/home/daiqiang/gif_search_data/util/CoreFunction',
    '/home/daiqiang/gif_search_data/tags_index_for_gif',
    '/Users/xinmei365/gif_search_data/util/CoreFunction',
    '/Users/xinmei365/gif_search_data/tags_index_for_gif',
     ])  
import pyUsage
print (pyUsage.get_cur_info(), 'BASE_DIR= ', BASE_DIR)
import pyIO
import pyString

# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))


# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/1.8/howto/deployment/checklist/

# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'h)44u3bdct=gm-k684$o(0j4i1e5@d=h=qmd$p(u5l)k^-ai*d'

# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
Esempio n. 30
0
def addNumber(root):
    id_value = 0    
    c = ET.SubElement(root, 'ElementEntry')
    c.set('Comment', '默认顺序是:先具体,后正则;先短后长')
    t_list = pyUsage.get_cur_info()
    c.set('TextType', t_list[1].replace('add', ''))
    ###1
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '一,幺,One,First')
    d.set('ContentType', 'Number')
    d.set('Model', 'regexNormalize_Number_1.crf_model')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '1'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'NumberBit')
        e.text = '{Number(Count)}'
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Kanji')
        e.text = '幺'
    ###2
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '二,两,Two')
    d.set('ContentType', 'Number')
    d.set('Model', 'regexNormalize_Number_2.crf_model')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '2'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', '二')
        e.text = '{Number(Count)}'
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', '两')
        e.text = '两'
        
    ###2
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '年份,数量')
    d.set('ContentType', 'Regex')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '[1-9][0-9]{0,3}'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Year')
        e.text = '{Number(Bit)}'
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Count')
        e.text = '{Number(Count)}'
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'NumberBit')
        e.text = '{Number(Bit)}'
    ###2
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '月份,年份,数量,值为1~12')
    d.set('ContentType', 'Regex')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '([1-9]|1[0-2])'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Month')
        e.text = '{Number(Count)}'
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Year')
        e.text = '{Number(Count)}'
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Count')
        e.text = '{Number(Count)}'
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'NumberBit')
        e.text = '{Number(Bit)}'
    ###2
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '座机号码')
    d.set('ContentType', 'Regex')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '[1-9]\d{5,7}'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Count')
        e.text = '{Number(Count)}'
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Telephone')
        e.text = '{Number(Telephone)}'
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'NumberBit')
        e.text = '{Number(Bit)}'
    ###2
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '手机号码')
    d.set('ContentType', 'Regex')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '[1]\d{10}'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Count')
        e.text = '{Number(Count)}'
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'Telephone')
        e.text = '{Number(Telephone)}'
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'NumberBit')
        e.text = '{Number(Bit)}'
    ###1
    d = ET.SubElement(c, 'Node')
    d.set('Comment', '其他')
    d.set('ContentType', 'Regex')
    id_value += 1
    d.set('ID', '%d'%(id_value))
    d.text = '\d+'
    if True:
        e = ET.SubElement(d, 'Voice')
        e.set('ReadType', 'NumberBit')
        e.text = '{Number(Bit)}'
Esempio n. 31
0
def getSpeechSynthesis_DataDictDir():
    dict_dir = getSpeechSynthesis_AlgorithmDir() + '../../document/data/dict/'
    print(pyUsage.get_cur_info(), 'dict_dir= ', dict_dir)
    
    return dict_dir
Esempio n. 32
0
def readPinyinXml(path):
    print(pyUsage.get_cur_info(), 'path= ', path)
    parser01 = XMLParser(encoding='gbk')
    
    ###先转为utf-16格式
    c_list = pyIO.read_file_content(path)
  
    flag = 'encoding="GBK"'
    flag.lower()
    pos = [i for i,e in enumerate(c_list) if e.find(flag) != -1]
    #print('pos= ', pos)
    
    if len(pos) == 0:
        s_flag = 'encoding=\'gbk\''
        pos = [i for i,e in enumerate(c_list) if e.find(s_flag) != -1]
        if len(pos) > 0:
            c_list[pos[0]] = c_list[pos[0]].replace('\'gbk\'', '"GBK"')
            #print(' 2 pos= ', pos)
    if len(pos) == 0:
        s_flag = 'encoding=\'GBK\''
        pos = [i for i,e in enumerate(c_list) if e.find(s_flag) != -1]
        c_list[pos[0]] = c_list[pos[0]].replace('\'GBK\'', '"GBK"')
        #print(' 3 pos= ', pos)

    #c_list[pos[0]].replace('encoding="GBK"', 'encoding="utf-8"')
    #print ('           item= ', c_list[pos[0]])
    t = '\n'.join(c_list)

    ###读入数据
    root = ET.fromstring(t)
    
    ###文件头
    lang = ''
    name = ''
    for i,child in enumerate(root[:1]):
        l = child.find('DictionaryLanguage')
        lang= l.text
        n = child.find('DictionaryName')
        name = n.text
    
    ###遍历数据
    word_info_list = []
    for i,child in enumerate(root[1:]):
        ###查找单词
        word = child.find('Word')
        #print(word.text)

        ###查找拼音
        pro_list = []
        for rank in child.iter('TYPE_PURE_NUMBER'):
            t1 = rank.find('0').text
            #print(t1.text)
            t2 = rank.find('1').text
            #print(t1.text)
            t3 = rank.find('2').text
            #print(t1.text)
            t4 = rank.find('BianDiao').text
            #print(t1.text)
            if not t1:
                t1 = ''
            if not t2:
                t2 = ''
            if not t3:
                t3 = ''
            if not t4:
                t4 = ''
            
            tmp_dict = {
                    'ProID':        t1,
                    'PartOfSpeech': t2,
                    'PinYin':       t3,
                    'BianDiao':     t4,
                    }
            pro_list.append(tmp_dict)
            if not t3:
                print(word.text, tmp_dict)
                sys.exit(0)      

        pro_list = singleItem(pro_list)
        word_info_list.append((word.text, pro_list))

    return lang, name, word_info_list
Esempio n. 33
0
def saveWordPronunceList2WordPinyinXml(file_name, lang, name, word_info_dict):
    ###dict2list
    tmp_list = []
    for k in word_info_dict:
        tmp_list.append((k, word_info_dict[k]))
    print(pyUsage.get_cur_info(), 'len(tmp_list)= ', len(tmp_list))
    
    tmp_list.sort(key=xml_sort_list)
    
    ###构造xml文件
    a = ET.Element('Dictionary')
    
    b = ET.SubElement(a, 'DictionaryHeader')
    c = ET.SubElement(b, 'DictionaryLanguage')
    c.text = lang
    d = ET.SubElement(b, 'DictionaryName')
    d.text = name
    
    for index,py_list in enumerate(tmp_list):
        w = py_list[0]###.lower()
        b = ET.SubElement(a, 'DictionaryEntry')
        c = ET.SubElement(b, 'Word')
        c.text = w
#         if w == '上':
#             print(py_list)
        
        for i,pro_dict in enumerate(py_list[1]):
            d = ET.SubElement(b, 'Pronunciation')
            e = ET.SubElement(d, 'ProID')
            #e.text = '%s'%pro_dict['ProID']
            ###重新更改ProgID
            e.text = '%d'%(i+1)
        
            f = ET.SubElement(d, 'PartOfSpeech')
            f.text = pro_dict['PartOfSpeech']
        
            g = ET.SubElement(d, 'PinYin')
            g.text = pro_dict['PinYin']
        
            h = ET.SubElement(d, 'BianDiao')
            h.text = pro_dict['BianDiao']

    text = ET.tostring(a, encoding="gbk", method="xml")
    text = text.decode('gbk')

    text = text.replace('version=\'1.0\'', 'version="1.0"')
    text = text.replace('encoding=\'gbk\'', 'encoding="GBK"')
    text = text.replace('<DictionaryHeader>', '\n<DictionaryHeader>')
    text = text.replace('</DictionaryHeader>', '</DictionaryHeader>\n')
    text = text.replace('</DictionaryEntry>', '</DictionaryEntry>\n')
    
    ###由于多加了空格,目前不知道怎么做比较好,这里去除
    text = text.replace('<PartOfSpeech />', '<PartOfSpeech></PartOfSpeech>')
    text = text.replace('<BianDiao />', '<BianDiao></BianDiao>')
    ###这里补充空格:xp用bash改写过xml
    lapse_space = ' '
    if file_name.find('multi_han.xml') != -1:
        lapse_space = ''
    text = text.replace('<DictionaryHeader>', lapse_space + '<DictionaryHeader>')
    text = text.replace('<DictionaryEntry>',  lapse_space + '<DictionaryEntry>')

    pyIO.clear_to_file(file_name)
    pyIO.add_to_file(file_name, text)
Esempio n. 34
0
def readPinyinXml(path):
    print(pyUsage.get_cur_info(), 'path= ', path)
    
    parser01 = XMLParser(encoding='gbk')
    
    ###先转为utf-16格式
    c_list = pyIO.read_file_content(path)
  
#     ###过滤转义符:
#     for i, e in enumerate(c_list):
#         while True:
#             t = pyString.reExtractData('(&#\d\d\d;)', e, 1)
#             if len(t) > 0:
#                 print('reExtractData t= ', t)
#                 c_list[i] = e.replace(t, '')
#                 e = c_list[i]
#                 print('ignore ', t)
#                 print('c_list[i]= ', c_list[i])
#                 pass
#             else:
#                 break
    
#     ###查找转义符
#     strip_line = [reExtractData('(&#\d+;)', e, 1) for e in c_list if e.find('&#') != -1]
#     strip_line = list(set(strip_line))
#     if len(strip_line) > 0:
#         print(strip_line)
#         print(path)
#         sys.exit(0)
    
    flag = 'encoding="GBK"'
    flag.lower()
    pos = [i for i,e in enumerate(c_list) if e.find(flag) != -1]
    #print('pos= ', pos)
    
    if len(pos) == 0:
        s_flag = 'encoding=\'gbk\''
        pos = [i for i,e in enumerate(c_list) if e.find(s_flag) != -1]
        if len(pos) > 0:
            c_list[pos[0]] = c_list[pos[0]].replace('\'gbk\'', '"GBK"')
            #print(' 2 pos= ', pos)
    if len(pos) == 0:
        s_flag = 'encoding=\'GBK\''
        pos = [i for i,e in enumerate(c_list) if e.find(s_flag) != -1]
        c_list[pos[0]] = c_list[pos[0]].replace('\'GBK\'', '"GBK"')
        #print(' 3 pos= ', pos)

    #c_list[pos[0]].replace('encoding="GBK"', 'encoding="utf-8"')
    #print ('           item= ', c_list[pos[0]])
    t = '\n'.join(c_list)

    ###读入数据
    root = ET.fromstring(t)
    
    ###文件头
    lang = ''
    name = ''
    for i,child in enumerate(root[:1]):
        l = child.find('DictionaryLanguage')
        lang= l.text
        n = child.find('DictionaryName')
        name = n.text
    
    ###遍历数据
    word_info_list = []
    for i,child in enumerate(root[1:]):
        ###查找单词
        word = child.find('Word')
        #print(word.text)

        ###查找拼音
        pro_list = []
        for rank in child.iter('Pronunciation'):
            t1 = rank.find('ProID').text
            #print(t1.text)
            t2 = rank.find('PartOfSpeech').text
            #print(t1.text)
            t3 = rank.find('PinYin').text
            #print(t1.text)
            t4 = rank.find('BianDiao').text
            #print(t1.text)
            if not t1:
                t1 = ''
            if not t2:
                t2 = ''
            if not t3:
                t3 = ''
            if not t4:
                t4 = ''
            
            tmp_dict = {
                    'ProID':        t1,
                    'PartOfSpeech': t2,
                    'PinYin':       t3,
                    'BianDiao':     t4,
                    }
            pro_list.append(tmp_dict)
            if not t3:
                print(word.text, tmp_dict)
                sys.exit(0)      

        pro_list = singleItem(pro_list)
        word_info_list.append((word.text, pro_list))

    return lang, name, word_info_list
Esempio n. 35
0
def getSpeechSynthesis_ToolsSegDir():
    seg_dir = getSpeechSynthesis_AlgorithmDir() + '/tools/seg/'
    print(pyUsage.get_cur_info(), 'seg_dir= ', seg_dir)
    return seg_dir