Beispiel #1
0
async def get_voice(text: str) -> Optional[Union[str, BytesIO]]:
    url = (
        f"http://{mb_config.mockingbird_ip}:{mb_config.mockingbird_port}/api/synthesize"
    )
    files = {"file": recoder}
    try:
        texts = split_text(cn2an.transform(text, "an2cn"))
        sound = AudioSegment.silent(10)
        for t in texts:
            if len(t) > 50:
                return "连续文字过长,请用标点符号分割"
            data = {"text": t, "vocoder": "HifiGAN"}
            async with httpx.AsyncClient() as client:
                resp = await client.post(url,
                                         data=data,
                                         files=files,
                                         timeout=20)
                result = resp.content
            sound += AudioSegment.from_file(
                BytesIO(result)) + AudioSegment.silent(200)
        output = BytesIO()
        sound.export(output, format="wav")
        return output
    except:
        logger.warning(traceback.format_exc())
        return None
Beispiel #2
0
 def cut_sentences(inp_line, ignore_number=False):
     pro_str = ""
     sentences = []
     valid_str = True
     for char in inp_line:
         if char in char_table:
             pro_str += char
         elif char.encode("utf-8").isdigit():
             if not ignore_number:
                 pro_str += char
             else:
                 valid_str = False
         elif char in SEPARATOR:
             if valid_str:
                 sentences.append(pro_str)
             pro_str = ""
             valid_str = True
         # for english characters and other punctuations like " ", "/"
         else:
             continue
     ret_sentences = []
     for sentence in sentences:
         if len(sentence) < 2:  # sentence too short, ignore
             continue
         try:
             ret_sentences.append(cn2an.transform(sentence, mode="an2cn"))
         except ValueError:  # number too long. ignore the sentence.
             continue
     return ret_sentences
Beispiel #3
0
 def process_item(self, item, spider):
       
     path = item["xs_name_path"]
     
     zj = item["zhangjie"]
     
     try :
         
         
         
         zj = zj.split(" ")
         
         
         
         zj[0] = cn2an.transform(zj[0], "cn2an")
         
         
         zj = zj[0] + " " + zj[1]
     except:
         print("章节数字转换错误")
         return item
     
     t_url = item["zhengwen"]
     
     print("正在下载>>>", path , "章节名" , zj)
     
     path = path + "/" + zj
     
     with open(path,'w') as f:    #设置文件对象
         f.write(zj + "\n" + t_url)
         f.close()
     
     return item
Beispiel #4
0
    def chat(self, question):
        self.init_embedding()
        question = cn2an.transform(question, "an2cn")
        embed = self.evaluate(question)
        embed_tensor = torch.tensor(embed)

        similarity = torch.nn.functional.cosine_similarity(
            embed_tensor, self.all_embed_tensor, dim=1, eps=1e-8)
        # print(self.qa_set.questions)
        # print(question, similarity)
        ret = []
        if torch.max(similarity) > self._threadhold:
            # question = self.qa_set.questions[torch.argmax(similarity)]
            k = 10
            questions = np.array(self.qa_set.questions)[torch.topk(
                similarity, k)[1]]
            k = self.qa_set.get_k(questions[0])
            questions = questions[:k]
            if k == 1:
                ret.append(self.qa_set.get_answer(questions[0]))
            else:
                for i, q in enumerate(questions):
                    question = '问题: {}'.format(q)
                    ans = self.qa_set.get_answer(q)
                    ret.append({question: ans})
        else:
            ret.append(self.qa_set.choose_default_ans())
        # print('ret size', len(ret), ret)
        return json.dumps(ret)
Beispiel #5
0
def builtin_regx(msg):
    # 解析内置能力正则
    text = cn2an.transform(msg.text, "cn2an")
    for k, vs in internal_regx_ability.items():
        for v in vs:
            regx_values = v.findall(text)
            if regx_values:
                msg.add_entities(k, regx_values)
    return
    yield None
Beispiel #6
0
def data(self):
    output = []
    res = list(df_all['總樓層數'])
    for i in res:
        if not i == 0:
            i = i.strip('層')
            output += [cn2an.transform(i)]
        elif i == 0:
            output += [0]
    return output
def data_clean(str_in):
    text_a = str_in.strip().replace(" ", "").replace("alink", "").replace("°C", "度")
    text_b = filter_emoji(text_a, restr='')
    text_1 = unicodedata.normalize('NFKC', text_b.lower().replace(" ", ""))      # 中文标点转换为英文标点
    text_2 = text_1.translate(trantab)  # 漏网之鱼手动修改对应
    text_3 = re.sub(u"\\(.*?\\)|\\{.*?}|\\[.*?]", "", text_2)   # 去掉小括号(), {}, []里的内容
    text_4 = cn2an.transform(text_3, "an2cn")                   # 阿拉伯数字转中文
    text_5 = pattern_flag.sub(u'', text_4)                      # 去掉标点符号
    if not is_all_chinese1(text_5):
        text_6 = pattern_alpha.sub(u'', text_5)
        if not is_all_chinese1(text_6):
            return ""
    return text_3
Beispiel #8
0
    def normalize_sentence_cn(self, sentence):
        """
            Convert digit to chinese characters.
            Convert chinese characters to pinyin.
        """

        # Convert digit to chinese characters.
        sentence = cn2an.transform(sentence, "an2cn")
        # Delete punctuations.
        sentence = sentence.translate(self.punctuation_table)
        # Convert chinese characters to pinyin.
        sentence = pinyin.get(sentence, format="strip", delimiter=" ")
        # remove white space in two end of string
        sentence = sentence.lstrip().rstrip()

        return sentence
Beispiel #9
0
def extract_laws_spa(law):
    # 再百跟十之間補上一
    law = add_chinese_one(law)
    # 取出法條的款項條
    law = cn2an.transform(law, "cn2an")

    # 用阿拉伯數字寫數字
    regex_article = "第\d*條"
    regex_paragraph = "第\d*項"
    regex_subparagraph = "第\d*款"

    # 假設都沒找到就回傳law
    act = law
    article = ""
    paragraph = ""
    subparagraph = ""
    # 找第幾條,有找到再往下找第幾項
    article_position = re.search(regex_article, law)
    if article_position != None:
        # 取出act(法條名稱,不包含條項款)
        act = law[:article_position.start()]
        article_text = law[article_position.start():article_position.end()]
        # 取出數字
        article = get_laws_number(article_text)
        # 找第幾項,有找到再往下找第幾款
        paragraph_position = re.search(regex_paragraph, law)
        if paragraph_position != None:
            paragraph_text = law[paragraph_position.start():paragraph_position.
                                 end()]
            paragraph = get_laws_number(paragraph_text)
            subparagraph_position = re.search(regex_subparagraph, law)
            if subparagraph_position != None:
                subparagraph_text = law[subparagraph_position.start(
                ):subparagraph_position.end()]
                subparagraph = get_laws_number(subparagraph_text)

    return act, article, paragraph, subparagraph
Beispiel #10
0
lexicon = parse_lexicon(args.lexicon_path)

lines = read_file_to_lines(args.input_path)
texts = [" ".join(line.split()[2:]) for line in lines]

cnt = 0
cutted_sents = []
valid_line_nums = []
for idx, text in enumerate(texts):
    sents = re.split("\s+", text)
    words = []
    segmentable = True
    for sent in sents:
        sent = re.sub("[^\u4e00-\u9fa5A-Za-z0-9]", "", sent)
        sent = cn2an.transform(sent, "an2cn")
        sent = converter.convert(sent)
        maybe_words = dict_seg(sent, lexicon)
        if not maybe_words:
            segmentable = False
            cnt += 1
            break
        words += maybe_words
    if segmentable:
        valid_line_nums.append(idx)
    cutted_sents.append(" ".join(words))

if args.filtered_output_path is not None:
    write_lines_to_file(args.filtered_output_path,
                        [lines[idx] for idx in valid_line_nums])
if args.output_path is not None:
Beispiel #11
0
 def normalize(line: str):
     line = unicodedata.normalize("NFKC", line)
     line = cn2an.transform(line, "an2cn")
     line = converter.convert(line)
     return line
Beispiel #12
0
def checkAdd(Address):
    '''
    借用內政部國土測繪中心「國土測繪圖資服務雲」之電子地圖,查詢任意地址之完整行政區(至村里鄰)及經緯度。
    
    Parameters
    ----------
    Address : str
        請輸入要查詢的地址
    Returns
    -------
    Address : str
        回覆完整行政區之地址
        
    lnglat : list
        回覆經緯度list
    '''

    origin = f'{Address}'    
    #將地址中的段處理成中文
    duan = re.findall('\d+段', Address)
    for s in duan:
        newS = cn2an.transform(s, "an2cn")
        Address = Address.replace(s,newS)
    #print(f'>>改段結果:{Address}')
      
    #把樓跟室切割出來
    try:
        floor = re.findall('\d+樓', Address)[0]
        Address = Address.replace(floor,'')
    except :
        floor = ''
    try:
        room = re.findall('\d+室', Address)[0]
        Address = Address.replace(room,'')
    except :
        room = ''
        
    
    addrQ = urllib.parse.quote(Address)
    #print(f'>>改字結果:{addrQ}')
    
    #透過內政部國兔測繪中心國土測繪圖資服務雲,查詢地點的座標與正確地點名稱
    s = Session()
    
    #開始查詢
    url = 'https://api.nlsc.gov.tw/MapSearch/QuerySearch'
    
    header = {
        "Accept": "application/xml, text/xml, */*; q=0.01",
        "Accept-Encoding": "zip, deflate, br",
        "Accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7",
        "Connection": "keep-alive",
        "Content-Length": "216",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "Host": "api.nlsc.gov.tw",
        "Origin": "https://maps.nlsc.gov.tw",
        "Referer": "https://maps.nlsc.gov.tw/T09/mapshow.action",
        "Sec-Fetch-Dest": "empty",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Site": "same-site",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36",
        }
    
    data = {
            "word": addrQ,
            "feedback": "XML",
            }
    
    r = s.post(url=url, headers=header, data=data,verify=False)
    if r.status_code == 200:
        print('>>查詢成功')

    tree = ET.fromstring(str(r.text)) # From XML String
    root = tree[0]

    realAddress = f"{root.find('CONTENT').text}{floor}{room}"
    #先將結果中的全形數字轉回一般數字
    big = ['0','1','2','3','4','5','6','7','8','9']
    small = [x for x in range(10)]
    numdict = dict(zip(big, small))
    for n in big:
        realAddress = realAddress.replace(n, f'{numdict[n]}')
    print(f'>>原始地址為:{origin}')
    print(f'>>完整地址為:{realAddress}')
    
    lnglat = f"{root.find('LOCATION').text}".split(',')
    print(f'>>經緯度為:{lnglat}')
    
    return realAddress, lnglat
Beispiel #13
0
import os
import cn2an

files = os.listdir(".")  # 假定你的文件和这个脚本在同一个目录下。
#小王捡了一百块钱.mp4 这种文件格式都可以
# 模块的网址 https://pypi.org/project/cn2an/
for file in files:
    newname = cn2an.transform(file, "cn2an")
    os.rename(file, newname)