async def get_voice(text: str) -> Optional[Union[str, BytesIO]]: url = ( f"http://{mb_config.mockingbird_ip}:{mb_config.mockingbird_port}/api/synthesize" ) files = {"file": recoder} try: texts = split_text(cn2an.transform(text, "an2cn")) sound = AudioSegment.silent(10) for t in texts: if len(t) > 50: return "连续文字过长,请用标点符号分割" data = {"text": t, "vocoder": "HifiGAN"} async with httpx.AsyncClient() as client: resp = await client.post(url, data=data, files=files, timeout=20) result = resp.content sound += AudioSegment.from_file( BytesIO(result)) + AudioSegment.silent(200) output = BytesIO() sound.export(output, format="wav") return output except: logger.warning(traceback.format_exc()) return None
def cut_sentences(inp_line, ignore_number=False): pro_str = "" sentences = [] valid_str = True for char in inp_line: if char in char_table: pro_str += char elif char.encode("utf-8").isdigit(): if not ignore_number: pro_str += char else: valid_str = False elif char in SEPARATOR: if valid_str: sentences.append(pro_str) pro_str = "" valid_str = True # for english characters and other punctuations like " ", "/" else: continue ret_sentences = [] for sentence in sentences: if len(sentence) < 2: # sentence too short, ignore continue try: ret_sentences.append(cn2an.transform(sentence, mode="an2cn")) except ValueError: # number too long. ignore the sentence. continue return ret_sentences
def process_item(self, item, spider): path = item["xs_name_path"] zj = item["zhangjie"] try : zj = zj.split(" ") zj[0] = cn2an.transform(zj[0], "cn2an") zj = zj[0] + " " + zj[1] except: print("章节数字转换错误") return item t_url = item["zhengwen"] print("正在下载>>>", path , "章节名" , zj) path = path + "/" + zj with open(path,'w') as f: #设置文件对象 f.write(zj + "\n" + t_url) f.close() return item
def chat(self, question): self.init_embedding() question = cn2an.transform(question, "an2cn") embed = self.evaluate(question) embed_tensor = torch.tensor(embed) similarity = torch.nn.functional.cosine_similarity( embed_tensor, self.all_embed_tensor, dim=1, eps=1e-8) # print(self.qa_set.questions) # print(question, similarity) ret = [] if torch.max(similarity) > self._threadhold: # question = self.qa_set.questions[torch.argmax(similarity)] k = 10 questions = np.array(self.qa_set.questions)[torch.topk( similarity, k)[1]] k = self.qa_set.get_k(questions[0]) questions = questions[:k] if k == 1: ret.append(self.qa_set.get_answer(questions[0])) else: for i, q in enumerate(questions): question = '问题: {}'.format(q) ans = self.qa_set.get_answer(q) ret.append({question: ans}) else: ret.append(self.qa_set.choose_default_ans()) # print('ret size', len(ret), ret) return json.dumps(ret)
def builtin_regx(msg): # 解析内置能力正则 text = cn2an.transform(msg.text, "cn2an") for k, vs in internal_regx_ability.items(): for v in vs: regx_values = v.findall(text) if regx_values: msg.add_entities(k, regx_values) return yield None
def data(self): output = [] res = list(df_all['總樓層數']) for i in res: if not i == 0: i = i.strip('層') output += [cn2an.transform(i)] elif i == 0: output += [0] return output
def data_clean(str_in): text_a = str_in.strip().replace(" ", "").replace("alink", "").replace("°C", "度") text_b = filter_emoji(text_a, restr='') text_1 = unicodedata.normalize('NFKC', text_b.lower().replace(" ", "")) # 中文标点转换为英文标点 text_2 = text_1.translate(trantab) # 漏网之鱼手动修改对应 text_3 = re.sub(u"\\(.*?\\)|\\{.*?}|\\[.*?]", "", text_2) # 去掉小括号(), {}, []里的内容 text_4 = cn2an.transform(text_3, "an2cn") # 阿拉伯数字转中文 text_5 = pattern_flag.sub(u'', text_4) # 去掉标点符号 if not is_all_chinese1(text_5): text_6 = pattern_alpha.sub(u'', text_5) if not is_all_chinese1(text_6): return "" return text_3
def normalize_sentence_cn(self, sentence): """ Convert digit to chinese characters. Convert chinese characters to pinyin. """ # Convert digit to chinese characters. sentence = cn2an.transform(sentence, "an2cn") # Delete punctuations. sentence = sentence.translate(self.punctuation_table) # Convert chinese characters to pinyin. sentence = pinyin.get(sentence, format="strip", delimiter=" ") # remove white space in two end of string sentence = sentence.lstrip().rstrip() return sentence
def extract_laws_spa(law): # 再百跟十之間補上一 law = add_chinese_one(law) # 取出法條的款項條 law = cn2an.transform(law, "cn2an") # 用阿拉伯數字寫數字 regex_article = "第\d*條" regex_paragraph = "第\d*項" regex_subparagraph = "第\d*款" # 假設都沒找到就回傳law act = law article = "" paragraph = "" subparagraph = "" # 找第幾條,有找到再往下找第幾項 article_position = re.search(regex_article, law) if article_position != None: # 取出act(法條名稱,不包含條項款) act = law[:article_position.start()] article_text = law[article_position.start():article_position.end()] # 取出數字 article = get_laws_number(article_text) # 找第幾項,有找到再往下找第幾款 paragraph_position = re.search(regex_paragraph, law) if paragraph_position != None: paragraph_text = law[paragraph_position.start():paragraph_position. end()] paragraph = get_laws_number(paragraph_text) subparagraph_position = re.search(regex_subparagraph, law) if subparagraph_position != None: subparagraph_text = law[subparagraph_position.start( ):subparagraph_position.end()] subparagraph = get_laws_number(subparagraph_text) return act, article, paragraph, subparagraph
lexicon = parse_lexicon(args.lexicon_path) lines = read_file_to_lines(args.input_path) texts = [" ".join(line.split()[2:]) for line in lines] cnt = 0 cutted_sents = [] valid_line_nums = [] for idx, text in enumerate(texts): sents = re.split("\s+", text) words = [] segmentable = True for sent in sents: sent = re.sub("[^\u4e00-\u9fa5A-Za-z0-9]", "", sent) sent = cn2an.transform(sent, "an2cn") sent = converter.convert(sent) maybe_words = dict_seg(sent, lexicon) if not maybe_words: segmentable = False cnt += 1 break words += maybe_words if segmentable: valid_line_nums.append(idx) cutted_sents.append(" ".join(words)) if args.filtered_output_path is not None: write_lines_to_file(args.filtered_output_path, [lines[idx] for idx in valid_line_nums]) if args.output_path is not None:
def normalize(line: str): line = unicodedata.normalize("NFKC", line) line = cn2an.transform(line, "an2cn") line = converter.convert(line) return line
def checkAdd(Address): ''' 借用內政部國土測繪中心「國土測繪圖資服務雲」之電子地圖,查詢任意地址之完整行政區(至村里鄰)及經緯度。 Parameters ---------- Address : str 請輸入要查詢的地址 Returns ------- Address : str 回覆完整行政區之地址 lnglat : list 回覆經緯度list ''' origin = f'{Address}' #將地址中的段處理成中文 duan = re.findall('\d+段', Address) for s in duan: newS = cn2an.transform(s, "an2cn") Address = Address.replace(s,newS) #print(f'>>改段結果:{Address}') #把樓跟室切割出來 try: floor = re.findall('\d+樓', Address)[0] Address = Address.replace(floor,'') except : floor = '' try: room = re.findall('\d+室', Address)[0] Address = Address.replace(room,'') except : room = '' addrQ = urllib.parse.quote(Address) #print(f'>>改字結果:{addrQ}') #透過內政部國兔測繪中心國土測繪圖資服務雲,查詢地點的座標與正確地點名稱 s = Session() #開始查詢 url = 'https://api.nlsc.gov.tw/MapSearch/QuerySearch' header = { "Accept": "application/xml, text/xml, */*; q=0.01", "Accept-Encoding": "zip, deflate, br", "Accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7", "Connection": "keep-alive", "Content-Length": "216", "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "Host": "api.nlsc.gov.tw", "Origin": "https://maps.nlsc.gov.tw", "Referer": "https://maps.nlsc.gov.tw/T09/mapshow.action", "Sec-Fetch-Dest": "empty", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-site", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36", } data = { "word": addrQ, "feedback": "XML", } r = s.post(url=url, headers=header, data=data,verify=False) if r.status_code == 200: print('>>查詢成功') tree = ET.fromstring(str(r.text)) # From XML String root = tree[0] realAddress = f"{root.find('CONTENT').text}{floor}{room}" #先將結果中的全形數字轉回一般數字 big = ['0','1','2','3','4','5','6','7','8','9'] small = [x for x in range(10)] numdict = dict(zip(big, small)) for n in big: realAddress = realAddress.replace(n, f'{numdict[n]}') print(f'>>原始地址為:{origin}') print(f'>>完整地址為:{realAddress}') lnglat = f"{root.find('LOCATION').text}".split(',') print(f'>>經緯度為:{lnglat}') return realAddress, lnglat
import os import cn2an files = os.listdir(".") # 假定你的文件和这个脚本在同一个目录下。 #小王捡了一百块钱.mp4 这种文件格式都可以 # 模块的网址 https://pypi.org/project/cn2an/ for file in files: newname = cn2an.transform(file, "cn2an") os.rename(file, newname)