def chs_to_cht(sentence): #传入参数为列表 """ 将简体转换成繁体 :param sentence: :return: """ sentence = ",".join(sentence) sentence = Converter('zh-hant').convert(sentence) sentence.encode('utf-8') return sentence.split(",")
def cat_to_chs(sentence): #传入参数为列表 """ 将繁体转换成简体 :param line: :return: """ sentence = ",".join(sentence) sentence = Converter('zh-hans').convert(sentence) sentence.encode('utf-8') return sentence.split(",")
def post(self): json_from_request = json.loads( Converter('zh-hant').convert( request.stream.read().decode('utf-8'))) LOG.debug('In doTravelInsuranceCalculate, data received from TE: %s' % json.dumps(json_from_request, ensure_ascii=False, indent=4)) payload = { "DAY": 10, "HD_FACE_AMT": 1000000, "MR_FACE_AMT": 100000, "OHS_FACE_AMT": 100000 } headers = { 'Content-Type': 'application/x-www-form-urlencoded', 'X-Requested-With': 'XMLHttpRequest' } url = constants.CATHAYLIFE_URL + 'SYL0_M030/doTravelInsuranceCalculate' LOG.debug('request cathaylife travel insurance caculate API: %s' % url) LOG.debug('payload: %s' % json.dumps(payload, ensure_ascii=False, indent=4)) r = requests.post(url, json=payload, timeout=float(constants.REQUEST_TIMEOUT), headers=headers) r_obj = r.json() LOG.debug('response: %s' % json.dumps(r_obj, ensure_ascii=False, indent=4)) r_obj = eval(r_obj['msg_response']['update']['content']) res = r_obj['text'].replace("<br>", "") update_kv_map = {"response": res} ret = encapsule_rtn_format(update_kv_map, None) return Response(json.dumps(ret), status=200)
def filter_chinese(sentence: str) -> str: """ 中文的一些预处理 :param sentence: 输入的句子或文本 :return: """ # 去除文本中的url # sentence = re.sub(r"http\S+", "", sentence) #剔除所有数字 # decimal_regex = re.compile(r"[^a-zA-Z]\d+") # sentence = decimal_regex.sub(r"", sentence) #删除英文字符 # eng_regex = re.compile(r'[a-zA-z]') # sentence = eng_regex.sub(r"", sentence) #只保留中文和标点符号 words = [ word for word in sentence if word >= u'\u4e00' and word <= u'\u9fa5' or word in [',', '。', '?', '!'] ] sentence = ''.join(words) # 去除空格 space_regex = re.compile(r"\s+") sentence = space_regex.sub(r"", sentence) # 繁体字转换成简体字 sentence = Converter('zh-hans').convert(sentence) return sentence.strip().lower()
def post(self): json_from_request = json.loads( Converter('zh-hant').convert( request.stream.read().decode('utf-8'))) LOG.debug('In Give_Fake_Data, data received from TE: %s' % json.dumps(json_from_request, ensure_ascii=False, indent=4)) df = pd.read_excel("ASR_test.xlsx") ID = int(json_from_request['task_info']['ID']) # find update_kv_map = {} if 'counter' not in json_from_request['task_info']: update_kv_map['counter'] = 0 ans = df.at[df[df['ID'] == ID].index[update_kv_map['counter']], "answer"] else: update_kv_map[ 'counter'] = json_from_request['task_info']['counter'] + 1 ans = df.at[df[df['ID'] == ID].index[update_kv_map['counter']], "answer"] if update_kv_map['counter'] == (len(df[df['ID'] == ID]) - 1): update_kv_map['last_ans'] = "ture" update_kv_map['ans'] = ans ret = encapsule_rtn_format(update_kv_map, None) return Response(json.dumps(ret), status=200)
def store_code_info(lang, code_info): global DB_CURSOR name_zh = Converter('zh-hant').convert(code_info['name']) if lang == 'en' and code_info['division_id'] != 0: pass elif lang == 'zh' and code_info['division_id'] != 0: sql = """INSERT INTO api_country_divisions (division_id, locale, name, adm_code) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE name=%s""" params = (code_info['division_id'], 'zh_CN', code_info['name'], code_info['adm_code'], code_info['name']) if DB_CURSOR is None: myToolbox.print_sql(sql, params) else: DB_CURSOR.execute(sql, params) sql = """INSERT INTO api_country_division_localized_names (division_id, locale, localized_name) VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE localized_name=%s""" params = (code_info['division_id'], 'zh_CN', code_info['name'], code_info['name']) if DB_CURSOR is None: myToolbox.print_sql(sql, params) else: DB_CURSOR.execute(sql, params) sql = """INSERT INTO api_country_division_localized_names (division_id, locale, localized_name) VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE localized_name=%s""" params = (code_info['division_id'], 'zh', name_zh, name_zh) if DB_CURSOR is None: myToolbox.print_sql(sql, params) else: DB_CURSOR.execute(sql, params)
def post(self): json_from_request = json.loads( Converter('zh-hans').convert( request.stream.read().decode('utf-8'))) app_id = json_from_request['app_id'] log.info('app_id: ' + app_id) try: conn = pymysql.Connect(host=const.DB_HOST, user=const.DB_ACCOUNT, passwd=const.DB_PASSWORD, charset='utf8') data = dao.Database(conn).query_bot_work_list(app_id) log.info(data) if len(data) == 1: result = { 'bot_id': data[0][0], 'work': data[0][1], 'return_flag': data[0][2], 'return_finish': data[0][3] } log.info(result) update_kv_map = { "bot_response": work_num_to_str(data[0][1]) + return_flag_to_str(data[0][2]) } ret = encapsule_rtn_format(update_kv_map, None) return Response(json.dumps(ret), status=200) except Exception as e: log.info("query_bot_work_list occured some error: " + utils.except_raise(e)) finally: conn.close()
def handle_message(event): msg = event.message.text msgtw = Converter('zh-hant').convert(msg) if msgtw != msg: line_bot_api.reply_message(event.reply_token, TextSendMessage(text=msgtw))
def traditional2simplified(sentence): """ 将sentence中的繁体字转为简体字 :param sentence: 待转换的句子 :return: 将句子中繁体字转换为简体字之后的句子 """ sentence = Converter('zh-hans').convert(sentence) return sentence
def Traditional2Simplified(sentence): ''' 将sentence中的繁体字转为简体字 :param sentence: 待转换的句子 :return: 将句子中繁体字转换为简体字之后的句子 ''' sentence = Converter('zh-hans').convert(sentence) return sentence
def Simplified2Traditional(sentence): ''' 将sentence中的简体字转为繁体字 :param sentence: 待转换的句子 :return: 将句子中简体字转换为繁体字之后的句子 ''' sentence = Converter('zh-hant').convert(sentence) return sentence
def convert_cht_name(data): for node in data: if 'name' in node: edit = Converter('zh-hans').convert(node['name']) if node['name'] != edit: print('id:\'{}\':convert name from \'{}\' to \'{}\''.format( node['id'], node['name'], edit)) node['name'] = edit
def trd2smp(sentence): ''' 将sentence中的繁体字转为简体字 :param sentence: 待转换的句子 :return: 将句子中繁体字转换为简体字之后的句子 ''' sentence = Converter('zh-hans').convert(sentence) return sentence
def smp2trd(sentence): ''' 将sentence中的简体字转为繁体字 :param sentence: 待转换的句子 :return: 将句子中简体字转换为繁体字之后的句子 ''' sentence = Converter('zh-hant').convert(sentence) return sentence
def simplified2traditional(sentence): """ 将sentence中的简体字转为繁体字 :param sentence: 待转换的句子 :return: 将句子中简体字转换为繁体字之后的句子 """ sentence = Converter('zh-hant').convert(sentence) return sentence
def post(self): json_from_request = json.loads( Converter('zh-hans').convert( request.stream.read().decode('utf-8'))) user_execute_work = json_from_request['task_info']['bot_execute'] log.info(json.dumps(json_from_request)) app_id = json_from_request['app_id'] log.info('app_id: ' + app_id) work_stats = 0 update_kv_map = {} try: conn = pymysql.Connect(host=const.DB_HOST, user=const.DB_ACCOUNT, passwd=const.DB_PASSWORD, charset='utf8') data = dao.Database(conn).query_bot_work_list(app_id) log.debug(data) if len(data) == 1: work_stats = data[0][1] except Exception as e: log.info("query_bot_work_list occured some error: " + utils.except_raise(e)) finally: conn.close() if work_stats == 0: try: data = {} data['bot_id'] = json_from_request['app_id'] data['work'] = user_execute_work conn = pymysql.Connect(host=const.DB_HOST, user=const.DB_ACCOUNT, passwd=const.DB_PASSWORD, charset='utf8') update_row = dao.Database(conn).insert_work_to_bot_work_list( data) except Exception as e: log.info("insert_work_to_bot_work_list occured some error: " + utils.except_raise(e)) finally: conn.close() log.info(user_execute_work) update_kv_map = { "bot_response": work_num_to_str(int(user_execute_work)) + '派車,成功' } else: update_kv_map = { "bot_response": '正在執行' + work_num_to_str(work_stats) + '任務,請稍後在下命令' } ret = encapsule_rtn_format(update_kv_map, None) return Response(json.dumps(ret), status=200)
def preprocess_sentence(sentence): ''' sentence = re.sub(r'\*+', '', sentence) sentence = re.sub( u"[’!\"#$%&'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~]+", "", sentence ) ''' sentence = Converter('zh-hans').convert(sentence) sentence = change_sentence(sentence) return sentence
def Traditional2Simplified(sentences): ''' 将sentence中的繁体字转为简体字 :param sentence: 待转换的句子 :return: 将句子中繁体字转换为简体字之后的句子 ''' ans = [] for s in tqdm(sentences): ans.append(Converter('zh-hans').convert(s)) return ans
def item_get(iid): ''' 調用 Onebound API 的 item_get 接口,回傳商品詳細數據 ''' url = "{url}?key={apiKey}&secret={apiSecret}&api_name=item_get&num_iid={iid}".format( url=cfg.api['url'], apiKey=cfg.api['key'], apiSecret=cfg.api['secret'], iid=iid) downloaded = False try_tiems = 0 try: while not downloaded or try_tiems > cfg.api['max_try_times']: try_tiems += 1 try: r = requests.get(url, headers=cfg.headers, timeout=30).json() r_t = Converter('zh-hant').convert( json.dumps(r, ensure_ascii=False)) json_obj = json.loads(r_t) except Exception as e: print(bcolors.FAIL + ''' API 回傳了無效的資料格式,請聯繫 API 供應商取得協助。 錯誤信息: {} 收到的資料: {} '''.format(e, json.dumps(r)) + bcolors.ENDC) return None # API 例外處理 if 'item' not in json_obj: print(bcolors.FAIL + '''API 服務發生錯誤,請聯繫 API 供應商或程式開發者。''' + bcolors.ENDC) if 'error' in json_obj: print('錯誤信息:') print(json_obj['error']) print(bcolors.ENDC) return None item = json_obj['item'] # 如果下載的資料不是空白的才結束循環 if item['title'] != '': downloaded = True return item except: return None
def speech_to_text(file_name): # function for audio recognition wav_file_name = convert_audio_extension(file_name) ''' File Type : .wav ''' r = sr.Recognizer() harvard = sr.AudioFile(wav_file_name) try: with harvard as source: audio = r.record(source) recognize_text = r.recognize_google(audio, language='zh-tw') recognize_text = Converter('zh-hant').convert(recognize_text) except: recognize_text = "無法辨識內容" return recognize_text
def train_wiki_word2vector(): # 词向量训练,这部分为针对中文维基百科的训练 # For Wiki_Zh_Corpus处理繁体转变为简体,这里开始是处理wiki的文件 f_read = open(os.path.join(sys.path[0], 'extracted_data.txt'), 'r', encoding='utf-8') f_write = open(os.path.join(sys.path[0], 'processed_extracted_data.txt'), 'w', encoding='utf-8') for line in f_read: sentence = Converter('zh-hans').convert(line) f_write.write(sentence) f_read.close() f_write.close() path_main = sys.path[0] + '/Data Source' data = codecs.open('processed_extracted_wiki_data.txt', 'r', encoding='utf-8') processed_data = process_raw_articles(path_main, data) i = 0 space = ' ' current_file = codecs.open('clear_wiki_zh_data.txt', 'w', encoding='utf-8') for text in processed_data: current_file.write(space.join(text) + "\n") i = i + 1 if i % 100 == 0: print("Saved " + str(i) + " articles") current_file.close() sentences = word2vec.Text8Corpus("clear_wiki_zh_data.txt") model = Word2Vec(sentences, size=200, window=5, min_count=10, workers=multiprocessing.cpu_count(), sample=0.001, sorted_vocab=True) model.wv.save_word2vec_format('word2vector_wiki_zh_info.bin2', binary=False)
def client_translate(self): global dic global root #T.delete(0,END) print("Start Translate\n") #pyperclip.copy("""今天天氣如何""") msg = pyperclip.paste() if msg == "": msg = """今天天氣如何""" if self._lastTranslated == msg: if self._isAuto == True: # keep doing auto translation self._timer = threading.Timer(2.0, self.client_translate) self._timer.start() return T.delete(1.0, END) translatedMsg = dic.translateFinal(msg) #line = Converter('zh-hans').convert(line.decode('utf-8')) #line = line.encode('utf-8') chtMsg = Converter('zh-hant').convert(translatedMsg) #print(translatedMsg) print(chtMsg) self._lastTranslated = msg #T.insert(END,translatedMsg) if self._isBilingual == True: T.insert(END, msg + "\n") T.insert(END, chtMsg) if self._isAuto == True: # keep doing auto translation self._timer = threading.Timer(2.0, self.client_translate) self._timer.start()
# -*- coding: utf-8 -*- __author__ = 'v-tedl' import os from langconv import Converter path = r'%s/redirectData/' % os.getcwd() KBItem2RedirectItem = dict() for file in os.listdir(path): print "process %s..." % file fp = open(path + file, 'r') for line in fp: KBItem, RedirectItem = line.strip('\n').split('\t') if KBItem == "KBItem": continue SimRedirectItem = Converter('zh-hans').convert(RedirectItem.decode('utf8')).encode('utf8') if KBItem in KBItem2RedirectItem.iterkeys(): KBItem2RedirectItem[KBItem].append(RedirectItem) if SimRedirectItem != RedirectItem: KBItem2RedirectItem[KBItem].append(SimRedirectItem) else: KBItem2RedirectItem[KBItem] = [RedirectItem] if SimRedirectItem != RedirectItem: KBItem2RedirectItem[KBItem].append(SimRedirectItem) fp.close() lines = [] for key in KBItem2RedirectItem.iterkeys(): simKey = Converter('zh-hans').convert(key.decode('utf8')).encode('utf8') if key == simKey: line = '\t'.join([key] + KBItem2RedirectItem[key]) else: line = '\t'.join([key, simKey] + KBItem2RedirectItem[key])
def Traditional2Simplified(sentence): sentence = Converter('zh-hans').convert(sentence) return sentence
return (1, 1) if __name__ == '__main__': (conn, cursor) = cardsdb.init() filename = '大表' wb = openpyxl.load_workbook(filename + '.xlsx') sheet = wb[wb.sheetnames[0]] (base_row, base_col) = find_base_cell(sheet) for r in range(base_row, sheet.max_row + 1): cell = sheet.cell(r, base_col).value if cell == 'Card name': for i in range(4): sheet.cell(r, base_col + i).value = translate_local( cursor, sheet.cell(r, base_col + i).value) else: sheet.cell(r, base_col).value = translate_local(cursor, cell) for i in range(1, 4): sheet.cell(r, base_col + i).number_format = '0.0000' sheet.column_dimensions[get_column_letter(base_col)].width = 13 sheet.column_dimensions[get_column_letter(base_col + 1)].width = 8 sheet.column_dimensions[get_column_letter(base_col + 2)].width = 11 sheet.column_dimensions[get_column_letter(base_col + 3)].width = 11 for r in range(base_row, sheet.max_row + 1): cell = sheet.cell(r, base_col).value if cell != None: sheet.cell(r, base_col).value = Converter('zh-hans').convert(cell) wb.save(filename + '翻译版.xlsx') conn.close()
import jieba as jb import logging from stopWords import removeStopWordsJbg import MySQLdb as mdb from langconv import Converter jb.enable_parallel() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) weiboAppKey = '82966982' tagsDict = {} a = 1 b = 2 L = 2 cvt = Converter('zh-hans') def setEleInDict(d, e, n): if e not in d: d[e] = n else: d[e] += n def setTagInDict(d, i, t, n): if i not in d: d[i] = [t, n] else: d[i][1] += n def getWordsFromWeibo(text): text = text.lower()
def tradition2simple(word): return Converter('zh-hans').convert(word)
def simple2tradition(word): return Converter('zh-hant').convert(word)
def convert_track(tagger, metadata, release, track): for key in metadata: if not isinstance('', (str, unicode)): continue metadata[key] = Converter(TO_LOCAL).convert(metadata[key])
def store_lang_info(lang, lang_info): global DB_CURSOR if lang_info['iso_639_2'] != '': iso_639_2 = lang_info['iso_639_2'].split('/') if len(iso_639_2) == 2: lang_info['iso_639_2'] = iso_639_2[0] lang_info['iso_639_2_t'] = iso_639_2[1] else: lang_info['iso_639_2_t'] = lang_info['iso_639_2'] else: lang_info['iso_639_2_t'] = '' if not re.match(r'^[a-z]{3}$', lang_info['iso_639_2']): lang_info['iso_639_2'] = None if not re.match(r'^[a-z]{3}$', lang_info['iso_639_2_t']): lang_info['iso_639_2_t'] = None if len(lang_info['iso_639_3']) > 3: lang_info['iso_639_3'] = lang_info['iso_639_3'][0:3] if not re.match(r'^[a-z]{3}$', lang_info['iso_639_3']): lang_info['iso_639_3'] = None if lang == 'zh' and lang_info['iso_639_1'] != '': print( "INFO (%s) > Got and stored language: %s, %s, %s, %s (%s, %s, %s, %s)" % (myToolbox.get_time(), lang_info['iso_639_1'], lang_info['iso_639_2'], lang_info['iso_639_2_t'], lang_info['iso_639_3'], lang_info['self_name'], lang_info['en'], lang_info['zh_CN'], lang_info['zh_TW'])) sql = """INSERT INTO api_languages (iso_639_1_code, iso_639_2_b_code, iso_639_2_t_code, iso_639_3_code, self_name) VALUES (%s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE self_name=%s""" params = (lang_info['iso_639_1'], lang_info['iso_639_2'], lang_info['iso_639_2_t'], lang_info['iso_639_3'], lang_info['self_name'], lang_info['self_name']) if DB_CURSOR is None: myToolbox.print_sql(sql, params) else: DB_CURSOR.execute(sql, params) sql = """INSERT INTO api_language_localized_names (iso_639_1_code, locale, localized_name) VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE localized_name=%s""" params = (lang_info['iso_639_1'], lang_info['iso_639_1'], lang_info['self_name'], lang_info['self_name']) if DB_CURSOR is None: myToolbox.print_sql(sql, params) else: DB_CURSOR.execute(sql, params) if lang_info['en'] != '': params = (lang_info['iso_639_1'], 'en', lang_info['en'], lang_info['en']) if DB_CURSOR is None: myToolbox.print_sql(sql, params) else: DB_CURSOR.execute(sql, params) if lang_info['zh_CN'] != '': params = (lang_info['iso_639_1'], 'zh_CN', lang_info['zh_CN'], lang_info['zh_CN']) if DB_CURSOR is None: myToolbox.print_sql(sql, params) else: DB_CURSOR.execute(sql, params) name_zh = Converter('zh-hant').convert(lang_info['zh_CN']) params = (lang_info['iso_639_1'], 'zh', name_zh, name_zh) if DB_CURSOR is None: myToolbox.print_sql(sql, params) else: DB_CURSOR.execute(sql, params) if lang_info['zh_TW'] != '': params = (lang_info['iso_639_1'], 'zh_TW', lang_info['zh_TW'], lang_info['zh_TW']) if DB_CURSOR is None: myToolbox.print_sql(sql, params) else: DB_CURSOR.execute(sql, params) else: pass
class Database(list): _re_split = re.compile("(?<=</doc>).*?(?=<doc)", flags=re.DOTALL) _re_ignore = re.compile("<br>") _xml_escape = [(" & ", " & ")] _converter = Converter("zh-hans") @staticmethod def cond_length(length): return lambda x: len(x[0]) > length @staticmethod def cond_title(condition): return lambda x: condition(x[1]["title"]) def __init__(self, database=None, conditions=[]): if isinstance(database, list): super(Database, self).__init__(database) elif isinstance(database, str): self.load_data(database, conditions=conditions) def load_data(self, path, conditions=[]): walk = os.walk(path) total = reduce(int.__add__, map(lambda t: len(t[2]), walk)) count = 0 success = 0 filtered = 0 fail = 0 print "loading data from %s" % path self.entities = set() for dirpath, dirnames, filenames in os.walk(path): for filename in filenames: fullname = os.path.join(dirpath, filename) xml_raw = open(fullname, "r").read() for xml in self._re_split.split(xml_raw): xml = self._re_ignore.sub("", xml) for old, new in self._xml_escape: xml = xml.replace(old, new) try: e = et.fromstring(xml) if not isinstance(e.text, unicode): e.text = e.text.decode() if not isinstance(e.attrib["title"], unicode): e.attrib["title"] = e.attrib["title"].decode() e.text = self._converter.convert(e.text) for key in e.attrib: e.attrib[key] = self._converter.convert( e.attrib[key]) data = (e.text, e.attrib) for condition in conditions: if not condition(data): filtered += 1 break else: success += 1 self.append(data) self.entities.add(e.attrib["title"]) except et.ParseError: fail += 1 count += 1 #if count >= 10 : # break print "\rfiles: %d / %d" % (count, total), sys.stdout.flush() print "... %d loaded, %d filtered, %d fails" % (success, filtered, fail) @property def sentences(self): # return tokenized sentences for text, attrib in self: yield list(jieba.cut(text)) @property def dictionary(self): # return gensim Dictionary if not hasattr(self, "_dictionary"): self._dictionary = Dictionary(self.sentences, prune_at=None) return self._dictionary @property def corpus(self): # return sparse vectors for gensim models if not hasattr(self, "_corpus"): self._corpus = [ self.dictionary.doc2bow(sentence) for sentence in self.sentences ] return self._corpus def save(self, filename): pickle.dump(self, open(filename, "w")) @staticmethod def load(filename): return pickle.load(open(filename))