def convert_to_strings(wikipage): # given a wikipage object, the function will return a structurlized # dictionary that holds all information from a wikipage. from hanziconv import HanziConv import wikitextparser as wtp import pprint try: summary = HanziConv.toTraditional( wtp.parse(wikipage.content).sections[0].pprint()) except: summary = None try: sections = [HanziConv.toTraditional( sec.pprint()) for sec in wtp.parse(wikipage.content).sections[1:]] try: sub_titles = [HanziConv.toTraditional( sec.title[1:-1]) for sec in wtp.parse(wikipage.content).sections[1:]] except: sub_titles = None try: section_content = [s[s.find('\n') + 1:] for s in sections] except: section_content = None except: sections = None try: sections = list(zip(sub_titles, section_content)) except: sections = None try: links = wikipage.links except: links = None return {'title': wikipage.title, 'summary': summary, 'sections': sections, 'links': links}
def convert_encoding_to_utf_8(filename): global total_cnt,success_cnt flag1 = True flag2 = True flag3 = True content = codecs.open(filename, 'rb').read() source_encoding = chardet.detect(content)['encoding'] total_cnt+=1 filename_trans = HanziConv.toTraditional(filename) if(filename_trans == filename): flag1 = False if source_encoding != 'utf-8' and source_encoding != 'UTF-8-SIG': content = content.decode(source_encoding, 'ignore') #.encode(source_encoding) else: flag2 = False content_trans = HanziConv.toTraditional(content) if(content_trans == content): flag3 = False if(flag1 or flag2 or flag3): backup(filename) os.rename(filename, filename_trans) with open(filename_trans, 'w', encoding='UTF-8-SIG') as file: file.write(content_trans) success_cnt+=1
def convert_dir(root_dir): global function_list # Check if root path is valid if os.path.exists(root_dir) == False: print("[error] dir:",root_dir,"do not exit") return print("work in", root_dir) for root, dirs, files in os.walk(root_dir): # recursively work in folder '''Convert folder name''' ########################################### Function 檔名轉繁體 start ########################################### if(function_list[0] == 1): root_trans = HanziConv.toTraditional(root) if(root_trans != root): os.rename(root, root_trans) ########################################### Function 檔名轉繁體 end ############################################# for root, dirs, files in os.walk(root_dir): # recursively work in folder # Work with file for f in files: filename = os.path.join(root, f) ######################################### Function 檔名轉繁體 start ######################################### if(function_list[0] == 1): filename_trans = HanziConv.toTraditional(filename) if(filename_trans != filename): os.rename(filename, filename_trans) ######################################### Function 檔名轉繁體 end ########################################### # Read file once if(function_list[1] == 1 or function_list[2] == 1) and (any(suf in filename_trans for suf in suffix)): content = codecs.open(filename_trans, 'rb').read() backup(filename_trans) ######################################## Function 檔案編碼轉換 start ######################################## if(function_list[1] == 1): try: if(any(suf in filename_trans for suf in suffix)): content = convert_encoding_to_utf_8(filename_trans, content) except: print("Fail Convert utf-8",filename) ######################################## Function 檔案編碼轉換 end ########################################## ######################################### Function 檔案轉繁體 start ######################################### if(function_list[2] == 1): try: if(any(suf in filename_trans for suf in suffix)): toTraditional(filename_trans, content) except: print("Fail Convert",filename)
def __init__(self, title, author, author_role, body, form='simplified'): if form == 'simplified': self.title = HanziConv.toSimplified(title) self.author = HanziConv.toSimplified(author) self.author_role = HanziConv.toSimplified(author_role) self.body = HanziConv.toSimplified(body) elif form == 'traditional': self.title = HanziConv.toTraditional(title) self.author = HanziConv.toTraditional(author) self.author_role = HanziConv.toTraditional(author_role) self.body = HanziConv.toTraditional(body) else: raise ValueError(f'Unrecongnized form: {form}')
def trad_and_simp(inputString): ''' Takes in a unicode string containing chinese characters and makes sure it contains both traditional and simplified versions of every character. If both versions are not present, whatever is missing is added. The returned string is in no guarenteed order, just guarenteed to have both character sets where possible. Parameters ---------- inputString : String A string containing traditional and/or simplified Chinese characters. These will be expanded so that all simplified and traditional characters are present. Returns ------- String A string is returned that contain traditional and simplified versions of every Chinese character found in the input string. ''' totalSet = set(inputString) totalSet = totalSet.union(HanziConv.toSimplified(inputString)) totalSet = totalSet.union(HanziConv.toTraditional(inputString)) return "".join(totalSet)
def run( app: str = typer.Option(default="Spotify", help="Application to track"), debug: bool = typer.Option(default=False, is_flag=True, help="To show debug messages or not"), traditional: bool = typer.Option( default=False, is_flag=True, help="Translate lyrics into Traditional Chinese if possible", ), ): # pragma: no cover {True: logger.enable, False: logger.disable}[debug]("touchbar_lyric") if not debug: logger.disable("touchbar_lyric") logger.disable("__main__") media_info = get_info(app) if media_info is None: return songs = universal_search(media_info.name, media_info.artists) for song in songs: if song.anchor(media_info.position): line: str = song.anchor(media_info.position) if traditional: line = HanziConv.toTraditional(line) print(line) break
def send_reuqest(user_id, req_text, api_key): req_data = { "key": api_key, "info": HanziConv.toSimplified(req_text), #"loc": "" "userid": user_id } ret_data = { "success": False, } try: ret = requests.post(TULING_123_URL, data=req_data, timeout=TIMEOUT) if ret.status_code == 200: ret_data["success"] = True ret_data.update(ret.json()) if 'text' in ret_data: txt = ret_data['text'] ret_data['text'] = HanziConv.toTraditional(txt) else: print(ret.text) except requests.RequestException: pass return ret_data
def translate(translate_file_path): with open(file=translate_file_path, mode="r", encoding="utf-8") as file: content = file.read() with open(file=translate_file_path, mode="w", encoding="utf-8") as file: if content: content = HanziConv.toTraditional(content) file.write(content)
def get_Xueqiu_categories(self): from hanziconv import HanziConv from selenium import webdriver from webdriver_manager.chrome import ChromeDriverManager url = 'https://xueqiu.com/hq#exchange=US&industry=3_2&firstName=3&page=1' while 1: try: driver = webdriver.Chrome(ChromeDriverManager().install()) driver.get(url) driver.implicitly_wait(10) soup = BeautifulSoup(driver.page_source, 'html.parser') categories = {} for ele in soup.find_all('i', {'class' : 'list-style'}): if re.search("明星股", ele.parent.text): for li in ele.parent.find_all('li'): key = HanziConv.toTraditional(li.text).strip() link = "https://xueqiu.com/hq{}".format(li.select('a')[0]['href'].strip()) categories[key] = link driver.quit() break except: traceback.print_exc() driver.quit() self.GICS_csvs(categories)
def get_stock_info(self, stock_name, use_proxy=True): from hanziconv import HanziConv headers = { 'X-Requested-With': 'XMLHttpRequest', 'Referer': 'http://xueqiu.com/p/ZH010389', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0', 'Host': 'xueqiu.com', #'Connection':'keep-alive', #'Accept':'*/*', 'cookie':'s=iabht2os.1dgjn9z; xq_a_token=02a16c8dd2d87980d1b3ddced673bd6a74288bde; xq_r_token=024b1e233fea42dd2e0a74832bde2c914ed30e79; __utma=1.2130135756.1433017807.1433017807.1433017807.1;' '__utmc=1; __utmz=1.1433017807.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); Hm_lvt_1db88642e346389874251b5a1eded6e3=1433017809; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1433017809' } counter = 0 while counter < self.RETRY: counter += 1 try: proxies = {} if use_proxy: proxies = self.get_proxy() print("PROXY => {:}".format(proxies)) res = requests.get("https://xueqiu.com/S/" + stock_name, headers=headers, proxies=proxies, timeout=self.REQUEST_TIMEOUT) reGetStockInfo = re.compile(r"profile-detail.*?\">(.*?)<", re.S | re.UNICODE) for stockInfo in reGetStockInfo.findall(res.text): return HanziConv.toTraditional(stockInfo) except: traceback.print_exc() time.sleep(1) return ''
def cut(string, using_stopwords=True, simplified_convert=True, log=False): string = string.lower() if simplified_convert: string = HanziConv.toSimplified(string) with open(os.path.join(BASE_DIR, 'digit_mark.json'), encoding='utf-8') as data_file: digit_mark = json.load(data_file) for digit in digit_mark: string = string.replace(digit, ' ') tokens = list(jieba.cut_for_search(string)) if simplified_convert: tokens = [HanziConv.toTraditional(i) for i in tokens] tokens = [i for i in tokens if i.strip() != ''] if using_stopwords: with open(os.path.join(BASE_DIR, 'stopwords.txt'), encoding='utf-8') as data_file: stopwords = [ line.replace('\n', '') for line in data_file.readlines() ] if log: removed_tokens = [i for i in list(tokens) if i in stopwords] if len(removed_tokens) > 0: print('token removed : ' + ", ".join(removed_tokens)) tokens = [i for i in list(tokens) if i not in stopwords] else: tokens = list(tokens) return tokens
def print_in_line_reverse(row, msg, style, lang): ''' string 语句,row行数,direction 方向。原理:字数/行数的余数。 字数/行数, 余数。决定了文字打印出来的坐标。 ''' if not msg: return None if lang == 'S': msg = HanziConv.toSimplified(msg) elif lang == 'T': msg = HanziConv.toTraditional(msg) msg = sub(msg) len_col = math.ceil(len(msg) / row) # 向上取整 big_line = '' for i in range(row): line = '' for j in range(len_col): try: line += msg[j * row + i] + style # 可以通过直接切片的方式,进行取值。错误则不打印连接符。 except: line += '㍐' + style line = line[::-1] big_line += line + '<br>' return big_line
def get_people_name(self): if self.get_main_content() != None: term_list = segment.seg( HanziConv.toSimplified(self.get_main_content())) for term in term_list: if str(term.nature) == NLP_Constant.people_name_pos: return HanziConv.toTraditional(str(term.word)) return None
def simplified_to_traditional(self): logging.info("等待中..(簡 to 繁)") traditional = open("traditional.txt", "w", encoding="utf-8") with open("wiki_text.txt", "r", encoding="utf-8") as simplified: for s in simplified: traditional.write(HanziConv.toTraditional(s)) print("成功簡體轉繁體!") traditional.close()
def toTraditional(filename, content): content_trans = HanziConv.toTraditional(content) if content_trans != content: # Write with utf8 encoding with open(filename, 'w', encoding='UTF-8-SIG') as file: file.write(content_trans)
def pre_process(text): text = HanziConv.toTraditional(text) # load cantonese corpus # jb.load_userdict('util/dict/canto_dict.txt') vocabs = list(jb.cut(text)) pp_text = " ".join(vocabs) return pp_text
def create_post(): form = PostForm() if form.validate_on_submit(): chinese = HanziConv.toTraditional(form.chinese_content.data) title = HanziConv.toTraditional(form.title.data) post = Post(author=current_user, title=title, chinese_content=chinese, content=form.content.data, tags=form.tags.data) db.session.add(post) db.session.commit() flash('Your post has been created!', 'success') return redirect(url_for('home')) return render_template('create_post.html', title='New Post', form=form, legend='New Post')
def process_text(self): logging.info("等待中..(簡 to 繁)") with open('./word2vec_data/traditional.txt', 'w', encoding='utf-8') as fw: with open('./word2vec_data/wiki_text.txt', 'r', encoding='utf-8') as f: for line in f: line = HanziConv.toTraditional(line) fw.write(line)
def inputTest(): x = input("請說話:") # x:token y = jerry.get_response(x) y = HanziConv.toTraditional(y.text) print(type(x)) print(type(y)) print(y)
def Transform_ZhTw_Save(self, File_Name, Next_FileName): FileRead = [] with open(File_Name, 'rb') as RawFile: for line in RawFile: FileRead.append(HanziConv.toTraditional(line)) with open(Next_FileName, 'wb') as Next_File: for i in range(len(FileRead)): for j in range(len(FileRead[i])): Next_File.write(FileRead[i][j].encode('utf-8'))
def textrankJob(n): # get keyword keyword = textrankGet(n) # read testdata line by line for i in range(1, 8): with open('./finalResult/' + n + 'dataset' + str(i) + '.csv', 'w', newline='', encoding='utf-8') as res: writer = csv.writer(res) with open('./testData/dataset' + str(i) + '.txt', 'r', newline='', encoding='utf-8') as txtfile: tr = txtfile.readlines() flag = True for t in tr: if flag is True: article = t else: # store keyword match on article content keywordMatch = [] content = t # start match keyword and content for index in keyword: temp = [] for k in index: if n == 'tfidf': k = HanziConv.toTraditional(k) if k in content: temp.append(k) keywordMatch.append(temp) # write match result to csv writer.writerow([article.strip()]) writer.writerow([content.strip()]) if n == 'tfidf': tempkeyword = [] string = "Result:" tempkeyword.append(string) for k in keywordMatch[0]: tempkeyword.append(k) writer.writerow(tempkeyword) writer.writerow("\n") else: exp_value = [0.4, 0.5, 0.6] for j in range(3): tempkeyword = [] tempkeyword.append(exp_value[j]) for k in keywordMatch[j]: tempkeyword.append(k) writer.writerow(tempkeyword) writer.writerow("\n") flag = not flag print("------------------------------------------")
def preprocess(self, line, cond=None): line = HanziConv.toTraditional(line) # line = re.sub(r"\@[a-z0-9][a-z0-9]*", '', line) # line = re.sub(r"\#[a-z0-9][a-z0-9]*", '', line) # line = re.split(r"\([a-z][a-z]\)", line.lower())[0] if cond == 'only_zh': words = [w for w in jieba.cut(line) if is_zh.search(w)] line = ' '.join(words) line = re.sub("\s+", ' ', line).strip().lower() return line
def get_words(path): words = [] with codecs.open(path, 'r', 'utf8') as f: line = f.readline() while line: word = line.strip().replace('\n', '') word = HanziConv.toTraditional(word) words += word line = f.readline() return words
def to_traditional_chinese(content): converted = content try: from hanziconv import HanziConv converted = HanziConv.toTraditional(content) except ImportError: logging.warn( 'You need to install python module "HanziConv" to convert to traditional Chinese.' ) return converted
def checkUpdate(self): _, latest_chapter_title = self.getLatestChapter() if latest_chapter_title != self.latest_chapter_title: self.latest_chapter_url, self.latest_chapter_title = self.getLatestChapter( ) self.latest_chapter_title_cht = HanziConv.toTraditional( self.latest_chapter_title) return True else: return False
def __init__(self, name, url) -> None: self.name = name self.url = url self.code = url.rsplit("/")[-2] self.a_link = f"/comic/{self.code}/" self.chapter_count = 0 self.latest_chapter_url, self.latest_chapter_title = self.getLatestChapter( ) self.latest_chapter_title_cht = HanziConv.toTraditional( self.latest_chapter_title) pass
def chatBot_GET_Google(question): url = 'https://www.google.com.tw/search?q=' + question + '+維基百科' response = requests.get(url) if response.status_code == 200: bs = BeautifulSoup(response.text, 'lxml') wiki_url = bs.find('cite') kwd = wiki_url.text.split('/')[-1] keyword_trad = HanziConv.toTraditional(kwd) return keyword_trad else: print('請求失敗')
def concept_lookup(self): print('find only one conception,so get its commonsense at most 10') # 先中文查找 local_commonsense = Query.base_lookup(HanziConv.toTraditional(self.conceptions)) if not local_commonsense: # 如果没有找到,翻译成英文再次查找 local_commonsense = Query.base_lookup(self.translator.zh_to_en(self.conceptions)) self.commonsense = set(local_commonsense)
def subot_getGoogle(question): url = f'https://www.google.com.tw/search?q={question}+維基百科' response = requests.get(url) if response.status_code == 200: bs = BeautifulSoup(response.text, 'lxml') wiki_url = bs.find('cite') kwd = wiki_url.text.split('/')[-1] keyword_trad = HanziConv.toTraditional(kwd) return keyword_trad else: print('解讀後轉換關鍵字失敗....')
def articles_parser_insert_mysql(self): #74218 self.cursor.execute( "SELECT id, title, content FROM articles where id >= 198886 and id <= 200000" ) sql = "INSERT INTO articles_parser (id, title_parser_result, content_parser_result) VALUES (%s, %s, %s)" results = self.cursor.fetchall() for record in results: index = record[0] title = record[1] content = record[2] print(index) print(title, end="\n\n") print(content) if title != "": title_parser_result = parsing.Parser( re.sub( r"[\s+\.\【\】\‧\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "", title)) if len(title_parser_result) != 0: if title_parser_result[0] == "error": title = HanziConv.toTraditional(title) title_parser_result = parsing.Parser( re.sub( r"[\s+\.\【\】\‧\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "", title)) else: continue else: continue content = re.sub(r'\、|\,|\。|\?|\?|\;|\;|\:|\~|\:|\⋯|\!', '\n', content) content_parser_result = "" for line in content.split("\n"): line = re.sub( r"[\s+\.\【\】\‧\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "", line) if len(line) >= 4 and '★' not in line and '◆' not in line: print(line) parser_result = parsing.Parser(line) else: continue if line == "" or len( parser_result) is not 1 or parser_result[0] == 'error': continue content_parser_result += parser_result[0] content_parser_result += "@" time.sleep(self.sleep) val = (index, title_parser_result[0], content_parser_result) print(title_parser_result[0], end="\n\n") print(content_parser_result) self.cursor.execute(sql, val) self.db.commit() self.db.close()
def generate(): if win.img_shown is 0: win.textBrowser.setText("請先開啟圖片") win.textBrowser.setFont(QtGui.QFont("Noto Sans Mono CJK TC", 17)) else: win.textBrowser.setText('請稍等...') win.textBrowser.setFont(QtGui.QFont("Noto Sans Mono CJK TC", 17)) predicted_cap = HanziConv.toTraditional( predict('./train_captions', "./ckpt-20", win.img_path)) win.textBrowser.setText(predicted_cap) # win.textBrowser.setFont(win.def_font) speak(predicted_cap)
def hello(): name = request.form['checking'] temp_name = HanziConv.toTraditional(name) # name = HanziConv.toSimplified(name) name = name.encode('utf-8') name = urllib2.quote(name) url_tem= "http://csclab11.cs.nthu.edu.tw:5000/?q=%s"%name result = urllib2.urlopen(url_tem).read() #result = json.load(result) # print type(result) d = json.loads(result) kangxi=HanziConv.toTraditional(d["result"]) # print d["result"] # namelist.append(temp_name) # resultlist.append(d["result"]) # result = get_result(name) kangxi=kangxi.encode('utf-8') kangxi=urllib2.quote(kangxi) url_kang="http://kxgen.mqstudiotw.com/?%s"%kangxi kangxi_result = urllib2.urlopen(url_kang) #print kangxi_result return render_template('index.html', name=temp_name,result=d["result"])
def writeDBF(filePattern, fullFilePath, dicInput): global dbfFileHandle global dbfFileIndex global writeMax # dbfFileHandle = None # dbfFileIndex = None insertCount = 0; updateCount = 0; bFileExists = os.path.exists(fullFilePath) dtWriteDBFStart = datetime.datetime.now() # logger.debug("write DBF start") today = dtWriteDBFStart.strftime("%Y%m%d") fileName = today strToken = "" if filePattern == "0": strToken = "SH" fileName += ".SH.txt" elif filePattern == "1": strToken = "SZ" fileName += ".SZ.txt" with open(fileName, "w") as text_file: for key, value in dicInput.iteritems(): insertCount += 1 value = HanziConv.toTraditional(value) try: value = value.decode("utf8") except: pass strWrite = (u"%s.%s,%s\n" % (key, strToken, value)) text_file.write(strWrite.encode('utf8')) dtWriteDBFEnd = datetime.datetime.now() logger.debug("write count : " + str(insertCount) + "/" + str(updateCount)) logger.debug("write DBF end (" + str(dtWriteDBFEnd - dtWriteDBFStart) + ")")
def get_json_from_page(page): from hanziconv import HanziConv stopwords = load_stop_words() cat_constrain_set = set(tokenize(HanziConv.toTraditional("。".join(page.categories)),stopwords)) summary_constrain_set = set(tokenize(HanziConv.toTraditional("。".join(page.summary)),stopwords)) return get_places(page.title,cat_constrain_set|summary_constrain_set)
def gen_response(keyword_list): dic = {"笑話":"你想要聽我說個笑話嗎", "無聊":"那聽個笑話好嗎"} ans = dic[HanziConv.toTraditional(keyword_list[0])] print(ans)
# but it's used this way "操你", "草你", "日你", # f**k you "操他", "草他", "日他", # f**k his "操她", "草她", "日她", # f**k her # Discrimination (racial slurs) "小日本", # little Japanese "台湾狗", # Taiwanese dogs "共产中国", # communist Chinese "流氓国家", # rogue country "人渣", # human slag "我去", # this is verbal and bad "鬼子" # devil, usually a suffix ] BAD = [HanziConv.toSimplified(word) for word in bad_init] + \ [HanziConv.toTraditional(word) for word in bad_init] INFORMAL = [ # Hello "你好", # nǐ hǎo; The standard "hello" greeting. "您好", # nín hǎo; The same "hello" greeting as above "你怎么样", # nǐ zěnmeyàng?; "What's up?", "How are you doing?" # Good afternoon "午安", # wǔ'an; note: seldom used in the Mainland. "下午好", # xìawǔ hǎo! Seldom used in the Republic of China # Good evening / Good night "晚安", # wǎn'an; Literally "Peace at night", Good night. "晚上好", # wǎnshang hǎo; Good evening!
m = re.search(ur"^(\[.+?\])(.+?):", s) if m: s = m.group(2) + m.group(1) else: m = re.search(ur"^\[.+?\](.*)", s) if m: s = m.group(1) return s if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("input", action="store", nargs = 1) parser.add_argument("output", action="store", nargs = 1) parser.add_argument("--encoding", action="store", default="utf_8_sig", nargs=1) parser.add_argument("--traditional", action="store_true", default=False) args = parser.parse_args() buf = codecs.open(args.input[0], "rb", args.encoding).read() if args.traditional: buf = HanziConv.toTraditional(buf) else: buf = HanziConv.toSimplified(buf) lines = buf.split("\n") lines.sort(key = sort_func) codecs.open(args.output[0], "wb", args.encoding).writelines(lines)
def get_sentences(page): from hanziconv import HanziConv sentences = [] for line in HanziConv.toTraditional(page.content).splitlines(): sentences.extend(line.split('。')) return sentences
new_lines = [] n = 0 for line in lines: if line[0] in "#%": new_lines.append(line) continue try: cmd, value = line.strip(' ').decode('utf-8').split(u' ', 1) except ValueError as e: # '\t' 鍵盤對應部份 new_lines.append(line) continue newv = HanziConv.toTraditional(value) if newv != value: # print value , # print ' -> ', # print newv n += 1 elif len(value.strip()) > 1: print value.strip() pass else: newl = line.strip().split(' ')[0].decode('utf-8') + ' ' + newv new_lines.append(newl.encode('utf-8')) print len(lines) print n