def test_conversion_mode(self): c = OpenCC('zhs2zht.ini') c.set_conversion_mode(0) try: c.set_conversion_mode(3) except ValueError, e: self.assertEqual(e.message, 'ValueError: conversion mode must be in [0,1,2].')
def test_convert2(): cc = OpenCC() text = '乾坤一擲' expect = '乾坤一掷' assert cc.convert(text) == expect text = '開放中文轉換' expect = '开放中文转换' assert cc.convert(text) == expect
def __init__(self, appname, session_id): self.session_id = session_id config = RimeConfig() if not rime.config_open(appname.encode("UTF-8"), config): return self.font_face = rimeGetString(config, 'style/font_face') self.candidate_format = rimeGetString(config, 'style/candidate_format') self.inline_preedit = rimeGetString(config, 'style/inline_preedit') menu_opencc_config = rimeGetString(config, 'style/menu_opencc') self.menu_opencc = OpenCC(menu_opencc_config) if menu_opencc_config else None value = c_int() if rime.config_get_int(config, b'style/font_point', value): self.font_point = value.value if rime.config_get_bool(config, b'style/horizontal', value): self.candidate_per_row = 10 if bool(value) else 1 if rime.config_get_int(config, b'style/candidate_per_row', value): self.candidate_per_row = value.value if rime.config_get_bool(config, b'style/display_tray_icon', value): self.display_tray_icon = bool(value) if rime.config_get_bool(config, b'style/candidate_use_cursor', value): self.candidate_use_cursor = bool(value) if rime.config_get_bool(config, b'style/soft_cursor', value): self.soft_cursor = bool(value) self.options.clear() self.options_states.clear() self.uris.clear() self.menu = self.config_get_menu(config, b'menu') #print("menu", self.menu) rime.config_close(config)
def __init__(self, *, data_path: str, scheduler: AsyncIOScheduler, quart_app: Quart, bot_api: Api, verinfo: str = None): # initialize config is_packaged = "_MEIPASS" in dir(sys) if is_packaged: basepath = os.path.dirname(sys.argv[0]) else: basepath = os.path.dirname(__file__) dirname = os.path.abspath(os.path.join(basepath, data_path)) if not os.path.exists(dirname): os.makedirs(dirname) config_f_path = os.path.join(dirname, "yobot_config.json") if is_packaged: default_config_f_path = os.path.join( sys._MEIPASS, "packedfiles", "default_config.json") else: default_config_f_path = os.path.join( os.path.dirname(__file__), "packedfiles", "default_config.json") with open(default_config_f_path, "r", encoding="utf-8") as config_file: self.glo_setting = json.load(config_file) if not os.path.exists(config_f_path): with open(config_f_path, "w") as f: f.write("{}") print("设置已初始化,发送help获取帮助") boss_filepath = os.path.join(dirname, "boss3.json") if not os.path.exists(boss_filepath): if is_packaged: default_boss_filepath = os.path.join( sys._MEIPASS, "packedfiles", "default_boss.json") else: default_boss_filepath = os.path.join( os.path.dirname(__file__), "packedfiles", "default_boss.json") shutil.copyfile(default_boss_filepath, boss_filepath) pool_filepath = os.path.join(dirname, "pool3.json") if not os.path.exists(pool_filepath): if is_packaged: default_pool_filepath = os.path.join( sys._MEIPASS, "packedfiles", "default_pool.json") else: default_pool_filepath = os.path.join( os.path.dirname(__file__), "packedfiles", "default_pool.json") shutil.copyfile(default_pool_filepath, pool_filepath) for e in os.environ: if e.startswith("YOBOT_"): k = e[6:].lower() self.glo_setting[k] = os.environ[e] with open(config_f_path, "r", encoding="utf-8-sig") as config_file: cfg = json.load(config_file) for k in self.glo_setting.keys(): if k in cfg: self.glo_setting[k] = cfg[k] if verinfo is None: verinfo = updater.get_version(self.Version, self.Version_id) print(verinfo['ver_name']) # initialize database ybdata.init(os.path.join(dirname, 'yobotdata.db')) # enable gzip if self.glo_setting["web_gzip"] > 0: gzipped_types = {'text/html', 'text/javascript', 'text/css', 'application/json'} @quart_app.after_request async def gzip_response(response): accept_encoding = request.headers.get('Accept-Encoding', '') if (response.status_code < 200 or response.status_code >= 300 or len(await response.get_data()) < 1024 or 'gzip' not in accept_encoding.lower() or 'Content-Encoding' in response.headers): return response gzip_buffer = BytesIO() gzip_file = gzip.GzipFile( mode='wb', compresslevel=self.glo_setting["web_gzip"], fileobj=gzip_buffer) gzip_file.write(await response.get_data()) gzip_file.close() gzipped_response = gzip_buffer.getvalue() response.set_data(gzipped_response) response.headers['Content-Encoding'] = 'gzip' response.headers['Content-Length'] = len(gzipped_response) return response # initialize web path if not self.glo_setting.get("public_address"): try: res = requests.get("http://api.ipify.org/") ipaddr = res.text except: with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s: s.connect(("8.8.8.8", 53)) ipaddr = s.getsockname()[0] self.glo_setting["public_address"] = "http://{}:{}/".format( ipaddr, self.glo_setting["port"], ) if not self.glo_setting["public_address"].endswith("/"): self.glo_setting["public_address"] += "/" if not self.glo_setting["public_basepath"].startswith("/"): self.glo_setting["public_basepath"] = "/" + \ self.glo_setting["public_basepath"] if not self.glo_setting["public_basepath"].endswith("/"): self.glo_setting["public_basepath"] += "/" # initialize update time if self.glo_setting["update-time"] == "random": self.glo_setting["update-time"] = "{:02d}:{:02d}".format( random.randint(2, 4), random.randint(0, 59) ) # initialize client salt if self.glo_setting["client_salt"] is None: self.glo_setting["client_salt"] = web_util.rand_string(16) # save initialization with open(config_f_path, "w", encoding="utf-8") as config_file: json.dump(self.glo_setting, config_file, indent=4) # initialize utils templating.Ver = self.Version[2:-1] # generate random secret_key if(quart_app.secret_key is None): quart_app.secret_key = bytes( (random.randint(0, 255) for _ in range(16))) # add mimetype mimetypes.init() mimetypes.add_type('application/javascript', '.js') mimetypes.add_type('image/webp', '.webp') # add route for static files @quart_app.route( urljoin(self.glo_setting["public_basepath"], "assets/<path:filename>"), methods=["GET"]) async def yobot_static(filename): accept_encoding = request.headers.get('Accept-Encoding', '') origin_file = os.path.join(os.path.dirname( __file__), "public", "static", filename) if ('gzip' not in accept_encoding.lower() or self.glo_setting['web_gzip'] == 0): return await send_file(origin_file) gzipped_file = os.path.abspath(os.path.join( os.path.dirname(__file__), "public", "static", filename+"."+self.Version[1:-1]+".gz", )) if not os.path.exists(gzipped_file): if not os.path.exists(origin_file): return "404 not found", 404 with open(origin_file, 'rb') as of, open(gzipped_file, 'wb') as gf: with gzip.GzipFile( mode='wb', compresslevel=self.glo_setting["web_gzip"], fileobj=gf, ) as gzip_file: gzip_file.write(of.read()) response = await make_response(await send_file(gzipped_file)) response.mimetype = ( mimetypes.guess_type(os.path.basename(origin_file))[0] or "application/octet-stream" ) response.headers['Content-Encoding'] = 'gzip' response.headers['Vary'] = 'Accept-Encoding' return response # add route for output files if not os.path.exists(os.path.join(dirname, "output")): os.mkdir(os.path.join(dirname, "output")) @quart_app.route( urljoin(self.glo_setting["public_basepath"], "output/<path:filename>"), methods=["GET"]) async def yobot_output(filename): return await send_file(os.path.join(dirname, "output", filename)) # openCC self.ccs2t = OpenCC(self.glo_setting.get("zht_out_style", "s2t")) self.cct2s = OpenCC("t2s") # filter self.black_list = set(self.glo_setting["black-list"]) self.black_list_group = set(self.glo_setting["black-list-group"]) self.white_list_group = set(self.glo_setting["white-list-group"]) # update runtime variables self.glo_setting.update({ "dirname": dirname, "verinfo": verinfo }) kwargs = { "glo_setting": self.glo_setting, "bot_api": bot_api, "scheduler": scheduler, "app": quart_app, } # load plugins plug_all = [ updater.Updater(**kwargs), switcher.Switcher(**kwargs), yobot_msg.Message(**kwargs), gacha.Gacha(**kwargs), jjc_consult.Consult(**kwargs), push_news.News(**kwargs), calender.Event(**kwargs), homepage.Index(**kwargs), marionette.Marionette(**kwargs), login.Login(**kwargs), settings.Setting(**kwargs), web_util.WebUtil(**kwargs), clan_battle.ClanBattle(**kwargs), ] self.plug_passive = [p for p in plug_all if p.Passive] self.plug_active = [p for p in plug_all if p.Active] for p in plug_all: if p.Request: p.register_routes(quart_app) # load new plugins self.plug_new = [ miner.Miner(**kwargs), group_leave.GroupLeave(**kwargs), custom.Custom(**kwargs), ]
import re import jieba from opencc import OpenCC import nltk nltk.download('punkt') from nltk.tokenize import word_tokenize openCC = OpenCC('t2s') openCC_final = OpenCC('s2t') def word_tokenize_fishbracket(en_sent): en_sent = word_tokenize(en_sent) en_sent_out=[] for i,w in enumerate(en_sent): if w=='<' and en_sent[i+2]=='>': continue elif w=='>' and en_sent[i-2]=='<': continue elif en_sent[i-1]=='<' and en_sent[i+1]=='>': en_sent_out.append('<'+w+'>') continue en_sent_out.append(w) return en_sent_out ch_vocab = open('ch_vocab', encoding='utf-8').read().splitlines() ch_word2id = {} for line in ch_vocab: line = line.split(' ') word = line[0] ID = line[1] ch_word2id[word] = ID
from typing import Any, Callable, Optional, Union from unicodedata import normalize from cryptography.fernet import Fernet from opencc import OpenCC from PIL import Image from pyrogram import Message, User from pyrogram.errors import FloodWait from .. import glovar # Enable logging logger = logging.getLogger(__name__) # Init Opencc converter = OpenCC(config="t2s.json") def bold(text: Any) -> str: # Get a bold text result = "" try: result = str(text).strip() if not result: return "" result = f"<b>{escape(result)}</b>" except Exception as e: logger.warning(f"Bold error: {e}", exc_info=True)
from __future__ import absolute_import from __future__ import division from __future__ import print_function import collections import csv import os import pandas as pd import numpy as np import modeling import optimization import tokenization import tensorflow as tf from opencc import OpenCC cc = OpenCC("t2s") flags = tf.flags FLAGS = flags.FLAGS ## Required parameters flags.DEFINE_string( "data_dir", None, "The input data dir. Should contain the .tsv files (or other data files) " "for the task.", ) flags.DEFINE_string( "bert_config_file",
#!/usr/bin/env python2 # -*- coding: utf-8 -*- """ Created on Mon Dec 23 22:56:08 2019 @author: kuangchen """ import regex from bs4 import BeautifulSoup from opencc import OpenCC from file_io import save_data_list cc = OpenCC('t2s') def convert(texts): return [cc.convert(text) for text in texts] def transform_2013(html_string): soup = BeautifulSoup(html_string, 'lxml') error_texts = [] correct_texts = [] sentences = soup.find_all('p') mistakes = soup.find_all('mistake') for sentence in sentences: text = sentence.get_text().strip() error_text = text correct_text = error_text[:]
def convert_chinese(content): content = OpenCC('t2s').convert(content) return content
def get_txt(self, txt_id, state, threadnum): titlem = '' intro = '' ids = str(txt_id) percent = 0 self.state = state #获取文章网址 req_url = ids #通过cookie获取文章信息 res = requests.get(req_url, headers=self.headerss).content #对文章进行编码 ress = etree.HTML( res.decode("GB18030", "ignore").encode("utf-8", "ignore").decode('utf-8')) #获取文案 intro = ress.xpath( "//html/body/table/tr/td[1]/div[2]/div[@id='novelintro']//text()") #获取标签 info = ress.xpath("string(/html/body/table[1]/tr/td[1]/div[3])") infox = [] for i in range(1, 7): infox.append( ress.xpath( "string(/html/body/table[1]/tr/td[3]/div[2]/ul/li[" + str(i) + "])")) #获取封面 cover = ress.xpath( "string(/html/body/table[1]/tr/td[1]/div[2]/img/@src)") if cover != '': pres = requests.get(cover) img = pres.content else: img = "0" #获取标题 titlem = ress.xpath("//html/head/title/text()") if self.state == 's': titlem[0] = OpenCC('t2s').convert(titlem[0]) elif self.state == 't': titlem[0] = OpenCC('s2t').convert(titlem[0]) print("网址:" + ids + "\r\n小说信息:" + str(titlem[0]) + "\r\n") #获取所有章节网址、标题、内容提要 self.td = ress.xpath('//*[@id="oneboolt"]//tr') for i in self.td: u = i.xpath('./td[2]/span/div[1]/a/@href') x = i.xpath('./td[2]/span/div[1]/a[1]/@rel') if len(u) > 0: self.href_list += u v = i.xpath('./td[2]/span/div[1]/a/text()')[0].strip() v = re.sub('&', '&', v) v = re.sub('>', '>', v) v = re.sub('<', '<', v) self.titleindex.append(v) v = i.xpath('./td[3]/text()')[0].strip() v = re.sub('&', '&', v) v = re.sub('>', '>', v) v = re.sub('<', '<', v) self.Summary.append(v) elif len(x) > 0: self.href_list += x v = i.xpath('./td[2]/span/div[1]/a/text()')[0].strip() v = re.sub('&', '&', v) v = re.sub('>', '>', v) v = re.sub('<', '<', v) self.titleindex.append(v) v = i.xpath('./td[3]/text()')[0].strip() v = re.sub('&', '&', v) v = re.sub('>', '>', v) v = re.sub('<', '<', v) self.Summary.append(v) #获取卷标名称 self.rollSign = ress.xpath( "//*[@id='oneboolt']//tr/td/b[@class='volumnfont']/text()") #获取卷标位置 self.rollSignPlace = ress.xpath( "//*[@id='oneboolt']//tr/td/b/ancestor-or-self::tr/following-sibling::tr[1]/td[2]/span/div[1]/a[1]/@href" ) self.rollSignPlace += ress.xpath( "//*[@id='oneboolt']//tr/td/b/ancestor-or-self::tr/following-sibling::tr[1]/td[2]/span/div[1]/a[1]/@rel" ) section_ct = len(self.href_list) print("可下载章节数:" + str(section_ct) + "\r\n") #fillNum:填充序号的长度,例如:若全文有1437章,则每章序号有四位,依次为0001、0002…… self.fillNum = len(str(len(self.td) - 4)) #对标题进行操作,删除违规字符等 ti = str(titlem[0]).split('_') ti = ti[0] ti = re.sub('/', '_', ti) ti = re.sub(r'\\', '_', ti) ti = re.sub('\|', '_', ti) ti = re.sub('\*', '', ti) ti = re.sub('&', '&', ti) xaut = ti.split('》')[1] xauthref = ress.xpath("//*[@id='oneboolt']//h2/a/@href")[0] xtitle = re.sub('《', '', ti.split('》')[0]) #若文件名不想加编号,可以将这行删除 ti = ti + '[' + ids.split('=')[1] + ']' ti = re.sub('\r', '', ti) v = "" #打开小说文件写入小说相关信息 path = os.getcwd() if os.path.exists(ti): os.chdir(ti) else: os.mkdir(ti) os.chdir(ti) self.index = [] #保存封面图片 if img != "0": pic = open("p.jpg", 'wb') pic.write(img) pic.close() #写入封面 f = open("C.xhtml", 'w', encoding='utf-8') f.write('''<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html> <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"> <head><title></title></head><body><img alt="p" src="p.jpg"/></body></html>''') f.close() #写入文章信息页 fo = open("TOC.xhtml", 'w', encoding='utf-8') fo.write('''<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html> <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"> <head><title></title></head><body>''') fo.write("<h2><center><a href='" + req_url + "'>" + xtitle + "</a></center></h2><p></p>") fo.write("<h3 class='sigil_not_in_toc'><center><a href='" + xauthref + "'>" + xaut + "</a></center></h3><p></p>") fo.write('''<blockquote class="userstuff">''') #self.index.append(titlem[0]) #生成目录文字 for l in self.href_list: titleOrigin = l.split('=') i = self.href_list.index(l) title = str(titleOrigin[2]).zfill(self.fillNum) + " " title = title + self.titleindex[i].strip() title = title + " " + self.Summary[i].strip() if self.state == 's': title = OpenCC('t2s').convert(title) elif self.state == 't': title = OpenCC('s2t').convert(title) if self.href_list[i] in self.rollSignPlace: v = self.rollSign[self.rollSignPlace.index(l)] if self.state == 's': v = OpenCC('t2s').convert( self.rollSign[self.rollSignPlace.index(l)]) elif self.state == 't': v = OpenCC('s2t').convert( self.rollSign[self.rollSignPlace.index(l)]) v = re.sub('&', '&', v).rstrip() #& v = re.sub('>', '>', v) v = re.sub('<', '<', v) self.index.append(v) title = re.sub('&', '&', title).rstrip() #& title = re.sub('>', '>', title) title = re.sub('<', '<', title) self.index.append(title) for ix in infox: ix = ix.strip() ix = re.sub('\r\n', '', ix) ix = re.sub(' +', '', ix) ix = re.sub('&', '&', ix) ix = re.sub('>', '>', ix) ix = re.sub('<', '<', ix) fo.write("<p>" + ix + "</p>") fo.write("</blockquote>") fo.write("<p><b>文案:</b></p>") for nx in intro: v = re.sub(' +', ' ', str(nx)).rstrip() v = re.sub('&', '&', v).rstrip() v = re.sub('>', '>', v) v = re.sub('<', '<', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) if v != "": fo.write("<p>" + v + "</p>") info = re.sub(' +', ' ', info).strip() info = re.sub('&', '&', info) info = re.sub('>', '>', info) info = re.sub('<', '<', info) if self.state == 's': info = OpenCC('t2s').convert(info) elif self.state == 't': info = OpenCC('s2t').convert(info) info = re.sub('搜索关键字', '</p><p>搜索关键字', info) info = re.sub('一句话简介:', '</p><p>一句话简介:', info) fo.write("<p>" + info + "</p>") fo.write("</body></html>") fo.close() count = 0 tlist = [] #获取每一章内容 with concurrent.futures.ThreadPoolExecutor( max_workers=threadnum) as executor: tlist = { executor.submit(self.get_sin, i): i for i in self.href_list } for future in concurrent.futures.as_completed(tlist): if self.percent < section_ct: print('\r 下载进度:%d/%d' % (self.percent, section_ct), end='', flush=True) ''' for i in self.href_list: thread = Thread(target=self.get_sin, args=(i,)) tlist.append(thread) thread.start() for t in tlist: t.join() print('\r 下载进度:%d/%d\r\n' % (self.percent,section_ct),end='',flush=True) ''' print('\r 下载完成,总进度:%d/%d\r\n' % (self.percent, section_ct), end='', flush=True) #input("\r\n请按回车键打包epub:") #保存为epub os.chdir(path) epub_name = ti + ".epub" epub = zipfile.ZipFile(epub_name, 'w') EPUB3.epubfile.create_mimetype(epub) EPUB3.epubfile.create_container(epub) os.chdir(ti) ppp = os.getcwd() EPUB3.epubfile.create_content(epub, ppp, xtitle, xaut) EPUB3.epubfile.create_info(epub, ppp, self.index, self.rollSign, xtitle + "-" + xaut) EPUB3.epubfile.create_stylesheet(epub) for html in os.listdir('.'): basename = os.path.basename(html) if basename.endswith('jpg'): epub.write(html, "OEBPS/" + basename, compress_type=zipfile.ZIP_DEFLATED) if basename.endswith('html'): epub.write(html, "OEBPS/" + basename, compress_type=zipfile.ZIP_DEFLATED) epub.close() os.chdir(path) shutil.rmtree(ppp) print("\r\nepub打包完成")
def get_sin(self, l): titleOrigin = l.split('=') i = self.href_list.index(l) cont = requests.get(l, headers=self.headerss).content dot = etree.HTML( cont.decode("GB18030", "ignore").encode("utf-8", "ignore").decode('utf-8')) #tex:正文 tex = dot.xpath( "//html/body/table[@id='oneboolt']/tr[2]/td[1]/div[@class='noveltext']/text()" ) #he:标题 he = dot.xpath( "//html/body/table[@id='oneboolt']/tr[2]/td[1]/div[@class='noveltext']/div[2]/h2/text()" ) #tex1:作话 tex1 = dot.xpath( "//html/body/table[@id='oneboolt']/tr[2]/td[1]/div[@class='noveltext']/div[@class='readsmall']/text()" ) #sign:作话位置 sign = dot.xpath("//*[@id='oneboolt']/tr[2]/td[1]/div/div[4]/@class") #序号填充 title = str(titleOrigin[2]).zfill(self.fillNum) + " " #章节名称 title = title + self.titleindex[i].strip() + " " #内容提要 title = title + self.Summary[i].strip() if self.state == 's': title = OpenCC('t2s').convert(title) elif self.state == 't': title = OpenCC('s2t').convert(title) if self.href_list[i] in self.rollSignPlace: v = self.rollSign[self.rollSignPlace.index(l)] if self.state == 's': v = OpenCC('t2s').convert( self.rollSign[self.rollSignPlace.index(l)]) elif self.state == 't': v = OpenCC('s2t').convert( self.rollSign[self.rollSignPlace.index(l)]) if len(he) == 0: print("第" + titleOrigin[2] + "章未购买或加载失败") else: #创建章节文件 fo = open("z" + str(titleOrigin[2].zfill(self.fillNum)) + ".xhtml", 'w', encoding='utf-8') fo.write('''<?xml version="1.0" encoding="utf-8"?> <!DOCTYPE html> <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"> <head><title></title></head><body>''') #写入卷标 if self.href_list[i] in self.rollSignPlace: fo.write("<h2>" + v.rstrip() + "</h2>") print("\r\n" + v + "\r\n") fo.write("<h3 id='v'>" + title + "</h3>") #写入标题 else: fo.write("<h3>" + title + "</h3>") #作话在文前的情况 if str(sign) == "['readsmall']": fo.write('''<blockquote class="userstuff">''') for m in tex1: #删除无用文字及多余空格空行 vv = re.sub('@无限好文,尽在晋江文学城', '', str(m)) v = re.sub(' +', ' ', vv).rstrip() v = re.sub('&', '&', v).rstrip() v = re.sub('>', '>', v) v = re.sub('<', '<', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) if v != "": #按行写入正文 fo.write("<p>" + v + "</p>") fo.write("</blockquote>") if len(tex1) != 0: fo.write("<hr/>") for tn in tex: vv = re.sub('@无限好文,尽在晋江文学城', '', str(tn)) v = re.sub(' +', ' ', vv).rstrip() v = re.sub('&', '&', v).rstrip() v = re.sub('>', '>', v) v = re.sub('<', '<', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) if v != "": fo.write("<p>" + v + "</p>") else: #作话在文后的情况 for tn in tex: vv = re.sub('@无限好文,尽在晋江文学城', '', str(tn)) v = re.sub(' +', ' ', vv).rstrip() v = re.sub('&', '&', v).rstrip() v = re.sub('>', '>', v) v = re.sub('<', '<', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) if v != "": fo.write("<p>" + v + "</p>") if len(tex1) != 0: fo.write("<hr/>") fo.write('''<blockquote class="userstuff">''') for m in tex1: vv = re.sub('@无限好文,尽在晋江文学城', '', str(m)) v = re.sub(' +', ' ', vv).rstrip() v = re.sub('&', '&', v).rstrip() v = re.sub('>', '>', v) v = re.sub('<', '<', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) if v != "": fo.write("<p>" + v + "</p>") fo.write("</blockquote>") fo.write("</body></html>") fo.close() self.percent += 1
class Yobot: def __init__(self, *args, **kwargs): # self.send_msg = send_msg dirname = os.getcwd() config_f_path = os.path.join(dirname, "yobot_config.json") if not os.path.exists(config_f_path): self.glo_setting = dict() return with open(config_f_path, "r", encoding="utf-8") as config_file: try: self.glo_setting = json.load(config_file) except: raise yobot_errors.File_error(config_f_path + " been damaged") inner_info = { "dirname": dirname, "version": { "ver_name": "yobot[v3.1.1]", "ver_id": 3101, "checktime": 0, "latest": True, "check_url": [ "https://gitee.com/yobot/yobot/raw/master/docs/v3/ver.json", "https://yuudi.github.io/yobot/v3/ver.json", "http://api.yobot.xyz/v3/version/" ] } } self.glo_setting.update(inner_info) self.ccs2t = OpenCC(self.glo_setting.get("zht_out_style", "s2t")) self.cct2s = OpenCC("t2s") updater_plugin = updater.Updater(self.glo_setting) plug_all = [ updater_plugin, switcher.Switcher(self.glo_setting), yobot_msg.Message(self.glo_setting), gacha.Gacha(self.glo_setting), char_consult.Char_consult(self.glo_setting), jjc_consult.Consult(self.glo_setting), boss_dmg.Boss_dmg(self.glo_setting), push_news.News(self.glo_setting), custom.Custom(self.glo_setting) ] self.plug_passive = [p for p in plug_all if p.Passive] self.plug_active = [p for p in plug_all if p.Active] def active_jobs( self) -> List[Tuple[Any, Callable[[], Iterable[Dict[str, Any]]]]]: jobs = [p.jobs() for p in self.plug_active] return reduce(lambda x, y: x + y, jobs) def proc(self, msg: dict, *args, **kwargs) -> str: ''' receive a message and return a reply ''' # prefix if self.glo_setting.get("preffix_on", False): preffix = self.glo_setting.get("preffix_string", "") if not msg["raw_message"].startswith(preffix): return None else: msg["raw_message"] = (msg["raw_message"][len(preffix):]) # black-list if msg["sender"]["user_id"] in self.glo_setting.get( "black-list", list()): return None # zht-zhs convertion if self.glo_setting.get("zht_in", False): msg["raw_message"] = self.cct2s.convert(msg["raw_message"]) if msg["sender"].get("card", "") == "": msg["sender"]["card"] = msg["sender"]["nickname"] # run replys = [] for pitem in self.plug_passive: func_num = pitem.match(msg["raw_message"]) if func_num: res = pitem.execute(func_num, msg) replys.append(res["reply"]) if res["block"]: break reply_msg = "\n".join(replys) # zhs-zht convertion if self.glo_setting.get("zht_out", False): reply_msg = self.ccs2t.convert(reply_msg) return reply_msg def execute(self, cmd: str, *args, **kwargs): if cmd == "update": res = self.plug_passive[0].execute(0x30) return res["reply"]
def clean_text(self, text, remove_url=True, email=True, weibo_at=True, stop_terms=("转发微博", ), emoji=True, weibo_topic=False, deduplicate_space=True, norm_url=False, norm_html=False, to_url=False, remove_puncts=False, remove_tags=True, t2s=False): ''' 进行各种文本清洗操作,微博中的特殊格式,网址,email,html代码,等等 :param text: 输入文本 :param remove_url: (默认使用)是否去除网址 :param email: (默认使用)是否去除email :param weibo_at: (默认使用)是否去除微博的\@相关文本 :param stop_terms: 去除文本中的一些特定词语,默认参数为("转发微博",) :param emoji: (默认使用)去除\[\]包围的文本,一般是表情符号 :param weibo_topic: (默认不使用)去除##包围的文本,一般是微博话题 :param deduplicate_space: (默认使用)合并文本中间的多个空格为一个 :param norm_url: (默认不使用)还原URL中的特殊字符为普通格式,如(%20转为空格) :param norm_html: (默认不使用)还原HTML中的特殊字符为普通格式,如(\ 转为空格) :param to_url: (默认不使用)将普通格式的字符转为还原URL中的特殊字符,用于请求,如(空格转为%20) :param remove_puncts: (默认不使用)移除所有标点符号 :param remove_tags: (默认使用)移除所有html块 :param t2s: (默认不使用)繁体字转中文 :return: 清洗后的文本 ''' # 反向的矛盾设置 if norm_url and to_url: raise Exception("norm_url和to_url是矛盾的设置") if norm_html: text = html.unescape(text) if to_url: text = urllib.parse.quote(text) if remove_tags: text = w3lib.html.remove_tags(text) if remove_url: URL_REGEX = re.compile( r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', re.IGNORECASE) text = re.sub(URL_REGEX, "", text) if norm_url: text = urllib.parse.unquote(text) if email: EMAIL_REGEX = re.compile( r"[-a-z0-9_.]+@(?:[-a-z0-9]+\.)+[a-z]{2,6}", re.IGNORECASE) text = re.sub(EMAIL_REGEX, "", text) if weibo_at: text = re.sub(r"(回复)?(//)?\s*@\S*?\s*(:|:| |$)", " ", text) # 去除正文中的@和回复/转发中的用户名 if emoji: text = re.sub(r"\[\S+\]", "", text) # 去除表情符号 if weibo_topic: text = re.sub(r"#\S+#", "", text) # 去除话题内容 if deduplicate_space: text = re.sub(r"\s+", " ", text) # 合并正文中过多的空格 if t2s: cc = OpenCC('t2s') text = cc.convert(text) assert hasattr(stop_terms, "__iter__"), Exception("去除的词语必须是一个可迭代对象") if type(stop_terms) == str: text = text.replace(stop_terms, "") else: for x in stop_terms: text = text.replace(x, "") if remove_puncts: allpuncs = re.compile( r"[,\_《。》、?;:‘’"“”【「】」·!@¥…()—\,\<\.\>\/\?\;\:\'\"\[\]\{\}\~\`\!\@\#\$\%\^\&\*\(\)\-\=\+]" ) text = re.sub(allpuncs, "", text) return text.strip()
def __init__(self, *, data_path: str, scheduler: AsyncIOScheduler, quart_app: Quart, bot_api: Api, verinfo: str = None): # initialize config is_packaged = "_MEIPASS" in dir(sys) if is_packaged: basepath = os.path.dirname(sys.argv[0]) else: basepath = os.path.dirname(__file__) dirname = os.path.abspath(os.path.join(basepath, data_path)) if not os.path.exists(dirname): os.makedirs(dirname) config_f_path = os.path.join(dirname, "yobot_config.json") if is_packaged: default_config_f_path = os.path.join(sys._MEIPASS, "packedfiles", "default_config.json") else: default_config_f_path = os.path.join(os.path.dirname(__file__), "packedfiles", "default_config.json") with open(default_config_f_path, "r", encoding="utf-8") as config_file: self.glo_setting = json.load(config_file) if not os.path.exists(config_f_path): shutil.copyfile(default_config_f_path, config_f_path) print("设置已初始化,发送help获取帮助") boss_filepath = os.path.join(dirname, "boss3.json") if not os.path.exists(boss_filepath): if is_packaged: default_boss_filepath = os.path.join(sys._MEIPASS, "packedfiles", "default_boss.json") else: default_boss_filepath = os.path.join(os.path.dirname(__file__), "packedfiles", "default_boss.json") shutil.copyfile(default_boss_filepath, boss_filepath) pool_filepath = os.path.join(dirname, "pool3.json") if not os.path.exists(pool_filepath): if is_packaged: default_pool_filepath = os.path.join(sys._MEIPASS, "packedfiles", "default_pool.json") else: default_pool_filepath = os.path.join(os.path.dirname(__file__), "packedfiles", "default_pool.json") shutil.copyfile(default_pool_filepath, pool_filepath) with open(config_f_path, "r", encoding="utf-8-sig") as config_file: cfg = json.load(config_file) for k in self.glo_setting.keys(): if k in cfg: self.glo_setting[k] = cfg[k] if verinfo is None: verinfo = updater.get_version(self.Version, self.Version_id) print(verinfo['ver_name']) # initialize database ybdata.init(os.path.join(dirname, 'yobotdata.db')) # initialize web path if not self.glo_setting.get("public_address"): try: res = requests.get("http://ip-api.com/json/?fields=8192") ipaddr = res.json()["query"] except: with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s: s.connect(("8.8.8.8", 53)) ipaddr = s.getsockname()[0] self.glo_setting["public_address"] = "http://{}:{}/".format( ipaddr, self.glo_setting["port"], ) if not self.glo_setting["public_address"].endswith("/"): self.glo_setting["public_address"] += "/" if not self.glo_setting["public_basepath"].startswith("/"): self.glo_setting["public_basepath"] = "/" + \ self.glo_setting["public_basepath"] if not self.glo_setting["public_basepath"].endswith("/"): self.glo_setting["public_basepath"] += "/" # initialize update time if self.glo_setting["update-time"] == "random": self.glo_setting["update-time"] = "{:02d}:{:02d}".format( random.randint(2, 4), random.randint(0, 59)) # initialize client salt if self.glo_setting["client_salt"] is None: self.glo_setting["client_salt"] = web_util.rand_string(16) # save initialization with open(config_f_path, "w", encoding="utf-8") as config_file: json.dump(self.glo_setting, config_file, indent=4) # initialize utils templating.Ver = self.Version[2:-1] # generate random secret_key if (quart_app.secret_key is None): quart_app.secret_key = bytes( (random.randint(0, 255) for _ in range(16))) # add mimetype mimetypes.init() mimetypes.add_type('application/javascript', '.js') mimetypes.add_type('image/webp', '.webp') # add route for static files @quart_app.route(urljoin(self.glo_setting["public_basepath"], "assets/<path:filename>"), methods=["GET"]) async def yobot_static(filename): return await send_file( os.path.join(os.path.dirname(__file__), "public", "static", filename)) # add route for output files if not os.path.exists(os.path.join(dirname, "output")): os.mkdir(os.path.join(dirname, "output")) @quart_app.route(urljoin(self.glo_setting["public_basepath"], "output/<path:filename>"), methods=["GET"]) async def yobot_output(filename): return await send_file(os.path.join(dirname, "output", filename)) # openCC self.ccs2t = OpenCC(self.glo_setting.get("zht_out_style", "s2t")) self.cct2s = OpenCC("t2s") # filter self.black_list = set(self.glo_setting["black-list"]) self.black_list_group = set(self.glo_setting["black-list-group"]) self.white_list_group = set(self.glo_setting["white-list-group"]) # update runtime variables self.glo_setting.update({"dirname": dirname, "verinfo": verinfo}) kwargs = { "glo_setting": self.glo_setting, "bot_api": bot_api, "scheduler": scheduler, "app": quart_app, } # load plugins plug_all = [ updater.Updater(**kwargs), switcher.Switcher(**kwargs), yobot_msg.Message(**kwargs), gacha.Gacha(**kwargs), jjc_consult.Consult(**kwargs), boss_dmg.Boss_dmg(**kwargs), push_news.News(**kwargs), calender.Event(**kwargs), homepage.Index(**kwargs), marionette.Marionette(**kwargs), login.Login(**kwargs), settings.Setting(**kwargs), web_util.WebUtil(**kwargs), clan_battle.ClanBattle(**kwargs), ] self.plug_passive = [p for p in plug_all if p.Passive] self.plug_active = [p for p in plug_all if p.Active] for p in plug_all: if p.Request: p.register_routes(quart_app) # load new plugins self.plug_new = [ miner.Miner(**kwargs), group_leave.GroupLeave(**kwargs), custom.Custom(**kwargs), ]
def loadtext(textdir, mixedversion=True, textrankrate=0.5): t0 = time.time() docs = [] filelist = os.listdir(textdir) for f in filelist: if not is_alphabet(f[0]): filelist.remove(f) # print(filelist) tr4s = TextRank4Sentence() for file in sorted(filelist, key=lambda file: float(file.split("_")[1][:-4]), reverse=True): print(file) with open(savedir+file, "r", encoding="utf-8") as txtfile: eofp = open(tempdir+file, "w", encoding="utf-8") contents = txtfile.read() if mixedversion: tr4s.analyze(text=contents, lower=True, source='all_filters') docsum = tr4s.get_key_sentences( num=int(len(tr4s.sentences)*textrankrate)) # print(docsum) contents = [item.sentence for item in docsum] else: contents = contents.splitlines() # print(contents) paragraph, sentence = [], [] for content in contents: line, i, l = "", 0, len(content) while i < l: if content[i] == " " and i+1 < l and not is_alphabet(content[i+1]): i += 1 continue if content[i] == " ": i += 1 continue line += content[i] if content[i] in ["。", "?", "!", "?", "!"]: if i+1 < l and content[i+1] in ["」", "』", "”"]: line += content[i+1] i += 1 if line[0] in ["(", "("] and sentence: sentence[-1] += line line = "" elif i+1 < l and content[i+1] not in ["》", "〉", ")", ")"]: sentence.append(line) line = "" i += 1 if line: sentence.append(line) if sentence: for s in sentence: eofp.write(s+"\n") paragraph.append(sentence) sentence = [] eofp.close() docs.append(paragraph) print("txt loding completed", time.time()-t0) xlist = ["。", "?", "!", "~", "?", "!", " ", " ", "」", "』", "”", "(", "(", "》", "〉", ")", ")", ",", ":", "」", "、", "《", ";", "「", "%"] t0 = time.time() dic = {} with open(_PATH+"source/1998.csv", "r", encoding="utf-8") as dictxt: lines = dictxt.read().splitlines() for line in lines: line = line.split(",") dic[line[1]] = float(line[3]) sentences = [] indexsentences = [] opcc = OpenCC('s2twp') for d, doc in enumerate(docs[:]): for p, para in enumerate(doc): for sid, sen in enumerate(para): # print(sen) sentence = [] idf = count = w = score = 0 hitset = set() words = jbps.cut(sen) for word, flag in words: w += 1 word = opcc.convert(word) try: float(word) except: if not word in xlist: # print(word, flag, end="|") sentence.append(word) if not word in dic or word in hitset: continue if flag in ["l", "n", "nr"] and dic[word] >= 30: idf += 1/dic[word] count += 1 # print("\n", sentence, "\n") if w >= 10 and count > 3: score = idf/count if sentence and score > 0: sentences.append(sentence) indexsentences.append( Sentence(sen, sentence, d, p, sid, score)) print("segmentation completed", time.time()-t0) print("total sentences:", len(sentences)) # print(sentences) # for s in indexsentences: # print(s) return sentences, indexsentences
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import sys from opencc import OpenCC if __name__ == '__main__': if sys.version_info[0] < 3: print('Require Python3 to run') sys.exit(0) openCC = OpenCC() openCC.set_conversion('s2twp') # openCC = OpenCC('s2twp') words = '鼠标是一种很常見及常用的電腦输入设备,它可以对当前屏幕上的游标进行定位,并通过按键和滚轮装置对游标所经过位置的' \ '屏幕元素进行操作。鼠标的鼻祖於1968年出现。美国科学家道格拉斯·恩格尔巴特(Douglas Englebart)在加利福尼亚制作了' \ '第一只鼠标。' result = openCC.convert(words) print("{} \n\n==> \n\n{}".format(words, result))
for s in soup.find_all('div', {'class': 'sons'}): sub_soup = get_soup(s.find('a').attrs['href']) cont = sub_soup.find('div', { 'class': 'main3' }).find('div', {'class': 'cont'}) chapter = cont.find('h1').find('span').find('b').string paragraphs_list = [] paragraphs = cont.find('div', {'class': 'contson'}) if not paragraphs.find('p') is None: paragraphs = paragraphs.find('p') for p in paragraphs: p = p.string if not p is None and len(p.strip()) > 0: paragraphs_list.append(replace_symbol(p)) print(len(paragraphs_list)) data['content'].append({ 'chapter': cont.find('h1').find('span').find('b').string, 'paragraphs': paragraphs_list }) cc = OpenCC('s2t') with open('./jsons/zengguangxianwen.json', 'w', encoding='utf-8') as file_object: json.dump(json.loads(cc.convert(json.dumps(data, ensure_ascii=False))), file_object, sort_keys=False, indent=2, ensure_ascii=False)
def main(bgtext): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0', type=str, required=False, help='生成設備') parser.add_argument('--length', default=700, type=int, required=False, help='生成长度') parser.add_argument('--batch_size', default=1, type=int, required=False, help='生成的batch size') parser.add_argument('--nsamples', default=1, type=int, required=False, help='生成几个样本') parser.add_argument('--temperature', default=1.0, type=float, required=False, help='生成温度') parser.add_argument('--topk', default=8, type=int, required=False, help='最高几选一') parser.add_argument('--topp', default=1, type=float, required=False, help='最高积累概率') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab.txt', type=str, required=False, help='词表路径') parser.add_argument('--model_path', default='model/final_model', type=str, required=False, help='模型路径') parser.add_argument('--prefix', default='{}'.format(str(bgtext)), type=str, required=False, help='生成文章的开头') parser.add_argument('--no_wordpiece', action='store_true', help='不做word piece切词') parser.add_argument('--segment', action='store_true', help='中文以词为单位') parser.add_argument('--fast_pattern', action='store_true', help='采用更加快的方式生成文本') parser.add_argument('--save_samples', action='store_true', help='保存产生的样本') parser.add_argument('--save_samples_path', default='./sample/sample_save', type=str, required=False, help="保存样本的路径") parser.add_argument('--repetition_penalty', default=1.0, type=float, required=False) args = parser.parse_args() print('args:\n' + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 length = args.length batch_size = args.batch_size nsamples = args.nsamples temperature = args.temperature topk = args.topk topp = args.topp repetition_penalty = args.repetition_penalty device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) model = GPT2LMHeadModel.from_pretrained(args.model_path) model.to(device) model.eval() n_ctx = model.config.n_ctx if length == -1: length = model.config.n_ctx args.save_samples = True if args.save_samples: if not os.path.exists(args.save_samples_path): os.makedirs(args.save_samples_path) samples_file = open(args.save_samples_path + '/samples.txt', 'w', encoding='utf8') while True: raw_text = args.prefix context_tokens = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(raw_text)) generated = 0 for _ in range(nsamples // batch_size): out = generate(n_ctx=n_ctx, model=model, context=context_tokens, length=length, is_fast_pattern=args.fast_pattern, tokenizer=tokenizer, temperature=temperature, top_k=topk, top_p=topp, repitition_penalty=repetition_penalty, device=device) #result_text = [] for i in range(batch_size): generated += 1 text = tokenizer.convert_ids_to_tokens(out) for i, item in enumerate(text[:-1]): # 确保英文前后有空格 if is_word(item) and is_word(text[i + 1]): text[i] = item + ' ' for i, item in enumerate(text): if item == '[MASK]': text[i] = '' elif item == '[CLS]': text[i] = '\n\n' elif item == '[SEP]': text[i] = '\n' info = "=" * 40 + " SAMPLE " + str( generated) + " " + "=" * 40 + "\n" #print(info) text = ''.join(text).replace('##', '').strip() cc = OpenCC('s2t') return cc.convert(text) print(cc.convert(text)) #print(text) #=== get the result=== if args.save_samples: samples_file.write(info) samples_file.write(text) samples_file.write('\n') samples_file.write('=' * 90) samples_file.write('\n' * 2) # if text[0] == bgtext[0][0]: # result_text.append(text) #print("=" * 80) if generated == nsamples: # close file when finish writing. if args.save_samples: samples_file.close() #print(result_text) break
def setUp(self): self.openCC = OpenCC()
def text_cleaning(text_array, word_tokenization, bert_vocab_file, do_lower_case, num_threads=1): """ a function to clean text, including - Normalize full width characters - Convert all English letters to lower case - Translate Chinese simplified characters to traditional Chinese - Separate Chinese and English tokens from each other - Segment Chinese text - remove punctuation - perform the special Word Piece Segmentation if specified by the argument "word_tokenization" INPUT: text_array: list/numpy array/pandas Series word_tokenization: method of word segmentation, either split by "space" or "word_piece" tokenization bert_vocab_file: string --path to the .txt file of vocabularies for modified Word Piece Tokenizer do_lower_case: boolean --whether the modified Word Piece Tokenizer converts English letters to lower case or not num_threads: int --number of CPU processors for performing the text cleaning OUTPUT: text_array: a list of string (cleaned text) """ with mp.Pool(processes=num_threads) as pool: print(" Normalizing full-width characters...") text_array = pool.map(normalize_full_width, text_array) print(" Converting English letters to lower case...") text_array = pool.map(case_lower, text_array) print(" Translating Simplified Chinese to Traditional...") bool_contain_chi = pool.map(detect_chi, text_array) text_array = np.array(text_array) cc = OpenCC('s2t') text_array[bool_contain_chi] = list( map(lambda x: cc.convert(x), text_array[bool_contain_chi])) print( " Separating Chinese amd English word tokens from each other...") text_array = pool.map(sep_chi_eng, text_array) print(" Segmenting Chinese vocabularies...") text_array = np.array(text_array) dtype_0 = str(text_array.dtype) dtype_len_0 = int(re.findall("[0-9]+", dtype_0)[0]) if sum(bool_contain_chi): replacement = pool.map(segment_chi, text_array[bool_contain_chi]) replacement = np.array(replacement) dtype_1 = str(replacement.dtype) dtype_len_1 = int(re.findall("[0-9]+", dtype_1)[0]) # change the type of text_array to that of the replacement # in order to avoid some of the characters in the segmented text being trimmed if dtype_len_0 > dtype_len_1: text_array = text_array.astype(dtype_0) else: text_array = text_array.astype(dtype_1) text_array[bool_contain_chi] = replacement print(" Removing punctuation...") text_array = pool.map(remove_punct, text_array) if word_tokenization == "word_piece": print(" Word Piece Segmentation...") tokenizer = ModifiedWordPieceTokenizer( bert_vocab_file=bert_vocab_file, do_lower_case=do_lower_case) text_array = pool.map(tokenizer.tokenize, text_array) elif word_tokenization == "space": return text_array return text_array
import os from opencc import OpenCC from tqdm import tqdm cc = OpenCC('s2t') # ctrl alt L def translate(src, dest): """ goal: convert simplified chinese to traditional input: source file path(src), target file path(dest) output: write converted file to target file path """ source = open(src, 'r', encoding='utf-8') result = open(dest, 'w', encoding='utf-8') count = 0 while True: line = source.readline() line = cc.convert(line) if not line: # readline會一直讀下去,這邊做的break break # print(line) ##debug count = count + 1 result.write(line) # print('===已處理' + str(count) + '行===') ##debug source.close() result.close()
def get_txt(self, txt_id, state, threadnum): titlem = '' intro = '' ids = str(txt_id) percent = 0 self.state = state self.percent = 0 self.index = [] self.titleindex = [] self.Summary = [] self.fillNum = 0 self.rollSign = [] self.rollSignPlace = [] self.href_list = [] self.td = [] self.failInfo = [] #获取文章网址 req_url = ids #通过cookie获取文章信息 res = requests.get(req_url, headers=self.headerss).content #对文章进行编码 ress = etree.HTML( res.decode("GB18030", "ignore").encode("utf-8", "ignore").decode('utf-8')) #获取文案 intro = ress.xpath( "//html/body/table/tr/td[1]/div[2]/div[@id='novelintro']//text()") #获取标签 info = ress.xpath("string(/html/body/table[1]/tr/td[1]/div[3])") infox = [] for i in range(1, 7): infox.append( ress.xpath( "string(/html/body/table[1]/tr/td[3]/div[2]/ul/li[" + str(i) + "])")) #获取标题和作者 xtitle = ress.xpath('string(//*[@itemprop="articleSection"])').strip() xaut = ress.xpath('string(//*[@itemprop="author"])').strip() ti = xtitle + '-' + xaut if self.state == 's': ti = OpenCC('t2s').convert(ti) elif self.state == 't': ti = OpenCC('s2t').convert(ti) print("网址:" + ids + "\r\n小说信息:" + str(ti) + "\r\n") #获取所有章节网址、标题、内容提要 self.td = ress.xpath('//*[@id="oneboolt"]//tr') loc = [] for i in self.td: u = i.xpath('./td[2]/span/div[1]/a/@href') x = i.xpath('./td[2]/span/div[1]/a[1]/@rel') if len(u) > 0: self.href_list += u v = i.xpath('./td[2]/span/div[1]/a') v = etree.tostring(v[0], encoding="utf-8").decode().strip() v = re.sub('</?\w+[^>]*>', '', v) self.titleindex.append(v.strip()) v = i.xpath('./td[3]') v = etree.tostring(v[0], encoding="utf-8").decode().strip() v = re.sub('</?\w+[^>]*>', '', v) v = re.sub(' ', '', v) self.Summary.append(v.strip()) elif len(x) > 0: self.href_list += x v = i.xpath('./td[2]/span/div[1]/a') v = etree.tostring(v[0], encoding="utf-8").decode().strip() v = re.sub('</?\w+[^>]*>', '', v) self.titleindex.append(v.strip()) v = i.xpath('./td[3]') v = etree.tostring(v[0], encoding="utf-8").decode().strip() v = re.sub('</?\w+[^>]*>', '', v) v = re.sub(' ', '', v) self.Summary.append(v.strip()) elif i.xpath('./td[2]/span/div[1]/span') != []: loc.append(i.xpath('./td[1]/text()')[0].strip()) #获取卷标名称 self.rollSign = ress.xpath( "//*[@id='oneboolt']//tr/td/b[@class='volumnfont']") #获取卷标位置 self.rollSignPlace = ress.xpath( "//*[@id='oneboolt']//tr/td/b/ancestor-or-self::tr/following-sibling::tr[1]/td[2]/span/div[1]/a[1]/@href" ) self.rollSignPlace += ress.xpath( "//*[@id='oneboolt']//tr/td/b/ancestor-or-self::tr/following-sibling::tr[1]/td[2]/span/div[1]/a[1]/@rel" ) #修改卷标格式 for rs in range(len(self.rollSign)): self.rollSign[rs] = etree.tostring( self.rollSign[rs], encoding="utf-8").decode().strip() self.rollSign[rs] = re.sub('</?\w+[^>]*>', '', self.rollSign[rs]) self.rollSign[rs] = "§ " + self.rollSign[rs] + " §" section_ct = len(self.href_list) print("可下载章节数:" + str(section_ct) + "\r\n") if loc != []: i = "" for x in loc: i = i + x + " " print("被锁章节:" + i + "\r\n") #fillNum:填充序号的长度,例如:若全文有1437章,则每章序号有四位,依次为0001、0002…… self.fillNum = len(str(len(self.td) - 4)) #对标题进行操作,删除违规字符等 ti = re.sub('[\/:*?"<>|]', '_', ti) ti = re.sub('&', '&', ti) xauthref = ress.xpath("//*[@id='oneboolt']//h2/a/@href")[0] #若文件名不想加编号,可以将这行删除 ti = ti + '.' + ids.split('=')[1] ti = re.sub('\r', '', ti) v = "" #打开小说文件写入小说相关信息 path = os.getcwd() self.path = path if not os.path.exists('Fonts'): os.mkdir('Fonts') if os.path.exists(ti + '_txt'): os.chdir(ti + '_txt') else: os.mkdir(ti + '_txt') os.chdir(ti + '_txt') ppp = os.getcwd() self.index = [] #写入文章信息页 TOC = xtitle + '\n' TOC += '作者:' + xaut + "\r\n" TOC += '源网址:' + req_url + '\r\n' #生成目录文字 for l in self.href_list: titleOrigin = l.split('=') i = self.href_list.index(l) # title = str(titleOrigin[2]).zfill(self.fillNum) + " " # title = title + self.titleindex[i].strip() + " " # title = title + self.Summary[i].strip() if self.state == 's': title = OpenCC('t2s').convert(title) elif self.state == 't': title = OpenCC('s2t').convert(title) if self.href_list[i] in self.rollSignPlace: v = self.rollSign[self.rollSignPlace.index(l)] if self.state == 's': v = OpenCC('t2s').convert( self.rollSign[self.rollSignPlace.index(l)]) elif self.state == 't': v = OpenCC('s2t').convert( self.rollSign[self.rollSignPlace.index(l)]) self.index.append(v) self.index.append(title) for ix in infox: ix = ix.strip() ix = re.sub('\r\n', '', ix) ix = re.sub(' +', '', ix) TOC += ix + "\r\n" TOC += "文案:\r\n" for nx in intro: v = re.sub(' +', ' ', str(nx)).strip() if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) if v != "": TOC += v + "\n" info = re.sub(' +', ' ', info).strip() if self.state == 's': info = OpenCC('t2s').convert(info) elif self.state == 't': info = OpenCC('s2t').convert(info) info = re.sub('搜索关键字', '\r\n搜索关键字', info) info = re.sub(' 一句话简介:', '一句话简介:', info) info = re.sub('\r\n \r\n 立意:', '\r\n立意:', info) TOC += info + "\n" fo = open("TOC.txt", 'w', encoding='utf-8') fo.write(TOC) fo.close() tlist = [] #获取每一章内容 with concurrent.futures.ThreadPoolExecutor( max_workers=threadnum) as executor: tlist = { executor.submit(self.get_sin, i): i for i in self.href_list } for future in concurrent.futures.as_completed(tlist): if self.percent < section_ct: print('\r 下载进度:%d/%d' % (self.percent, section_ct), end='', flush=True) print('\r 下载完成,总进度:%d/%d\r\n' % (self.percent, section_ct), end='', flush=True) ''' for i in self.href_list: self.get_sin(i) ''' if self.failInfo != []: self.failInfo.sort() vs = "" for ss in self.failInfo: vs = vs + ss + "|" print("\r\n未购买或加载失败章节:") print(vs[:-1] + "\r\n") #整合 os.chdir(path) f = open(ti + ".txt", 'w', encoding='utf-8') filenames = os.listdir(ppp) i = 0 for filename in filenames: filepath = ppp + '\\' + filename for line in open(filepath, encoding='utf-8', errors='ignore'): f.writelines(line) f.close() shutil.rmtree(ppp) print("\r\ntxt文件整合完成")
from opencc import OpenCC import codecs import os import re import string import json from scipy import spatial import jieba from typing import List import math from gensim.models.word2vec import PathLineSentences from gensim.models.word2vec import Word2Vec from gensim.models.ldamodel import LdaModel from gensim.corpora.dictionary import Dictionary OP = OpenCC('t2s') sent_cut_pattern = [ re.compile(r'([。?!?])([^"\'”])'), re.compile(r'(\.{6})([^"\'”])'), re.compile(r'([。?!?]["\'”])([^\'"”])'), ] zh_pattern = re.compile(r'^[\u4e00-\u9fa5]+$') puncs = string.punctuation + '.,;《》?!“”‘’@#¥%…&×()——+【】{};;●,。&~、|\s::' punc_pattern = re.compile(r'[{}]+'.format(puncs)) stopwords = [] with open('./data/chinese_stopwords.txt', 'r', encoding='utf8') as f: for line in f: line = line.strip() if len(line) > 0: stopwords.append(line.strip())
def get_sin(self, l): titleOrigin = l.split('=') i = self.href_list.index(l) badgateway = True while (badgateway): cont = requests.get(l, headers=self.headerss) dot = etree.HTML( cont.content.decode('gb18030', "ignore").encode("utf-8").decode('utf-8')) codetext = etree.tostring(dot, encoding="utf-8").decode() bdw = re.findall('<h1>502 Bad Gateway</h1>', codetext) if bdw == []: badgateway = False fontfamily = '' cvlist = [] cvdic = [] #字体反爬虫 fontsrc = re.findall( r'//static.jjwxc.net/tmp/fonts/.*?woff2.h=my.jjwxc.net', codetext) if fontsrc != []: fontsrc = "http:" + fontsrc[0] fontname = re.sub('http://static.jjwxc.net/tmp/fonts/', '', fontsrc) fontname = re.sub('.h=my.jjwxc.net', '', fontname) fontfamily = re.sub('.woff2', '', fontname) try: with open(self.path + "/Fonts/" + fontfamily + ".txt", "r", encoding='utf-8') as f: cvlist = f.readlines() for y in range(len(cvlist)): cvdic.append(cvlist[y].split('-')) cvdic = dict(cvdic) except: t = 1 if not os.path.exists(self.path + "/Fonts/" + fontname): fontwb = requests.get(fontsrc).content fontf = open(self.path + "/Fonts/" + fontname, 'wb') fontf.write(fontwb) fontf.close() if cvlist != []: fontfamily += '_c' elif fontfamily not in self.fontlist: self.fontlist.append(fontfamily) #tex:正文 tex = dot.xpath('//*[@id="oneboolt"]/tr[2]/td[1]/div/text()') #tex1:作话 tex1 = dot.xpath("//div[@class='readsmall']/text()") #sign:作话位置 sign = dot.xpath("//*[@id='oneboolt']/tr[2]/td[1]/div/div[4]/@class") title = '' #序号填充 if self.titleInfo[0] == '1': title = str(titleOrigin[2]).zfill(self.fillNum) + "#" #章节名称 if self.titleInfo[1] == '1': title = title + " " + self.titleindex[i].strip() #内容提要 if self.titleInfo[2] == '1': title = title + " " + self.Summary[i].strip() if self.state == 's': title = OpenCC('t2s').convert(title) elif self.state == 't': title = OpenCC('s2t').convert(title) if self.href_list[i] in self.rollSignPlace: v = self.rollSign[self.rollSignPlace.index(l)] if self.state == 's': v = OpenCC('t2s').convert( self.rollSign[self.rollSignPlace.index(l)]) elif self.state == 't': v = OpenCC('s2t').convert( self.rollSign[self.rollSignPlace.index(l)]) #创建章节文件 fo = open("z" + str(titleOrigin[2].zfill(4)) + ".txt", 'w', encoding='utf-8') #写入卷标 if self.href_list[i] in self.rollSignPlace: fo.write("\r\n\r\n" + v.rstrip() + '\r\n') print("\r\n" + v + "\r\n") fo.write(title + '\r\n') #写入标题 else: fo.write("\r\n\r\n" + title + "\r\n") if len(tex) == 0: self.failInfo.append(titleOrigin[2].zfill(self.fillNum)) fo.write('下载失败!') else: #反爬虫处理,必须把对照表TXT文件下载至Fonts文件夹 if cvdic != []: for y in range(len(tex)): for s, v in cvdic.items(): if not s == 'x"/;': s = re.sub(r'&#x', r'\\u', s) s = re.sub( ';', '', s).encode('utf-8').decode('unicode_escape') tex[y] = re.sub(s, v.strip(), tex[y]) cvdic = cvlist = 0 #作话在文前的情况 if str(sign) == "['readsmall']": for m in tex1: #删除无用文字及多余空格空行 vv = re.sub('@无限好文,尽在晋江文学城', '', str(m)) v = re.sub(' +', ' ', vv).strip() v = re.sub(' ', '', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) v = re.sub('作者有话要说:', '作者有话要说:\n', v) if v != "": #按行写入正文 fo.write(v + "\n") if len(tex1) != 0: fo.write("\n*\r\n") for tn in tex: vv = re.sub('@无限好文,尽在晋江文学城', '', str(tn)) v = re.sub(' +', ' ', vv).strip() v = re.sub(' ', '', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) if v != "": fo.write(v + "\n") else: #作话在文后的情况 for tn in tex: vv = re.sub('@无限好文,尽在晋江文学城', '', str(tn)) v = re.sub(' +', ' ', vv).strip() v = re.sub(' ', '', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) if v != "": fo.write(v + "\n") if len(tex1) != 0: fo.write("\n*\r\n") for m in tex1: vv = re.sub('@无限好文,尽在晋江文学城', '', str(m)) v = re.sub(' +', ' ', vv).strip() v = re.sub(' ', '', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) v = re.sub('作者有话要说:', '作者有话要说:\n', v) if v != "": fo.write(v + "\n") fo.close() self.percent += 1
names = os.listdir(path) plainText = [ x for x in names if x[-3:].lower() in ("lrc", "ass", "txt", "ssa", "srt") ] for a in plainText: pt = f"{path}/{a}" with open(pt, encoding="utf-8") as f: data = cc.convert(f.read()) with open(pt, mode="w+", encoding="utf-8") as f: f.write(data) print("done.") if __name__ == "__main__": cc = OpenCC("t2s") type = ( "t2s", "t2hk", "t2tw", "tw2s", "tw2sp", "hk2s", "s2hk", "s2t", "s2tw", "s2twp", ) items = "\n".join([f"{x[0]}. {x[1]}" for x in enumerate(type)]) n = input( f'WARNING: JAPANESE CHARACTER maybe affected!\n{items}\nInput number to select a convert type.\nInput nothing and enter to use "t2s" by default:'
@Author: King @Date: 2019.03.13 @Purpose: 处理wikizh语料库 @Link:https://dumps.wikimedia.org/zhwiki/20180801/ @Reference: https://kexue.fm/archives/4176 @opencc安装命令:pip install opencc-python-reimplemented @opencc reference: https://github.com/yichen0831/opencc-python ''' from gensim.corpora.wikicorpus import extract_pages, filter_wiki import bz2file import re from os import path #import opencc from opencc import OpenCC OpenCC = OpenCC( 't2s') # convert from Simplified Chinese to Traditional Chinese from tqdm import tqdm import codecs data_dir = 'resource/' wiki = extract_pages( bz2file.open( path.join(data_dir, 'zhwiki-20180801-pages-articles-multistream.xml.bz2'))) def wiki_replace(d): s = d[1] s = re.sub(':*{\|[\s\S]*?\|}', '', s) s = re.sub('<gallery>[\s\S]*?</gallery>', '', s) s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s)
import os import sys import torch from prepro_lib import jieba_cut from opencc import OpenCC import pickle import copy cc = OpenCC('tw2s') device = torch.device('cuda:0') class System: def __init__( self, max_len=30, model_path=None, emb_matrix=None, w2id_dict=None ): if not os.path.exists(model_path): print("model path is not exist..") sys.exit(1) self.max_len = max_len self.emb_matrix = emb_matrix self.model_path = model_path
from gensim.corpora.wikicorpus import extract_pages, filter_wiki import bz2file import re from tqdm import tqdm import codecs wiki = extract_pages( bz2file.open('zhwiki-20180301-pages-articles-multistream.xml.bz2')) from opencc import OpenCC openCC = OpenCC( 'hk2s') # convert from Simplified Chinese to Traditional Chinese # can also set conversion by calling set_conversion # openCC.set_conversion('s2tw') to_convert = '开放中文转换' converted = openCC.convert(to_convert) def wiki_replace(d): s = d[1] s = re.sub(':*{\|[\s\S]*?\|}', '', s) s = re.sub('<gallery>[\s\S]*?</gallery>', '', s) s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s) s = filter_wiki(s) s = re.sub('\* *\n|\'{2,}', '', s) s = re.sub('\n+', '\n', s) s = re.sub('\n[:;]|\n +', '\n', s) s = re.sub('\n==', '\n\n==', s) s = u'【' + d[0] + u'】\n' + s return openCC.convert(s).strip()
from mxnet.gluon.data import SimpleDataset, DataLoader from mxnet import nd import numpy as np from copy import deepcopy from random import choice from utils import load_pickle import jieba import re from pinyin import PinYinSampler from opencc import OpenCC from structure import Structure s2t = OpenCC('s2t') # t2s = OpenCC('t2s') class SighanCSC14Data(object): def __init__(self, tokenizer, transformer, config, mode, vocab_tgt=None, useDecoder=False, args=None): self.tokenizer = tokenizer self.transformer = transformer self.mode = mode self.batch_size = config[mode]['batch_size'] self.max_seq_len = config['int_max_length'] self.vocab_tgt = vocab_tgt self.config = config
from PyQt5 import QtWidgets from PyQt5 import QtCore from PyQt5 import QtGui from shensha import shensha from opencc import OpenCC cc = OpenCC('s2t') class ShenShaDialog(QtWidgets.QDialog): def __init__(self, parent=None, shiPan=None): super().__init__(parent) self.setWindowTitle("神煞") self.setWindowFlag(QtCore.Qt.WindowMinMaxButtonsHint) self.resize(700, 500) guaTiLayout = QtWidgets.QVBoxLayout() # self.layout=helpLayout self.setLayout(guaTiLayout) self.guaTiTextBrowser = QtWidgets.QTextBrowser() guaTiLayout.addWidget(self.guaTiTextBrowser) guaTiFont = QtGui.QFont() guaTiFont.setPixelSize(18) self.guaTiTextBrowser.setFont(guaTiFont) if shiPan is None: self.guaTiTextBrowser.setHtml("查詢神煞,請先排盤") return shenShaJson = {"年": {}, "月": {}, "日": {}} shenShaModules = shensha shenShaFuns = [] for attr in (a for a in dir(shenShaModules) if a.startswith('do_')):
class Yobot: Version = "[v3.6.11]" # semver Version_id = 253 # "git rev-list --count HEAD" def __init__(self, *, data_path: str, scheduler: AsyncIOScheduler, quart_app: Quart, bot_api: Api, verinfo: str = None): # initialize config is_packaged = "_MEIPASS" in dir(sys) if is_packaged: basepath = os.path.dirname(sys.argv[0]) else: basepath = os.path.dirname(__file__) dirname = os.path.abspath(os.path.join(basepath, data_path)) if not os.path.exists(dirname): os.makedirs(dirname) config_f_path = os.path.join(dirname, "yobot_config.json") if is_packaged: default_config_f_path = os.path.join( sys._MEIPASS, "packedfiles", "default_config.json") else: default_config_f_path = os.path.join( os.path.dirname(__file__), "packedfiles", "default_config.json") with open(default_config_f_path, "r", encoding="utf-8") as config_file: self.glo_setting = json.load(config_file) if not os.path.exists(config_f_path): with open(config_f_path, "w") as f: f.write("{}") print("设置已初始化,发送help获取帮助") boss_filepath = os.path.join(dirname, "boss3.json") if not os.path.exists(boss_filepath): if is_packaged: default_boss_filepath = os.path.join( sys._MEIPASS, "packedfiles", "default_boss.json") else: default_boss_filepath = os.path.join( os.path.dirname(__file__), "packedfiles", "default_boss.json") shutil.copyfile(default_boss_filepath, boss_filepath) pool_filepath = os.path.join(dirname, "pool3.json") if not os.path.exists(pool_filepath): if is_packaged: default_pool_filepath = os.path.join( sys._MEIPASS, "packedfiles", "default_pool.json") else: default_pool_filepath = os.path.join( os.path.dirname(__file__), "packedfiles", "default_pool.json") shutil.copyfile(default_pool_filepath, pool_filepath) for e in os.environ: if e.startswith("YOBOT_"): k = e[6:].lower() self.glo_setting[k] = os.environ[e] with open(config_f_path, "r", encoding="utf-8-sig") as config_file: cfg = json.load(config_file) for k in self.glo_setting.keys(): if k in cfg: self.glo_setting[k] = cfg[k] if verinfo is None: verinfo = updater.get_version(self.Version, self.Version_id) print(verinfo['ver_name']) # initialize database ybdata.init(os.path.join(dirname, 'yobotdata.db')) # enable gzip if self.glo_setting["web_gzip"] > 0: gzipped_types = {'text/html', 'text/javascript', 'text/css', 'application/json'} @quart_app.after_request async def gzip_response(response): accept_encoding = request.headers.get('Accept-Encoding', '') if (response.status_code < 200 or response.status_code >= 300 or len(await response.get_data()) < 1024 or 'gzip' not in accept_encoding.lower() or 'Content-Encoding' in response.headers): return response gzip_buffer = BytesIO() gzip_file = gzip.GzipFile( mode='wb', compresslevel=self.glo_setting["web_gzip"], fileobj=gzip_buffer) gzip_file.write(await response.get_data()) gzip_file.close() gzipped_response = gzip_buffer.getvalue() response.set_data(gzipped_response) response.headers['Content-Encoding'] = 'gzip' response.headers['Content-Length'] = len(gzipped_response) return response # initialize web path if not self.glo_setting.get("public_address"): try: res = requests.get("http://api.ipify.org/") ipaddr = res.text except: with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s: s.connect(("8.8.8.8", 53)) ipaddr = s.getsockname()[0] self.glo_setting["public_address"] = "http://{}:{}/".format( ipaddr, self.glo_setting["port"], ) if not self.glo_setting["public_address"].endswith("/"): self.glo_setting["public_address"] += "/" if not self.glo_setting["public_basepath"].startswith("/"): self.glo_setting["public_basepath"] = "/" + \ self.glo_setting["public_basepath"] if not self.glo_setting["public_basepath"].endswith("/"): self.glo_setting["public_basepath"] += "/" # initialize update time if self.glo_setting["update-time"] == "random": self.glo_setting["update-time"] = "{:02d}:{:02d}".format( random.randint(2, 4), random.randint(0, 59) ) # initialize client salt if self.glo_setting["client_salt"] is None: self.glo_setting["client_salt"] = web_util.rand_string(16) # save initialization with open(config_f_path, "w", encoding="utf-8") as config_file: json.dump(self.glo_setting, config_file, indent=4) # initialize utils templating.Ver = self.Version[2:-1] # generate random secret_key if(quart_app.secret_key is None): quart_app.secret_key = bytes( (random.randint(0, 255) for _ in range(16))) # add mimetype mimetypes.init() mimetypes.add_type('application/javascript', '.js') mimetypes.add_type('image/webp', '.webp') # add route for static files @quart_app.route( urljoin(self.glo_setting["public_basepath"], "assets/<path:filename>"), methods=["GET"]) async def yobot_static(filename): accept_encoding = request.headers.get('Accept-Encoding', '') origin_file = os.path.join(os.path.dirname( __file__), "public", "static", filename) if ('gzip' not in accept_encoding.lower() or self.glo_setting['web_gzip'] == 0): return await send_file(origin_file) gzipped_file = os.path.abspath(os.path.join( os.path.dirname(__file__), "public", "static", filename+"."+self.Version[1:-1]+".gz", )) if not os.path.exists(gzipped_file): if not os.path.exists(origin_file): return "404 not found", 404 with open(origin_file, 'rb') as of, open(gzipped_file, 'wb') as gf: with gzip.GzipFile( mode='wb', compresslevel=self.glo_setting["web_gzip"], fileobj=gf, ) as gzip_file: gzip_file.write(of.read()) response = await make_response(await send_file(gzipped_file)) response.mimetype = ( mimetypes.guess_type(os.path.basename(origin_file))[0] or "application/octet-stream" ) response.headers['Content-Encoding'] = 'gzip' response.headers['Vary'] = 'Accept-Encoding' return response # add route for output files if not os.path.exists(os.path.join(dirname, "output")): os.mkdir(os.path.join(dirname, "output")) @quart_app.route( urljoin(self.glo_setting["public_basepath"], "output/<path:filename>"), methods=["GET"]) async def yobot_output(filename): return await send_file(os.path.join(dirname, "output", filename)) # openCC self.ccs2t = OpenCC(self.glo_setting.get("zht_out_style", "s2t")) self.cct2s = OpenCC("t2s") # filter self.black_list = set(self.glo_setting["black-list"]) self.black_list_group = set(self.glo_setting["black-list-group"]) self.white_list_group = set(self.glo_setting["white-list-group"]) # update runtime variables self.glo_setting.update({ "dirname": dirname, "verinfo": verinfo }) kwargs = { "glo_setting": self.glo_setting, "bot_api": bot_api, "scheduler": scheduler, "app": quart_app, } # load plugins plug_all = [ updater.Updater(**kwargs), switcher.Switcher(**kwargs), yobot_msg.Message(**kwargs), gacha.Gacha(**kwargs), jjc_consult.Consult(**kwargs), push_news.News(**kwargs), calender.Event(**kwargs), homepage.Index(**kwargs), marionette.Marionette(**kwargs), login.Login(**kwargs), settings.Setting(**kwargs), web_util.WebUtil(**kwargs), clan_battle.ClanBattle(**kwargs), ] self.plug_passive = [p for p in plug_all if p.Passive] self.plug_active = [p for p in plug_all if p.Active] for p in plug_all: if p.Request: p.register_routes(quart_app) # load new plugins self.plug_new = [ miner.Miner(**kwargs), group_leave.GroupLeave(**kwargs), custom.Custom(**kwargs), ] def active_jobs(self) -> List[Tuple[Any, Callable[[], Iterable[Dict[str, Any]]]]]: jobs = [p.jobs() for p in self.plug_active] return reduce(lambda x, y: x+y, jobs) async def proc_async(self, msg: dict, *args, **kwargs) -> str: ''' receive a message and return a reply ''' # prefix if self.glo_setting.get("preffix_on", False): preffix = self.glo_setting.get("preffix_string", "") if not msg["raw_message"].startswith(preffix): return None else: msg["raw_message"] = ( msg["raw_message"][len(preffix):]) # black-list if msg["sender"]["user_id"] in self.black_list: return None if msg["message_type"] == "group": if self.glo_setting["white_list_mode"]: if msg["group_id"] not in self.white_list_group: return None else: if msg["group_id"] in self.black_list_group: return None # zht-zhs convertion if self.glo_setting.get("zht_in", False): msg["raw_message"] = self.cct2s.convert(msg["raw_message"]) if msg["sender"].get("card", "") == "": msg["sender"]["card"] = msg["sender"].get("nickname", "无法获取昵称") # run new reply_msg = None for plug in self.plug_new: ret = await plug.execute_async(msg) if ret is None: continue elif isinstance(ret, bool): if ret: break else: continue elif isinstance(ret, str): reply_msg = ret break else: raise ValueError('unsupport return type: {}'.format(type(ret))) if reply_msg: if self.glo_setting.get("zht_out", False): reply_msg = self.ccs2t.convert(reply_msg) return reply_msg # run replys = [] for pitem in self.plug_passive: if hasattr(pitem, 'match'): func_num = pitem.match(msg["raw_message"]) else: func_num = True if func_num: if hasattr(pitem, "execute_async"): res = await pitem.execute_async(func_num, msg) else: res = pitem.execute(func_num, msg) if res is None: continue if isinstance(res, str): replys.append(res) break if res is None: break if res["reply"]: replys.append(res["reply"]) if res["block"]: break reply_msg = "\n".join(replys) # zhs-zht convertion if self.glo_setting.get("zht_out", False): reply_msg = self.ccs2t.convert(reply_msg) return reply_msg def execute(self, cmd: str, *args, **kwargs): if cmd == "update": res = self.plug_passive[0].execute(0x30) return res["reply"]
def test_unicode_zht2zhs(self): c = OpenCC('zht2zhs.ini') self.assertEqual(c.convert(u'开放中文转换'), u'開放中文轉換') c.close()
from config import Config import wget Jebot = Client( "YT Downloader", api_id=Config.APP_ID, api_hash=Config.API_HASH, bot_token=Config.TG_BOT_TOKEN, ) YTDL_REGEX = (r"^((?:https?:)?\/\/)" r"?((?:www|m)\.)" r"?((?:youtube\.com|youtu\.be|xvideos\.com|pornhub\.com" r"|xhamster\.com|xnxx\.com))" r"(\/)([-a-zA-Z0-9()@:%_\+.~#?&//=]*)([\w\-]+)(\S+)?$") s2tw = OpenCC('s2tw.json').convert @Jebot.on_message(filters.command("start")) async def start(client, message): if message.chat.type == 'private': await Jebot.send_message( chat_id=message.chat.id, text="""<b>Hey There, I'm AnyDL Bot I can download video or audio from Youtube. Made by @ImJanindu 🇱🇰 Hit help button to find out more about how to use me</b>""", reply_markup=InlineKeyboardMarkup( [[ InlineKeyboardButton(
def test_convert_text(self): c = OpenCC('zhs2zht.ini') try: c.convert(3) except TypeError, e: self.assertEqual(e.message, 'TypeError: must be string or buffer.')
self.whole_sum = "" self.transcript = [] self.footnote = [] abs_len = [] document_len = [] document_len_big_sent = [] document_len_small_sent = [] whole_p = 0 all_data = [] coverage = 0 big_punc = ["。", "?", "!", "?", "!"] small_punc = [",", ";", ",", ";"] cc = OpenCC('tw2sp') list_dir_path = "www.ted.com" list_dir = os.listdir(list_dir_path) print(len(list_dir)) ted_map = {} if not os.path.exists("ted_zh"): os.mkdir("ted_zh") if not os.path.exists("ted_en"): os.mkdir("ted_en") with tqdm(total=len(list_dir)) as pbar: for html_file in list_dir: pbar.update(1) htmlfile_reader = open(os.path.join(list_dir_path, html_file), 'r', encoding='utf-8')
class OpenCCTest(unittest.TestCase): def setUp(self): self.openCC = OpenCC() def test_hk2s(self): self.openCC.set_conversion('hk2s') words = '香煙(英語:Cigarette),為煙草製品的一種。滑鼠是一種很常見及常用的電腦輸入設備。' self.assertEqual(self.openCC.convert(words), '香烟(英语:Cigarette),为烟草制品的一种。滑鼠是一种很常见及常用的电脑输入设备。') def test_s2hk(self): self.openCC.set_conversion('s2hk') words = '香烟(英语:Cigarette),为烟草制品的一种。鼠标是一种很常见及常用的电脑输入设备。' self.assertEqual(self.openCC.convert(words), '香煙(英語:Cigarette),為煙草製品的一種。鼠標是一種很常見及常用的電腦輸入設備。') def test_s2t(self): self.openCC.set_conversion('s2t') words = '香烟(英语:Cigarette),为烟草制品的一种。鼠标是一种很常见及常用的电脑输入设备。' self.assertEqual(self.openCC.convert(words), '香菸(英語:Cigarette),爲菸草製品的一種。鼠標是一種很常見及常用的電腦輸入設備。') def test_s2tw(self): self.openCC.set_conversion('s2tw') words = '香烟(英语:Cigarette),为烟草制品的一种。鼠标是一种很常见及常用的电脑输入设备。' self.assertEqual(self.openCC.convert(words), '香菸(英語:Cigarette),為菸草製品的一種。鼠標是一種很常見及常用的電腦輸入設備。') def test_s2twp(self): self.openCC.set_conversion('s2twp') words = '香烟(英语:Cigarette),为烟草制品的一种。內存是一种很常见及常用的电脑输入设备。' self.assertEqual(self.openCC.convert(words), '香菸(英語:Cigarette),為菸草製品的一種。記憶體是一種很常見及常用的電腦輸入裝置。') def test_t2hk(self): self.openCC.set_conversion('t2hk') words = '香菸(英語:Cigarette),爲菸草製品的一種。滑鼠是一種很常見及常用的電腦輸入裝置。' self.assertEqual(self.openCC.convert(words), '香煙(英語:Cigarette),為煙草製品的一種。滑鼠是一種很常見及常用的電腦輸入裝置。') def test_t2s(self): self.openCC.set_conversion('t2s') words = '香菸(英語:Cigarette),爲菸草製品的一種。滑鼠是一種很常見及常用的電腦輸入裝置。' self.assertEqual(self.openCC.convert(words), '香烟(英语:Cigarette),为烟草制品的一种。滑鼠是一种很常见及常用的电脑输入装置。') def test_t2tw(self): self.openCC.set_conversion('t2tw') words = '香菸(英語:Cigarette),爲菸草製品的一種。鼠標是一種很常見及常用的電腦輸入設備。' self.assertEqual(self.openCC.convert(words), '香菸(英語:Cigarette),為菸草製品的一種。鼠標是一種很常見及常用的電腦輸入設備。') def test_tw2s(self): self.openCC.set_conversion('tw2s') words = '香菸(英語:Cigarette),為菸草製品的一種。滑鼠是一種很常見及常用的電腦輸入裝置。' self.assertEqual(self.openCC.convert(words), '香烟(英语:Cigarette),为烟草制品的一种。滑鼠是一种很常见及常用的电脑输入装置。') def test_tw2sp(self): self.openCC.set_conversion('tw2sp') words = '香菸(英語:Cigarette),為菸草製品的一種。記憶體是一種很常見及常用的電腦輸入裝置。' self.assertEqual(self.openCC.convert(words), '香烟(英语:Cigarette),为烟草制品的一种。内存是一种很常见及常用的电脑输入设备。')
def main(fin, fout): cc = OpenCC('t2s') for line in fin: line2 = cc.convert(line) fout.write(line2)
#import tensorflow as tf import numpy as np import pandas as pd import keras import CKIPClient_python3 as parser from time import sleep import os.path import pickle from IPython import embed from opencc import OpenCC openCC = OpenCC('s2t') PAD = 3 EOS = 1 UNK = 2 BOS = 0 def sql_to_csv(): D = pd.read_sql('select * from \"hotel-review\" ', 'sqlite:///HR_30.db') del D['id'] D['review'] = [openCC.convert(sen) for sen in D['review']] D.to_csv('data/raw.csv', index=False) def raw_to_ckip_parse(): import re #if ~os.path.isfile('data/raw.csv'): # raise Exception('No raw csv') D = pd.read_csv('data/raw.csv')
def test_base_zhs2zht(self): c = OpenCC('zhs2zht.ini') self.assertEqual(c.convert('开放中文转换'), '開放中文轉換') c.close()
class RimeStyle: font_face = "MingLiu" candidate_format = "{0} {1}" inline_preedit = "false" menu_opencc = None font_point = 20 candidate_per_row = 1 inline_code = False display_tray_icon = False candidate_use_cursor = False soft_cursor = False menu = [] options = [] options_states = [] schemas = [] uris = [] session_id = None def __init__(self, appname, session_id): self.session_id = session_id config = RimeConfig() if not rime.config_open(appname.encode("UTF-8"), config): return self.font_face = rimeGetString(config, 'style/font_face') self.candidate_format = rimeGetString(config, 'style/candidate_format') self.inline_preedit = rimeGetString(config, 'style/inline_preedit') menu_opencc_config = rimeGetString(config, 'style/menu_opencc') self.menu_opencc = OpenCC(menu_opencc_config) if menu_opencc_config else None value = c_int() if rime.config_get_int(config, b'style/font_point', value): self.font_point = value.value if rime.config_get_bool(config, b'style/horizontal', value): self.candidate_per_row = 10 if bool(value) else 1 if rime.config_get_int(config, b'style/candidate_per_row', value): self.candidate_per_row = value.value if rime.config_get_bool(config, b'style/display_tray_icon', value): self.display_tray_icon = bool(value) if rime.config_get_bool(config, b'style/candidate_use_cursor', value): self.candidate_use_cursor = bool(value) if rime.config_get_bool(config, b'style/soft_cursor', value): self.soft_cursor = bool(value) self.options.clear() self.options_states.clear() self.uris.clear() self.menu = self.config_get_menu(config, b'menu') #print("menu", self.menu) rime.config_close(config) def get_schema(self, commandId): if commandId >= ID_SCHEMA: return self.schemas[commandId - ID_SCHEMA] def get_option(self, commandId): if commandId >= ID_OPTION: return self.options[commandId - ID_OPTION] def get_uri(self, commandId): if commandId >= ID_URI: return self.uris[commandId - ID_URI] def get_schema_list(self): schema_list = RimeSchemaList() self.schemas = [] submenu = [] current_schema = bytes(CHAR_SIZE) rime.get_current_schema(self.session_id, current_schema, CHAR_SIZE) current_schema_id = current_schema.rstrip(b'\0') if rime.get_schema_list(schema_list): n = schema_list.size for i in range(n): schema_id = schema_list.list[i].schema_id name = schema_list.list[i].name.decode("UTF-8") if self.menu_opencc: name = self.menu_opencc.convert(name) self.schemas.append(schema_id) d = {'text': name, 'id': ID_SCHEMA + i} if schema_id == current_schema_id: d["checked"] = True submenu.append(d) rime.free_schema_list(schema_list) return submenu def config_get_menu(self, config, path): menu = [] iterator = RimeConfigIterator() if not rime.config_begin_list(iterator, config, path): return while rime.config_next(iterator): d = {} name = rime.config_get_cstring(config, iterator.path + b'/name') command = rime.config_get_cstring(config, iterator.path + b'/command') uri = rime.config_get_cstring(config, iterator.path + b'/uri') text = rime.config_get_cstring(config, iterator.path + b'/text') if command: d["id"] = commands.get(command.decode("UTF-8"), 0) if ID_SCHEMA_LIST == d["id"]: d["submenu"] = self.get_schema_list() elif ID_SYNC_DIR == d["id"]: d["enabled"] = os.path.isdir(rime.get_sync_dir().decode(ENC)) elif uri: d["id"] = ID_URI + len(self.uris) self.uris.append(uri.decode("UTF-8")) elif name: states = [rime.config_get_cstring(config, iterator.path + b'/states/@0').decode("UTF-8"), rime.config_get_cstring(config, iterator.path + b'/states/@1').decode("UTF-8")] d["id"] = ID_OPTION + len(self.options) state_id = rime.get_option(self.session_id, name) d["text"] = "%s → %s" % (states[state_id], states[1 - state_id]) self.options_states.append(states) self.options.append(name) if text: d["text"] = text.decode("UTF-8") if self.menu_opencc: d["text"] = self.menu_opencc.convert(d["text"]) submenu = self.config_get_menu(config, iterator.path + b'/submenu') if submenu: d["submenu"] = submenu menu.append(d) rime.config_end(iterator) return menu