Python OpenCC Examples, opencc.OpenCC Python Examples

Example #1

0

Show file

File: test.py Project: cute/pyopencc

    def test_conversion_mode(self):
        c = OpenCC('zhs2zht.ini')
        c.set_conversion_mode(0)

        try:
            c.set_conversion_mode(3)
        except ValueError, e:
            self.assertEqual(e.message, 'ValueError: conversion mode must be in [0,1,2].')

Example #2

0

Show file

File: test_ctypes.py Project: messense/opencc-python

def test_convert2():
    cc = OpenCC()
    text = '乾坤一擲'
    expect = '乾坤一掷'
    assert cc.convert(text) == expect

    text = '開放中文轉換'
    expect = '开放中文转换'
    assert cc.convert(text) == expect

Example #3

0

Show file

File: librime.py Project: EasyIME/PIME

 def __init__(self, appname, session_id):
     self.session_id = session_id
     config = RimeConfig()
     if not rime.config_open(appname.encode("UTF-8"), config):
         return
     self.font_face = rimeGetString(config, 'style/font_face')
     self.candidate_format = rimeGetString(config, 'style/candidate_format')
     self.inline_preedit = rimeGetString(config, 'style/inline_preedit')
     menu_opencc_config = rimeGetString(config, 'style/menu_opencc')
     self.menu_opencc = OpenCC(menu_opencc_config) if menu_opencc_config else None
     value = c_int()
     if rime.config_get_int(config, b'style/font_point', value):
         self.font_point = value.value
     if rime.config_get_bool(config, b'style/horizontal', value):
         self.candidate_per_row = 10 if bool(value) else 1
     if rime.config_get_int(config, b'style/candidate_per_row', value):
         self.candidate_per_row = value.value
     if rime.config_get_bool(config, b'style/display_tray_icon', value):
         self.display_tray_icon = bool(value)
     if rime.config_get_bool(config, b'style/candidate_use_cursor', value):
         self.candidate_use_cursor = bool(value)
     if rime.config_get_bool(config, b'style/soft_cursor', value):
         self.soft_cursor = bool(value)
     self.options.clear()
     self.options_states.clear()
     self.uris.clear()
     self.menu = self.config_get_menu(config, b'menu')
     #print("menu", self.menu)
     rime.config_close(config)

Example #4

0

Show file

    def __init__(self, *,
                 data_path: str,
                 scheduler: AsyncIOScheduler,
                 quart_app: Quart,
                 bot_api: Api,
                 verinfo: str = None):

        # initialize config
        is_packaged = "_MEIPASS" in dir(sys)
        if is_packaged:
            basepath = os.path.dirname(sys.argv[0])
        else:
            basepath = os.path.dirname(__file__)

        dirname = os.path.abspath(os.path.join(basepath, data_path))
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        config_f_path = os.path.join(dirname, "yobot_config.json")
        if is_packaged:
            default_config_f_path = os.path.join(
                sys._MEIPASS, "packedfiles", "default_config.json")
        else:
            default_config_f_path = os.path.join(
                os.path.dirname(__file__), "packedfiles", "default_config.json")
        with open(default_config_f_path, "r", encoding="utf-8") as config_file:
            self.glo_setting = json.load(config_file)
        if not os.path.exists(config_f_path):
            with open(config_f_path, "w") as f:
                f.write("{}")
            print("设置已初始化，发送help获取帮助")
        boss_filepath = os.path.join(dirname, "boss3.json")
        if not os.path.exists(boss_filepath):
            if is_packaged:
                default_boss_filepath = os.path.join(
                    sys._MEIPASS, "packedfiles", "default_boss.json")
            else:
                default_boss_filepath = os.path.join(
                    os.path.dirname(__file__), "packedfiles", "default_boss.json")
            shutil.copyfile(default_boss_filepath, boss_filepath)
        pool_filepath = os.path.join(dirname, "pool3.json")
        if not os.path.exists(pool_filepath):
            if is_packaged:
                default_pool_filepath = os.path.join(
                    sys._MEIPASS, "packedfiles", "default_pool.json")
            else:
                default_pool_filepath = os.path.join(
                    os.path.dirname(__file__), "packedfiles", "default_pool.json")
            shutil.copyfile(default_pool_filepath, pool_filepath)
        for e in os.environ:
            if e.startswith("YOBOT_"):
                k = e[6:].lower()
                self.glo_setting[k] = os.environ[e]
        with open(config_f_path, "r", encoding="utf-8-sig") as config_file:
            cfg = json.load(config_file)
            for k in self.glo_setting.keys():
                if k in cfg:
                    self.glo_setting[k] = cfg[k]

        if verinfo is None:
            verinfo = updater.get_version(self.Version, self.Version_id)
            print(verinfo['ver_name'])

        # initialize database
        ybdata.init(os.path.join(dirname, 'yobotdata.db'))

        # enable gzip
        if self.glo_setting["web_gzip"] > 0:
            gzipped_types = {'text/html', 'text/javascript', 'text/css', 'application/json'}
            @quart_app.after_request
            async def gzip_response(response):
                accept_encoding = request.headers.get('Accept-Encoding', '')
                if (response.status_code < 200 or
                    response.status_code >= 300 or
                    len(await response.get_data()) < 1024 or
                    'gzip' not in accept_encoding.lower() or
                        'Content-Encoding' in response.headers):
                    return response

                gzip_buffer = BytesIO()
                gzip_file = gzip.GzipFile(
                    mode='wb', compresslevel=self.glo_setting["web_gzip"], fileobj=gzip_buffer)
                gzip_file.write(await response.get_data())
                gzip_file.close()
                gzipped_response = gzip_buffer.getvalue()
                response.set_data(gzipped_response)
                response.headers['Content-Encoding'] = 'gzip'
                response.headers['Content-Length'] = len(gzipped_response)

                return response

        # initialize web path
        if not self.glo_setting.get("public_address"):
            try:
                res = requests.get("http://api.ipify.org/")
                ipaddr = res.text
            except:
                with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
                    s.connect(("8.8.8.8", 53))
                    ipaddr = s.getsockname()[0]
            self.glo_setting["public_address"] = "http://{}:{}/".format(
                ipaddr,
                self.glo_setting["port"],
            )

        if not self.glo_setting["public_address"].endswith("/"):
            self.glo_setting["public_address"] += "/"

        if not self.glo_setting["public_basepath"].startswith("/"):
            self.glo_setting["public_basepath"] = "/" + \
                self.glo_setting["public_basepath"]

        if not self.glo_setting["public_basepath"].endswith("/"):
            self.glo_setting["public_basepath"] += "/"

        # initialize update time
        if self.glo_setting["update-time"] == "random":
            self.glo_setting["update-time"] = "{:02d}:{:02d}".format(
                random.randint(2, 4),
                random.randint(0, 59)
            )

        # initialize client salt
        if self.glo_setting["client_salt"] is None:
            self.glo_setting["client_salt"] = web_util.rand_string(16)

        # save initialization
        with open(config_f_path, "w", encoding="utf-8") as config_file:
            json.dump(self.glo_setting, config_file, indent=4)

        # initialize utils
        templating.Ver = self.Version[2:-1]

        # generate random secret_key
        if(quart_app.secret_key is None):
            quart_app.secret_key = bytes(
                (random.randint(0, 255) for _ in range(16)))

        # add mimetype
        mimetypes.init()
        mimetypes.add_type('application/javascript', '.js')
        mimetypes.add_type('image/webp', '.webp')

        # add route for static files
        @quart_app.route(
            urljoin(self.glo_setting["public_basepath"],
                    "assets/<path:filename>"),
            methods=["GET"])
        async def yobot_static(filename):
            accept_encoding = request.headers.get('Accept-Encoding', '')
            origin_file = os.path.join(os.path.dirname(
                __file__), "public", "static", filename)
            if ('gzip' not in accept_encoding.lower()
                    or self.glo_setting['web_gzip'] == 0):
                return await send_file(origin_file)
            gzipped_file = os.path.abspath(os.path.join(
                os.path.dirname(__file__),
                "public",
                "static",
                filename+"."+self.Version[1:-1]+".gz",
            ))
            if not os.path.exists(gzipped_file):
                if not os.path.exists(origin_file):
                    return "404 not found", 404
                with open(origin_file, 'rb') as of, open(gzipped_file, 'wb') as gf:
                    with gzip.GzipFile(
                        mode='wb',
                        compresslevel=self.glo_setting["web_gzip"],
                        fileobj=gf,
                    ) as gzip_file:
                        gzip_file.write(of.read())

            response = await make_response(await send_file(gzipped_file))
            response.mimetype = (
                mimetypes.guess_type(os.path.basename(origin_file))[0]
                or "application/octet-stream"
            )
            response.headers['Content-Encoding'] = 'gzip'
            response.headers['Vary'] = 'Accept-Encoding'
            return response

        # add route for output files
        if not os.path.exists(os.path.join(dirname, "output")):
            os.mkdir(os.path.join(dirname, "output"))

        @quart_app.route(
            urljoin(self.glo_setting["public_basepath"],
                    "output/<path:filename>"),
            methods=["GET"])
        async def yobot_output(filename):
            return await send_file(os.path.join(dirname, "output", filename))

        # openCC
        self.ccs2t = OpenCC(self.glo_setting.get("zht_out_style", "s2t"))
        self.cct2s = OpenCC("t2s")

        # filter
        self.black_list = set(self.glo_setting["black-list"])
        self.black_list_group = set(self.glo_setting["black-list-group"])
        self.white_list_group = set(self.glo_setting["white-list-group"])

        # update runtime variables
        self.glo_setting.update({
            "dirname": dirname,
            "verinfo": verinfo
        })
        kwargs = {
            "glo_setting": self.glo_setting,
            "bot_api": bot_api,
            "scheduler": scheduler,
            "app": quart_app,
        }

        # load plugins
        plug_all = [
            updater.Updater(**kwargs),
            switcher.Switcher(**kwargs),
            yobot_msg.Message(**kwargs),
            gacha.Gacha(**kwargs),
            jjc_consult.Consult(**kwargs),
            push_news.News(**kwargs),
            calender.Event(**kwargs),
            homepage.Index(**kwargs),
            marionette.Marionette(**kwargs),
            login.Login(**kwargs),
            settings.Setting(**kwargs),
            web_util.WebUtil(**kwargs),
            clan_battle.ClanBattle(**kwargs),
        ]
        self.plug_passive = [p for p in plug_all if p.Passive]
        self.plug_active = [p for p in plug_all if p.Active]

        for p in plug_all:
            if p.Request:
                p.register_routes(quart_app)

        # load new plugins
        self.plug_new = [
            miner.Miner(**kwargs),
            group_leave.GroupLeave(**kwargs),
            custom.Custom(**kwargs),
        ]

Example #5

0

Show file

File: bi_make_sensplit_test.py Project: qianchu/CLUSE

import re
import jieba
from opencc import OpenCC
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

openCC = OpenCC('t2s')
openCC_final = OpenCC('s2t')

def word_tokenize_fishbracket(en_sent):
    en_sent = word_tokenize(en_sent)
    en_sent_out=[]
    for i,w in enumerate(en_sent):
        if w=='<' and en_sent[i+2]=='>':
            continue
        elif w=='>' and en_sent[i-2]=='<':
            continue
        elif en_sent[i-1]=='<' and en_sent[i+1]=='>':
            en_sent_out.append('<'+w+'>')
            continue
        en_sent_out.append(w)
    return en_sent_out

ch_vocab = open('ch_vocab', encoding='utf-8').read().splitlines()
ch_word2id = {}
for line in ch_vocab:
    line = line.split(' ')
    word = line[0]
    ID = line[1]
    ch_word2id[word] = ID

Example #6

0

Show file

from typing import Any, Callable, Optional, Union
from unicodedata import normalize

from cryptography.fernet import Fernet
from opencc import OpenCC
from PIL import Image
from pyrogram import Message, User
from pyrogram.errors import FloodWait

from .. import glovar

# Enable logging
logger = logging.getLogger(__name__)

# Init Opencc
converter = OpenCC(config="t2s.json")


def bold(text: Any) -> str:
    # Get a bold text
    result = ""

    try:
        result = str(text).strip()

        if not result:
            return ""

        result = f"<b>{escape(result)}</b>"
    except Exception as e:
        logger.warning(f"Bold error: {e}", exc_info=True)

Example #7

0

Show file

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import csv
import os
import pandas as pd
import numpy as np
import modeling
import optimization
import tokenization
import tensorflow as tf
from opencc import OpenCC

cc = OpenCC("t2s")

flags = tf.flags

FLAGS = flags.FLAGS

## Required parameters
flags.DEFINE_string(
    "data_dir",
    None,
    "The input data dir. Should contain the .tsv files (or other data files) "
    "for the task.",
)

flags.DEFINE_string(
    "bert_config_file",

Example #8

0

Show file

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 23 22:56:08 2019

@author: kuangchen
"""

import regex
from bs4 import BeautifulSoup
from opencc import OpenCC

from file_io import save_data_list

cc = OpenCC('t2s')


def convert(texts):
    return [cc.convert(text) for text in texts]


def transform_2013(html_string):
    soup = BeautifulSoup(html_string, 'lxml')
    error_texts = []
    correct_texts = []
    sentences = soup.find_all('p')
    mistakes = soup.find_all('mistake')
    for sentence in sentences:
        text = sentence.get_text().strip()
        error_text = text
        correct_text = error_text[:]

Example #9

0

Show file

def convert_chinese(content):
    content = OpenCC('t2s').convert(content)
    return content

Example #10

0

Show file

File: main_epub.py Project: ElonVampire/jjdown

    def get_txt(self, txt_id, state, threadnum):
        titlem = ''
        intro = ''
        ids = str(txt_id)
        percent = 0
        self.state = state

        #获取文章网址
        req_url = ids

        #通过cookie获取文章信息
        res = requests.get(req_url, headers=self.headerss).content
        #对文章进行编码
        ress = etree.HTML(
            res.decode("GB18030", "ignore").encode("utf-8",
                                                   "ignore").decode('utf-8'))

        #获取文案
        intro = ress.xpath(
            "//html/body/table/tr/td[1]/div[2]/div[@id='novelintro']//text()")
        #获取标签
        info = ress.xpath("string(/html/body/table[1]/tr/td[1]/div[3])")

        infox = []
        for i in range(1, 7):
            infox.append(
                ress.xpath(
                    "string(/html/body/table[1]/tr/td[3]/div[2]/ul/li[" +
                    str(i) + "])"))

        #获取封面
        cover = ress.xpath(
            "string(/html/body/table[1]/tr/td[1]/div[2]/img/@src)")

        if cover != '':
            pres = requests.get(cover)
            img = pres.content
        else:
            img = "0"

        #获取标题
        titlem = ress.xpath("//html/head/title/text()")
        if self.state == 's':
            titlem[0] = OpenCC('t2s').convert(titlem[0])
        elif self.state == 't':
            titlem[0] = OpenCC('s2t').convert(titlem[0])
        print("网址：" + ids + "\r\n小说信息：" + str(titlem[0]) + "\r\n")

        #获取所有章节网址、标题、内容提要
        self.td = ress.xpath('//*[@id="oneboolt"]//tr')

        for i in self.td:
            u = i.xpath('./td[2]/span/div[1]/a/@href')
            x = i.xpath('./td[2]/span/div[1]/a[1]/@rel')
            if len(u) > 0:
                self.href_list += u
                v = i.xpath('./td[2]/span/div[1]/a/text()')[0].strip()
                v = re.sub('&', '&amp;', v)
                v = re.sub('>', '&gt;', v)
                v = re.sub('<', '&lt;', v)
                self.titleindex.append(v)
                v = i.xpath('./td[3]/text()')[0].strip()
                v = re.sub('&', '&amp;', v)
                v = re.sub('>', '&gt;', v)
                v = re.sub('<', '&lt;', v)
                self.Summary.append(v)
            elif len(x) > 0:
                self.href_list += x
                v = i.xpath('./td[2]/span/div[1]/a/text()')[0].strip()
                v = re.sub('&', '&amp;', v)
                v = re.sub('>', '&gt;', v)
                v = re.sub('<', '&lt;', v)
                self.titleindex.append(v)
                v = i.xpath('./td[3]/text()')[0].strip()
                v = re.sub('&', '&amp;', v)
                v = re.sub('>', '&gt;', v)
                v = re.sub('<', '&lt;', v)
                self.Summary.append(v)

        #获取卷标名称
        self.rollSign = ress.xpath(
            "//*[@id='oneboolt']//tr/td/b[@class='volumnfont']/text()")
        #获取卷标位置
        self.rollSignPlace = ress.xpath(
            "//*[@id='oneboolt']//tr/td/b/ancestor-or-self::tr/following-sibling::tr[1]/td[2]/span/div[1]/a[1]/@href"
        )
        self.rollSignPlace += ress.xpath(
            "//*[@id='oneboolt']//tr/td/b/ancestor-or-self::tr/following-sibling::tr[1]/td[2]/span/div[1]/a[1]/@rel"
        )

        section_ct = len(self.href_list)

        print("可下载章节数：" + str(section_ct) + "\r\n")

        #fillNum：填充序号的长度，例如：若全文有1437章，则每章序号有四位，依次为0001、0002……
        self.fillNum = len(str(len(self.td) - 4))

        #对标题进行操作，删除违规字符等
        ti = str(titlem[0]).split('_')
        ti = ti[0]
        ti = re.sub('/', '_', ti)
        ti = re.sub(r'\\', '_', ti)
        ti = re.sub('\|', '_', ti)
        ti = re.sub('\*', '', ti)
        ti = re.sub('&', '&amp;', ti)

        xaut = ti.split('》')[1]
        xauthref = ress.xpath("//*[@id='oneboolt']//h2/a/@href")[0]
        xtitle = re.sub('《', '', ti.split('》')[0])

        #若文件名不想加编号，可以将这行删除
        ti = ti + '[' + ids.split('=')[1] + ']'
        ti = re.sub('\r', '', ti)

        v = ""
        #打开小说文件写入小说相关信息
        path = os.getcwd()
        if os.path.exists(ti):
            os.chdir(ti)
        else:
            os.mkdir(ti)
            os.chdir(ti)
        self.index = []
        #保存封面图片
        if img != "0":
            pic = open("p.jpg", 'wb')
            pic.write(img)
            pic.close()

            #写入封面
            f = open("C.xhtml", 'w', encoding='utf-8')
            f.write('''<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
<head><title></title></head><body><img alt="p" src="p.jpg"/></body></html>''')
            f.close()

        #写入文章信息页
        fo = open("TOC.xhtml", 'w', encoding='utf-8')
        fo.write('''<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
<head><title></title></head><body>''')

        fo.write("<h2><center><a href='" + req_url + "'>" + xtitle +
                 "</a></center></h2><p></p>")
        fo.write("<h3 class='sigil_not_in_toc'><center><a href='" + xauthref +
                 "'>" + xaut + "</a></center></h3><p></p>")
        fo.write('''<blockquote class="userstuff">''')
        #self.index.append(titlem[0])
        #生成目录文字
        for l in self.href_list:
            titleOrigin = l.split('=')
            i = self.href_list.index(l)
            title = str(titleOrigin[2]).zfill(self.fillNum) + " "
            title = title + self.titleindex[i].strip()
            title = title + " " + self.Summary[i].strip()
            if self.state == 's':
                title = OpenCC('t2s').convert(title)
            elif self.state == 't':
                title = OpenCC('s2t').convert(title)
            if self.href_list[i] in self.rollSignPlace:
                v = self.rollSign[self.rollSignPlace.index(l)]
                if self.state == 's':
                    v = OpenCC('t2s').convert(
                        self.rollSign[self.rollSignPlace.index(l)])
                elif self.state == 't':
                    v = OpenCC('s2t').convert(
                        self.rollSign[self.rollSignPlace.index(l)])
                v = re.sub('&', '&amp;', v).rstrip()  #&amp;
                v = re.sub('>', '&gt;', v)
                v = re.sub('<', '&lt;', v)
                self.index.append(v)
            title = re.sub('&', '&amp;', title).rstrip()  #&amp;
            title = re.sub('>', '&gt;', title)
            title = re.sub('<', '&lt;', title)
            self.index.append(title)

        for ix in infox:
            ix = ix.strip()
            ix = re.sub('\r\n', '', ix)
            ix = re.sub(' +', '', ix)
            ix = re.sub('&', '&amp;', ix)
            ix = re.sub('>', '&gt;', ix)
            ix = re.sub('<', '&lt;', ix)
            fo.write("<p>" + ix + "</p>")

        fo.write("</blockquote>")
        fo.write("<p><b>文案：</b></p>")
        for nx in intro:
            v = re.sub(' +', ' ', str(nx)).rstrip()
            v = re.sub('&', '&amp;', v).rstrip()
            v = re.sub('>', '&gt;', v)
            v = re.sub('<', '&lt;', v)
            if self.state == 's':
                v = OpenCC('t2s').convert(v)
            elif self.state == 't':
                v = OpenCC('s2t').convert(v)
            if v != "":
                fo.write("<p>" + v + "</p>")
        info = re.sub(' +', ' ', info).strip()
        info = re.sub('&', '&amp;', info)
        info = re.sub('>', '&gt;', info)
        info = re.sub('<', '&lt;', info)
        if self.state == 's':
            info = OpenCC('t2s').convert(info)
        elif self.state == 't':
            info = OpenCC('s2t').convert(info)
        info = re.sub('搜索关键字', '</p><p>搜索关键字', info)
        info = re.sub('一句话简介：', '</p><p>一句话简介：', info)
        fo.write("<p>" + info + "</p>")
        fo.write("</body></html>")
        fo.close()
        count = 0
        tlist = []
        #获取每一章内容
        with concurrent.futures.ThreadPoolExecutor(
                max_workers=threadnum) as executor:
            tlist = {
                executor.submit(self.get_sin, i): i
                for i in self.href_list
            }
            for future in concurrent.futures.as_completed(tlist):
                if self.percent < section_ct:
                    print('\r 下载进度：%d/%d' % (self.percent, section_ct),
                          end='',
                          flush=True)
        '''
        for i in self.href_list:
            thread = Thread(target=self.get_sin, args=(i,))
            tlist.append(thread)
            thread.start()
        for t in tlist:
            t.join()
            print('\r 下载进度：%d/%d\r\n' % (self.percent,section_ct),end='',flush=True)
        '''
        print('\r 下载完成，总进度：%d/%d\r\n' % (self.percent, section_ct),
              end='',
              flush=True)

        #input("\r\n请按回车键打包epub：")
        #保存为epub
        os.chdir(path)
        epub_name = ti + ".epub"
        epub = zipfile.ZipFile(epub_name, 'w')
        EPUB3.epubfile.create_mimetype(epub)
        EPUB3.epubfile.create_container(epub)
        os.chdir(ti)
        ppp = os.getcwd()
        EPUB3.epubfile.create_content(epub, ppp, xtitle, xaut)
        EPUB3.epubfile.create_info(epub, ppp, self.index, self.rollSign,
                                   xtitle + "-" + xaut)
        EPUB3.epubfile.create_stylesheet(epub)
        for html in os.listdir('.'):
            basename = os.path.basename(html)
            if basename.endswith('jpg'):
                epub.write(html,
                           "OEBPS/" + basename,
                           compress_type=zipfile.ZIP_DEFLATED)
            if basename.endswith('html'):
                epub.write(html,
                           "OEBPS/" + basename,
                           compress_type=zipfile.ZIP_DEFLATED)
        epub.close()
        os.chdir(path)
        shutil.rmtree(ppp)
        print("\r\nepub打包完成")

Example #11

0

Show file

File: main_epub.py Project: ElonVampire/jjdown

    def get_sin(self, l):
        titleOrigin = l.split('=')
        i = self.href_list.index(l)
        cont = requests.get(l, headers=self.headerss).content
        dot = etree.HTML(
            cont.decode("GB18030", "ignore").encode("utf-8",
                                                    "ignore").decode('utf-8'))

        #tex:正文
        tex = dot.xpath(
            "//html/body/table[@id='oneboolt']/tr[2]/td[1]/div[@class='noveltext']/text()"
        )
        #he:标题
        he = dot.xpath(
            "//html/body/table[@id='oneboolt']/tr[2]/td[1]/div[@class='noveltext']/div[2]/h2/text()"
        )
        #tex1:作话
        tex1 = dot.xpath(
            "//html/body/table[@id='oneboolt']/tr[2]/td[1]/div[@class='noveltext']/div[@class='readsmall']/text()"
        )
        #sign:作话位置
        sign = dot.xpath("//*[@id='oneboolt']/tr[2]/td[1]/div/div[4]/@class")

        #序号填充
        title = str(titleOrigin[2]).zfill(self.fillNum) + " "

        #章节名称
        title = title + self.titleindex[i].strip() + " "

        #内容提要
        title = title + self.Summary[i].strip()

        if self.state == 's':
            title = OpenCC('t2s').convert(title)
        elif self.state == 't':
            title = OpenCC('s2t').convert(title)
        if self.href_list[i] in self.rollSignPlace:
            v = self.rollSign[self.rollSignPlace.index(l)]
            if self.state == 's':
                v = OpenCC('t2s').convert(
                    self.rollSign[self.rollSignPlace.index(l)])
            elif self.state == 't':
                v = OpenCC('s2t').convert(
                    self.rollSign[self.rollSignPlace.index(l)])

        if len(he) == 0:
            print("第" + titleOrigin[2] + "章未购买或加载失败")
        else:
            #创建章节文件
            fo = open("z" + str(titleOrigin[2].zfill(self.fillNum)) + ".xhtml",
                      'w',
                      encoding='utf-8')

            fo.write('''<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
<head><title></title></head><body>''')
            #写入卷标
            if self.href_list[i] in self.rollSignPlace:
                fo.write("<h2>" + v.rstrip() + "</h2>")
                print("\r\n" + v + "\r\n")
                fo.write("<h3 id='v'>" + title + "</h3>")
            #写入标题
            else:
                fo.write("<h3>" + title + "</h3>")
            #作话在文前的情况
            if str(sign) == "['readsmall']":
                fo.write('''<blockquote class="userstuff">''')
                for m in tex1:  #删除无用文字及多余空格空行
                    vv = re.sub('@无限好文，尽在晋江文学城', '', str(m))
                    v = re.sub(' +', ' ', vv).rstrip()
                    v = re.sub('&', '&amp;', v).rstrip()
                    v = re.sub('>', '&gt;', v)
                    v = re.sub('<', '&lt;', v)
                    if self.state == 's':
                        v = OpenCC('t2s').convert(v)
                    elif self.state == 't':
                        v = OpenCC('s2t').convert(v)
                    if v != "":  #按行写入正文
                        fo.write("<p>" + v + "</p>")
                fo.write("</blockquote>")
                if len(tex1) != 0:
                    fo.write("<hr/>")
                for tn in tex:
                    vv = re.sub('@无限好文，尽在晋江文学城', '', str(tn))
                    v = re.sub(' +', ' ', vv).rstrip()
                    v = re.sub('&', '&amp;', v).rstrip()
                    v = re.sub('>', '&gt;', v)
                    v = re.sub('<', '&lt;', v)
                    if self.state == 's':
                        v = OpenCC('t2s').convert(v)
                    elif self.state == 't':
                        v = OpenCC('s2t').convert(v)
                    if v != "":
                        fo.write("<p>" + v + "</p>")
            else:  #作话在文后的情况
                for tn in tex:
                    vv = re.sub('@无限好文，尽在晋江文学城', '', str(tn))
                    v = re.sub(' +', ' ', vv).rstrip()
                    v = re.sub('&', '&amp;', v).rstrip()
                    v = re.sub('>', '&gt;', v)
                    v = re.sub('<', '&lt;', v)
                    if self.state == 's':
                        v = OpenCC('t2s').convert(v)
                    elif self.state == 't':
                        v = OpenCC('s2t').convert(v)
                    if v != "":
                        fo.write("<p>" + v + "</p>")
                if len(tex1) != 0:
                    fo.write("<hr/>")
                fo.write('''<blockquote class="userstuff">''')
                for m in tex1:
                    vv = re.sub('@无限好文，尽在晋江文学城', '', str(m))
                    v = re.sub(' +', ' ', vv).rstrip()
                    v = re.sub('&', '&amp;', v).rstrip()
                    v = re.sub('>', '&gt;', v)
                    v = re.sub('<', '&lt;', v)
                    if self.state == 's':
                        v = OpenCC('t2s').convert(v)
                    elif self.state == 't':
                        v = OpenCC('s2t').convert(v)
                    if v != "":
                        fo.write("<p>" + v + "</p>")
                fo.write("</blockquote>")
            fo.write("</body></html>")
            fo.close()
            self.percent += 1

Example #12

0

Show file

File: yobot.py Project: Adpex/yobot

class Yobot:
    def __init__(self, *args, **kwargs):
        # self.send_msg = send_msg

        dirname = os.getcwd()
        config_f_path = os.path.join(dirname, "yobot_config.json")
        if not os.path.exists(config_f_path):
            self.glo_setting = dict()
            return
        with open(config_f_path, "r", encoding="utf-8") as config_file:
            try:
                self.glo_setting = json.load(config_file)
            except:
                raise yobot_errors.File_error(config_f_path + " been damaged")

        inner_info = {
            "dirname": dirname,
            "version": {
                "ver_name":
                "yobot[v3.1.1]",
                "ver_id":
                3101,
                "checktime":
                0,
                "latest":
                True,
                "check_url": [
                    "https://gitee.com/yobot/yobot/raw/master/docs/v3/ver.json",
                    "https://yuudi.github.io/yobot/v3/ver.json",
                    "http://api.yobot.xyz/v3/version/"
                ]
            }
        }
        self.glo_setting.update(inner_info)

        self.ccs2t = OpenCC(self.glo_setting.get("zht_out_style", "s2t"))
        self.cct2s = OpenCC("t2s")

        updater_plugin = updater.Updater(self.glo_setting)

        plug_all = [
            updater_plugin,
            switcher.Switcher(self.glo_setting),
            yobot_msg.Message(self.glo_setting),
            gacha.Gacha(self.glo_setting),
            char_consult.Char_consult(self.glo_setting),
            jjc_consult.Consult(self.glo_setting),
            boss_dmg.Boss_dmg(self.glo_setting),
            push_news.News(self.glo_setting),
            custom.Custom(self.glo_setting)
        ]
        self.plug_passive = [p for p in plug_all if p.Passive]
        self.plug_active = [p for p in plug_all if p.Active]

    def active_jobs(
            self) -> List[Tuple[Any, Callable[[], Iterable[Dict[str, Any]]]]]:
        jobs = [p.jobs() for p in self.plug_active]
        return reduce(lambda x, y: x + y, jobs)

    def proc(self, msg: dict, *args, **kwargs) -> str:
        '''
        receive a message and return a reply
        '''
        # prefix
        if self.glo_setting.get("preffix_on", False):
            preffix = self.glo_setting.get("preffix_string", "")
            if not msg["raw_message"].startswith(preffix):
                return None
            else:
                msg["raw_message"] = (msg["raw_message"][len(preffix):])

        # black-list
        if msg["sender"]["user_id"] in self.glo_setting.get(
                "black-list", list()):
            return None

        # zht-zhs convertion
        if self.glo_setting.get("zht_in", False):
            msg["raw_message"] = self.cct2s.convert(msg["raw_message"])
        if msg["sender"].get("card", "") == "":
            msg["sender"]["card"] = msg["sender"]["nickname"]

        # run
        replys = []
        for pitem in self.plug_passive:
            func_num = pitem.match(msg["raw_message"])
            if func_num:
                res = pitem.execute(func_num, msg)
                replys.append(res["reply"])
                if res["block"]:
                    break
        reply_msg = "\n".join(replys)

        # zhs-zht convertion
        if self.glo_setting.get("zht_out", False):
            reply_msg = self.ccs2t.convert(reply_msg)

        return reply_msg

    def execute(self, cmd: str, *args, **kwargs):
        if cmd == "update":
            res = self.plug_passive[0].execute(0x30)
        return res["reply"]

Example #13

0

Show file

    def clean_text(self,
                   text,
                   remove_url=True,
                   email=True,
                   weibo_at=True,
                   stop_terms=("转发微博", ),
                   emoji=True,
                   weibo_topic=False,
                   deduplicate_space=True,
                   norm_url=False,
                   norm_html=False,
                   to_url=False,
                   remove_puncts=False,
                   remove_tags=True,
                   t2s=False):
        '''
        进行各种文本清洗操作，微博中的特殊格式，网址，email，html代码，等等

        :param text: 输入文本
        :param remove_url: （默认使用）是否去除网址
        :param email: （默认使用）是否去除email
        :param weibo_at: （默认使用）是否去除微博的\@相关文本
        :param stop_terms: 去除文本中的一些特定词语，默认参数为("转发微博",)
        :param emoji: （默认使用）去除\[\]包围的文本，一般是表情符号
        :param weibo_topic: （默认不使用）去除##包围的文本，一般是微博话题
        :param deduplicate_space: （默认使用）合并文本中间的多个空格为一个
        :param norm_url: （默认不使用）还原URL中的特殊字符为普通格式，如(%20转为空格)
        :param norm_html: （默认不使用）还原HTML中的特殊字符为普通格式，如(\&nbsp;转为空格)
        :param to_url: （默认不使用）将普通格式的字符转为还原URL中的特殊字符，用于请求，如(空格转为%20)
        :param remove_puncts: （默认不使用）移除所有标点符号
        :param remove_tags: （默认使用）移除所有html块
        :param t2s: （默认不使用）繁体字转中文
        :return: 清洗后的文本
        '''
        # 反向的矛盾设置
        if norm_url and to_url:
            raise Exception("norm_url和to_url是矛盾的设置")
        if norm_html:
            text = html.unescape(text)
        if to_url:
            text = urllib.parse.quote(text)
        if remove_tags:
            text = w3lib.html.remove_tags(text)
        if remove_url:
            URL_REGEX = re.compile(
                r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))',
                re.IGNORECASE)
            text = re.sub(URL_REGEX, "", text)
        if norm_url:
            text = urllib.parse.unquote(text)
        if email:
            EMAIL_REGEX = re.compile(
                r"[-a-z0-9_.]+@(?:[-a-z0-9]+\.)+[a-z]{2,6}", re.IGNORECASE)
            text = re.sub(EMAIL_REGEX, "", text)
        if weibo_at:
            text = re.sub(r"(回复)?(//)?\s*@\S*?\s*(:|：| |$)", " ",
                          text)  # 去除正文中的@和回复/转发中的用户名
        if emoji:
            text = re.sub(r"\[\S+\]", "", text)  # 去除表情符号
        if weibo_topic:
            text = re.sub(r"#\S+#", "", text)  # 去除话题内容
        if deduplicate_space:
            text = re.sub(r"\s+", " ", text)  # 合并正文中过多的空格
        if t2s:
            cc = OpenCC('t2s')
            text = cc.convert(text)
        assert hasattr(stop_terms, "__iter__"), Exception("去除的词语必须是一个可迭代对象")
        if type(stop_terms) == str:
            text = text.replace(stop_terms, "")
        else:
            for x in stop_terms:
                text = text.replace(x, "")
        if remove_puncts:
            allpuncs = re.compile(
                r"[，\_《。》、？；：‘’＂“”【「】」·！@￥…（）—\,\<\.\>\/\?\;\:\'\"\[\]\{\}\~\`\!\@\#\$\%\^\&\*\(\)\-\=\+]"
            )
            text = re.sub(allpuncs, "", text)

        return text.strip()

Example #14

0

Show file

    def __init__(self,
                 *,
                 data_path: str,
                 scheduler: AsyncIOScheduler,
                 quart_app: Quart,
                 bot_api: Api,
                 verinfo: str = None):

        # initialize config
        is_packaged = "_MEIPASS" in dir(sys)
        if is_packaged:
            basepath = os.path.dirname(sys.argv[0])
        else:
            basepath = os.path.dirname(__file__)

        dirname = os.path.abspath(os.path.join(basepath, data_path))
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        config_f_path = os.path.join(dirname, "yobot_config.json")
        if is_packaged:
            default_config_f_path = os.path.join(sys._MEIPASS, "packedfiles",
                                                 "default_config.json")
        else:
            default_config_f_path = os.path.join(os.path.dirname(__file__),
                                                 "packedfiles",
                                                 "default_config.json")
        with open(default_config_f_path, "r", encoding="utf-8") as config_file:
            self.glo_setting = json.load(config_file)
        if not os.path.exists(config_f_path):
            shutil.copyfile(default_config_f_path, config_f_path)
            print("设置已初始化，发送help获取帮助")
        boss_filepath = os.path.join(dirname, "boss3.json")
        if not os.path.exists(boss_filepath):
            if is_packaged:
                default_boss_filepath = os.path.join(sys._MEIPASS,
                                                     "packedfiles",
                                                     "default_boss.json")
            else:
                default_boss_filepath = os.path.join(os.path.dirname(__file__),
                                                     "packedfiles",
                                                     "default_boss.json")
            shutil.copyfile(default_boss_filepath, boss_filepath)
        pool_filepath = os.path.join(dirname, "pool3.json")
        if not os.path.exists(pool_filepath):
            if is_packaged:
                default_pool_filepath = os.path.join(sys._MEIPASS,
                                                     "packedfiles",
                                                     "default_pool.json")
            else:
                default_pool_filepath = os.path.join(os.path.dirname(__file__),
                                                     "packedfiles",
                                                     "default_pool.json")
            shutil.copyfile(default_pool_filepath, pool_filepath)
        with open(config_f_path, "r", encoding="utf-8-sig") as config_file:
            cfg = json.load(config_file)
            for k in self.glo_setting.keys():
                if k in cfg:
                    self.glo_setting[k] = cfg[k]

        if verinfo is None:
            verinfo = updater.get_version(self.Version, self.Version_id)
            print(verinfo['ver_name'])

        # initialize database
        ybdata.init(os.path.join(dirname, 'yobotdata.db'))

        # initialize web path
        if not self.glo_setting.get("public_address"):
            try:
                res = requests.get("http://ip-api.com/json/?fields=8192")
                ipaddr = res.json()["query"]
            except:
                with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
                    s.connect(("8.8.8.8", 53))
                    ipaddr = s.getsockname()[0]
            self.glo_setting["public_address"] = "http://{}:{}/".format(
                ipaddr,
                self.glo_setting["port"],
            )

        if not self.glo_setting["public_address"].endswith("/"):
            self.glo_setting["public_address"] += "/"

        if not self.glo_setting["public_basepath"].startswith("/"):
            self.glo_setting["public_basepath"] = "/" + \
                self.glo_setting["public_basepath"]

        if not self.glo_setting["public_basepath"].endswith("/"):
            self.glo_setting["public_basepath"] += "/"

        # initialize update time
        if self.glo_setting["update-time"] == "random":
            self.glo_setting["update-time"] = "{:02d}:{:02d}".format(
                random.randint(2, 4), random.randint(0, 59))

        # initialize client salt
        if self.glo_setting["client_salt"] is None:
            self.glo_setting["client_salt"] = web_util.rand_string(16)

        # save initialization
        with open(config_f_path, "w", encoding="utf-8") as config_file:
            json.dump(self.glo_setting, config_file, indent=4)

        # initialize utils
        templating.Ver = self.Version[2:-1]

        # generate random secret_key
        if (quart_app.secret_key is None):
            quart_app.secret_key = bytes(
                (random.randint(0, 255) for _ in range(16)))

        # add mimetype
        mimetypes.init()
        mimetypes.add_type('application/javascript', '.js')
        mimetypes.add_type('image/webp', '.webp')

        # add route for static files
        @quart_app.route(urljoin(self.glo_setting["public_basepath"],
                                 "assets/<path:filename>"),
                         methods=["GET"])
        async def yobot_static(filename):
            return await send_file(
                os.path.join(os.path.dirname(__file__), "public", "static",
                             filename))

        # add route for output files
        if not os.path.exists(os.path.join(dirname, "output")):
            os.mkdir(os.path.join(dirname, "output"))

        @quart_app.route(urljoin(self.glo_setting["public_basepath"],
                                 "output/<path:filename>"),
                         methods=["GET"])
        async def yobot_output(filename):
            return await send_file(os.path.join(dirname, "output", filename))

        # openCC
        self.ccs2t = OpenCC(self.glo_setting.get("zht_out_style", "s2t"))
        self.cct2s = OpenCC("t2s")

        # filter
        self.black_list = set(self.glo_setting["black-list"])
        self.black_list_group = set(self.glo_setting["black-list-group"])
        self.white_list_group = set(self.glo_setting["white-list-group"])

        # update runtime variables
        self.glo_setting.update({"dirname": dirname, "verinfo": verinfo})
        kwargs = {
            "glo_setting": self.glo_setting,
            "bot_api": bot_api,
            "scheduler": scheduler,
            "app": quart_app,
        }

        # load plugins
        plug_all = [
            updater.Updater(**kwargs),
            switcher.Switcher(**kwargs),
            yobot_msg.Message(**kwargs),
            gacha.Gacha(**kwargs),
            jjc_consult.Consult(**kwargs),
            boss_dmg.Boss_dmg(**kwargs),
            push_news.News(**kwargs),
            calender.Event(**kwargs),
            homepage.Index(**kwargs),
            marionette.Marionette(**kwargs),
            login.Login(**kwargs),
            settings.Setting(**kwargs),
            web_util.WebUtil(**kwargs),
            clan_battle.ClanBattle(**kwargs),
        ]
        self.plug_passive = [p for p in plug_all if p.Passive]
        self.plug_active = [p for p in plug_all if p.Active]

        for p in plug_all:
            if p.Request:
                p.register_routes(quart_app)

        # load new plugins
        self.plug_new = [
            miner.Miner(**kwargs),
            group_leave.GroupLeave(**kwargs),
            custom.Custom(**kwargs),
        ]

Example #15

0

Show file

def loadtext(textdir, mixedversion=True, textrankrate=0.5):
    t0 = time.time()
    docs = []
    filelist = os.listdir(textdir)
    for f in filelist:
        if not is_alphabet(f[0]):
            filelist.remove(f)
    # print(filelist)
    tr4s = TextRank4Sentence()
    for file in sorted(filelist, key=lambda file: float(file.split("_")[1][:-4]), reverse=True):
        print(file)
        with open(savedir+file, "r", encoding="utf-8") as txtfile:
            eofp = open(tempdir+file, "w", encoding="utf-8")
            contents = txtfile.read()
            if mixedversion:
                tr4s.analyze(text=contents, lower=True, source='all_filters')
                docsum = tr4s.get_key_sentences(
                    num=int(len(tr4s.sentences)*textrankrate))
                # print(docsum)
                contents = [item.sentence for item in docsum]
            else:
                contents = contents.splitlines()
            # print(contents)
            paragraph, sentence = [], []
            for content in contents:
                line, i, l = "", 0, len(content)
                while i < l:
                    if content[i] == " " and i+1 < l and not is_alphabet(content[i+1]):
                        i += 1
                        continue
                    if content[i] == "　":
                        i += 1
                        continue
                    line += content[i]
                    if content[i] in ["。", "？", "！", "?", "!"]:
                        if i+1 < l and content[i+1] in ["」", "』", "”"]:
                            line += content[i+1]
                            i += 1
                        if line[0] in ["(", "（"] and sentence:
                            sentence[-1] += line
                            line = ""
                        elif i+1 < l and content[i+1] not in ["》", "〉", ")", "）"]:
                            sentence.append(line)
                            line = ""
                    i += 1
                if line:
                    sentence.append(line)
                if sentence:
                    for s in sentence:
                        eofp.write(s+"\n")
                    paragraph.append(sentence)
                    sentence = []
            eofp.close()
            docs.append(paragraph)
    print("txt loding completed", time.time()-t0)
    xlist = ["。", "？", "！", "～", "?", "!", " ", "　", "」", "』", "”",
             "(", "（", "》", "〉", ")", "）", "，", "：", "」", "、", "《",
             "；", "「", "%"]
    t0 = time.time()
    dic = {}
    with open(_PATH+"source/1998.csv", "r", encoding="utf-8") as dictxt:
        lines = dictxt.read().splitlines()
        for line in lines:
            line = line.split(",")
            dic[line[1]] = float(line[3])
    sentences = []
    indexsentences = []
    opcc = OpenCC('s2twp')
    for d, doc in enumerate(docs[:]):
        for p, para in enumerate(doc):
            for sid, sen in enumerate(para):
                # print(sen)
                sentence = []
                idf = count = w = score = 0
                hitset = set()
                words = jbps.cut(sen)
                for word, flag in words:
                    w += 1
                    word = opcc.convert(word)
                    try:
                        float(word)
                    except:
                        if not word in xlist:
                            # print(word, flag, end="|")
                            sentence.append(word)
                        if not word in dic or word in hitset:
                            continue
                        if flag in ["l", "n", "nr"] and dic[word] >= 30:
                            idf += 1/dic[word]
                            count += 1
#                print("\n", sentence, "\n")
                if w >= 10 and count > 3:
                    score = idf/count
                if sentence and score > 0:
                    sentences.append(sentence)
                    indexsentences.append(
                        Sentence(sen, sentence, d, p, sid, score))
    print("segmentation completed", time.time()-t0)
    print("total sentences:", len(sentences))
#    print(sentences)
    # for s in indexsentences:
    #     print(s)
    return sentences, indexsentences

Example #16

0

Show file

File: example.py Project: jianxing0310/opencc-python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
from opencc import OpenCC


if __name__ == '__main__':
    if sys.version_info[0] < 3:
        print('Require Python3 to run')
        sys.exit(0)

    openCC = OpenCC()
    openCC.set_conversion('s2twp')
    # openCC = OpenCC('s2twp')

    words = '鼠标是一种很常見及常用的電腦输入设备，它可以对当前屏幕上的游标进行定位，并通过按键和滚轮装置对游标所经过位置的' \
            '屏幕元素进行操作。鼠标的鼻祖於1968年出现。美国科学家道格拉斯·恩格尔巴特（Douglas Englebart）在加利福尼亚制作了' \
            '第一只鼠标。'

    result = openCC.convert(words)
    print("{} \n\n==> \n\n{}".format(words, result))

Example #17

0

Show file

File: fetch_zgxw.py Project: O70/python-examples

for s in soup.find_all('div', {'class': 'sons'}):
    sub_soup = get_soup(s.find('a').attrs['href'])
    cont = sub_soup.find('div', {
        'class': 'main3'
    }).find('div', {'class': 'cont'})
    chapter = cont.find('h1').find('span').find('b').string

    paragraphs_list = []
    paragraphs = cont.find('div', {'class': 'contson'})
    if not paragraphs.find('p') is None:
        paragraphs = paragraphs.find('p')
    for p in paragraphs:
        p = p.string
        if not p is None and len(p.strip()) > 0:
            paragraphs_list.append(replace_symbol(p))

    print(len(paragraphs_list))
    data['content'].append({
        'chapter': cont.find('h1').find('span').find('b').string,
        'paragraphs': paragraphs_list
    })

cc = OpenCC('s2t')
with open('./jsons/zengguangxianwen.json', 'w',
          encoding='utf-8') as file_object:
    json.dump(json.loads(cc.convert(json.dumps(data, ensure_ascii=False))),
              file_object,
              sort_keys=False,
              indent=2,
              ensure_ascii=False)

Example #18

0

Show file

File: generate_own_script_test1.py Project: valosz66666/youtube_recommendation_system

def main(bgtext):
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0',
                        type=str,
                        required=False,
                        help='生成設備')
    parser.add_argument('--length',
                        default=700,
                        type=int,
                        required=False,
                        help='生成长度')
    parser.add_argument('--batch_size',
                        default=1,
                        type=int,
                        required=False,
                        help='生成的batch size')
    parser.add_argument('--nsamples',
                        default=1,
                        type=int,
                        required=False,
                        help='生成几个样本')
    parser.add_argument('--temperature',
                        default=1.0,
                        type=float,
                        required=False,
                        help='生成温度')
    parser.add_argument('--topk',
                        default=8,
                        type=int,
                        required=False,
                        help='最高几选一')
    parser.add_argument('--topp',
                        default=1,
                        type=float,
                        required=False,
                        help='最高积累概率')
    parser.add_argument('--model_config',
                        default='config/model_config_small.json',
                        type=str,
                        required=False,
                        help='模型参数')
    parser.add_argument('--tokenizer_path',
                        default='cache/vocab.txt',
                        type=str,
                        required=False,
                        help='词表路径')
    parser.add_argument('--model_path',
                        default='model/final_model',
                        type=str,
                        required=False,
                        help='模型路径')
    parser.add_argument('--prefix',
                        default='{}'.format(str(bgtext)),
                        type=str,
                        required=False,
                        help='生成文章的开头')
    parser.add_argument('--no_wordpiece',
                        action='store_true',
                        help='不做word piece切词')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')
    parser.add_argument('--fast_pattern',
                        action='store_true',
                        help='采用更加快的方式生成文本')
    parser.add_argument('--save_samples', action='store_true', help='保存产生的样本')
    parser.add_argument('--save_samples_path',
                        default='./sample/sample_save',
                        type=str,
                        required=False,
                        help="保存样本的路径")
    parser.add_argument('--repetition_penalty',
                        default=1.0,
                        type=float,
                        required=False)

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
    length = args.length
    batch_size = args.batch_size
    nsamples = args.nsamples
    temperature = args.temperature
    topk = args.topk
    topp = args.topp
    repetition_penalty = args.repetition_penalty

    device = "cuda" if torch.cuda.is_available() else "cpu"

    tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
    model = GPT2LMHeadModel.from_pretrained(args.model_path)
    model.to(device)
    model.eval()

    n_ctx = model.config.n_ctx

    if length == -1:
        length = model.config.n_ctx
    args.save_samples = True
    if args.save_samples:
        if not os.path.exists(args.save_samples_path):
            os.makedirs(args.save_samples_path)
        samples_file = open(args.save_samples_path + '/samples.txt',
                            'w',
                            encoding='utf8')
    while True:
        raw_text = args.prefix
        context_tokens = tokenizer.convert_tokens_to_ids(
            tokenizer.tokenize(raw_text))
        generated = 0
        for _ in range(nsamples // batch_size):
            out = generate(n_ctx=n_ctx,
                           model=model,
                           context=context_tokens,
                           length=length,
                           is_fast_pattern=args.fast_pattern,
                           tokenizer=tokenizer,
                           temperature=temperature,
                           top_k=topk,
                           top_p=topp,
                           repitition_penalty=repetition_penalty,
                           device=device)
            #result_text = []
            for i in range(batch_size):
                generated += 1
                text = tokenizer.convert_ids_to_tokens(out)
                for i, item in enumerate(text[:-1]):  # 确保英文前后有空格
                    if is_word(item) and is_word(text[i + 1]):
                        text[i] = item + ' '
                for i, item in enumerate(text):
                    if item == '[MASK]':
                        text[i] = ''
                    elif item == '[CLS]':
                        text[i] = '\n\n'
                    elif item == '[SEP]':
                        text[i] = '\n'
                info = "=" * 40 + " SAMPLE " + str(
                    generated) + " " + "=" * 40 + "\n"
                #print(info)
                text = ''.join(text).replace('##', '').strip()
                cc = OpenCC('s2t')
                return cc.convert(text)
                print(cc.convert(text))
                #print(text)
                #=== get the result===
                if args.save_samples:
                    samples_file.write(info)
                    samples_file.write(text)
                    samples_file.write('\n')
                    samples_file.write('=' * 90)
                    samples_file.write('\n' * 2)
                # if text[0] == bgtext[0][0]:
                #     result_text.append(text)
        #print("=" * 80)
        if generated == nsamples:
            # close file when finish writing.
            if args.save_samples:
                samples_file.close()
            #print(result_text)
            break

Example #19

0

Show file

File: conversion_test.py Project: jianxing0310/opencc-python

 def setUp(self):
     self.openCC = OpenCC()

Example #20

0

Show file

File: utils.py Project: shfcheung/text-classifier

def text_cleaning(text_array,
                  word_tokenization,
                  bert_vocab_file,
                  do_lower_case,
                  num_threads=1):
    """
    a function to clean text, including
    - Normalize full width characters
    - Convert all English letters to lower case
    - Translate Chinese simplified characters to traditional Chinese
    - Separate Chinese and English tokens from each other
    - Segment Chinese text
    - remove punctuation
    - perform the special Word Piece Segmentation if specified by the argument "word_tokenization"

    INPUT:
        text_array: list/numpy array/pandas Series
        word_tokenization: method of word segmentation, either split by "space" or "word_piece" tokenization
        bert_vocab_file: string --path to the .txt file of vocabularies for modified Word Piece Tokenizer
        do_lower_case: boolean --whether the modified Word Piece Tokenizer converts English letters to lower case or not
        num_threads: int --number of CPU processors for performing the text cleaning
    OUTPUT:
        text_array: a list of string (cleaned text)
    """

    with mp.Pool(processes=num_threads) as pool:

        print("   Normalizing full-width characters...")
        text_array = pool.map(normalize_full_width, text_array)

        print("   Converting English letters to lower case...")
        text_array = pool.map(case_lower, text_array)

        print("   Translating Simplified Chinese to Traditional...")
        bool_contain_chi = pool.map(detect_chi, text_array)
        text_array = np.array(text_array)
        cc = OpenCC('s2t')
        text_array[bool_contain_chi] = list(
            map(lambda x: cc.convert(x), text_array[bool_contain_chi]))

        print(
            "   Separating Chinese amd English word tokens from each other...")
        text_array = pool.map(sep_chi_eng, text_array)

        print("   Segmenting Chinese vocabularies...")
        text_array = np.array(text_array)
        dtype_0 = str(text_array.dtype)
        dtype_len_0 = int(re.findall("[0-9]+", dtype_0)[0])

        if sum(bool_contain_chi):
            replacement = pool.map(segment_chi, text_array[bool_contain_chi])
            replacement = np.array(replacement)
            dtype_1 = str(replacement.dtype)
            dtype_len_1 = int(re.findall("[0-9]+", dtype_1)[0])

            # change the type of text_array to that of the replacement
            # in order to avoid some of the characters in the segmented text being trimmed
            if dtype_len_0 > dtype_len_1:
                text_array = text_array.astype(dtype_0)
            else:
                text_array = text_array.astype(dtype_1)

            text_array[bool_contain_chi] = replacement

        print("   Removing punctuation...")
        text_array = pool.map(remove_punct, text_array)

        if word_tokenization == "word_piece":
            print("   Word Piece Segmentation...")
            tokenizer = ModifiedWordPieceTokenizer(
                bert_vocab_file=bert_vocab_file, do_lower_case=do_lower_case)
            text_array = pool.map(tokenizer.tokenize, text_array)
        elif word_tokenization == "space":
            return text_array

    return text_array

Example #21

0

Show file

File: 01.sim2trad.py Project: tychen5/BERT_chinese_LM_processing

import os

from opencc import OpenCC
from tqdm import tqdm

cc = OpenCC('s2t')
# ctrl alt L


def translate(src, dest):
    """
    goal: convert simplified chinese to traditional
    input: source file path(src), target file path(dest)
    output: write converted file to target file path
    """

    source = open(src, 'r', encoding='utf-8')
    result = open(dest, 'w', encoding='utf-8')
    count = 0
    while True:
        line = source.readline()
        line = cc.convert(line)
        if not line:  # readline會一直讀下去，這邊做的break
            break
        # print(line) ##debug
        count = count + 1
        result.write(line)
        # print('===已處理' + str(count) + '行===') ##debug
    source.close()
    result.close()

Example #22

0

Show file

    def get_txt(self, txt_id, state, threadnum):
        titlem = ''
        intro = ''
        ids = str(txt_id)
        percent = 0
        self.state = state
        self.percent = 0
        self.index = []
        self.titleindex = []
        self.Summary = []
        self.fillNum = 0
        self.rollSign = []
        self.rollSignPlace = []
        self.href_list = []
        self.td = []
        self.failInfo = []

        #获取文章网址
        req_url = ids

        #通过cookie获取文章信息
        res = requests.get(req_url, headers=self.headerss).content
        #对文章进行编码
        ress = etree.HTML(
            res.decode("GB18030", "ignore").encode("utf-8",
                                                   "ignore").decode('utf-8'))

        #获取文案
        intro = ress.xpath(
            "//html/body/table/tr/td[1]/div[2]/div[@id='novelintro']//text()")
        #获取标签
        info = ress.xpath("string(/html/body/table[1]/tr/td[1]/div[3])")

        infox = []
        for i in range(1, 7):
            infox.append(
                ress.xpath(
                    "string(/html/body/table[1]/tr/td[3]/div[2]/ul/li[" +
                    str(i) + "])"))

        #获取标题和作者
        xtitle = ress.xpath('string(//*[@itemprop="articleSection"])').strip()
        xaut = ress.xpath('string(//*[@itemprop="author"])').strip()
        ti = xtitle + '-' + xaut

        if self.state == 's':
            ti = OpenCC('t2s').convert(ti)
        elif self.state == 't':
            ti = OpenCC('s2t').convert(ti)
        print("网址：" + ids + "\r\n小说信息：" + str(ti) + "\r\n")

        #获取所有章节网址、标题、内容提要
        self.td = ress.xpath('//*[@id="oneboolt"]//tr')
        loc = []

        for i in self.td:
            u = i.xpath('./td[2]/span/div[1]/a/@href')
            x = i.xpath('./td[2]/span/div[1]/a[1]/@rel')
            if len(u) > 0:
                self.href_list += u
                v = i.xpath('./td[2]/span/div[1]/a')
                v = etree.tostring(v[0], encoding="utf-8").decode().strip()
                v = re.sub('</?\w+[^>]*>', '', v)
                self.titleindex.append(v.strip())
                v = i.xpath('./td[3]')
                v = etree.tostring(v[0], encoding="utf-8").decode().strip()
                v = re.sub('</?\w+[^>]*>', '', v)
                v = re.sub('&#13;', '', v)
                self.Summary.append(v.strip())
            elif len(x) > 0:
                self.href_list += x
                v = i.xpath('./td[2]/span/div[1]/a')
                v = etree.tostring(v[0], encoding="utf-8").decode().strip()
                v = re.sub('</?\w+[^>]*>', '', v)
                self.titleindex.append(v.strip())
                v = i.xpath('./td[3]')
                v = etree.tostring(v[0], encoding="utf-8").decode().strip()
                v = re.sub('</?\w+[^>]*>', '', v)
                v = re.sub('&#13;', '', v)
                self.Summary.append(v.strip())
            elif i.xpath('./td[2]/span/div[1]/span') != []:
                loc.append(i.xpath('./td[1]/text()')[0].strip())

        #获取卷标名称
        self.rollSign = ress.xpath(
            "//*[@id='oneboolt']//tr/td/b[@class='volumnfont']")
        #获取卷标位置
        self.rollSignPlace = ress.xpath(
            "//*[@id='oneboolt']//tr/td/b/ancestor-or-self::tr/following-sibling::tr[1]/td[2]/span/div[1]/a[1]/@href"
        )
        self.rollSignPlace += ress.xpath(
            "//*[@id='oneboolt']//tr/td/b/ancestor-or-self::tr/following-sibling::tr[1]/td[2]/span/div[1]/a[1]/@rel"
        )

        #修改卷标格式
        for rs in range(len(self.rollSign)):
            self.rollSign[rs] = etree.tostring(
                self.rollSign[rs], encoding="utf-8").decode().strip()
            self.rollSign[rs] = re.sub('</?\w+[^>]*>', '', self.rollSign[rs])
            self.rollSign[rs] = "§ " + self.rollSign[rs] + " §"

        section_ct = len(self.href_list)

        print("可下载章节数：" + str(section_ct) + "\r\n")
        if loc != []:
            i = ""
            for x in loc:
                i = i + x + " "
            print("被锁章节：" + i + "\r\n")

        #fillNum：填充序号的长度，例如：若全文有1437章，则每章序号有四位，依次为0001、0002……
        self.fillNum = len(str(len(self.td) - 4))

        #对标题进行操作，删除违规字符等
        ti = re.sub('[\/:*?"<>|]', '_', ti)
        ti = re.sub('&', '&amp;', ti)

        xauthref = ress.xpath("//*[@id='oneboolt']//h2/a/@href")[0]

        #若文件名不想加编号，可以将这行删除
        ti = ti + '.' + ids.split('=')[1]
        ti = re.sub('\r', '', ti)

        v = ""
        #打开小说文件写入小说相关信息
        path = os.getcwd()
        self.path = path
        if not os.path.exists('Fonts'):
            os.mkdir('Fonts')
        if os.path.exists(ti + '_txt'):
            os.chdir(ti + '_txt')
        else:
            os.mkdir(ti + '_txt')
            os.chdir(ti + '_txt')
        ppp = os.getcwd()
        self.index = []
        #写入文章信息页
        TOC = xtitle + '\n'
        TOC += '作者：' + xaut + "\r\n"
        TOC += '源网址：' + req_url + '\r\n'
        #生成目录文字
        for l in self.href_list:
            titleOrigin = l.split('=')
            i = self.href_list.index(l)
            #
            title = str(titleOrigin[2]).zfill(self.fillNum) + " "
            #
            title = title + self.titleindex[i].strip() + " "
            #
            title = title + self.Summary[i].strip()
            if self.state == 's':
                title = OpenCC('t2s').convert(title)
            elif self.state == 't':
                title = OpenCC('s2t').convert(title)
            if self.href_list[i] in self.rollSignPlace:
                v = self.rollSign[self.rollSignPlace.index(l)]
                if self.state == 's':
                    v = OpenCC('t2s').convert(
                        self.rollSign[self.rollSignPlace.index(l)])
                elif self.state == 't':
                    v = OpenCC('s2t').convert(
                        self.rollSign[self.rollSignPlace.index(l)])
                self.index.append(v)
            self.index.append(title)

        for ix in infox:
            ix = ix.strip()
            ix = re.sub('\r\n', '', ix)
            ix = re.sub(' +', '', ix)
            TOC += ix + "\r\n"

        TOC += "文案：\r\n"
        for nx in intro:
            v = re.sub(' +', ' ', str(nx)).strip()
            if self.state == 's':
                v = OpenCC('t2s').convert(v)
            elif self.state == 't':
                v = OpenCC('s2t').convert(v)
            if v != "":
                TOC += v + "\n"
        info = re.sub(' +', ' ', info).strip()
        if self.state == 's':
            info = OpenCC('t2s').convert(info)
        elif self.state == 't':
            info = OpenCC('s2t').convert(info)
        info = re.sub('搜索关键字', '\r\n搜索关键字', info)
        info = re.sub(' 一句话简介：', '一句话简介：', info)
        info = re.sub('\r\n \r\n 立意：', '\r\n立意：', info)
        TOC += info + "\n"
        fo = open("TOC.txt", 'w', encoding='utf-8')
        fo.write(TOC)
        fo.close()
        tlist = []
        #获取每一章内容

        with concurrent.futures.ThreadPoolExecutor(
                max_workers=threadnum) as executor:
            tlist = {
                executor.submit(self.get_sin, i): i
                for i in self.href_list
            }
            for future in concurrent.futures.as_completed(tlist):
                if self.percent < section_ct:
                    print('\r 下载进度：%d/%d' % (self.percent, section_ct),
                          end='',
                          flush=True)
            print('\r 下载完成，总进度：%d/%d\r\n' % (self.percent, section_ct),
                  end='',
                  flush=True)
        '''
        for i in self.href_list:
            self.get_sin(i)
        '''
        if self.failInfo != []:
            self.failInfo.sort()
            vs = ""
            for ss in self.failInfo:
                vs = vs + ss + "|"
            print("\r\n未购买或加载失败章节：")
            print(vs[:-1] + "\r\n")

        #整合
        os.chdir(path)
        f = open(ti + ".txt", 'w', encoding='utf-8')
        filenames = os.listdir(ppp)
        i = 0
        for filename in filenames:
            filepath = ppp + '\\' + filename
            for line in open(filepath, encoding='utf-8', errors='ignore'):
                f.writelines(line)
        f.close()
        shutil.rmtree(ppp)

        print("\r\ntxt文件整合完成")

Example #23

0

Show file

from opencc import OpenCC
import codecs
import os
import re
import string
import json
from scipy import spatial
import jieba
from typing import List
import math
from gensim.models.word2vec import PathLineSentences
from gensim.models.word2vec import Word2Vec
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary

OP = OpenCC('t2s')
sent_cut_pattern = [
    re.compile(r'([。？！?])([^"\'”])'),
    re.compile(r'(\.{6})([^"\'”])'),
    re.compile(r'([。？！?]["\'”])([^\'"”])'),
]
zh_pattern = re.compile(r'^[\u4e00-\u9fa5]+$')
puncs = string.punctuation + '.,;《》？！“”‘’@#￥%…&×（）——+【】{};；●，。&～、|\s:：'
punc_pattern = re.compile(r'[{}]+'.format(puncs))

stopwords = []
with open('./data/chinese_stopwords.txt', 'r', encoding='utf8') as f:
    for line in f:
        line = line.strip()
        if len(line) > 0:
            stopwords.append(line.strip())

Example #24

0

Show file

    def get_sin(self, l):
        titleOrigin = l.split('=')
        i = self.href_list.index(l)
        badgateway = True
        while (badgateway):
            cont = requests.get(l, headers=self.headerss)
            dot = etree.HTML(
                cont.content.decode('gb18030',
                                    "ignore").encode("utf-8").decode('utf-8'))
            codetext = etree.tostring(dot, encoding="utf-8").decode()
            bdw = re.findall('<h1>502 Bad Gateway</h1>', codetext)
            if bdw == []:
                badgateway = False
        fontfamily = ''
        cvlist = []
        cvdic = []

        #字体反爬虫
        fontsrc = re.findall(
            r'//static.jjwxc.net/tmp/fonts/.*?woff2.h=my.jjwxc.net', codetext)
        if fontsrc != []:
            fontsrc = "http:" + fontsrc[0]
            fontname = re.sub('http://static.jjwxc.net/tmp/fonts/', '',
                              fontsrc)
            fontname = re.sub('.h=my.jjwxc.net', '', fontname)
            fontfamily = re.sub('.woff2', '', fontname)
            try:
                with open(self.path + "/Fonts/" + fontfamily + ".txt",
                          "r",
                          encoding='utf-8') as f:
                    cvlist = f.readlines()
                    for y in range(len(cvlist)):
                        cvdic.append(cvlist[y].split('-'))
                    cvdic = dict(cvdic)
            except:
                t = 1
            if not os.path.exists(self.path + "/Fonts/" + fontname):
                fontwb = requests.get(fontsrc).content
                fontf = open(self.path + "/Fonts/" + fontname, 'wb')
                fontf.write(fontwb)
                fontf.close()
            if cvlist != []:
                fontfamily += '_c'
            elif fontfamily not in self.fontlist:
                self.fontlist.append(fontfamily)

        #tex:正文
        tex = dot.xpath('//*[@id="oneboolt"]/tr[2]/td[1]/div/text()')

        #tex1:作话
        tex1 = dot.xpath("//div[@class='readsmall']/text()")
        #sign:作话位置
        sign = dot.xpath("//*[@id='oneboolt']/tr[2]/td[1]/div/div[4]/@class")

        title = ''
        #序号填充
        if self.titleInfo[0] == '1':
            title = str(titleOrigin[2]).zfill(self.fillNum) + "#"

        #章节名称
        if self.titleInfo[1] == '1':
            title = title + " " + self.titleindex[i].strip()

        #内容提要
        if self.titleInfo[2] == '1':
            title = title + " " + self.Summary[i].strip()

        if self.state == 's':
            title = OpenCC('t2s').convert(title)
        elif self.state == 't':
            title = OpenCC('s2t').convert(title)
        if self.href_list[i] in self.rollSignPlace:
            v = self.rollSign[self.rollSignPlace.index(l)]
            if self.state == 's':
                v = OpenCC('t2s').convert(
                    self.rollSign[self.rollSignPlace.index(l)])
            elif self.state == 't':
                v = OpenCC('s2t').convert(
                    self.rollSign[self.rollSignPlace.index(l)])

            #创建章节文件
        fo = open("z" + str(titleOrigin[2].zfill(4)) + ".txt",
                  'w',
                  encoding='utf-8')
        #写入卷标
        if self.href_list[i] in self.rollSignPlace:
            fo.write("\r\n\r\n" + v.rstrip() + '\r\n')
            print("\r\n" + v + "\r\n")
            fo.write(title + '\r\n')
        #写入标题
        else:
            fo.write("\r\n\r\n" + title + "\r\n")
        if len(tex) == 0:
            self.failInfo.append(titleOrigin[2].zfill(self.fillNum))
            fo.write('下载失败！')
        else:
            #反爬虫处理，必须把对照表TXT文件下载至Fonts文件夹
            if cvdic != []:
                for y in range(len(tex)):
                    for s, v in cvdic.items():
                        if not s == '&#x78"/;':
                            s = re.sub(r'&#x', r'\\u', s)
                            s = re.sub(
                                ';', '',
                                s).encode('utf-8').decode('unicode_escape')
                            tex[y] = re.sub(s, v.strip(), tex[y])
            cvdic = cvlist = 0
            #作话在文前的情况
            if str(sign) == "['readsmall']":
                for m in tex1:  #删除无用文字及多余空格空行
                    vv = re.sub('@无限好文，尽在晋江文学城', '', str(m))
                    v = re.sub(' +', ' ', vv).strip()
                    v = re.sub('　', '', v)
                    if self.state == 's':
                        v = OpenCC('t2s').convert(v)
                    elif self.state == 't':
                        v = OpenCC('s2t').convert(v)
                    v = re.sub('作者有话要说：', '作者有话要说：\n', v)
                    if v != "":  #按行写入正文
                        fo.write(v + "\n")
                if len(tex1) != 0:
                    fo.write("\n*\r\n")
                for tn in tex:
                    vv = re.sub('@无限好文，尽在晋江文学城', '', str(tn))
                    v = re.sub(' +', ' ', vv).strip()
                    v = re.sub('　', '', v)
                    if self.state == 's':
                        v = OpenCC('t2s').convert(v)
                    elif self.state == 't':
                        v = OpenCC('s2t').convert(v)
                    if v != "":
                        fo.write(v + "\n")
            else:  #作话在文后的情况
                for tn in tex:
                    vv = re.sub('@无限好文，尽在晋江文学城', '', str(tn))
                    v = re.sub(' +', ' ', vv).strip()
                    v = re.sub('　', '', v)
                    if self.state == 's':
                        v = OpenCC('t2s').convert(v)
                    elif self.state == 't':
                        v = OpenCC('s2t').convert(v)
                    if v != "":
                        fo.write(v + "\n")
                if len(tex1) != 0:
                    fo.write("\n*\r\n")
                for m in tex1:
                    vv = re.sub('@无限好文，尽在晋江文学城', '', str(m))
                    v = re.sub(' +', ' ', vv).strip()
                    v = re.sub('　', '', v)
                    if self.state == 's':
                        v = OpenCC('t2s').convert(v)
                    elif self.state == 't':
                        v = OpenCC('s2t').convert(v)
                    v = re.sub('作者有话要说：', '作者有话要说：\n', v)
                    if v != "":
                        fo.write(v + "\n")
        fo.close()
        self.percent += 1

Example #25

0

Show file

    names = os.listdir(path)
    plainText = [
        x for x in names
        if x[-3:].lower() in ("lrc", "ass", "txt", "ssa", "srt")
    ]
    for a in plainText:
        pt = f"{path}/{a}"
        with open(pt, encoding="utf-8") as f:
            data = cc.convert(f.read())
        with open(pt, mode="w+", encoding="utf-8") as f:
            f.write(data)
    print("done.")


if __name__ == "__main__":
    cc = OpenCC("t2s")
    type = (
        "t2s",
        "t2hk",
        "t2tw",
        "tw2s",
        "tw2sp",
        "hk2s",
        "s2hk",
        "s2t",
        "s2tw",
        "s2twp",
    )
    items = "\n".join([f"{x[0]}. {x[1]}" for x in enumerate(type)])
    n = input(
        f'WARNING: JAPANESE CHARACTER maybe affected!\n{items}\nInput number to select a convert type.\nInput nothing and enter to use "t2s" by default:'

Example #26

0

Show file

File: wikizh.py Project: yangkm601/DataWhale_Study_Wp

@Author: King
@Date: 2019.03.13
@Purpose: 处理wikizh语料库
@Link:https://dumps.wikimedia.org/zhwiki/20180801/
@Reference: https://kexue.fm/archives/4176
@opencc安装命令:pip install opencc-python-reimplemented
@opencc reference: https://github.com/yichen0831/opencc-python
'''

from gensim.corpora.wikicorpus import extract_pages, filter_wiki
import bz2file
import re
from os import path
#import opencc
from opencc import OpenCC
OpenCC = OpenCC(
    't2s')  # convert from Simplified Chinese to Traditional Chinese
from tqdm import tqdm
import codecs

data_dir = 'resource/'
wiki = extract_pages(
    bz2file.open(
        path.join(data_dir,
                  'zhwiki-20180801-pages-articles-multistream.xml.bz2')))


def wiki_replace(d):
    s = d[1]
    s = re.sub(':*{\|[\s\S]*?\|}', '', s)
    s = re.sub('<gallery>[\s\S]*?</gallery>', '', s)
    s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s)

Example #27

0

Show file

import os
import sys
import torch
from prepro_lib import jieba_cut
from opencc import OpenCC
import pickle
import copy

cc = OpenCC('tw2s')

device = torch.device('cuda:0')


class System:
    def __init__(
            self,
            max_len=30,
            model_path=None,
            emb_matrix=None,
            w2id_dict=None
    ):
        if not os.path.exists(model_path):
            print("model path is not exist..")
            sys.exit(1)

        self.max_len = max_len

        self.emb_matrix = emb_matrix

        self.model_path = model_path

Example #28

0

Show file

File: 1_process.py Project: fengweijie/wiki_yunyibei

from gensim.corpora.wikicorpus import extract_pages, filter_wiki
import bz2file
import re
from tqdm import tqdm
import codecs

wiki = extract_pages(
    bz2file.open('zhwiki-20180301-pages-articles-multistream.xml.bz2'))
from opencc import OpenCC

openCC = OpenCC(
    'hk2s')  # convert from Simplified Chinese to Traditional Chinese
# can also set conversion by calling set_conversion
# openCC.set_conversion('s2tw')
to_convert = '开放中文转换'
converted = openCC.convert(to_convert)


def wiki_replace(d):
    s = d[1]
    s = re.sub(':*{\|[\s\S]*?\|}', '', s)
    s = re.sub('<gallery>[\s\S]*?</gallery>', '', s)
    s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s)
    s = filter_wiki(s)
    s = re.sub('\* *\n|\'{2,}', '', s)
    s = re.sub('\n+', '\n', s)
    s = re.sub('\n[:;]|\n +', '\n', s)
    s = re.sub('\n==', '\n\n==', s)
    s = u'【' + d[0] + u'】\n' + s
    return openCC.convert(s).strip()

Example #29

0

Show file

from mxnet.gluon.data import SimpleDataset, DataLoader
from mxnet import nd
import numpy as np
from copy import deepcopy
from random import choice
from utils import load_pickle
import jieba
import re
from pinyin import PinYinSampler
from opencc import OpenCC
from structure import Structure
s2t = OpenCC('s2t')  #
t2s = OpenCC('t2s')


class SighanCSC14Data(object):
    def __init__(self,
                 tokenizer,
                 transformer,
                 config,
                 mode,
                 vocab_tgt=None,
                 useDecoder=False,
                 args=None):
        self.tokenizer = tokenizer
        self.transformer = transformer
        self.mode = mode
        self.batch_size = config[mode]['batch_size']
        self.max_seq_len = config['int_max_length']
        self.vocab_tgt = vocab_tgt
        self.config = config

Example #30

0

Show file

from PyQt5 import QtWidgets
from PyQt5 import QtCore
from PyQt5 import QtGui
from shensha import shensha
from opencc import OpenCC
cc = OpenCC('s2t')


class ShenShaDialog(QtWidgets.QDialog):
    def __init__(self, parent=None, shiPan=None):
        super().__init__(parent)
        self.setWindowTitle("神煞")
        self.setWindowFlag(QtCore.Qt.WindowMinMaxButtonsHint)
        self.resize(700, 500)
        guaTiLayout = QtWidgets.QVBoxLayout()
        # self.layout=helpLayout
        self.setLayout(guaTiLayout)
        self.guaTiTextBrowser = QtWidgets.QTextBrowser()
        guaTiLayout.addWidget(self.guaTiTextBrowser)
        guaTiFont = QtGui.QFont()
        guaTiFont.setPixelSize(18)
        self.guaTiTextBrowser.setFont(guaTiFont)

        if shiPan is None:
            self.guaTiTextBrowser.setHtml("查詢神煞，請先排盤")
            return

        shenShaJson = {"年": {}, "月": {}, "日": {}}
        shenShaModules = shensha
        shenShaFuns = []
        for attr in (a for a in dir(shenShaModules) if a.startswith('do_')):

Example #31

0

Show file

class Yobot:
    Version = "[v3.6.11]"  # semver
    Version_id = 253
    #  "git rev-list --count HEAD"

    def __init__(self, *,
                 data_path: str,
                 scheduler: AsyncIOScheduler,
                 quart_app: Quart,
                 bot_api: Api,
                 verinfo: str = None):

        # initialize config
        is_packaged = "_MEIPASS" in dir(sys)
        if is_packaged:
            basepath = os.path.dirname(sys.argv[0])
        else:
            basepath = os.path.dirname(__file__)

        dirname = os.path.abspath(os.path.join(basepath, data_path))
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        config_f_path = os.path.join(dirname, "yobot_config.json")
        if is_packaged:
            default_config_f_path = os.path.join(
                sys._MEIPASS, "packedfiles", "default_config.json")
        else:
            default_config_f_path = os.path.join(
                os.path.dirname(__file__), "packedfiles", "default_config.json")
        with open(default_config_f_path, "r", encoding="utf-8") as config_file:
            self.glo_setting = json.load(config_file)
        if not os.path.exists(config_f_path):
            with open(config_f_path, "w") as f:
                f.write("{}")
            print("设置已初始化，发送help获取帮助")
        boss_filepath = os.path.join(dirname, "boss3.json")
        if not os.path.exists(boss_filepath):
            if is_packaged:
                default_boss_filepath = os.path.join(
                    sys._MEIPASS, "packedfiles", "default_boss.json")
            else:
                default_boss_filepath = os.path.join(
                    os.path.dirname(__file__), "packedfiles", "default_boss.json")
            shutil.copyfile(default_boss_filepath, boss_filepath)
        pool_filepath = os.path.join(dirname, "pool3.json")
        if not os.path.exists(pool_filepath):
            if is_packaged:
                default_pool_filepath = os.path.join(
                    sys._MEIPASS, "packedfiles", "default_pool.json")
            else:
                default_pool_filepath = os.path.join(
                    os.path.dirname(__file__), "packedfiles", "default_pool.json")
            shutil.copyfile(default_pool_filepath, pool_filepath)
        for e in os.environ:
            if e.startswith("YOBOT_"):
                k = e[6:].lower()
                self.glo_setting[k] = os.environ[e]
        with open(config_f_path, "r", encoding="utf-8-sig") as config_file:
            cfg = json.load(config_file)
            for k in self.glo_setting.keys():
                if k in cfg:
                    self.glo_setting[k] = cfg[k]

        if verinfo is None:
            verinfo = updater.get_version(self.Version, self.Version_id)
            print(verinfo['ver_name'])

        # initialize database
        ybdata.init(os.path.join(dirname, 'yobotdata.db'))

        # enable gzip
        if self.glo_setting["web_gzip"] > 0:
            gzipped_types = {'text/html', 'text/javascript', 'text/css', 'application/json'}
            @quart_app.after_request
            async def gzip_response(response):
                accept_encoding = request.headers.get('Accept-Encoding', '')
                if (response.status_code < 200 or
                    response.status_code >= 300 or
                    len(await response.get_data()) < 1024 or
                    'gzip' not in accept_encoding.lower() or
                        'Content-Encoding' in response.headers):
                    return response

                gzip_buffer = BytesIO()
                gzip_file = gzip.GzipFile(
                    mode='wb', compresslevel=self.glo_setting["web_gzip"], fileobj=gzip_buffer)
                gzip_file.write(await response.get_data())
                gzip_file.close()
                gzipped_response = gzip_buffer.getvalue()
                response.set_data(gzipped_response)
                response.headers['Content-Encoding'] = 'gzip'
                response.headers['Content-Length'] = len(gzipped_response)

                return response

        # initialize web path
        if not self.glo_setting.get("public_address"):
            try:
                res = requests.get("http://api.ipify.org/")
                ipaddr = res.text
            except:
                with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
                    s.connect(("8.8.8.8", 53))
                    ipaddr = s.getsockname()[0]
            self.glo_setting["public_address"] = "http://{}:{}/".format(
                ipaddr,
                self.glo_setting["port"],
            )

        if not self.glo_setting["public_address"].endswith("/"):
            self.glo_setting["public_address"] += "/"

        if not self.glo_setting["public_basepath"].startswith("/"):
            self.glo_setting["public_basepath"] = "/" + \
                self.glo_setting["public_basepath"]

        if not self.glo_setting["public_basepath"].endswith("/"):
            self.glo_setting["public_basepath"] += "/"

        # initialize update time
        if self.glo_setting["update-time"] == "random":
            self.glo_setting["update-time"] = "{:02d}:{:02d}".format(
                random.randint(2, 4),
                random.randint(0, 59)
            )

        # initialize client salt
        if self.glo_setting["client_salt"] is None:
            self.glo_setting["client_salt"] = web_util.rand_string(16)

        # save initialization
        with open(config_f_path, "w", encoding="utf-8") as config_file:
            json.dump(self.glo_setting, config_file, indent=4)

        # initialize utils
        templating.Ver = self.Version[2:-1]

        # generate random secret_key
        if(quart_app.secret_key is None):
            quart_app.secret_key = bytes(
                (random.randint(0, 255) for _ in range(16)))

        # add mimetype
        mimetypes.init()
        mimetypes.add_type('application/javascript', '.js')
        mimetypes.add_type('image/webp', '.webp')

        # add route for static files
        @quart_app.route(
            urljoin(self.glo_setting["public_basepath"],
                    "assets/<path:filename>"),
            methods=["GET"])
        async def yobot_static(filename):
            accept_encoding = request.headers.get('Accept-Encoding', '')
            origin_file = os.path.join(os.path.dirname(
                __file__), "public", "static", filename)
            if ('gzip' not in accept_encoding.lower()
                    or self.glo_setting['web_gzip'] == 0):
                return await send_file(origin_file)
            gzipped_file = os.path.abspath(os.path.join(
                os.path.dirname(__file__),
                "public",
                "static",
                filename+"."+self.Version[1:-1]+".gz",
            ))
            if not os.path.exists(gzipped_file):
                if not os.path.exists(origin_file):
                    return "404 not found", 404
                with open(origin_file, 'rb') as of, open(gzipped_file, 'wb') as gf:
                    with gzip.GzipFile(
                        mode='wb',
                        compresslevel=self.glo_setting["web_gzip"],
                        fileobj=gf,
                    ) as gzip_file:
                        gzip_file.write(of.read())

            response = await make_response(await send_file(gzipped_file))
            response.mimetype = (
                mimetypes.guess_type(os.path.basename(origin_file))[0]
                or "application/octet-stream"
            )
            response.headers['Content-Encoding'] = 'gzip'
            response.headers['Vary'] = 'Accept-Encoding'
            return response

        # add route for output files
        if not os.path.exists(os.path.join(dirname, "output")):
            os.mkdir(os.path.join(dirname, "output"))

        @quart_app.route(
            urljoin(self.glo_setting["public_basepath"],
                    "output/<path:filename>"),
            methods=["GET"])
        async def yobot_output(filename):
            return await send_file(os.path.join(dirname, "output", filename))

        # openCC
        self.ccs2t = OpenCC(self.glo_setting.get("zht_out_style", "s2t"))
        self.cct2s = OpenCC("t2s")

        # filter
        self.black_list = set(self.glo_setting["black-list"])
        self.black_list_group = set(self.glo_setting["black-list-group"])
        self.white_list_group = set(self.glo_setting["white-list-group"])

        # update runtime variables
        self.glo_setting.update({
            "dirname": dirname,
            "verinfo": verinfo
        })
        kwargs = {
            "glo_setting": self.glo_setting,
            "bot_api": bot_api,
            "scheduler": scheduler,
            "app": quart_app,
        }

        # load plugins
        plug_all = [
            updater.Updater(**kwargs),
            switcher.Switcher(**kwargs),
            yobot_msg.Message(**kwargs),
            gacha.Gacha(**kwargs),
            jjc_consult.Consult(**kwargs),
            push_news.News(**kwargs),
            calender.Event(**kwargs),
            homepage.Index(**kwargs),
            marionette.Marionette(**kwargs),
            login.Login(**kwargs),
            settings.Setting(**kwargs),
            web_util.WebUtil(**kwargs),
            clan_battle.ClanBattle(**kwargs),
        ]
        self.plug_passive = [p for p in plug_all if p.Passive]
        self.plug_active = [p for p in plug_all if p.Active]

        for p in plug_all:
            if p.Request:
                p.register_routes(quart_app)

        # load new plugins
        self.plug_new = [
            miner.Miner(**kwargs),
            group_leave.GroupLeave(**kwargs),
            custom.Custom(**kwargs),
        ]

    def active_jobs(self) -> List[Tuple[Any, Callable[[], Iterable[Dict[str, Any]]]]]:
        jobs = [p.jobs() for p in self.plug_active]
        return reduce(lambda x, y: x+y, jobs)

    async def proc_async(self, msg: dict, *args, **kwargs) -> str:
        '''
        receive a message and return a reply
        '''
        # prefix
        if self.glo_setting.get("preffix_on", False):
            preffix = self.glo_setting.get("preffix_string", "")
            if not msg["raw_message"].startswith(preffix):
                return None
            else:
                msg["raw_message"] = (
                    msg["raw_message"][len(preffix):])

        # black-list
        if msg["sender"]["user_id"] in self.black_list:
            return None
        if msg["message_type"] == "group":
            if self.glo_setting["white_list_mode"]:
                if msg["group_id"] not in self.white_list_group:
                    return None
            else:
                if msg["group_id"] in self.black_list_group:
                    return None

        # zht-zhs convertion
        if self.glo_setting.get("zht_in", False):
            msg["raw_message"] = self.cct2s.convert(msg["raw_message"])
        if msg["sender"].get("card", "") == "":
            msg["sender"]["card"] = msg["sender"].get("nickname", "无法获取昵称")

        # run new
        reply_msg = None
        for plug in self.plug_new:
            ret = await plug.execute_async(msg)
            if ret is None:
                continue
            elif isinstance(ret, bool):
                if ret:
                    break
                else:
                    continue
            elif isinstance(ret, str):
                reply_msg = ret
                break
            else:
                raise ValueError('unsupport return type: {}'.format(type(ret)))

        if reply_msg:
            if self.glo_setting.get("zht_out", False):
                reply_msg = self.ccs2t.convert(reply_msg)
            return reply_msg

        # run
        replys = []
        for pitem in self.plug_passive:
            if hasattr(pitem, 'match'):
                func_num = pitem.match(msg["raw_message"])
            else:
                func_num = True
            if func_num:
                if hasattr(pitem, "execute_async"):
                    res = await pitem.execute_async(func_num, msg)
                else:
                    res = pitem.execute(func_num, msg)
                if res is None:
                    continue
                if isinstance(res, str):
                    replys.append(res)
                    break
                if res is None:
                    break
                if res["reply"]:
                    replys.append(res["reply"])
                if res["block"]:
                    break
        reply_msg = "\n".join(replys)

        # zhs-zht convertion
        if self.glo_setting.get("zht_out", False):
            reply_msg = self.ccs2t.convert(reply_msg)

        return reply_msg

    def execute(self, cmd: str, *args, **kwargs):
        if cmd == "update":
            res = self.plug_passive[0].execute(0x30)
            return res["reply"]

Example #32

0

Show file

File: test.py Project: cute/pyopencc

 def test_unicode_zht2zhs(self):
     c = OpenCC('zht2zhs.ini')
     self.assertEqual(c.convert(u'开放中文转换'), u'開放中文轉換')
     c.close()

Example #33

0

Show file

from config import Config
import wget

Jebot = Client(
   "YT Downloader",
   api_id=Config.APP_ID,
   api_hash=Config.API_HASH,
   bot_token=Config.TG_BOT_TOKEN,
)

YTDL_REGEX = (r"^((?:https?:)?\/\/)"
              r"?((?:www|m)\.)"
              r"?((?:youtube\.com|youtu\.be|xvideos\.com|pornhub\.com"
              r"|xhamster\.com|xnxx\.com))"
              r"(\/)([-a-zA-Z0-9()@:%_\+.~#?&//=]*)([\w\-]+)(\S+)?$")
s2tw = OpenCC('s2tw.json').convert


@Jebot.on_message(filters.command("start"))
async def start(client, message):
   if message.chat.type == 'private':
       await Jebot.send_message(
               chat_id=message.chat.id,
               text="""<b>Hey There, I'm AnyDL Bot

I can download video or audio from Youtube. Made by @ImJanindu ðŸ‡±ðŸ‡°

Hit help button to find out more about how to use me</b>""",   
                            reply_markup=InlineKeyboardMarkup(
                                [[
                                        InlineKeyboardButton(

Example #34

0

Show file

File: test.py Project: cute/pyopencc

 def test_convert_text(self):
     c = OpenCC('zhs2zht.ini')
     try:
         c.convert(3)
     except TypeError, e:
         self.assertEqual(e.message, 'TypeError: must be string or buffer.')

Example #35

0

Show file

        self.whole_sum = ""
        self.transcript = []
        self.footnote = []


abs_len = []
document_len = []
document_len_big_sent = []
document_len_small_sent = []
whole_p = 0
all_data = []
coverage = 0
big_punc = ["。", "？", "！", "?", "!"]
small_punc = ["，", "；", ",", ";"]

cc = OpenCC('tw2sp')
list_dir_path = "www.ted.com"
list_dir = os.listdir(list_dir_path)
print(len(list_dir))
ted_map = {}
if not os.path.exists("ted_zh"):
    os.mkdir("ted_zh")
if not os.path.exists("ted_en"):
    os.mkdir("ted_en")

with tqdm(total=len(list_dir)) as pbar:
    for html_file in list_dir:
        pbar.update(1)
        htmlfile_reader = open(os.path.join(list_dir_path, html_file),
                               'r',
                               encoding='utf-8')

Example #36

0

Show file

File: conversion_test.py Project: jianxing0310/opencc-python

class OpenCCTest(unittest.TestCase):

    def setUp(self):
        self.openCC = OpenCC()

    def test_hk2s(self):
        self.openCC.set_conversion('hk2s')
        words = '香煙（英語：Cigarette），為煙草製品的一種。滑鼠是一種很常見及常用的電腦輸入設備。'
        self.assertEqual(self.openCC.convert(words), '香烟（英语：Cigarette），为烟草制品的一种。滑鼠是一种很常见及常用的电脑输入设备。')

    def test_s2hk(self):
        self.openCC.set_conversion('s2hk')
        words = '香烟（英语：Cigarette），为烟草制品的一种。鼠标是一种很常见及常用的电脑输入设备。'
        self.assertEqual(self.openCC.convert(words), '香煙（英語：Cigarette），為煙草製品的一種。鼠標是一種很常見及常用的電腦輸入設備。')

    def test_s2t(self):
        self.openCC.set_conversion('s2t')
        words = '香烟（英语：Cigarette），为烟草制品的一种。鼠标是一种很常见及常用的电脑输入设备。'
        self.assertEqual(self.openCC.convert(words), '香菸（英語：Cigarette），爲菸草製品的一種。鼠標是一種很常見及常用的電腦輸入設備。')

    def test_s2tw(self):
        self.openCC.set_conversion('s2tw')
        words = '香烟（英语：Cigarette），为烟草制品的一种。鼠标是一种很常见及常用的电脑输入设备。'
        self.assertEqual(self.openCC.convert(words), '香菸（英語：Cigarette），為菸草製品的一種。鼠標是一種很常見及常用的電腦輸入設備。')

    def test_s2twp(self):
        self.openCC.set_conversion('s2twp')
        words = '香烟（英语：Cigarette），为烟草制品的一种。內存是一种很常见及常用的电脑输入设备。'
        self.assertEqual(self.openCC.convert(words), '香菸（英語：Cigarette），為菸草製品的一種。記憶體是一種很常見及常用的電腦輸入裝置。')

    def test_t2hk(self):
        self.openCC.set_conversion('t2hk')
        words = '香菸（英語：Cigarette），爲菸草製品的一種。滑鼠是一種很常見及常用的電腦輸入裝置。'
        self.assertEqual(self.openCC.convert(words), '香煙（英語：Cigarette），為煙草製品的一種。滑鼠是一種很常見及常用的電腦輸入裝置。')

    def test_t2s(self):
        self.openCC.set_conversion('t2s')
        words = '香菸（英語：Cigarette），爲菸草製品的一種。滑鼠是一種很常見及常用的電腦輸入裝置。'
        self.assertEqual(self.openCC.convert(words), '香烟（英语：Cigarette），为烟草制品的一种。滑鼠是一种很常见及常用的电脑输入装置。')

    def test_t2tw(self):
        self.openCC.set_conversion('t2tw')
        words = '香菸（英語：Cigarette），爲菸草製品的一種。鼠標是一種很常見及常用的電腦輸入設備。'
        self.assertEqual(self.openCC.convert(words), '香菸（英語：Cigarette），為菸草製品的一種。鼠標是一種很常見及常用的電腦輸入設備。')

    def test_tw2s(self):
        self.openCC.set_conversion('tw2s')
        words = '香菸（英語：Cigarette），為菸草製品的一種。滑鼠是一種很常見及常用的電腦輸入裝置。'
        self.assertEqual(self.openCC.convert(words), '香烟（英语：Cigarette），为烟草制品的一种。滑鼠是一种很常见及常用的电脑输入装置。')

    def test_tw2sp(self):
        self.openCC.set_conversion('tw2sp')
        words = '香菸（英語：Cigarette），為菸草製品的一種。記憶體是一種很常見及常用的電腦輸入裝置。'
        self.assertEqual(self.openCC.convert(words), '香烟（英语：Cigarette），为烟草制品的一种。内存是一种很常见及常用的电脑输入设备。')

Example #37

0

Show file

def main(fin, fout):
    cc = OpenCC('t2s')
    for line in fin:
        line2 = cc.convert(line)
        fout.write(line2)

Example #38

0

Show file

File: helper.py Project: henryhenrychen/GC2S

#import tensorflow as tf
import numpy as np
import pandas as pd
import keras
import CKIPClient_python3 as parser
from time import sleep
import os.path
import pickle
from IPython import embed
from opencc import OpenCC
openCC = OpenCC('s2t')

PAD = 3
EOS = 1
UNK = 2
BOS = 0


def sql_to_csv():

    D = pd.read_sql('select * from \"hotel-review\" ', 'sqlite:///HR_30.db')
    del D['id']
    D['review'] = [openCC.convert(sen) for sen in D['review']]
    D.to_csv('data/raw.csv', index=False)


def raw_to_ckip_parse():
    import re
    #if ~os.path.isfile('data/raw.csv'):
    #    raise Exception('No raw csv')
    D = pd.read_csv('data/raw.csv')

Example #39

0

Show file

File: test.py Project: cute/pyopencc

 def test_base_zhs2zht(self):
     c = OpenCC('zhs2zht.ini')
     self.assertEqual(c.convert('开放中文转换'), '開放中文轉換')
     c.close()

Example #40

0

Show file

File: librime.py Project: EasyIME/PIME

class RimeStyle:
    font_face = "MingLiu"
    candidate_format = "{0} {1}"
    inline_preedit = "false"
    menu_opencc = None
    font_point = 20
    candidate_per_row = 1
    inline_code = False
    display_tray_icon = False
    candidate_use_cursor = False
    soft_cursor = False
    menu = []
    options = []
    options_states = []
    schemas = []
    uris = []
    session_id = None

    def __init__(self, appname, session_id):
        self.session_id = session_id
        config = RimeConfig()
        if not rime.config_open(appname.encode("UTF-8"), config):
            return
        self.font_face = rimeGetString(config, 'style/font_face')
        self.candidate_format = rimeGetString(config, 'style/candidate_format')
        self.inline_preedit = rimeGetString(config, 'style/inline_preedit')
        menu_opencc_config = rimeGetString(config, 'style/menu_opencc')
        self.menu_opencc = OpenCC(menu_opencc_config) if menu_opencc_config else None
        value = c_int()
        if rime.config_get_int(config, b'style/font_point', value):
            self.font_point = value.value
        if rime.config_get_bool(config, b'style/horizontal', value):
            self.candidate_per_row = 10 if bool(value) else 1
        if rime.config_get_int(config, b'style/candidate_per_row', value):
            self.candidate_per_row = value.value
        if rime.config_get_bool(config, b'style/display_tray_icon', value):
            self.display_tray_icon = bool(value)
        if rime.config_get_bool(config, b'style/candidate_use_cursor', value):
            self.candidate_use_cursor = bool(value)
        if rime.config_get_bool(config, b'style/soft_cursor', value):
            self.soft_cursor = bool(value)
        self.options.clear()
        self.options_states.clear()
        self.uris.clear()
        self.menu = self.config_get_menu(config, b'menu')
        #print("menu", self.menu)
        rime.config_close(config)

    def get_schema(self, commandId):
        if commandId >= ID_SCHEMA:
            return self.schemas[commandId - ID_SCHEMA]

    def get_option(self, commandId):
        if commandId >= ID_OPTION:
            return self.options[commandId - ID_OPTION]

    def get_uri(self, commandId):
        if commandId >= ID_URI:
            return self.uris[commandId - ID_URI]

    def get_schema_list(self):
        schema_list = RimeSchemaList()
        self.schemas = []
        submenu = []
        current_schema = bytes(CHAR_SIZE)
        rime.get_current_schema(self.session_id, current_schema, CHAR_SIZE)
        current_schema_id = current_schema.rstrip(b'\0')
        if rime.get_schema_list(schema_list):
            n = schema_list.size
            for i in range(n):
                schema_id = schema_list.list[i].schema_id
                name = schema_list.list[i].name.decode("UTF-8")
                if self.menu_opencc:
                    name = self.menu_opencc.convert(name)
                self.schemas.append(schema_id)
                d = {'text': name, 'id': ID_SCHEMA + i}
                if schema_id == current_schema_id:
                    d["checked"] = True
                submenu.append(d)
        rime.free_schema_list(schema_list)
        return submenu          

    def config_get_menu(self, config, path):
        menu = []
        iterator = RimeConfigIterator()
        if not rime.config_begin_list(iterator, config, path):
            return
        while rime.config_next(iterator):
            d = {}
            name = rime.config_get_cstring(config, iterator.path + b'/name')
            command = rime.config_get_cstring(config, iterator.path + b'/command')
            uri = rime.config_get_cstring(config, iterator.path + b'/uri')
            text = rime.config_get_cstring(config, iterator.path + b'/text')
            if command:
                d["id"] = commands.get(command.decode("UTF-8"), 0)
                if ID_SCHEMA_LIST == d["id"]:
                    d["submenu"] = self.get_schema_list()
                elif ID_SYNC_DIR == d["id"]:
                    d["enabled"] = os.path.isdir(rime.get_sync_dir().decode(ENC))
            elif uri:
                d["id"] = ID_URI + len(self.uris)
                self.uris.append(uri.decode("UTF-8"))
            elif name:
                states = [rime.config_get_cstring(config, iterator.path + b'/states/@0').decode("UTF-8"),
                          rime.config_get_cstring(config, iterator.path + b'/states/@1').decode("UTF-8")]
                d["id"] = ID_OPTION + len(self.options)
                state_id = rime.get_option(self.session_id, name)
                d["text"] = "%s → %s" % (states[state_id], states[1 - state_id])
                self.options_states.append(states)
                self.options.append(name)
            if text:
                d["text"] = text.decode("UTF-8")
                if self.menu_opencc:
                    d["text"] = self.menu_opencc.convert(d["text"])
                submenu = self.config_get_menu(config, iterator.path + b'/submenu')
                if submenu:
                    d["submenu"] = submenu
            menu.append(d)
        rime.config_end(iterator)
        return menu