Ejemplo n.º 1
0
def regular(sen):
    """
    句子规范化,主要是对原始语料的句子进行一些标点符号的统一
    :param sen:
    :return:
    """
    # 繁体转简体
    sen = zhconv.convert(sen, 'zh-cn')
    sen = zhconv.convert(sen, 'zh-cn')
    sen = sen.replace('<GO>', '')
    sen = sen.replace('<PAD>', '')
    sen = sen.replace('<EOS>', '')
    sen = sen.replace('<UNK>', '')
    sen = sen.replace('/', '')
    sen = re.sub(r'…{1,100}', '···', sen)
    sen = re.sub(r'\.{3,100}', '···', sen)
    sen = re.sub(r'···{2,100}', '···', sen)
    sen = re.sub(r',{1,100}', ',', sen)
    sen = re.sub(r',{1,100}', ',', sen)
    sen = re.sub(r'\.{1,100}', '。', sen)
    sen = re.sub(r'。{1,100}', '。', sen)
    sen = re.sub(r'\?{1,100}', '?', sen)
    sen = re.sub(r'?{1,100}', '?', sen)
    sen = re.sub(r'!{1,100}', '!', sen)
    sen = re.sub(r'!{1,100}', '!', sen)
    sen = re.sub(r'~{1,100}', '~', sen)
    sen = re.sub(r'~{1,100}', '~', sen)
    sen = re.sub(r'0', '0', sen)
    sen = re.sub(r'3', '3', sen)
    sen = re.sub(r'\s{1,100}', ',', sen)
    sen = re.sub(r'[“”]{1,100}', '"', sen)  #中文引号不好处理
    sen = re.sub('[^\w\u4e00-\u9fff"。,?!~·]+', '', sen)
    sen = re.sub(r'[ˇˊˋˍεπのゞェーω]', '', sen)
    return sen
Ejemplo n.º 2
0
def html_to_tw(data_txt_html):
    html_data = data_txt_html
    title_content = html_data[0]
    Articles_contents = html_data[1]
    category_list = html_data[2]
    post_tag = html_data[3]

    # 标题内容转换
    title_content = convert(title_content, "zh-tw")

    # 文本内容转换
    Articles_contents = convert(Articles_contents, "zh-tw")  # 繁体中文转换

    # 分类
    category_lists = []
    for category in category_list:
        category = convert(category, "zh-tw")
        category_lists.append(category)

    # 标签
    post_tags = []
    for tag in post_tag:
        tag = convert(tag, "zh-tw")
        post_tags.append(tag)

    tw_data_txt_html = [
        title_content, Articles_contents, category_lists, post_tags
    ]
    return tw_data_txt_html
Ejemplo n.º 3
0
def test_zhconv():
    from zhconv import convert

    # 繁体转简体
    print(convert('我幹什麼不干你事。', 'zh-cn'))
    # 简体转繁体
    print(convert('人体内存在很多微生物', 'zh-tw'))
Ejemplo n.º 4
0
def table_to_data(table):
    data = [[
        convert(cell.text.strip(), 'zh-cn') for cell in row.find_all("td")
    ] for row in table.find_all("tr")]
    data[0] = [
        convert(cell.text.strip(), 'zh-cn')
        for cell in table.find("tr").find_all("th")
    ]
    return data
Ejemplo n.º 5
0
def convert_file_to_hans(target_file):
    with codecs.open(target_file, 'r', encoding='utf-8') as f:
        content = f.read()
        dir_name, base_name = os.path.split(target_file)
        hans_base_name = "hans_" + zhconv.convert(base_name.decode('utf-8'),
                                                  'zh-hans').encode('utf-8')
        hans = zhconv.convert(content, 'zh-hans')
        with codecs.open(dir_name + "/" + hans_base_name,
                         'w',
                         encoding='utf-8') as res:
            res.write(hans)
Ejemplo n.º 6
0
def get_cv(cv):
    cv = convert(cv, 'zh-tw')
    query = Info.select().where(Info.cv==cv)
    if len(query) == 0:
        msg = f"没有找到{cv}扮演的角色"
    else:
        msg = f"{cv} 的扮演角色有:"
        for i in query:
            msg += "\n"
            msg += i.name
    return convert(msg, "zh-hans")
Ejemplo n.º 7
0
 def convert_to_zhtw(self, sentences):
     print("Convert sentences to Traditional Chinese...")
     sentences_tw = {}
     for title, sentence_list in sentences.items():
         title_tw = convert(title, 'zh-tw')
         sentences_tw[title_tw] = []
         for speaker, content in sentence_list:
             speaker = convert(speaker, 'zh-tw')
             content = convert(content, 'zh-tw')
             sentences_tw[title_tw].append([speaker, content])
     return sentences_tw
Ejemplo n.º 8
0
async def resou(ctx, *args):
    res_json = json.loads(
        requests.get('https://api.oioweb.cn/api/summary.php').text)
    title_list = [i['title'] for i in res_json]
    count = 5
    page_index = 0
    msg = '\n'.join(title_list[page_index * count:(page_index + 1) * count])
    if len(args) > 0 and args[0] == 'f':
        msg = convert(msg, 'zh-hant')
    message = await ctx.send(msg)
    prev_ic = "⬅️"
    next_ic = "➡️"
    await message.add_reaction(prev_ic)
    await message.add_reaction(next_ic)

    valid_reactions = [prev_ic, next_ic]

    def check(reaction, user):
        return user == ctx.author and str(reaction.emoji) in valid_reactions

    async def reset_reaction():
        await message.clear_reactions()
        await message.add_reaction(prev_ic)
        await message.add_reaction(next_ic)

    reaction, user = await bot.wait_for('reaction_add',
                                        timeout=30.0,
                                        check=check)
    while reaction != None:
        if str(reaction.emoji) == next_ic:
            if page_index >= 10:
                page_index = 0
            else:
                page_index += 1
        else:
            if page_index <= 0:
                page_index = 0
            else:
                page_index -= 1

        if (page_index + 1) * count < len(title_list):
            msg = '\n'.join(title_list[page_index * count:(page_index + 1) *
                                       count])
        else:
            msg = '\n'.join(title_list[page_index * count:])
        if len(args) > 0 and args[0] == 'f':
            msg = convert(msg, 'zh-hant')
        await message.edit(content=msg)
        await reset_reaction()
        reaction, user = await bot.wait_for('reaction_add',
                                            timeout=30.0,
                                            check=check)
Ejemplo n.º 9
0
 def parse_item(self, response):
     try:
         item = BookItem()
         item['book_url'] = response.url
         item['book_name'] = convert(
             response.xpath('//div/b/text()').extract()[0], 'zh-cn')
         item['book_desc'] = convert(
             response.xpath('//*[@id="desc_text"]/text()').extract()
             [0].strip(), 'zh-cn')
         yield item
     except Exception as e:
         print(e)
         return
Ejemplo n.º 10
0
    def search(self):
        self.system_logger.info('loading data...')
        data = json.load(open('%s/sections.json' % self.data_dir))
        self.system_logger.info('keywords: %s' % self.keywords)
        self.system_logger.info('search...')
        with open(self.output_path, 'w') as fw:
            for kbid in data:
                sections = data[kbid]
                for n, i in enumerate(sections):
                    sec = zhconv.convert(i[0], 'zh-ch')
                    for kw in self.keywords:
                        if kw in sec:
                            start = i[1][0]
                            if len(sections) - 1 == n:
                                end = sys.maxsize
                            else:
                                end = sections[n + 1][1][0]

                                # In case of:
                                # == 评价 ==     <- 2 '=' mark, target section
                                # === 正面 ===   <- 3 '=' mark, subsection
                                # ...
                                # ==== 争议 ==== <- 4 '=' mark, subsubsection
                                # ...
                                # === 负面 ===   <- 3 '=' mark, subsection
                                # ...
                                # == Foo ==     <- 2 '=' mark, new section
                                # ...
                                nums = sec.count('=')
                                for k in range(n + 1, len(sections)):
                                    if sections[k][0].count('=') == nums:
                                        end = sections[k][1][0]
                                        break

                            res = self.collection.find({
                                'source_title': kbid,
                                'start': {
                                    '$gt': start - 1
                                },
                                'end': {
                                    '$lt': end
                                }
                            })
                            if res.count() > 0:
                                fw.write('%s\n' % kbid)
                                for r in res:
                                    sent = ''.join([t[0] for t in r['tokens']])
                                    sent = zhconv.convert(sent, 'zh-ch')
                                    fw.write('%s\n' % sent)
                                fw.write('\n')
                            break
Ejemplo n.º 11
0
def findAndInput(path, matchedMusicPath, targetPathFile):  # 相对路径
    """
    :param path: 歌单文件夹
    :param matchedMusicPath: 歌名路径文件
    :param targetPathFile: 匹配成功后存放的文件路径
    """
    matchedMusic = open(matchedMusicPath, encoding="UTF-8-sig")
    matchedMusicLine = matchedMusic.readline()
    matchedMusic_list = {}  # 受匹配的音乐集合
    success_musicPath_list = []  # 成功匹配到的数据
    matchedKey = 1  # 匹配键

    while matchedMusicLine:
        # matchedMusicLine = matchedMusicLine.replace(u'\xa0', ' ').replace("\n", "")  # 去除 nbsp的现象
        matchedMusicLine = matchedMusicLine.replace("\n", "")  # 去除 nbsp的现象
        targetAtr = re.split(
            r'[=]', matchedMusicLine
        )  # 分割目标文本 : Jam - 七月上=F:/缓存音乐/Music1/Jam - 七月上.mp3
        # Music = re.split(r'[\s.、&:_\\/ ()()-]', targetAtr[0].lower())  # 切割为歌名 作者名.mp3
        while "" in targetAtr:  # 去除列表中的空字符串
            targetAtr.remove("")
        # targetAtr.pop(0)
        targetAtr[0] = targetAtr[0].replace(u'\xa0',
                                            ' ')  # 仅去除歌名 nbsp的现象,而不是路径的
        targetAtr[0] = zhconv.convert(targetAtr[0],
                                      'zh-hans')  # 将文本转为中文简体 以防万一
        matchedMusic_list[matchedKey] = targetAtr
        matchedKey += 1
        matchedMusicLine = matchedMusic.readline()
    files = os.listdir(path)  # 获取所有歌单文件
    # files = ['acivii.txt']

    musicCount = 0  # 歌单总个数
    for f in files:
        print("文件名:", f)  # 歌单文件里的数据
        file = open(path + '/' + f, 'r', encoding='UTF-8-sig')  # 获得指定文件
        line = file.readline()  # 按行读取
        while line:
            line = line.replace(u'\xa0', ' ').replace('\n', "")  # 去除 nbsp的现象
            targetMusicLine = zhconv.convert(line,
                                             'zh-hans')  # 将目标歌单的数据也转为中文简体 以防万一
            success_path = adaptation(targetMusicLine,
                                      matchedMusic_list)  # 文本适配 返回成功匹配到的路径
            if success_path:
                success_musicPath_list.append(success_path)  # 添加进入列表中
            musicCount += 1
            line = file.readline()
        outPutM3u(success_musicPath_list, targetPathFile, f)  # 写入
        success_musicPath_list = []  # 数据清空
    print("匹配次数为:", musicCount)
Ejemplo n.º 12
0
def chinese():

    json = request.get_json()
    text = json['text']
    to = json['to']
    result = text
    # simpleText = convert(text, 'zh-cn')

    if to == 'zh-CN':
        result = convert(text, 'zh-cn')
    else:
        result = convert(text, 'zh-tw')

    print(result)
    return jsonify({'result': result})
Ejemplo n.º 13
0
def chineseconvert():

    json = request.get_json()
    text = json['text']
    # to = json['to']
    result = text
    simpleText = convert(text, 'zh-cn')

    if simpleText == text:
        result = convert(text, 'zh-tw')
    else:
        result = simpleText

    print(result)
    return jsonify({'result': result})
Ejemplo n.º 14
0
def get_uniquei(id):
    query = Uniquei.get_or_none(Uniquei.id==id)
    if query:
        skill = Skill.get(Skill.id==id,Skill.type=='技能1')
        skill1 = Skill.get(Skill.id==id,Skill.type=='專武強化技能1')
        e_icon = resize_icon(query.num,types='equipment')
        prop = Props.select().where(Props.id==id)
        msg = ''
        msg += f'\n{query.name}\n'
        msg += f'{e_icon}\n'
        msg += f'{query.description}'
        msg += '\n======================\n'
        for i in prop:
            msg += f'{i.property}:{i.base_value}-{i.max_value}\n'
        msg += '======================\n'
        msg += f'{skill.type}:{skill.name}\n'
        msg += f'{resize_icon(skill.num)}\n'
        msg += f'描述:\n{skill.description}\n'
        effect = skill.effect.strip( "[']" )
        msg += '效果:\n'
        for e in effect.split("', '"):
            msg += f'{e}'
        msg += '\n======================\n'
        msg += f'{skill1.type}:{skill1.name}\n'
        msg += f'{resize_icon(skill1.num)}\n'
        msg += f'描述:\n{skill1.description}\n'
        effect1 = skill1.effect.strip( "[']" )
        msg += '效果:\n'
        for e in effect1.split("', '"):
            msg += f'{e}'
        return convert(msg, 'zh-hans')
    else:
        return '\n该角色暂时没有专武。'
Ejemplo n.º 15
0
    async def preprocess(self, sent: str):
        if self.type == "cn2en":
            sent = convert(sent, "zh-cn")
            if self.stops.sub("", sent) in self.cn2en_trans_dict or \
                not self.chinese_char_pattern.search(sent):
                return sent

            async with self.client.post(self.tokenize_url,
                                        json={
                                            'q': sent,
                                            "mode": self.tokenize_mode
                                        }) as rsp:
                rsp = await rsp.json()
                sent = " ".join(rsp['words'])
                sent = remove_ngram(sent, min_n_gram=2, max_n_gram=4)
                sent = self.tokenizer.segment(sent)
        elif self.type == "en2cn":
            sent = self.en_normalize_punctuation.normalize(sent)
            sent = self.en_tokenizer.tokenize(sent, return_str=True)
            tok = E2V(sent)
            tok = tok.lower()
            tok = remove_ngram(tok, min_n_gram=2, max_n_gram=4)
            sent = self.tokenizer.segment(tok)
        else:
            raise Exception("This type({}) is not support.".format(self.type))
        return sent
Ejemplo n.º 16
0
 def ts_trans2(self, param):
     op = param.get("option")
     source = param.get("first")
     try:
         if op == '0':
             result = convert(source, 'zh-tw')
             if source == result:
                 result = convert(source, 'zh-cn')
         elif op == '1':
             result = convert(source, 'zh-tw')
         else:
             result = convert(source, 'zh-cn')
         return result
     except Exception as e:
         logger.error(e)
         raise e
Ejemplo n.º 17
0
    def convertToZhtw(self, fileFromPath, fileToPath):
        with open(fileFromPath, 'r') as f:
            content = f.read().decode("utf-8")

            with open(fileToPath, 'w') as f1:
                content = zhconv.convert(content, 'zh-tw')
                f1.write(content)
Ejemplo n.º 18
0
def t2d(table):
    # try to solve rowspan / colspan
    rows = table.find_all("tr")
    col_num, row_num = get_col_row_num(rows)
    # print(col_num, row_num)
    res = [[-1 for i in range(col_num)] for j in range(row_num)]
    i = 0
    # i-th row, j-th column
    for row in rows:
        j = 0
        cells = row.find_all(["th", "td"])
        for cell in cells:
            value = cell.text.strip()
            while j < col_num and res[i][j] != -1:
                j += 1
            if col_num <= j:
                break
            col_span, row_span = min(int(cell.attrs.get('colspan', 1)),
                                     col_num - j), min(
                                         int(cell.attrs.get('rowspan', 1)),
                                         row_num - i)  # handle overflow
            value = int(value) if value.isdigit() else convert(value, 'zh-cn')
            res[i][j] = value  # current cell
            for k in range(1, row_span):
                res[i + k][j] = value  # down
            for k in range(1, col_span):
                j += 1
                res[i][j] = value  # right
            j += 1
        i += 1
    return res
Ejemplo n.º 19
0
def get_skill(id):
    loop = Info.get(Info.id==id)
    query = Skill.select().where(Skill.id==id)
    arr = []
    for i in query:
        if i.type in ("專武強化技能1",):
            continue
        skill = {}
        skill['name'] = i.name
        skill['type'] = i.type
        skill['description'] = i.description
        skill['num'] = i.num
        skill['effect'] = i.effect
        arr.append(skill)
    newlist = sorted(arr, key=functools.cmp_to_key(custom_sorted)) 
    msg = ''
    start = get_icons(loop.start.split(','))
    msg += f'\n起手:\n{start}\n'
    loop = get_icons(loop.loop.split(','))
    msg += f'循环:\n{loop}\n'
    msg += '技能:'
    for s in newlist:
        msg += '\n======================\n'
        msg += f'{s["type"]}:{s["name"]}\n'
        msg += f'{resize_icon(s["num"])}\n'
        msg += f'描述:\n{s["description"]}\n'
        effect = s["effect"].strip( "[']" )
        msg += '效果:\n'
        for e in effect.split("', '"):
            msg += f'{e}'
    return convert(msg, 'zh-hans')
Ejemplo n.º 20
0
def char_word_tokenize(text):
    """分词器、中文单独成词,英文单词、连续数字作为一个词"""
    # 大写转小写,繁体转简体
    text = zhconv.convert(text.lower(), 'zh-cn')
    # 全角转半角
    text = full_to_half(text)
    tokenized_chs = []
    text_len = len(text)
    i = 0
    while i < text_len:
        ch = text[i]
        # 中文字符
        if ch in all:
            tokenized_chs.append(ch)
            i += 1
        # 数字或英文字母
        elif ch.isdigit() or ch.islower():
            word = ch
            j = i + 1
            while j < text_len:
                tch = text[j]
                if tch.isdigit() or tch.islower():
                    word += tch
                    j += 1
                else:
                    break
            i = j
            # 抽取词干API有错误,暂弃
            # tokenized_chs.append(stemmer.stemWord(word))
            tokenized_chs.append(word)
        else:
            i += 1
    return tokenized_chs
def getPropertyJson(props, percount, output):
    for index in range(0, len(props), percount):
        props_detail = []
        ids = "|".join(props[index:index+percount])
        r = requests.get(url+ids)
        data = json.loads(zhconv.convert(r.text, "zh-cn"))
        for k, v in data['entities'].items():
            if "missing" in v or v.get("labels", -1)==-1:
                continue
            newdata = {}
            newdata['id'] = k
            attrs = ['labels', 'descriptions']
            langs = ['zh', 'en']
            for attr in attrs:
                if v.get(attr, -1)!=-1:
                    for lang in langs:
                        if v[attr].get(lang, -1)!=-1:
                            newdata[lang + '-' + attr[:-1]] = v[attr][lang]["value"]
            if v.get("aliases", -1)!=-1:
                for lang in langs:
                    if v["aliases"].get(lang, -1)!=-1:
                        newdata[lang + '-' + "aliase"] = []
                        for vv in v["aliases"][lang]:
                            newdata[lang + '-' + "aliase"].append(vv["value"])
            props_detail.append(newdata)
            print("Crawled %d properties."%(len(props_detail)))
            with open(output, "a+", encoding="utf8") as f:
                for pp in props_detail:
                    json.dump(pp, f, ensure_ascii=False)
                    f.write("\n")
    print("total crawled properties: %d"%(total))
Ejemplo n.º 22
0
def save_cut_word_rst(file_path):
    data = pd.read_csv(file_path + '.csv', usecols=['content'])
    with open(file_path + '_cut_word_rst.txt', 'w') as f_w:
        for content in data['content'].values:
            content = zhconv.convert(content.strip(), 'zh-cn')
            content = list(filter(lambda x: len(x.strip()) > 0, list(jieba.cut(content))))
            f_w.write(' '.join(content) + '\n')
Ejemplo n.º 23
0
def one_process(file_list, mode, output_file):
    """
    :param file_list: 待处理文件
    :param output_file: 输出结果文件路径
    :param mode: 模式
    :return:
    """
    assert mode == "zh" or "en"
    for file_path in file_list:
        #存储在output路径下的同子路径文件中
        output_f = output_file + file_path[file_path.index('/', 2):]
        with open(output_f, 'w+', encoding='utf-8') as fw:
            with open(file_path, 'r', encoding='utf-8') as f:
                count = 0
                for line_data in f:
                    if count % 3000 == 0:
                        print(file_path, count)
                    count += 1
                    if mode == "zh":
                        #将line_data转化为一个dict
                        line_data = json.loads(
                            zhconv.convert(line_data, "zh-cn").strip())
                        # print(line_data)
                    else:
                        line_data = json.loads(line_data.strip())
                    new_data = add_mention(line_data, mode)
                    fw.write(json.dumps(new_data, ensure_ascii=False) + '\n')
Ejemplo n.º 24
0
def print_poem(num=10):

    flist = open('./similar_output.txt', 'rU', encoding='UTF-8').readlines()
    rtn = ""
    scoredict = {}
    for i in range(int(len(flist) / 3)):
        plist = flist[3 * i].strip('\n') + flist[3 * i + 1]
        scoredict[i] = pz_score(plist, i)

    sort_scoredict = sorted(scoredict.items(),
                            key=lambda item: item[1],
                            reverse=True)

    for m in range(num):
        # 诗的编号
        num = sort_scoredict[m][0]
        # 诗的得分
        score = sort_scoredict[m][1]
        if score >= 0.8:
            # 输出诗的得分
            t = 'score: ' + str(score) + '\n'
            poem = flist[3 * num].strip('\n') + flist[3 * num + 1]
            poem = zhconv.convert(poem, 'zh-cn')
            # 输出诗
            t += poem + '\n'
            rtn += t
        else:
            break
    return rtn
Ejemplo n.º 25
0
async def shici(ctx, *args):
    res_json = json.loads(requests.get('https://v1.jinrishici.com/all').text)
    msg = "{}\n——{} {}".format(res_json['content'], res_json['origin'],
                               res_json['author'])
    if len(args) > 0 and args[0] == 'f':
        msg = convert(msg, 'zh-hant')
    await ctx.send(msg)
Ejemplo n.º 26
0
def preprocess(sen):
    """
    用来清洗评价数据,包括统一为小写,删除文本中的空格,换行,句号,问号,感叹号以及标签信息,将繁体转换为简体,最后利用jieba库进行tokenization操作
    :param sen: 待处理的字符串
    :return: list,处理并分词后的列表
    """

    import zhconv
    import jieba
    # import hanlp
    # tokenizer = hanlp.load('LARGE_ALBERT_BASE')
    sen.lower()
    sen = sen.replace(' ', '')
    sen = sen.replace('\n', '')
    pattern = re.compile(
        r'(?<=<).+?(?=>)'
    )  # https://blog.csdn.net/z1102252970/article/details/70739804
    str1 = pattern.sub('', sen)
    str1 = str1.replace('<>', '')
    cop = re.compile("[^\u4e00-\u9fa5^a-z^A-Z^0-9]")  # 匹配汉字,英文,数字
    str1 = zhconv.convert(str1, 'zh-cn')
    str1 = cop.sub('', str1)
    # tokens=tokenizer(str1)# hanlp问题:str1的长度不能超过126,多余的字符会被截断,所以需要对长字符串进行拆分
    # tokens=[]
    # if len(str1)>100:
    #     str1s=cut_str_by_len(str1,100)
    #     for split_text in str1s:
    #         tokens.extend(tokenizer(split_text))
    # else:tokens.extend(tokenizer(str1))
    tokens = jieba.cut(str1)
    return list(tokens)
Ejemplo n.º 27
0
def convertepub(filename, output, locale):
    with zipfile.ZipFile(filename, 'r') as zf, \
         zipfile.ZipFile(output, 'w') as zw:
        zfiles = collections.OrderedDict()
        for zi in zf.infolist():
            zfiles[zi.filename] = zi
        with zf.open(zfiles['META-INF/container.xml'], 'r') as f:
            dom = xml.dom.minidom.parse(f)
            rootfiles = [t.getAttribute('full-path')
                for t in dom.getElementsByTagName('rootfile')
                if t.getAttribute('media-type') == 'application/oebps-package+xml']
        htmls = set(rootfiles)
        for rootfile in rootfiles:
            with zf.open(zfiles[rootfile], 'r') as f:
                dom = xml.dom.minidom.parse(f)
                manifest = dom.getElementsByTagName('manifest')[0]
                htmls.update(t.getAttribute('href')
                    for t in manifest.getElementsByTagName('item')
                    if t.getAttribute('media-type') in
                    ('application/xhtml+xml', 'application/x-dtbncx+xml'))
        for name, zi in zfiles.items():
            if name in htmls:
                s = zhconv.convert(zf.read(zi).decode('utf-8'), locale)
                zw.writestr(zi, s.encode('utf-8'))
            else:
                zw.writestr(zi, zf.read(zi))
Ejemplo n.º 28
0
def render_text_with_token_id(token_id, font, use_traditional, idx2word):
    word = idx2word[token_id]
    word = convert(word, 'zh-hant') if use_traditional else word
    if len(word) > 1:
        return np.zeros((font.size + 1, font.size + 1))
    else:
        return pad_mask(render_text(word, font), font.size)
Ejemplo n.º 29
0
 def search(self, searchItem):
     proxies = {
         "http": "220.168.237.187:8888",
         "https": "https://127.0.0.1:1080",
         "http": "http://127.0.0.1:1080"
     }
     headers = {
         'user-agent':
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
     }
     searchItem = '战争列表_(' + searchItem
     google_url = str(self.url) + urllib.parse.quote(searchItem)
     try:
         proxy_handler = urllib.request.ProxyHandler(
             proxies)  # 设置对应的代理服务器信息
         opener = urllib.request.build_opener(
             proxy_handler, urllib.request.HTTPHandler)  # 创建一个自定义的opener对象
         urllib.request.install_opener(opener)  # 创建全局默认的opener对象
         req = urllib.request.Request(google_url, headers=headers)
         response = urllib.request.urlopen(req)
         content = response.read().decode("utf-8")
         content = convert(str(content), 'zh-hans')
         soupIter = BeautifulSoup(content, 'lxml')
         yield soupIter
     except:
         return '404'
Ejemplo n.º 30
0
def convertepub(filename, output, locale):
    with zipfile.ZipFile(filename, 'r') as zf, \
         zipfile.ZipFile(output, 'w') as zw:
        zfiles = collections.OrderedDict()
        for zi in zf.infolist():
            zfiles[zi.filename] = zi
        with zf.open(zfiles['META-INF/container.xml'], 'r') as f:
            dom = xml.dom.minidom.parse(f)
            rootfiles = [
                t.getAttribute('full-path')
                for t in dom.getElementsByTagName('rootfile') if
                t.getAttribute('media-type') == 'application/oebps-package+xml'
            ]
        htmls = set(rootfiles)
        for rootfile in rootfiles:
            with zf.open(zfiles[rootfile], 'r') as f:
                dom = xml.dom.minidom.parse(f)
                manifest = dom.getElementsByTagName('manifest')[0]
                htmls.update(
                    t.getAttribute('href')
                    for t in manifest.getElementsByTagName('item')
                    if t.getAttribute('media-type') in (
                        'application/xhtml+xml', 'application/x-dtbncx+xml'))
        for name, zi in zfiles.items():
            if name in htmls:
                s = zhconv.convert(zf.read(zi).decode('utf-8'), locale)
                zw.writestr(zi, s.encode('utf-8'))
            else:
                zw.writestr(zi, zf.read(zi))
Ejemplo n.º 31
0
 def find_loc(self, text):
     text = zhconv.convert(text, "zh-cn")
     text = text.lower()
     text = text.replace(" ", "")
     city = self.find_city(text)
     province = self.find_province(text)
     nation = self.find_nation(text)
     if len(city) > 1:
         if "吉安" in city:
             if self.city_province_dict["吉安"] in province:
                 pass
             else:
                 city.pop(city.index("吉安"))
         elif "吉林" in city and "吉林" in province:
             city.pop(city.index("吉林"))
         else:
             for c in city:
                 if self.city_province_dict[c] not in province:
                     city.pop(city.index(c))
         if len(city) > 1:
             city = [city[0]]
     if city.__contains__(
             "阿里") and self.city_province_dict["阿里"] not in province:
         city = []
     if len(city) > 0:
         province = [self.city_province_dict[city[0]]]
     elif len(province) > 1:
         province = [province[-1]]
     if (len(province) + len(city) + len(nation)) > 0:
         nation = "中国"
         return (nation, province, city)
     else:
         return 0
def html_to_txt(title):
    """将下载的html文件转换为txt"""
    html_file = os.path.join(PATH, "{}.html".format(title))
    save_file = os.path.join(PATH, "txt/{}.txt".format(title))
    with open(html_file)as f:
        content = f.read()
    soup = BeautifulSoup(content, "lxml")
    with open(save_file, 'w', encoding='utf-8')as f:
        for text in soup.find_all('div', id='content'):
            for t in text.strings:
                t = convert(t.strip(), 'zh-cn') # 繁体转简体
                print(t, file=f)
Ejemplo n.º 33
0
def convertfunc(s, locale, locale_only):
    if locale:
        simp = zhconv.issimp(s, True)
        if (simp is None
            or simp and locale in Locales['zh-hans']
            or not simp and locale in Locales['zh-hant']):
            return identity
        elif locale_only:
            return empty
        else:
            return lambda x: zhconv.convert(s, locale)
    else:
        return identity
Ejemplo n.º 34
0
def fetch_post(url, output=os.path.join(OUTPUT, 'posts')):
    if not os.path.exists(output):
        os.makedirs(output)
    post_name = zhconv.convert(url_filename(url), 'zh-cn')
    post_file = os.path.join(output, '%s.txt' % post_name)
    if os.path.exists(post_file):
        print('Skip %s' % url)
    else:
        print('Fetch %s' % url)
        soup = commons.soup(url)
        for s in soup.find_all('a'):
            s.decompose()
        content_tag = soup.find(filter_post_content)
        content = content_tag.get_text()
        # print('Post %s (%s)' % (post_name, len(content)))
        if content and len(content) > 500:
            content = re.sub(r'[><&%]','',content)
            content = zhconv.convert(content, 'zh-cn')
            with codecs.open(post_file, 'w', 'utf-8') as f:
                f.write(post_name)
                f.write('\n\n')
                f.write(content)