Example #1
0
    def __to_bibDict(self, P=1):
        """
        私有方法
        方法:对已按@分割后的 bib 文本内容进行解析 \n
        返回值:bibItem字典\n
        参数:P=1(表示简体转繁体)P=0(表示不转换)
        """
        bibitemStr = self.bibitemStr
        # 简体转繁体
        if P == 1:
            bibitemStr = cv.s2t(bibitemStr)
        for i in self.__bibDict.keys():
            y = re.findall(i + ".*?{(.*?)}", bibitemStr, re.I)
            self.__bibDict[i] = ("" if y == [] else y[0])

        # 提取 bibitem中的 引用标签字符串
        x = bibitemStr.split(",")[0].split("{")
        self.__bibDict["bibType"] = x[0].strip().lower()
        self.__bibDict["citelabel"] = x[1].strip()
        # re 判断是否包含中文
        zhmodel = re.compile(u'[\u4e00-\u9fa5]')
        if self.__bibDict["author"] != "":
            match = zhmodel.search(self.__bibDict["author"])
            self.__bibDict["lang"] = ('chinese' if match else "")
        else:
            match = zhmodel.search(self.__bibDict["title"])
            self.__bibDict["lang"] = ('chinese' if match else "")
Example #2
0
def core_lans(str, lans):
    try:
        assert lans in ['auto', 'cn', 'tc']
        if lans in ['cn', 'tc']:
            import inlp.convert.chinese as cv
            return cv.t2s(str) if lans == 'cn' else cv.s2t(str)
        else:
            return str
    except Exception as e:
        logger.warning(e)
        return str
Example #3
0
def get_novel_name():
    #請求當前章節頁面  params為請求引數
    r = requests.get(req_url, params=req_header)
    #soup轉換
    soup = BeautifulSoup(r.text, "html.parser")
    #以selcetor獲取章節名稱
    novel_name = soup.select('#info h1')[0]
    #刪去不必要的東西
    pattern = re.compile(r'<(/*)h1>')  #移除<h1> & </h1>
    novel_name = re.sub(pattern, "", str(novel_name))
    #簡中轉繁中
    novel_name = cv.s2t(novel_name)

    return novel_name
Example #4
0
    def __init__(self, bibItemStr, style="mustAPA"):
        """
        初始化
        """
        self.gDic = ""
        # 初始化时传入的字符转繁体
        self.bibitemStr = cv.s2t(bibItemStr)
        # 表示 bib文件中每一个item 格式后的文本
        self.__bibTex = ""
        # 表示参考文献的序号
        self.__bibIndex = -1
        # bib 字典
        self.__bibDict = {
            "bibType": "",
            "citelabel": "",
            "author": "",
            "editor": "",
            "title": "",
            "journal": "",
            "volume": "",
            "number": "",
            "pages": "",
            "chapter": "",
            "institution": "",
            "year": "",
            "school": "",
            "university": "",
            "location": "",
            "publisher": "",
            "booktitle": "",
            "issn": "",
            "doi": "",
            "type": "",
            "organization": "",
            "url": "",
            "note": "",
            "description": "",
            "date": ""
        }

        # 初始化执行 转字典 方法
        self.__to_bibDict()
        # 获取作者姓名序号
        self.__getbibIndex()
        # 默认参考文献样式为 MUST APA
        if style == "mustAPA":
            self.__to_bibStyle_mustAPA()
        else:
            pass  # to do
Example #5
0
def get_chapter_content(sub_chapter):
    #請求當前章節頁面  params為請求引數
    r = requests.get(req_url + chapter_url[sub_chapter], params=req_header)
    #soup轉換
    soup = BeautifulSoup(r.text, "html.parser")
    #以selcetor獲取章節名稱
    chapter_name = soup.select(
        '#wrapper .content_read .box_con .bookname h1')[0]
    pattern = re.compile(r'<(/*)h1>')  #移除<h1> & </h1>
    chapter_name = re.sub(pattern, "", str(chapter_name))
    #獲取章節內容
    chapter_text = soup.select('#wrapper .content_read .box_con #content')[0]
    #刪去內容不必要的東西
    chapter_text = re.sub(r'<br/>', '\n\n',
                          str(chapter_text))  #移除html的換行符號<br/>
    chapter_text = re.sub(r'<div id="content">', '',
                          str(chapter_text))  #移除此段<div id="content">
    chapter_text = re.sub(r'<(/*)div>', '',
                          str(chapter_text))  #移除<div> & </div>
    #簡中轉繁中
    chapter_name = cv.s2t(chapter_name)
    chapter_text = cv.s2t(chapter_text)

    return chapter_name, chapter_text
Example #6
0
# -*- coding: utf-8 -*-
"""
__title__ = 'example'
__author__ = 'JieYuan'
__mtime__ = '2018/6/5'
"""
from inlp.convert import char, chinese
from inlp.explode import Strokes, Chars
from inlp.similarity import simhash, thesaurus

print('\n%s\n' % 'inlp.convert')
print(char.half2full("0123456789"))
print(char.full2half("0123456789"))
print(chinese.s2t('忧郁的台湾乌龟'))  # chinese.simple2tradition('忧郁的台湾乌龟')
print(chinese.t2s('憂郁的臺灣烏龜'))  # chinese.tradition2simple('憂郁的臺灣烏龜')

print('\n%s\n' % 'inlp.explode')
print(Chars().get_chars('袁'))
print(Strokes().get_strokes('袁'))

print('\n%s\n' % 'inlp.similarity')
s1 = ['周杰伦', '是', '一个', '歌手']
s2 = ['刘若英', '是', '个', '演员']
print(simhash(s1, s2))
print(thesaurus.cilin(s1, s2))
print(thesaurus.hownet(s1, s2))
Example #7
0
 def post_proc(_str):
     _str = beautify_str(_str)
     return cv.t2s(_str) if lans == 'cn' else cv.s2t(_str)
Example #8
0
    filepath = ""
    bibFilePath = []
    bibFileStr = ""

    # 搜索所有 tex 文本,将文本内容全转为繁体
    for i, j, k in os.walk("."):
        for ii in k:
            temPath = os.path.join(i, ii)
            # 如果找到 tex 后缀的文件
            if ".tex" in temPath:
                filepath = temPath
                with open(filepath, "r+", encoding="utf-8") as f:
                    # 读取全文
                    temp = f.read()
                    # 全文转繁体
                    temp = cv.s2t(temp)
                    # 全文重写到文件中
                    f.seek(0)
                    f.truncate()
                    f.write(temp)
            # 如果找到 bib 后缀的文件
            elif ".bib" in temPath:
                bibFilePath.append(temPath)

    # 得到格式化的 bib 文件
    bibFileStr = iitool.BIBPYITEM.allbibFileToStr(bibFilePath)

    # 将格式化的 bib 文本内容转成繁体
    bibFileStr = cv.s2t(bibFileStr)
    # 對文本進行預處理,(處理一些特殊字符)
    bibFileStr = bibFileStr.replace(r"&", r"\&")