def __to_bibDict(self, P=1): """ 私有方法 方法:对已按@分割后的 bib 文本内容进行解析 \n 返回值:bibItem字典\n 参数:P=1(表示简体转繁体)P=0(表示不转换) """ bibitemStr = self.bibitemStr # 简体转繁体 if P == 1: bibitemStr = cv.s2t(bibitemStr) for i in self.__bibDict.keys(): y = re.findall(i + ".*?{(.*?)}", bibitemStr, re.I) self.__bibDict[i] = ("" if y == [] else y[0]) # 提取 bibitem中的 引用标签字符串 x = bibitemStr.split(",")[0].split("{") self.__bibDict["bibType"] = x[0].strip().lower() self.__bibDict["citelabel"] = x[1].strip() # re 判断是否包含中文 zhmodel = re.compile(u'[\u4e00-\u9fa5]') if self.__bibDict["author"] != "": match =["author"]) self.__bibDict["lang"] = ('chinese' if match else "") else: match =["title"]) self.__bibDict["lang"] = ('chinese' if match else "")
def core_lans(str, lans): try: assert lans in ['auto', 'cn', 'tc'] if lans in ['cn', 'tc']: import inlp.convert.chinese as cv return cv.t2s(str) if lans == 'cn' else cv.s2t(str) else: return str except Exception as e: logger.warning(e) return str
def get_novel_name(): #請求當前章節頁面 params為請求引數 r = requests.get(req_url, params=req_header) #soup轉換 soup = BeautifulSoup(r.text, "html.parser") #以selcetor獲取章節名稱 novel_name ='#info h1')[0] #刪去不必要的東西 pattern = re.compile(r'<(/*)h1>') #移除<h1> & </h1> novel_name = re.sub(pattern, "", str(novel_name)) #簡中轉繁中 novel_name = cv.s2t(novel_name) return novel_name
def __init__(self, bibItemStr, style="mustAPA"): """ 初始化 """ self.gDic = "" # 初始化时传入的字符转繁体 self.bibitemStr = cv.s2t(bibItemStr) # 表示 bib文件中每一个item 格式后的文本 self.__bibTex = "" # 表示参考文献的序号 self.__bibIndex = -1 # bib 字典 self.__bibDict = { "bibType": "", "citelabel": "", "author": "", "editor": "", "title": "", "journal": "", "volume": "", "number": "", "pages": "", "chapter": "", "institution": "", "year": "", "school": "", "university": "", "location": "", "publisher": "", "booktitle": "", "issn": "", "doi": "", "type": "", "organization": "", "url": "", "note": "", "description": "", "date": "" } # 初始化执行 转字典 方法 self.__to_bibDict() # 获取作者姓名序号 self.__getbibIndex() # 默认参考文献样式为 MUST APA if style == "mustAPA": self.__to_bibStyle_mustAPA() else: pass # to do
def get_chapter_content(sub_chapter): #請求當前章節頁面 params為請求引數 r = requests.get(req_url + chapter_url[sub_chapter], params=req_header) #soup轉換 soup = BeautifulSoup(r.text, "html.parser") #以selcetor獲取章節名稱 chapter_name = '#wrapper .content_read .box_con .bookname h1')[0] pattern = re.compile(r'<(/*)h1>') #移除<h1> & </h1> chapter_name = re.sub(pattern, "", str(chapter_name)) #獲取章節內容 chapter_text ='#wrapper .content_read .box_con #content')[0] #刪去內容不必要的東西 chapter_text = re.sub(r'<br/>', '\n\n', str(chapter_text)) #移除html的換行符號<br/> chapter_text = re.sub(r'<div id="content">', '', str(chapter_text)) #移除此段<div id="content"> chapter_text = re.sub(r'<(/*)div>', '', str(chapter_text)) #移除<div> & </div> #簡中轉繁中 chapter_name = cv.s2t(chapter_name) chapter_text = cv.s2t(chapter_text) return chapter_name, chapter_text
# -*- coding: utf-8 -*- """ __title__ = 'example' __author__ = 'JieYuan' __mtime__ = '2018/6/5' """ from inlp.convert import char, chinese from inlp.explode import Strokes, Chars from inlp.similarity import simhash, thesaurus print('\n%s\n' % 'inlp.convert') print(char.half2full("0123456789")) print(char.full2half("0123456789")) print(chinese.s2t('忧郁的台湾乌龟')) # chinese.simple2tradition('忧郁的台湾乌龟') print(chinese.t2s('憂郁的臺灣烏龜')) # chinese.tradition2simple('憂郁的臺灣烏龜') print('\n%s\n' % 'inlp.explode') print(Chars().get_chars('袁')) print(Strokes().get_strokes('袁')) print('\n%s\n' % 'inlp.similarity') s1 = ['周杰伦', '是', '一个', '歌手'] s2 = ['刘若英', '是', '个', '演员'] print(simhash(s1, s2)) print(thesaurus.cilin(s1, s2)) print(thesaurus.hownet(s1, s2))
def post_proc(_str): _str = beautify_str(_str) return cv.t2s(_str) if lans == 'cn' else cv.s2t(_str)
filepath = "" bibFilePath = [] bibFileStr = "" # 搜索所有 tex 文本,将文本内容全转为繁体 for i, j, k in os.walk("."): for ii in k: temPath = os.path.join(i, ii) # 如果找到 tex 后缀的文件 if ".tex" in temPath: filepath = temPath with open(filepath, "r+", encoding="utf-8") as f: # 读取全文 temp = # 全文转繁体 temp = cv.s2t(temp) # 全文重写到文件中 f.truncate() f.write(temp) # 如果找到 bib 后缀的文件 elif ".bib" in temPath: bibFilePath.append(temPath) # 得到格式化的 bib 文件 bibFileStr = iitool.BIBPYITEM.allbibFileToStr(bibFilePath) # 将格式化的 bib 文本内容转成繁体 bibFileStr = cv.s2t(bibFileStr) # 對文本進行預處理,(處理一些特殊字符) bibFileStr = bibFileStr.replace(r"&", r"\&")