def txt2htmlGF(path=None, m1=re.compile(r'^第\w{1,3}[编|篇]'), m2=re.compile(r'^第\w{1,3}章'), m3=re.compile(r'^第\w{1,3}节'), ind=True, regrex1=None, search=None, Startw=None): """ path:文件夹的名称,若没有输入参数,则默认为None,即当前目录。 func:txt2html_odir,形成一个个单独的文件,文件名与源文件相同,并保存在源文件的目录下。 :txt2htmlv1,合并成一个文件,文件保存在当前工作目录下,输出为output.html。 px: 按预先定义的方式进行排序 path:所选择的文件夹 """ files = [] if isinstance(path, list): files.extend(path) elif os.path.isfile(path): files.append(path) elif path is None: txtpath = os.getcwd() ss = GFlist(path, regrex1=regrex1, research=search, startw=Startw) files = [i[1] for i in ss] elif os.path.isdir(path): ss = GFlist(path, regrex1=regrex1, research=search, startw=Startw) files = [i[1] for i in ss] txt2htmlv1(files, m1=m1, m2=m2, m3=m3, index=ind) return
def absfile(path,func=abssplit,regrex1=None,\ Research=None,Startw=None,p1=re.compile('裁判要点'),\ p2=re.compile('相关法条'),rc=re.compile('裁判要点\W*(.*?\s*.*?)\W*相关法条')): files = [] if isinstance(path, list): for i in path: if os.path.isfile(i): files.append(i) elif os.path.isdir(i): ss = GFlist(path, regrex1=regrex1, research=Research, startw=Startw) files.extend([i[1] for i in ss]) elif isinstance(path, str): if os.path.isfile(path): files.append(path) elif path is None: txtpath = os.getcwd() ss = GFlist(path, regrex1=regrex1, research=Research, startw=Startw) files.extend([i[1] for i in ss]) elif os.path.isdir(path): ss = GFlist(path, regrex1=regrex1, research=Research, startw=Startw) files.extend([i[1] for i in ss]) tdir = 'temp_dir' if not os.path.exists(tdir): os.mkdir(tdir) Tfile = [] if func.__name__ == 'abstract': for f in files: bn = os.path.basename(f) nf = os.path.join(tdir, bn) text = func(f, rc=rc) try: with open(nf, 'w', encoding='utf8') as gf: gf.write(text) Tfile.append(nf) except: pass elif func.__name__ == 'abssplit': for f in files: bn = os.path.basename(f) nf = os.path.join(tdir, bn) text = func(f, p1=p1, p2=p2) #print(text) try: with open(nf, 'w', encoding='utf8') as gf: gf.write(text) Tfile.append(nf) except: pass return Tfile
def absAPPhtml(path, outdir='', regrex1=re.compile('检例第(\d*)号'), rc=re.compile('(.*?案\s*(检例第\d*号))'), p1=re.compile('【要旨】'), p2=re.compile('\【\w*】'), yz=True): if outdir == '': outdir = 'itempdit' absSPP(path=path, tdir=outdir, rc=rc, p1=p1, p2=p2, yz=yz) ss = GFlist(outdir, regrex1=regrex1) Tfile = [i[1] for i in ss] htmlcode = _hh(outdir) tb, ctt = make_Mulu_content(Tfile) htmlName = 'outputabsSPP.html' try: html = open(htmlName, 'w', encoding='utf8') html.write(htmlcode) html.write(tb) html.write(ctt) except: html = open(htmlName, 'w', encoding='gbk') html.write(htmlcode) html.write(tb) html.write(ctt) html.write('</body></html>') html.close() shutil.rmtree(outdir) return return
def TopyhtmlGF(pf, regrex1=None, search=None, index=True, Startw=None): pfname = pf.replace('/', '') #print(pfname) htmlf = pfname + '_content.html' p = getcsspath() ll = title + '\n' + title1 + ft % p + title2 + '\n' if os.path.exists(htmlf): os.remove(htmlf) with open(htmlf, 'w', encoding='utf8') as f: f.write(ll) f.write('<div id="content"> \n') f.write('<h1 class="title">%s</h1>\n<ul class="org-ul">\n' % pfname) f.flush() files = [] if isinstance(pf, list): files.extend(pf) elif pf is None: txtpath = os.getcwd() ss = GFlist(pf, regrex1=regrex1, research=search, startw=Startw) files = [i[1] for i in ss] elif os.path.isdir(pf): ss = GFlist(pf, regrex1=regrex1, research=search, startw=Startw) files = [i[1] for i in ss] with open(htmlf, 'w', encoding='utf8') as f: f.write(ll) f.write('<div id="content"> \n') f.write('<h1 class="title">%s</h1>\n<ul class="org-ul">\n' % pfname) f.flush() for ff in files: name = os.path.splitext(os.path.basename(ff))[0] #fpath=urllib.parse.quote(ff) fpath = pathname2url(ff) line = '<li><code>[ ]</code> <a href=%s>%s</a>\n</li>' % (fpath, name) try: write_file(htmlf, line) except Exception as e: print(e) write_file(htmlf, r"</ul>" + '\n') write_file(htmlf, '</div>\n</body>\n</html>') return
def C2html_AllinOneGF(txtpath=None,regrex1=None,Research=None,index=True,Startw=None,py=False): """ 将目录txtpath下的txt文件内容全部转到output.html文件中 px:文中排序的基准。 """ files=[] if isinstance(txtpath,list): files.extend(txtpath) elif txtpath is None: txtpath=os.getcwd() ss=GFlist(txtpath,regrex1=regrex1,research=Research,startw=Startw) files=[i[1] for i in ss] elif os.path.isdir(txtpath): ss=GFlist(txtpath,regrex1=regrex1,research=Research,startw=Startw) files=[i[1] for i in ss] C2html(files,index=index,py=py) return
def absAPPhtml(path,outdir='',regrex1=re.compile('检例第(\d*)号'),\ rc=re.compile('(.*?案\s*(检例第\d*号))'),\ p1=re.compile('【要\s*旨】'),\ p2=re.compile('【\w*】'),\ yz=True,func=C2html): """ 主要是针对最高检察院的指导性案例,对每一个案例进行分类,或提取起裁判要旨 """ if outdir == '': outdir = 'itempdit' absSPP(path=path, tdir=outdir, rc=rc, p1=p1, p2=p2, yz=yz) ss = GFlist(outdir, regrex1=regrex1) Tfile = [i[1] for i in ss] if func.__name__ == 'C2html': func(Tfile) elif func.__name__ == "txt2htmlall": func(Tfile, mformat='AIO') shutil.rmtree(outdir) return
def MainSpp(path,outdir='itempdit',regrex1=re.compile('检例第(\d*)号'),\ rc=re.compile('(.*?案\s*(检例第\d*号))'),\ p1=re.compile('【要旨】'),p2=re.compile('\【\w*】'),\ yz=True,OutFile='MainSpp',mtype='pad',\ pyin=False,Total='max',item1_bool=False,item2_bool=False,\ item0_bool=False): if outdir=='': outdir='itempdit' absSPP(path=path,tdir=outdir,rc=rc,p1=p1,p2=p2,yz=yz) ss=GFlist(outdir,regrex1=regrex1) txt_files={} for i in ss: txt_files[i[0]]=Singal_input(i[1],pyin,item1_bool=item1_bool,item2_bool=item2_bool,item0_bool=item0_bool) if len(txt_files)>0: txt_files1=sorted(txt_files.items(),key=lambda txt_files:txt_files[0]) else: print('No files 适合条件') sys.exit() ##########################3 if Total=='max': OutFile1=OutFile+'.tex' fl=open(OutFile1,'w',encoding='utf8') fl.write(latexs[mtype]+'\n\n') for f in txt_files1: fl.write('\input{%s}'%f[1]) fl.write(r'\newpage') #fl.write('\n\n') fl.write(end) fl.close() os.system('xelatex -no-pdf -interaction=nonstopmode %s' %OutFile1) os.system('xelatex -interaction=nonstopmode %s' %OutFile1) _removef(OutFile1) ###########################3 elif isinstance(Total,int): for f in txt_files1: txp=[txt_files1[i:i+Total] for i in range(0,len(txt_files),Total)] fn=1 for ff in txp: OutFile1=OutFile+'_%s.tex'%str(fn).zfill(2) fl=open(OutFile1,'w',encoding='utf8') fl.write(latexs[mtype]+'\n\n') for f in ff: fl.write('\input{%s}'%f[1]) fl.write(r'\newpage') #fl.write('\n\n') fl.write(end) fl.close() os.system('xelatex -no-pdf -interaction=nonstopmode %s' %OutFile1) os.system('xelatex -interaction=nonstopmode %s' %OutFile1) _removef(OutFile1) fn +=1 else: print('Total is max out int, please input the right parameter.') """ for root,dirs,files in os.walk(tdir): for f in files: if os.path.splitext(f)[1] in ['.tex']: os.remove('%s'%os.path.abspath(root+'/'+f)) pass""" shutil.rmtree(outdir) return
def MainsAbs(txtpath,func=abssplit,OutFile='Mainabs',mtype='pad',\ pyin=False,Total='max',regrex1=None,Research=None,\ Startw=None,rc=re.compile('\裁判要点\W*(.*?)\W*相关法条'),\ p1=re.compile('裁判要点'),p2=re.compile('相关法条'),\ item1_bool=False,item2_bool=False,\ item0_bool=False): txt_files={} rsch=[] if isinstance(Research,str): rsch.append(Research) elif isinstance(Research,list): rsch.extend(Research) files=[] if isinstance(txtpath,list): files.extend(txtpath) elif txtpath is None: txtpath=os.getcwd() ss=GFlist(txtpath,regrex1=regrex1,research=Research,startw=Startw) files=[i[1] for i in ss] elif os.path.isdir(txtpath): ss=GFlist(txtpath,regrex1=regrex1,research=Research,startw=Startw) files=[i[1] for i in ss] tdir='temp_dir' if not os.path.exists(tdir): os.mkdir(tdir) for f in files: print(f) if func.__name__=='abstract': bn=os.path.basename(f) nf=os.path.join(tdir,bn) #print(nf) try: text=func(f,rc=rc) #print(text) with open(nf,'w',encoding='utf8') as gf: gf.write(text) #Tfile.append(f[0],nf) except: print('没有相应的内容for abstract') pass elif func.__name__=='abssplit': bn=os.path.basename(f) nf=os.path.join(tdir,bn) try: text=func(f,p1=p1,p2=p2) #print(text) with open(nf,'w',encoding='utf8') as gf: gf.write(text) #Tfile[f[0]]=nf #Tfile.append(f[0],nf) except: print('没有相应的内容for abssplit') pass ss=GFlist(tdir,regrex1=regrex1) for i in ss: txt_files[i[0]]=Singal_input(i[1],pyin,item1_bool=item1_bool,item2_bool=item2_bool,item0_bool=item0_bool) if len(txt_files)>0: txt_files1=sorted(txt_files.items(),key=lambda txt_files:txt_files[0]) else: print('No files 适合条件') sys.exit() ##########################3 if Total=='max': OutFile1=OutFile+'.tex' fl=open(OutFile1,'w',encoding='utf8') fl.write(latexs[mtype]+'\n\n') for f in txt_files1: fl.write('\input{%s}'%f[1]) fl.write(r'\newpage') #fl.write('\n\n') fl.write(end) fl.close() os.system('xelatex -no-pdf -interaction=nonstopmode %s' %OutFile1) os.system('xelatex -interaction=nonstopmode %s' %OutFile1) _removef(OutFile1) ###########################3 elif isinstance(Total,int): for f in txt_files1: txp=[txt_files1[i:i+Total] for i in range(0,len(txt_files),Total)] fn=1 for ff in txp: OutFile1=OutFile+'_%s.tex'%str(fn).zfill(2) fl=open(OutFile1,'w',encoding='utf8') fl.write(latexs[mtype]+'\n\n') for f in ff: fl.write('\input{%s}'%f[1]) fl.write(r'\newpage') #fl.write('\n\n') fl.write(end) fl.close() os.system('xelatex -no-pdf -interaction=nonstopmode %s' %OutFile1) os.system('xelatex -interaction=nonstopmode %s' %OutFile1) _removef(OutFile1) fn +=1 else: print('Total is max out int, please input the right parameter.') """ for root,dirs,files in os.walk(tdir): for f in files: if os.path.splitext(f)[1] in ['.tex']: os.remove('%s'%os.path.abspath(root+'/'+f)) pass""" shutil.rmtree(tdir) return
def GenerateBookGF(path,regrex1=None,\ search=None,startw=None,\ exclude=None,\ func=C2html,\ item1_bool=False,\ item2_bool=False,\ item0_bool=False,\ htmlfile='htmlfile/htmlbook_output',\ pdffile='htmlbook_Main',mtype='article',\ num=None,pyin=False,File_num='max',\ m1=re.compile(r'^第\w{1,3}[编|篇]'),\ m2=re.compile(r'^第\w{1,3}章'),\ m3=re.compile(r'^第\w{1,3}节'),\ m4=re.compile(r'^\w{1,3}、'),\ index=True,res=True,\ Spp=False,\ Spplit=False,\ rc=re.compile('(.*?案\s*(检例第\d*号))'),\ p1=re.compile('【要\s*旨】'),p2=re.compile('【\w*】'),yz=True): """ regrex:re.compile('\d*'),从文件名中提取中关键字作排序用 search:str/list,民事诉讼,将文件名中符合含有关键字的文件提取出来 startw:re.compile('^ok'),将文件名中以特定字开头的文件提取出来 exclude:str/list,刑事诉讼,将含有exclude的文件予以排除 num:regrex的作用相同,主要是用于latex的文件中 m1:html文件中的一级目录 m2:同上,是2级目录g m3:同上,是3级目录 m4:同上,是4级目录 """ if func.__name__ in ['MainSpp']: func(path,yz=yz,mtype=mtype) return if func.__name__ in ['MainsAbs']: func(path,pyin=pyin,Startw=startw,mtype=mtype,regrex1=regrex1) return cc=re.compile('([,、:-》.《—_;;〈〉<>【】()()])*\s*-') rs=[] if isinstance(search ,list): rs.extend(search) elif isinstance(search ,str): rs.append(search) excl=[] if isinstance(exclude ,list): excl.extend(exclude) elif isinstance(exclude,str): excl.append(exclude) file_list = [] path_list = [] if isinstance(path,list): for f in path: if isfile(f): file_list.append(f) elif isdir(f): path_list.append(f) elif isfile(path): file_list.append(path) elif isdir(path): path_list.append(path) elif path is None: txtpath = os.getcwd() else: print('Please in list of dir/file,or dir,file') sys.exit() File_tmp = GFlist(path_list) for ff in File_tmp: file_list.append(ff[1]) only_one = set() fls = [] word = re.compile(r'[\u4e00-\u9fa5]+\d*') for ff in file_list: aa = os.path.basename(ff) nwd = ''.join(word.findall(aa)) if nwd not in only_one: only_one.add(nwd) fls.append(ff) if len(fls) > 0: file_list = fls temff = set() if exclude is not None: for ff in file_list: for ex in excl: if ex in os.path.basename(ff): temff.add(ff) File_tmp = [f for f in file_list if f not in temff] Final_list = {} for f in File_tmp: ff=basename(f) if regrex1 is not None: if splitext(ff)[1].lower() in ['.txt','.doc','.docx']: i1 = [i for i in regrex1.findall(ff) if len(i) > 0] i2 = [i for i in regrex1.findall(ut.ChNumToArab(ff)) if len(i) > 0] if len(i1) > 0: num1 = int(i1[0]) Final_list[num1] = f elif len(i2) > 0: num1= int(i2[0]) Final_list[num1] = f else: num1 = cc.sub('', ff).replace(' ', '') Final_list[num1] = f if search is not None: Tem={} for k,v in Final_list.items(): for rsch in rs: if rsch in basename(v): Tem[k]=v if len(Tem)>0: Final_list=Tem else: print('没有关于 "%s" 的文件'%search) sys.exit() if startw is not None: dff={} for k,v in Final_list.items(): if startw.match(basename(v)) is not None: #print('start word ...',v) dff[k]=v if len(dff)>0: Final_list=dff else: print('没有符合的文件') sys.exit() if len(Final_list)>0: Final=sorted(Final_list.items(),key=lambda item:item[0],reverse=res) Final_files=[i[1] for i in Final] #if res: # Final_files if func.__name__ in ['C2html','txt2htmlv1']: func(Final_files,output=htmlfile,m1=m1,m2=m2,m3=m3,index=index) pass elif func.__name__ in ['PdfFile']: func(Final_files,OutFile=pdffile,mtype=mtype,\ num=num,pyin=pyin,Total=File_num,\ item0_bool=item0_bool,\ item1_bool=item1_bool,item2_bool=item2_bool) #os.remove(pdffile+'.pdf','htmlfile/'+pdffile+'.pdf') pass else: print('Please input right function:','C2html','C2htmlBase','txt2htmlv1','txt2html_inonefile','PdfFile') if Spplit: shutil.rmtree(path) return#Final_files
def TopyhtmlGF(pf, regrex1=None, search=None, index=True, Startw=None): #print(pf) if isinstance(pf, str): pfname = pf.replace('/', '_') elif isinstance(pf, list): pfname = 'selectdirs' elif pf is None: pfname = 'selectdirs' else: raise Exception('请输入文件目录') htmlf = pfname + '_content.html' p = getcsspath() ll = title + '\n' + title1 + ft % p + title2 + '\n' if os.path.exists(htmlf): os.remove(htmlf) files = [] #print('test 1') if pf is None: pf = os.getcwd() ss = GFlist(pf, regrex1=regrex1, research=search, startw=Startw) files.extend([i[1] for i in ss]) elif isinstance(pf, list): for ff in pf: if os.path.isdir(ff): files.append(ff) print(ff) ss = GFlist(ff, regrex1=regrex1, research=search, startw=Startw) files.extend([i[1] for i in ss]) elif os.path.isfile(ff): files.append(ff) elif isinstance(pf, str): if not os.path.exists(pf): raise Exception('文件不存在,请输入正确的文件或目录') #sys.exit() if os.path.isdir(pf): ss = GFlist(pf, regrex1=regrex1, research=search, startw=Startw) files.extend([i[1] for i in ss]) elif os.path.isfile(pf): files.append(pf) with open(htmlf, 'w', encoding='utf8') as f: f.write(ll) f.write('<div id="content"> \n') f.write('<h1 class="title">%s</h1>\n<ul class="org-ul">\n' % pfname) f.flush() ss = set() #print(files) for ff in files: if os.path.isfile(ff): dn = os.path.dirname(ff) if dn not in ss: ss.add(dn) rname = os.path.split(dn)[1] dline = '\n<li>- d  <a href=%s>%s</a> \n</li>\n' % ( pathname2url(os.path.abspath(dn)), rname) write_file(htmlf, dline) name = os.path.splitext(os.path.basename(ff))[0] fpath = pathname2url(os.path.abspath(ff)) line = '<ul> - <code>[ ]</code> <a href=%s>%s</a>\n</ul>' % ( fpath, name) try: write_file(htmlf, line) except Exception as e: print(e) write_file(htmlf, r"</ul>" + '\n') write_file(htmlf, '</div>\n</body>\n</html>') return
def absTFilehtml(txtpath, func=abssplit, rc=re.compile('裁判要点\W*(.*?\s*.*?)\W*相关法条'), p1=re.compile('裁判要点'), p2=re.compile('相关法条'), regrex1=None, Research=None, index=True, Startw=None, m1=re.compile(r'^第\w{1,3}[编|篇]'), m2=re.compile(r'^第\w{1,3}章'), m3=re.compile(r'^第\w{1,3}节')): """ rc:需要提取的主要内容 regrex1: """ files = [] if isinstance(txtpath, list): files.extend(txtpath) elif txtpath is None: txtpath = os.getcwd() ss = GFlist(txtpath, regrex1=regrex1, research=Research, startw=Startw) files = [i[1] for i in ss] elif os.path.isdir(txtpath): ss = GFlist(txtpath, regrex1=regrex1, research=Research, startw=Startw) files = [i[1] for i in ss] tdir = 'temp_dir' if not os.path.exists(tdir): os.mkdir(tdir) htmlcode = _hh(txtpath) Tfile = [] if func.__name__ == 'abstract': for f in files: bn = os.path.basename(f) nf = os.path.join(tdir, bn) text = func(f, rc=rc) try: with open(nf, 'w', encoding='utf8') as gf: gf.write(text) Tfile.append(nf) except: pass elif func.__name__ == 'abssplit': for f in files: bn = os.path.basename(f) nf = os.path.join(tdir, bn) text = func(f, p1=p1, p2=p2) #print(text) try: with open(nf, 'w', encoding='utf8') as gf: gf.write(text) Tfile.append(nf) except: pass tb, ctt = make_Mulu_content(Tfile, m1=m1, m2=m2, m3=m3, index=index) htmlName = 'outputabs.html' try: html = open(htmlName, 'w', encoding='utf8') html.write(htmlcode) html.write(tb) html.write(ctt) except: html = open(htmlName, 'w', encoding='gbk') html.write(htmlcode) html.write(tb) html.write(ctt) html.write('</body></html>') html.close() shutil.rmtree(tdir) return
def absTFilehtml(txtpath,func=abssplit,\ rc=re.compile('裁判要点\W*(.*?\s*.*?)\W*相关法条'),\ p1=re.compile('裁判要点'),p2=re.compile('相关法条'),\ regrex1=None,Research=None,index=True,Startw=None,\ m1=re.compile(r'^第\w{1,3}[编|篇]'),\ m2=re.compile(r'^第\w{1,3}章'),\ m3=re.compile(r'^第\w{1,3}节'),\ thtmlfunc=C2html): """ 主要是针对最高法的指导性案例,提取起裁判要旨 rc:需要提取的主要内容 regrex1: """ files = [] if isinstance(txtpath, list): files.extend(txtpath) elif txtpath is None: txtpath = os.getcwd() ss = GFlist(txtpath, regrex1=regrex1, research=Research, startw=Startw) files = [i[1] for i in ss] elif os.path.isdir(txtpath): ss = GFlist(txtpath, regrex1=regrex1, research=Research, startw=Startw) files = [i[1] for i in ss] tdir = 'temp_dir' if not os.path.exists(tdir): os.mkdir(tdir) htmlcode = _hh(txtpath) Tfile = [] if func.__name__ == 'abstract': for f in files: bn = os.path.basename(f) nf = os.path.join(tdir, bn) text = func(f, rc=rc) try: with open(nf, 'w', encoding='utf8') as gf: gf.write(text) Tfile.append(nf) except: pass elif func.__name__ == 'abssplit': for f in files: bn = os.path.basename(f) nf = os.path.join(tdir, bn) text = func(f, p1=p1, p2=p2) #print(text) try: with open(nf, 'w', encoding='utf8') as gf: gf.write(text) Tfile.append(nf) except: pass ss = GFlist(tdir, regrex1=regrex1, research=Research, startw=Startw) if thtmlfunc.__name__ == 'C2html': thtmlfunc(Tfile) elif thtmlfunc.__name__ == "txt2htmlall": thtmlfunc(Tfile, mformat='AIO') #shutil.rmtree(tdir) return