def diff_2files_html(textfile1,textfile2): if textfile1 == "" or textfile2 == "": print("Usage:test.py filename1 filename2") #sys.exit() else: f1 = os.path.splitext(textfile1)[1] f2 = os.path.splitext(textfile2)[1] if f1.lower() in ['.txt']: text1_lines = readfile(textfile1) elif f1.lower() in ['.doc', '.docx']: text1_lines = msdoc2text(textfile1) if f2.lower() in ['.txt']: text2_lines = readfile(textfile2) elif f2.lower() in ['.doc', '.docx']: text2_lines = msdoc2text(textfile2) d = difflib.HtmlDiff() doc = d.make_file(text1_lines,text2_lines) #print(d.make_file(text1_lines,text2_lines)) f=open('diff_file.html','w') f.write(r'<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />') #f.write("<meta charset='UTF-8'>") f.write(doc) f.close() """ ff=open('diff_file.txt','w') diff = difflib.unified_diff(text1_lines, text2_lines, lineterm='') ff.write('\n'.join(list(diff))) ff.close()""" return
def File2all(fpath): tem = os.path.splitext(fpath) if tem[1].lower() in ['.txt']: try: txt = open(fpath, 'r', encoding='utf8').readlines() except: txt = open(fpath, 'r', encoding='gb18030').readlines() elif tem[1].lower() in ['.doc', '.docx']: text = msdoc2text(fpath) txt = text.split('\n') #txt=[i.strip() for i in tl if len(i.strip())>0] txtlist = [i.strip() for i in txt if len(i.strip()) > 0] emoi = [] for i in txtlist: if cs.match(i): emoi.append(i) else: if len(emoi) > 0: emoi[-1] = emoi[-1] + '\n' + i name = os.path.splitext(os.path.basename(fpath))[0] if len(emoi) > 0: tt = [] for i in emoi: tt.append((i, name)) return tt else: text = '\n'.join(txtlist) return text, name
def File2read(fpath): tem = os.path.splitext(fpath) if tem[1].lower() in ['.txt']: try: txt = open(fpath, 'r', encoding='utf8').readlines() except: txt = open(fpath, 'r', encoding='gb18030').readlines() elif tem[1].lower() in ['.doc', '.docx']: text = msdoc2text(fpath) txt = text.split('\n') txtlist = [i.strip() for i in txt if len(i.strip()) > 0] name = os.path.splitext(os.path.basename(fpath))[0] text = '\n'.join(txtlist) return text, name
def Singal_input(InFile,pyin=False,\ item1_bool=False,item2_bool=False,\ item0_bool=False): path=os.path.abspath(InFile) dname=os.path.dirname(path) ss=re.compile('第\w{1,3}[章编]') sss=re.compile('第\w{1,3}[节]') rpls=re.compile('[\W_#]') #ss=re.compile('_') allname=os.path.splitext(os.path.basename(path)) name=rpls.sub('',allname[0]).strip() if len(name)<12: outFile=p.get_pinyin(name)+'.tex' else: outFile=p.get_initials(name)+'.tex' outFile=outFile.replace('、','').replace(' ','').replace(':','').replace('-','').replace(':','').replace('(','').replace(')','').replace('《','').replace('》','').replace('—','') if sys.platform.startswith('win'): rt1=dname.split('\\') dname='/'.join(rt1) outFile2=dname+'/'+outFile if allname[1].lower() in ['.doc','.docx']: text=msdoc2text(path) tl=text.split('\n') content=[i.strip() for i in tl if len(i.strip())>0] elif allname[1].lower() in ['.txt']: try: f=open(InFile,'r',encoding='utf8') content=f.readlines() f.close() except: f=open(InFile,'r',encoding='gbk') content=f.readlines() f.close() cts=[] cnum1=re.compile('^第([一二三四五六七八九十百千万零]{1,5})条\s*\n*$') ctstmp=[] cnum2=re.compile('^第([一二三四五六七八九十百千万零]{1,5})条\s*\w') for li in content: li=li.lstrip() if li.strip() in ['裁判要点','基本案情','裁判结果','裁判理由','相关法条','【关键词】','【诉讼过程】','【基本案情】','【抗辩理由】','【案件结果】','【要旨】','【指导意义】','【相关法律规定】']: cts.append(r'\subsection{%s}'%li.strip()) elif ss.match(li): cts.append(r'\subsection{%s}'%li.strip()) elif sss.match(li): cts.append(r'\subsubsection{%s}'%li.strip()) elif (cnum.match(li)) and item0_bool: cts.append(r'\subsubsection{%s}'%li.strip()) elif (cnum1.match(li)) and item1_bool: ctstmp.append(li.strip()) elif cnum2.match(li) and item2_bool: ms1=cnum2.match(li).group()[:-1] li=re.sub(ms1[:-1],ms1[:-1].strip()+r'\\hspace{1em}',li) ctstmp.append(li) else: li=li.strip() if (len(li)>0)and(len(ctstmp)>0): li=ctstmp.pop()+'\hspace{1em}'+li.strip() cts.append(li) cts='\n\n'.join(cts).replace(' ','') cts=cts.replace('#','\#').replace('&','\&').replace('$','\$').replace('|','\|').replace('_','\_') cts=re.sub(r'%',r'\%',cts) if os.path.exists(outFile2): tmf=os.path.splitext(outFile2) time.sleep(0.02) outFile2=tmf[0]+'_%s'%int(time.time()*10000)+tmf[1] fl=open(outFile2,'w',encoding='utf8') fl.write(section%name) if pyin: fl.write('\n\n'+r'\begin{pinyinscope}') fl.write('\n\n'+cts+'\n\n') if pyin: fl.write('\n\n'+r'\end{pinyinscope}') fl.close() return outFile2
def make_Mulu_contentv1(files,m1=re.compile(r'^第\w{1,3}[编|篇]'),m2=re.compile(r'^第\w{1,3}章'),m3=re.compile(r'^第\w{1,3}节'),index=True): """ files:为纯文本文件的列表。否则会出现错误。 """ table='table.txt' content="output.txt" if os.path.exists(table): os.remove(table) if os.path.exists(content): os.remove(content) tb=open(table,'w',encoding='utf8') ctt=open(content,"w",encoding='utf8') tb.write('''<div id="table-of-contents"> <h2>Table of Contents</h2> <div id="text-table-of-contents"> \n''') for i,txtName in enumerate(files): tem=os.path.splitext(txtName) if tem[1].lower() in ['.txt']: try: txt=open(txtName,'r',encoding='utf8') text=txt.readlines() except: txt=open(txtName,'r',encoding='gb18030') text=txt.readlines() txt.close() elif tem[1].lower() in ['.doc','.docx']: text=msdoc2text(txtName) tl=text.split('\n') text=[i.strip() for i in tl if len(i.strip())>0] text=[x.strip() for x in text] s='\n'.join(text) ss=re.sub(r'\n{1,}',r'\n\n',s) text=ss.splitlines() ntitle=os.path.splitext(os.path.basename(txtName))[0]#[2:] ntitle=cc.sub('',ntitle)#.replace('_','').replace('-','') if i>0: tb.write('</li></ul>\n') pass tb.write('<ul><li><a href="#sec-%s-%s">%s</a>\n'%(i,txtName,ntitle)) titles='''<h1 id="sec-%s-%s">%s</h1> \n'''%(i,txtName,ntitle) ctt.write(titles) muI=1 mu1o=muI muII=1 mu2o=muII muIII=1 mu3o=muIII muIV=1 for line in text: line=line.strip() #print(line) if m1.match(line) is not None: if muI>mu1o: tb.write('</li></ul>\n') #print('ok......1') mu1o=muI ctt.write('</div>\n') tb.write('<ul><li><a href="#sec-%s-%s">%s</a>\n'%(muI,txtName,line)) tb.write('\n') titles='''<div id="outline-container-%s" class="outline-%s"> <h2 id="sec-%s-%s">%s</h2>\n'''%(muI,muI+1,muI,txtName,line) ctt.write(titles) muI=muI+1 elif m2.match(line) is not None: if muII>mu2o: tb.write('</li></ul>\n') #print('ok...........2') mu20=muII tb.write('<ul><li><a href="#sec-%s-%s-%s">%s</a>\n'%(muI,muII,txtName,line)) titles='<div id="outline-container-%s-%s"><h3 id="sec-%s-%s-%s">%s</h4>\n'%(muI,muII,muI,muII,txtName,line) ctt.write(titles) #print(titles) muII=muII+1 elif m3.match(line) is not None: if index: tb.write('<ul><li><a href="#sec-%s-%s-%s-%s">%s</a></li></ul>\n'%(muI,muII,muIII,txtName,line)) tb.write('\n') titles='<div id="outline-container-%s-%s-%s"><h4 id="sec-%s-%s-%s-%s">%s</h4>\n '%(muI,muII,muIII,muI,muII,muIII,txtName,line) ctt.write(titles) muIII=muIII+1 #print('ok.......3') else: ctt.write(titles) elif len(line)>0: line=line\ .replace('&','&')\ .replace('<','<')\ .replace('® ','® ')\ .replace('"','"')\ .replace('©','©')\ .replace('™','™')\ .replace('<','<')\ .replace('\t'," ").\ replace(' ',' ') line='<p>  %s</p>\n'%line #print(line) ctt.write(line) else: pass tb.write(r'</li></ul>') ctt.write('</div>') tb.write(r'</div></div>') ctt.close() tb.close() tb=open(table,'r',encoding='utf8').read() ctt=open(content,'r',encoding='utf8').read() os.remove(table) os.remove(content) return tb,ctt
def make_Mulu_content(files,m1=re.compile(r'^第\w{1,3}[编|篇]'),m2=re.compile(r'^第\w{1,3}章'),m3=re.compile(r'^第\w{1,3}节'),index=True,py=False): """ files:为纯文本文件的列表。否则会出现错误。 """ table='table.txt' content="output.txt" if os.path.exists(table): os.remove(table) if os.path.exists(content): os.remove(content) tb=open(table,'w',encoding='utf8') ctt=open(content,"w",encoding='utf8') tb.write('''<div id="table-of-contents"> <h2>Table of Contents</h2> <div id="text-table-of-contents"> \n''') for i,txtName in enumerate(files): tem=os.path.splitext(txtName) if tem[1].lower() in ['.txt']: try: txt=open(txtName,'r',encoding='utf8') text=txt.readlines() except: txt=open(txtName,'r',encoding='gb18030') text=txt.readlines() txt.close() elif tem[1].lower() in ['.doc','.docx']: text=msdoc2text(txtName) tl=text.split('\n') text=[i.strip() for i in tl if len(i.strip())>0] text=[x.strip() for x in text] s='\n'.join(text) ss=re.sub(r'\n{1,}',r'\n\n',s) text=ss.splitlines() ntitle=os.path.splitext(os.path.basename(txtName))[0]#[2:] ntitle=cc.sub('',ntitle)#.replace('_','').replace('-','') #print('the name for file %s,%s'%(ntitle,txtName)) if i>0: tb.write('</li></ul>\n') pass tb.write('<ul><li><a href="#sec-%s-%s">%s</a>\n'%(i,txtName,ntitle)) titles='''<h1 id="sec-%s-%s">%s</h1> \n'''%(i,txtName,ntitle) ctt.write(titles) muI=1 mu1o=muI muII=1 mu2o=muII muIII=1 mu3o=muIII muIV=1 tmplist=[] tem='' for line in text: line=line.strip() if m2.match(tem): if m2.match(line): #l1='</ul></li>\n' tb.write('</ul></li>\n') ctt.write('</div>\n') #print('2 t0 2 title %s next title %s....'%(tem,line)) #print(l1) elif m1.match(line): #l2='</ul></li>\n</ul></li>\n' tb.write('</ul></li>\n</ul></li>\n') ctt.write('</div>\n</div>') #print('2 to 1 title %s next title %s....'%(tem,line)) #print('title 2 next title 1....') elif m1.match(tem): if m1.match(line): tb.write('</ul></li>\n') #l3='</ul></li>\n' ctt.write('</div>\n') #print(l3,'1 to 2') pass elif m3.match(tem): if m1.match(line): tb.write('</ul></li>\n</ul></li>\n') ctt.write('</div>\n</div></div>') #print('3 to 1 title %s next title %s....'%(tem,line)) #print('title 3 next title 1....') elif m2.match(line): tb.write('</ul></li>\n') ctt.write('</div>\n</div>') #print('3 to 2 title %s next title %s....'%(tem,line)) #print('title 3 next title 2....') elif m3.match(line): ctt.write('</div>\n') if m1.match(line) is not None: tb.write('<ul><li><a href="#sec-%s-%s">%s</a>\n'%(muI,txtName,line)) #tb.write('\n') titles='''<div id="outline-container-%s" class="outline-%s"> <h2 id="sec-%s-%s">%s</h2>\n'''%(muI,muI+1,muI,txtName,line) ctt.write(titles) tem=line muI += 1 elif m2.match(line) is not None: tb.write('<ul><li><a href="#sec-%s-%s-%s">%s</a>\n'%(muI,muII,txtName,line)) titles='<div id="outline-container-%s-%s"><h3 id="sec-%s-%s-%s">%s</h4>\n'%(muI,muII,muI,muII,txtName,line) ctt.write(titles) tem=line muII +=1 elif m3.match(line) is not None: if index: tb.write('<ul><li><a href="#sec-%s-%s-%s-%s">%s</a></li></ul>\n'%(muI,muII,muIII,txtName,line)) tb.write('\n') titles='<div id="outline-container-%s-%s-%s"><h4 id="sec-%s-%s-%s-%s">%s</h4>\n '%(muI,muII,muIII,muI,muII,muIII,txtName,line) ctt.write(titles) muIII=muIII+1 tem=line #print('ok.......3') else: ctt.write(titles) elif len(line)>0: line=line\ .replace('&','&')\ .replace('<','<')\ .replace('® ','® ')\ .replace('"','"')\ .replace('©','©')\ .replace('™','™')\ .replace('<','<')\ .replace('\t'," ").\ replace(' ',' ') if py: hans=SnowNLP(line).words ruby='<ruby> ' for i in hans: piy=' '.join(lazy_pinyin(i,1)) ruby=ruby+i+'<rt>%s</rt>'%piy line="<p>  %s</p>\n"%ruby else: line='<p>  %s</p>\n'%line #print(line) ctt.write(line) else: pass tb.write(r'</li></ul>') ctt.write('</div>') tb.write(r'</div></div>') ctt.close() tb.close() tb=open(table,'r',encoding='utf8').read() ctt=open(content,'r',encoding='utf8').read() os.remove(table) os.remove(content) return tb,ctt