Python msdoc2textの例、mswdoc.docx2txt.msdoc2text Pythonの例

コード例 #1

0

ファイルを表示

ファイル: mdiff.py プロジェクト: davischan3168/newpackage

def diff_2files_html(textfile1,textfile2): 
    if textfile1 == "" or textfile2 == "":
        print("Usage:test.py filename1 filename2")
        #sys.exit()

    else:
        f1 = os.path.splitext(textfile1)[1]
        f2 = os.path.splitext(textfile2)[1]
        if f1.lower() in ['.txt']:
            text1_lines = readfile(textfile1)
        elif f1.lower() in ['.doc', '.docx']:
            text1_lines = msdoc2text(textfile1)
        if f2.lower() in ['.txt']:
            text2_lines = readfile(textfile2)
        elif f2.lower() in ['.doc', '.docx']:
            text2_lines = msdoc2text(textfile2)            
        d = difflib.HtmlDiff()
        doc = d.make_file(text1_lines,text2_lines)
        #print(d.make_file(text1_lines,text2_lines))
        f=open('diff_file.html','w')
        f.write(r'<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />')
        #f.write("<meta charset='UTF-8'>")
        f.write(doc)
        f.close()
        """
        ff=open('diff_file.txt','w')
        diff = difflib.unified_diff(text1_lines, text2_lines, lineterm='')
        ff.write('\n'.join(list(diff)))
        ff.close()"""
    return

コード例 #2

0

ファイルを表示

def File2all(fpath):
    tem = os.path.splitext(fpath)
    if tem[1].lower() in ['.txt']:
        try:
            txt = open(fpath, 'r', encoding='utf8').readlines()
        except:
            txt = open(fpath, 'r', encoding='gb18030').readlines()
    elif tem[1].lower() in ['.doc', '.docx']:
        text = msdoc2text(fpath)
        txt = text.split('\n')
        #txt=[i.strip() for i in tl if len(i.strip())>0]

    txtlist = [i.strip() for i in txt if len(i.strip()) > 0]
    emoi = []
    for i in txtlist:
        if cs.match(i):
            emoi.append(i)
        else:
            if len(emoi) > 0:
                emoi[-1] = emoi[-1] + '\n' + i

    name = os.path.splitext(os.path.basename(fpath))[0]

    if len(emoi) > 0:
        tt = []
        for i in emoi:
            tt.append((i, name))
        return tt
    else:
        text = '\n'.join(txtlist)
        return text, name

コード例 #3

0

ファイルを表示

def File2read(fpath):
    tem = os.path.splitext(fpath)
    if tem[1].lower() in ['.txt']:
        try:
            txt = open(fpath, 'r', encoding='utf8').readlines()
        except:
            txt = open(fpath, 'r', encoding='gb18030').readlines()
    elif tem[1].lower() in ['.doc', '.docx']:
        text = msdoc2text(fpath)
        txt = text.split('\n')

    txtlist = [i.strip() for i in txt if len(i.strip()) > 0]
    name = os.path.splitext(os.path.basename(fpath))[0]
    text = '\n'.join(txtlist)

    return text, name

コード例 #4

0

ファイルを表示

def Singal_input(InFile,pyin=False,\
                 item1_bool=False,item2_bool=False,\
                 item0_bool=False):
    path=os.path.abspath(InFile)
    dname=os.path.dirname(path)
    ss=re.compile('第\w{1,3}[章编]')
    sss=re.compile('第\w{1,3}[节]')
    rpls=re.compile('[\W_#]')
    #ss=re.compile('_')
    allname=os.path.splitext(os.path.basename(path))
    name=rpls.sub('',allname[0]).strip()
    if len(name)<12:
        outFile=p.get_pinyin(name)+'.tex'
    else:
        outFile=p.get_initials(name)+'.tex'
    outFile=outFile.replace('、','').replace('&nbsp','').replace(':','').replace('-','').replace('：','').replace('（','').replace('）','').replace('《','').replace('》','').replace('—','')

    if sys.platform.startswith('win'):
        rt1=dname.split('\\')
        dname='/'.join(rt1)
    outFile2=dname+'/'+outFile

    if allname[1].lower() in ['.doc','.docx']:
        text=msdoc2text(path)
        tl=text.split('\n')
        content=[i.strip() for i in tl if len(i.strip())>0]

    elif  allname[1].lower() in ['.txt']:
        try:
            f=open(InFile,'r',encoding='utf8')
            content=f.readlines()
            f.close()
        except:
            f=open(InFile,'r',encoding='gbk')
            content=f.readlines()
            f.close()        

    cts=[]
    cnum1=re.compile('^第([一二三四五六七八九十百千万零]{1,5})条\s*\n*$')
    ctstmp=[]
    cnum2=re.compile('^第([一二三四五六七八九十百千万零]{1,5})条\s*\w')
    for li in content:
        li=li.lstrip()
        if li.strip() in ['裁判要点','基本案情','裁判结果','裁判理由','相关法条','【关键词】','【诉讼过程】','【基本案情】','【抗辩理由】','【案件结果】','【要旨】','【指导意义】','【相关法律规定】']:
            cts.append(r'\subsection{%s}'%li.strip())
        elif ss.match(li):
            cts.append(r'\subsection{%s}'%li.strip())
        elif sss.match(li):
            cts.append(r'\subsubsection{%s}'%li.strip())
        elif (cnum.match(li)) and item0_bool:
            cts.append(r'\subsubsection{%s}'%li.strip())
        elif (cnum1.match(li)) and item1_bool:
            ctstmp.append(li.strip())
        elif cnum2.match(li) and item2_bool:
            ms1=cnum2.match(li).group()[:-1]
            li=re.sub(ms1[:-1],ms1[:-1].strip()+r'\\hspace{1em}',li)
            ctstmp.append(li)
        else:
            li=li.strip()
            if (len(li)>0)and(len(ctstmp)>0):
                li=ctstmp.pop()+'\hspace{1em}'+li.strip()
            cts.append(li)
    cts='\n\n'.join(cts).replace('&nbsp','')
    cts=cts.replace('#','\#').replace('&','\&').replace('$','\$').replace('|','\|').replace('_','\_')
    cts=re.sub(r'%',r'\%',cts)

    if os.path.exists(outFile2):
        tmf=os.path.splitext(outFile2)
        time.sleep(0.02)
        outFile2=tmf[0]+'_%s'%int(time.time()*10000)+tmf[1]
    fl=open(outFile2,'w',encoding='utf8')
    fl.write(section%name)
    if pyin:
        fl.write('\n\n'+r'\begin{pinyinscope}')
    fl.write('\n\n'+cts+'\n\n')
    if pyin:
        fl.write('\n\n'+r'\end{pinyinscope}')  
    fl.close()
    return outFile2

コード例 #5

0

ファイルを表示

ファイル: utilth.py プロジェクト: davischan3168/newpackage

def make_Mulu_contentv1(files,m1=re.compile(r'^第\w{1,3}[编|篇]'),m2=re.compile(r'^第\w{1,3}章'),m3=re.compile(r'^第\w{1,3}节'),index=True):
    """
    files:为纯文本文件的列表。否则会出现错误。
    """
    table='table.txt'
    content="output.txt"

    if os.path.exists(table):
        os.remove(table)

    if os.path.exists(content):
        os.remove(content)

    tb=open(table,'w',encoding='utf8')
    ctt=open(content,"w",encoding='utf8')

    tb.write('''<div id="table-of-contents">
    <h2>Table of Contents</h2>
    <div id="text-table-of-contents">
    \n''')
    
    for i,txtName in enumerate(files):
        tem=os.path.splitext(txtName)
        if tem[1].lower() in ['.txt']:
            try:
                txt=open(txtName,'r',encoding='utf8')
                text=txt.readlines()
            except:
                txt=open(txtName,'r',encoding='gb18030')
                text=txt.readlines()            
            txt.close()
        elif tem[1].lower() in ['.doc','.docx']:
            text=msdoc2text(txtName)
            tl=text.split('\n')
            text=[i.strip() for i in tl if len(i.strip())>0]
            
        text=[x.strip() for x in text]
        s='\n'.join(text)
        ss=re.sub(r'\n{1,}',r'\n\n',s)
        text=ss.splitlines()

        ntitle=os.path.splitext(os.path.basename(txtName))[0]#[2:]
        ntitle=cc.sub('',ntitle)#.replace('_','').replace('-','')
        if i>0:
            tb.write('</li></ul>\n')
            pass
        tb.write('<ul><li><a href="#sec-%s-%s">%s</a>\n'%(i,txtName,ntitle))
        titles='''<h1 id="sec-%s-%s">%s</h1> \n'''%(i,txtName,ntitle)
        ctt.write(titles)

        muI=1
        mu1o=muI
        muII=1
        mu2o=muII
        muIII=1
        mu3o=muIII
        muIV=1
        for line in text:
            line=line.strip()
            #print(line)
            if m1.match(line) is not None:
                if muI>mu1o:
                    tb.write('</li></ul>\n')
                    #print('ok......1')
                    mu1o=muI
                ctt.write('</div>\n')
                tb.write('<ul><li><a href="#sec-%s-%s">%s</a>\n'%(muI,txtName,line))
                tb.write('\n')
                titles='''<div id="outline-container-%s" class="outline-%s">
                <h2 id="sec-%s-%s">%s</h2>\n'''%(muI,muI+1,muI,txtName,line)
                ctt.write(titles)
                muI=muI+1
            elif m2.match(line) is not None:
                if muII>mu2o:
                    tb.write('</li></ul>\n')
                    #print('ok...........2')
                    mu20=muII
                tb.write('<ul><li><a href="#sec-%s-%s-%s">%s</a>\n'%(muI,muII,txtName,line))
                titles='<div id="outline-container-%s-%s"><h3 id="sec-%s-%s-%s">%s</h4>\n'%(muI,muII,muI,muII,txtName,line)
                ctt.write(titles)
                #print(titles)
                muII=muII+1
            elif m3.match(line) is not None:
                if index:
                    tb.write('<ul><li><a  href="#sec-%s-%s-%s-%s">%s</a></li></ul>\n'%(muI,muII,muIII,txtName,line))
                    tb.write('\n')
                    titles='<div id="outline-container-%s-%s-%s"><h4 id="sec-%s-%s-%s-%s">%s</h4>\n '%(muI,muII,muIII,muI,muII,muIII,txtName,line)
                    ctt.write(titles)            
                    muIII=muIII+1
                    #print('ok.......3')

                else:
                    ctt.write(titles)


            elif len(line)>0:
                line=line\
                  .replace('&','&')\
                  .replace('<','<')\
                  .replace('® ','® ')\
                  .replace('"','"')\
                  .replace('©','©')\
                  .replace('™','™')\
                  .replace('<','<')\
                  .replace('\t',"    ").\
                  replace(' ',' ')
                line='<p>&emsp;&emsp;%s</p>\n'%line
                #print(line)
                ctt.write(line)
            else:
                pass

        
        tb.write(r'</li></ul>')

    ctt.write('</div>')
    tb.write(r'</div></div>')    
    ctt.close()
    tb.close()
    tb=open(table,'r',encoding='utf8').read()
    ctt=open(content,'r',encoding='utf8').read()
    os.remove(table)
    os.remove(content)
    
    return tb,ctt

コード例 #6

0

ファイルを表示

ファイル: utilth.py プロジェクト: davischan3168/newpackage

def make_Mulu_content(files,m1=re.compile(r'^第\w{1,3}[编|篇]'),m2=re.compile(r'^第\w{1,3}章'),m3=re.compile(r'^第\w{1,3}节'),index=True,py=False):
    """
    files:为纯文本文件的列表。否则会出现错误。
    """
    table='table.txt'
    content="output.txt"

    if os.path.exists(table):
        os.remove(table)

    if os.path.exists(content):
        os.remove(content)

    tb=open(table,'w',encoding='utf8')
    ctt=open(content,"w",encoding='utf8')

    tb.write('''<div id="table-of-contents">
    <h2>Table of Contents</h2>
    <div id="text-table-of-contents">
    \n''')
    
    for i,txtName in enumerate(files):
        tem=os.path.splitext(txtName)
        if tem[1].lower() in ['.txt']:
            try:
                txt=open(txtName,'r',encoding='utf8')
                text=txt.readlines()
            except:
                txt=open(txtName,'r',encoding='gb18030')
                text=txt.readlines()            
            txt.close()
        elif tem[1].lower() in ['.doc','.docx']:
            text=msdoc2text(txtName)
            tl=text.split('\n')
            text=[i.strip() for i in tl if len(i.strip())>0]
            
        text=[x.strip() for x in text]
        s='\n'.join(text)
        ss=re.sub(r'\n{1,}',r'\n\n',s)
        text=ss.splitlines()

        ntitle=os.path.splitext(os.path.basename(txtName))[0]#[2:]
        ntitle=cc.sub('',ntitle)#.replace('_','').replace('-','')
        #print('the name for file %s,%s'%(ntitle,txtName))
        if i>0:
            tb.write('</li></ul>\n')
            pass
        tb.write('<ul><li><a href="#sec-%s-%s">%s</a>\n'%(i,txtName,ntitle))
        titles='''<h1 id="sec-%s-%s">%s</h1> \n'''%(i,txtName,ntitle)
        ctt.write(titles)

        muI=1
        mu1o=muI
        muII=1
        mu2o=muII
        muIII=1
        mu3o=muIII
        muIV=1
        tmplist=[]
        tem=''
        for line in text:
            line=line.strip()
            
            if m2.match(tem):
                if m2.match(line):
                    #l1='</ul></li>\n'
                    tb.write('</ul></li>\n')
                    ctt.write('</div>\n')
                    #print('2 t0 2 title %s next title %s....'%(tem,line))
                    #print(l1)
                elif m1.match(line):
                    #l2='</ul></li>\n</ul></li>\n'
                    tb.write('</ul></li>\n</ul></li>\n')
                    ctt.write('</div>\n</div>')
                    #print('2 to 1 title %s next title %s....'%(tem,line))
                    #print('title 2 next title 1....')
                    
            elif m1.match(tem):
                if m1.match(line):
                    tb.write('</ul></li>\n')
                    #l3='</ul></li>\n'
                    ctt.write('</div>\n')
                    #print(l3,'1 to 2')
                    pass

            elif m3.match(tem):
                if m1.match(line):
                    tb.write('</ul></li>\n</ul></li>\n')
                    ctt.write('</div>\n</div></div>')
                    #print('3 to 1 title %s next title %s....'%(tem,line))
                    #print('title 3 next title 1....')
                elif m2.match(line):
                    tb.write('</ul></li>\n')
                    ctt.write('</div>\n</div>')
                    #print('3 to 2 title %s next title %s....'%(tem,line))
                    #print('title 3 next title 2....')
                elif m3.match(line):
                    ctt.write('</div>\n')
                    
            if m1.match(line) is not None:
                
                tb.write('<ul><li><a href="#sec-%s-%s">%s</a>\n'%(muI,txtName,line))
                #tb.write('\n')
                titles='''<div id="outline-container-%s" class="outline-%s">
                <h2 id="sec-%s-%s">%s</h2>\n'''%(muI,muI+1,muI,txtName,line)
                ctt.write(titles)
                tem=line
                muI += 1
            elif m2.match(line) is not None:
                tb.write('<ul><li><a href="#sec-%s-%s-%s">%s</a>\n'%(muI,muII,txtName,line))
                titles='<div id="outline-container-%s-%s"><h3 id="sec-%s-%s-%s">%s</h4>\n'%(muI,muII,muI,muII,txtName,line)
                ctt.write(titles)
                tem=line
                muII +=1
            elif m3.match(line) is not None:
                if index:
                    tb.write('<ul><li><a  href="#sec-%s-%s-%s-%s">%s</a></li></ul>\n'%(muI,muII,muIII,txtName,line))
                    tb.write('\n')
                    titles='<div id="outline-container-%s-%s-%s"><h4 id="sec-%s-%s-%s-%s">%s</h4>\n '%(muI,muII,muIII,muI,muII,muIII,txtName,line)
                    ctt.write(titles)            
                    muIII=muIII+1
                    tem=line
                    #print('ok.......3')

                else:
                    ctt.write(titles)
                    


            elif len(line)>0:
                line=line\
                  .replace('&','&')\
                  .replace('<','<')\
                  .replace('® ','® ')\
                  .replace('"','"')\
                  .replace('©','©')\
                  .replace('™','™')\
                  .replace('<','<')\
                  .replace('\t',"    ").\
                  replace(' ',' ')
                if py:
                    hans=SnowNLP(line).words
                    ruby='<ruby> '
                    for i in hans:
                        piy=' '.join(lazy_pinyin(i,1))
                        ruby=ruby+i+'<rt>%s</rt>'%piy
                    line="<p>&emsp;&emsp;%s</p>\n"%ruby
                else:
                    line='<p>&emsp;&emsp;%s</p>\n'%line
                    #print(line)
                ctt.write(line)
            else:
                pass

        
        tb.write(r'</li></ul>')

    ctt.write('</div>')
    tb.write(r'</div></div>')    
    ctt.close()
    tb.close()
    tb=open(table,'r',encoding='utf8').read()
    ctt=open(content,'r',encoding='utf8').read()
    os.remove(table)
    os.remove(content)
    
    return tb,ctt