def convert(output_type, docx_path, output_path): if output_type == '--html': output = PyDocX.to_html(docx_path) elif output_type == '--markdown': output = PyDocX.to_markdown(docx_path) else: print('Only valid output formats are --html and --markdown') return 2 with open(output_path, 'wb') as f: f.write(output.encode('utf-8')) return 0
def wordtohtml(request): media_root = os.path.join(settings.BASE_DIR, 'upload/') if request.method == "POST": file = request.FILES.get('file') if file is not None: t = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) id = shortuuid.uuid() word = wordhtml(word=file, time=t, uuid=id) word.save() file_path = media_root + str(word.word) html = PyDocX.to_html(file_path) html_name = str(word.uuid) + ".html" txt_name = media_root + "word/" + html_name f = open(txt_name, 'w', encoding="utf-8") f.write(html) f.close() if settings.DEBUG: url = "http://127.0.0.1:8000/upload/word/" + html_name else: url = "https://www.manjiexiang.cn/upload/word/" + html_name return HttpResponseRedirect(url) else: me = Me.objects.all() return render(request, "wordtohtml.html", {"msg": me[0]}) else: me = Me.objects.all() return render(request, "wordtohtml.html", {"msg": me[0]})
def get_img(file): html = PyDocX.to_html(file) soup = BeautifulSoup(html, 'lxml') images_in_docx = [] images_in_docx.append("aaaaaaaaaaaaaa") # for img in soup.findAll('img'): # reg = re.compile('data.*?/(.*?);', re.S) # style_img = reg.findall(img['src'])[0] # strg = img['src'].replace("data:image/wmf;base64,", "").replace("data:image/jpeg;base64,", "") # byte = base64.urlsafe_b64decode(strg) # t0 = int(round(time.time() * 1000)) # tmp_path = '/tmp/%d.%s' % (t0, str(style_img)) # with open(tmp_path, 'wb') as file: # file.write(byte) # if style_img == 'wmf': # t1 = int(round(time.time() * 1000)) # png_path = '/tmp/%d.png' % t1 # os.system('convert %s %s' % (tmp_path, png_path)) # f = open(png_path, 'rb') # url = put(f) # f.close() # img['src'] = url # images_in_docx.append(img) # os.remove(png_path) # os.remove(tmp_path) # else: # f = open(tmp_path, 'rb') # url = put(f) # f.close() # img['src'] = url # images_in_docx.append(img) # os.remove(tmp_path) return images_in_docx
def read_word(file): proxy = [] doc = docx.Document(file) for para in doc.paragraphs: proxy.append(para._element.xml) # 返回docx文档的xml文件 threads = [] # q = Queue q = {} t1 = threading.Thread(target=getMathml, args=(proxy, 'mmls_in_para', q)) threads.append(t1) html = PyDocX.to_html(file) t2 = threading.Thread(target=get_img, args=(html, 'images_in_para', 'images_in_table', q)) threads.append(t2) t1.start() t2.start() t1.join() t2.join() # for t in threads: # t.setDaemon(True) # t.start() # t.join() mmls_in_para = q.get('mmls_in_para') images_in_para, images_in_table = q.get('images_in_para'), q.get('images_in_table') table_html, table_para = get_table(doc, images_in_table) # 返回表格数据 paragraphs = get_para_html(proxy, mmls_in_para, images_in_para, table_html, table_para) # 得到文档的段落信息,返回的是HTML标签 return paragraphs
def docxtract(docxfile, **kwargs): ''' Get a docx file and extract chapters. Return an array for further processing by the user. ''' header = [u'h1', u'h2', u'h3', u'h4'] html = PyDocX.to_html(docxfile) html = cleanhtml(html, **kwargs) # Get the tree p = False tree = [] for child in html.find_all(True): if child.name in header: if p: tree.append([ptext, p]) p = False tree.append(["<b>%s</b>" % child.get_text(), unicode(child)]) else: if child.get_text().strip(): if p: p = ''.join([p, unicode(child)]) else: p = unicode(child) ptext = "%s..." % child.get_text()[:80] if p: tree.append([ptext, p]) return tree
def docx2html(file_path): ''' 将一个docx转换成html :param file_path: :return: ''' return PyDocX.to_html(file_path)
def store_document(): sys.setdefaultencoding('utf-8') # Pass in a path html = PyDocX.to_html(open('cmpe273-greensheet.docx', 'rb')) db.docCollection.insert({"HTML": html, "FileName": 'cmpe273-greensheet'}) return "sucess"
def _main_(tournament): os.chdir('C:\School\Quiz Bowl\Quizbowl Packets\\' + tournament) files = os.listdir(os.curdir) q = open(tournament + ' answers.csv', 'w') writer = csv.DictWriter(q, fieldnames=['Full Answer', 'Underlined Answer', 'Tournament', 'Packet'], lineterminator='\n') writer.writeheader() for f in files: html = PyDocX.to_html(f).decode('Windows-1252').encode('utf-8') html = clean_file(html) answers = re.findall(r'ANSWER: (?P<id>.*?)(?:</li>|<br />|</p>)', html) und = [] for a in answers: underlined = re.findall(r'<span class=\"pydocx-underline\">(?P<id>.*?)</span>', a) output = "" for u in underlined: output += ' ' + u.strip() und.append(output.strip()) for i in range(0, len(answers)): answer = re.sub(r'<[^>]*>', '', answers[i]).strip() writer.writerow({'Full Answer': answer.encode('utf-8'), 'Underlined Answer': und[i].encode('utf-8'), 'Tournament': tournament, 'Packet': f}) q.close() shutil.copy(tournament + ' answers.csv', 'C:\School\Quiz Bowl\Stats\Answers') os.remove(tournament + ' answers.csv')
def docx_to_html(docx_path): docx_html = PyDocX.to_html(docx_path) style_start = docx_html.find("<style>") style_end = docx_html.find("</style>") docx_html = docx_html[:style_start] + docx_html[style_end:] docx_html = '<div>' + docx_html.replace("_", " ") + '</div>' return docx_html
def main(): #修改为遍历时操作 filelist=os.listdir(config.filepath) for file in filelist: if file.endswith(".docx"): print("start to solve ",file) filepath=config.filepath+"/"+file name=os.path.basename(filepath) temp_photo_path=temppath+os.path.sep+name.replace(".docx","")+os.path.sep if not os.path.exists(temp_photo_path): os.makedirs(temp_photo_path) # 使用pydocx转化为html html = PyDocX.to_html(filepath) bsoup = bs4.BeautifulSoup(html, "lxml") imglist = bsoup.find_all("img") for img in imglist: #将html中的图片保存并转换后存入html中 img["src"],path=savephoto(img["src"], temp_photo_path) with open(temp_photo_path + name.replace(".docx", ".html"), "w", encoding="utf-8") as file: file.write(bsoup.prettify()) #将html转为docx #获取所有p标签 dstpath=temp_photo_path + name newdoc=docx.Document(os.getcwd()+os.path.sep+"docx/templates/default.docx") bodycontent = bsoup.find("body") bodychild = bodycontent.contents for child in bodychild: handle_tag(child, newdoc,temp_photo_path) newdoc.save(dstpath)
def insert_document(): msg = '' if request.method == 'POST': uploaded_file = request.files['file'] uploaded_file.save(secure_filename(uploaded_file.filename)) html = PyDocX.to_html(open(uploaded_file.filename, 'rb')) parse_store_content(html) return "File successfully parsed and uploaded"
def trans_to_html(): for one_file in glob.glob('*.docx'): name = one_file.replace('.docx', '') html = PyDocX.to_html(one_file) f = open(name + '.html' ,'w',encoding = 'utf-8') f.write(html) f.close()
def merge_docx(docx_list=None, out_htmlpath=None): """ docx_list is a list of strings which contains the (absolute) path of DOC/DOCX files to be merged. MERGE_DOCX() will follow the index order of docx_list for appending. Returns the HTML file as string. If OUT_HTMLPATH is given, write the HTML file out as well. """ if docx_list is None: return None cleaner = Cleaner() parser = HTMLParser(encoding='utf-8') html_list = [] for path in docx_list: try: tmp_html = PyDocX.to_html(path) html_list.append(cleaner.clean_html(lxml.html.fromstring(tmp_html, parser=parser))) except: #'MalformedDocxException' try: # Pretend it is a html html_file = '{}.html'.format(path) with open(html_file, 'rb') as tmp: tmp_html = tmp.read() tmp_html = tmp_html.decode('utf-8') html_list.append(cleaner.clean_html(lxml.html.fromstring(tmp_html, parser=parser))) except: # Cannot convert continue #print html_list if len(html_list)>1: #Append element at the end of first body main_body = html_list[0].xpath('./body')[0] for tree in html_list[1:]: elem_list = tree.xpath('./body/*') for elem in elem_list: main_body.append(elem) elif len(html_list)==1: main_body = html_list[0].xpath('./body')[0] else: try: main_body = html_list[0].xpath('./body')[0] except IndexError: # no body content. Most likely just an image/appendix return None # Convert ElementTree back to string # in this way we will lose the 'style' info in html_list[0][0], which is usually in header, # but not sure if it will cause any differences to parser later on. Probably not. html_str = lxml.etree.tostring(main_body) if out_htmlpath is not None: with open(out_htmlpath, 'wb') as tmp: tmp.write(html_str.encode('utf-8')) return html_str
def file_change(input, output='out.html'): html = PyDocX.to_html(input) # the title html = html.replace('pydocx-center', 'pydocx-center subtitle', -1) # the subtitle html = html.replace('pydocx-center subtitle', 'pydocx-center title', 2) with open(output, 'w', encoding='utf-8') as f: f.write(html) f.close
def docx2html(docx_filepath, html_filename=None): with open(docx_filepath, 'rb') as docx_file: html = PyDocX.to_html(docx_file) xmltree = etree.fromstring(html) prettyxml = etree.tostring(xmltree, pretty_print=True) if html_filename is None: html_filename = os.path.splitext(docx_filepath)[0] + '.html' with open(html_filename, 'w+b') as html_file: html_file.write(prettyxml)
def docx2pdf(filepath, dirpath, filename): #结果是html,xswl # wdFormatPDF = 17 in_file = filepath out_path = dirpath + '/Webpages' create_dir(out_path) html = PyDocX.to_html(in_file) f = open(filename, 'w', encoding="utf-8") f.write(html) f.close()
def insert_document(): msg = '' if request.method == 'POST': uploaded_file = request.files['file'] uploaded_file.save(secure_filename(uploaded_file.filename)) html = PyDocX.to_html(open(uploaded_file.filename, 'rb')) file_name = str(uploaded_file.filename) db[file_name].insert({"filename": file_name, "HTML": html}) parse_store_content(file_name) return "File successfully uploaded"
def insert_document(): if request.method == 'POST': f = request.files['file'] f.save(secure_filename(f.filename)) html = PyDocX.to_html(open(f.filename, 'rb')) db.greensheetdocs.insert({ "filename":"CMPE273", "HTML":html }) return "Successfully Uploaded"
def convert_docx_html(srcfile): """ 转换docx文件为html文件 :param srcfile: docx文件 :return: """ html = PyDocX.to_html(srcfile) name = srcfile[:srcfile.rfind(".")] f = open(name.encode("gbk") + ".html", 'w') f.write(html.encode("utf-8")) f.close()
def docx2html(_path_docx): # html = PyDocX.to_html("test.docx") # html = PyDocX.to_html(r'E:\\3101A0CV-20170615-H3C RPS800-A 用户手册-6PW101\06-正文.docx') html = PyDocX.to_html(_path_docx) # f = open("test.html", 'w', encoding="utf-8") path_html = _path_docx.split("\\")[-1].split(".")[0] + ".html" f = open(path_html, 'w', encoding="utf-8") f.write(html) f.close() return path_html
def get_img(file): """ :param file:文件对象 :return:返回文档中的图片 """ html = PyDocX.to_html(file) print('html{}'.format(html)) soup = BeautifulSoup(html, 'lxml') images_in_para = [] images_in_table = [] for img in soup.find_all('img'): if img.find_parents('table') != []: img['src'] = "$$$$$$$$$$$$$$$$" images_in_table.append(str(img)) else: img['src'] = "aaaaaaaaaaaa" images_in_para.append(str(img)) # reg = re.compile('data.*?/(.*?);', re.S) # style_img = reg.findall(img['src'])[0] # strg = img['src'].replace("data:image/wmf;base64,", "").replace("data:image/jpeg;base64,", "") # byte = base64.urlsafe_b64decode(strg) # t0 = int(round(time.time() * 1000)) # tmp_path = '/tmp/%d.%s' % (t0, str(style_img)) # with open(tmp_path, 'wb') as file: # file.write(byte) # if style_img == 'wmf': # t1 = int(round(time.time() * 1000)) # png_path = '/tmp/%d.png' % t1 # os.system('convert %s %s' % (tmp_path, png_path)) # f = open(png_path, 'rb') # url = put(f) # f.close() # img['src'] = url # if img.find_parents('table') != []: # images_in_table.append(img) # else: # images_in_para.append(img) # os.remove(png_path) # os.remove(tmp_path) # else: # f = open(tmp_path, 'rb') # url = put(f) # f.close() # img['src'] = url # if img.find_parents('table') != []: # images_in_table.append(img) # else: # images_in_para.append(img) # os.remove(tmp_path) return images_in_para, images_in_table
def to_html(): #transfer all docx files into html through this for loop. for one_file in glob.glob('*.docx'): #extract the name of each employee name = re.compile(r'(.*?)员工履历').findall(one_file)[0] #extract content of docx file html = PyDocX.to_html(one_file) #create a html file f = open(name + '.html', 'w', encoding='utf-8') #save the content into this html file. f.write(html) #save and close the html file. f.close()
def docx_ol(request, file_id): file_info = FileInfo.objects.get(id=file_id) init_path = file_info.file_path html_path = file_info.file_path.split('.')[0] + '.html' html_name = file_info.file_name.split('.')[0] + '.html' # if file_info.file_type =='doc': # doc2x(file_info.file_path) # init_path=file_info.file_path.split('.')[0]+'.docx' # pythoncom.CoInitialize() html = PyDocX.to_html(init_path) f = open(html_path, 'w', encoding="utf-8") f.write(html) f.close() shutil.copy(html_path, 'D:\\python\\jzyy\\fileserver\\templates\\fileserver\\ol') return render(request, 'fileserver/ol/%s' % html_name)
def upload(request): if request.method == 'POST': # 获取对象 file = request.FILES.get("file", None) print(file.name) bs = base64.b64decode(file.name) filename = str(bs, 'ISO-8859-1') print(filename) extension = fileExtension(filename) print(extension) html = None if (extension == '.docx' or extension == '.doc'): html = PyDocX.to_html(file) if (extension == '.xls' or extension == '.xlsx'): xd = pd.ExcelFile(file) df = xd.parse() html = df.to_html(header=True, index=False) return HttpResponse(html)
def docx_to_html(filepath, overwrite=False): """ Converts docx file to in-memory html string :param filepath: full path to the file to convert :return: unicode string """ html_file = '{}.html'.format(filepath) if not os.path.exists(html_file) or overwrite: #res = pydocx.docx2html(filepath) res = PyDocX.to_html(filepath) with open(html_file, 'wb') as tmp: tmp.write(res.encode('utf-8')) else: with open(html_file, 'rb') as tmp: res = tmp.read().decode('utf-8') return res
def index(request): if request.method == "GET": papername = request.GET.get('filename', None) paper = PaperGrade.objects.get(PaperName=papername) username = request.session.get('username', None) user = UserInfo.objects.get(Name=username) user.PaperChecking = papername user.save() if paper.file_type() is 'docx': html = PyDocX.to_html(paper) return render(request, html) #不知对错 else: return render( request, 'All/index.html', {'filename': r'/File/' + paper.PaperName}) #url写法自我怀疑人生中 else: return render(request, 'All/index.html') return render(request, 'All/index.html')
def _create_pdf(self, file): _, extension = os.path.splitext(file.file.name) aws_response = requests.get(self._get_aws_url(file.file, 5)) content = ContentFile(aws_response.content) if extension in ['.xlsx', '.xls', '.xlb']: preview_file = f'preview;name;data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{base64.b64encode(content.read()).decode("utf-8")}' return Response( { 'preview': preview_file, 'extension': extension, 'title': file.name }, status=status.HTTP_200_OK) elif extension == '.pdf': preview_file = f'preview;name;data:application/pdf;base64,{base64.b64encode(content.read()).decode("utf-8")}' return Response( { 'preview': preview_file, 'extension': extension, 'title': file.name }, status=status.HTTP_200_OK) elif extension == '.docx': html = PyDocX.to_html(content) elif extension == '.rtf': html = rtf.getHtml(content.read().decode('UTF-8')) else: raise ValidationError( "File is required, expected format: pdf, doc, docx, xls, xlsx, rtf" ) response = HttpResponse() response['Content-Disposition'] = 'attachment; filename="report.pdf"' pisa.CreatePDF(html, dest=response) preview_file = f'preview;name;data:application/pdf;base64,{base64.b64encode(response.content).decode("utf-8")}' return Response( { 'preview': preview_file, 'extension': extension, 'title': file.name }, status=status.HTTP_200_OK)
def get_img(file): html = PyDocX.to_html(file) soup = BeautifulSoup(html, 'lxml') images_in_para = [] images_in_table = [] for img in soup.find_all('img'): reg = re.compile('data.*?/(.*?);', re.S) style_img = reg.findall(img['src'])[0] strg = img['src'].replace("data:image/wmf;base64,", "").replace("data:image/jpeg;base64,", "") byte = base64.urlsafe_b64decode(strg) t0 = int(round(time.time() * 1000)) tmp_path = '/tmp/%d.%s' % (t0, str(style_img)) with open(tmp_path, 'wb') as file: file.write(byte) if style_img == 'wmf': t1 = int(round(time.time() * 1000)) png_path = '/tmp/%d.png' % t1 os.system('convert %s %s' % (tmp_path, png_path)) f = open(png_path, 'rb') url = put(f) f.close() img['src'] = url if img.find_parents('table') != []: images_in_table.append(img) else: images_in_para.append(img) os.remove(png_path) os.remove(tmp_path) else: f = open(tmp_path, 'rb') url = put(f) f.close() img['src'] = url if img.find_parents('table') != []: images_in_table.append(img) else: images_in_para.append(img) os.remove(tmp_path) return images_in_para, images_in_table
def get_img(file): html = PyDocX.to_html(file) soup = BeautifulSoup(html, 'lxml') images_in_para = [] images_in_table = [] for img in soup.find_all('img'): reg = re.compile('data.*?/(.*?);', re.S) style_img = reg.findall(img['src'])[0] strg = re.sub(re.compile("data:[/\w]*;base64,", re.S | re.I), '', img['src']) byte = base64.urlsafe_b64decode(strg) t0 = str(uuid.uuid4()) or int(round(time.time() * 1000)) tmp_path = '/tmp/{}.{}'.format(t0, str(style_img)) with open(tmp_path, 'wb') as file: file.write(byte) if style_img == 'wmf': t1 = str(uuid.uuid4()) or int(round(time.time() * 1000)) png_path = '/tmp/{}.png'.format(t1) os.system('convert %s %s' % (tmp_path, png_path)) with open(png_path, 'rb') as f: url = put(f) img['src'] = url if img.find_parents('table'): images_in_table.append(img) else: images_in_para.append(img) os.remove(png_path) os.remove(tmp_path) else: with open(tmp_path, 'rb') as f: url = put(f) img['src'] = url if img.find_parents('table'): images_in_table.append(img) else: images_in_para.append(img) os.remove(tmp_path) return images_in_para, images_in_table
def word2html(htmlfile, wordfile): html = PyDocX.to_html(wordfile) f = open(htmlfile, 'w', encoding="utf-8") f.write(html) f.close() print("data end")
def gui_parse(): global __file__ # to fix stupid __file__ = os.path.abspath(__file__) # __file__ handling _file_ = os.path.basename(__file__) # in python 2 global debug root = Tk() root.withdraw() sys.stderr = codecs.getwriter('utf8')(sys.stderr) parser = argparse.ArgumentParser() parser.add_argument('filename', nargs='?') parser.add_argument('--debug', '-d', action='store_true') args = parser.parse_args() if args.debug: debug = True if args.filename is None: args.filename = tkFileDialog.askopenfilename( filetypes=[ ('Word 2007+','*.docx'), ('Plain text','*.txt'), ]) os.chdir(os.path.dirname(os.path.abspath(args.filename))) if os.path.splitext(args.filename)[1] == '.txt': with codecs.open(args.filename, 'r', 'utf8') as input_file: input_text = input_file.read() input_text = input_text.replace('\r','') final_structure = chgk_parse(input_text) elif os.path.splitext(args.filename)[1] == '.docx': from pydocx import PyDocX from bs4 import BeautifulSoup from parse import parse import base64 import html2text input_docx = PyDocX.to_html(args.filename) bsoup = BeautifulSoup(input_docx) if args.debug: with codecs.open('debug.pydocx', 'w', 'utf8') as dbg: dbg.write(input_docx) def generate_imgname(ext): imgcounter = 1 while os.path.isfile('{:03}.{}' .format(imgcounter, ext)): imgcounter += 1 return '{:03}.{}'.format(imgcounter, ext) for tag in bsoup.find_all('style'): tag.extract() for tag in bsoup.find_all('p'): if tag.string: tag.string = tag.string + SEP for tag in bsoup.find_all('b'): tag.unwrap() for tag in bsoup.find_all('strong'): tag.unwrap() for tag in bsoup.find_all('i'): tag.string = '_' + tag.string + '_' tag.unwrap() for tag in bsoup.find_all('em'): tag.string = '_' + tag.string + '_' tag.unwrap() for tag in bsoup.find_all('li'): if tag.string: tag.string = '- ' + tag.string for tag in bsoup.find_all('img'): imgparse = parse('data:image/{ext};base64,{b64}', tag['src']) imgname = generate_imgname(imgparse['ext']) tag.insert_before('(img {})'.format(imgname)) if not args.debug: with open(imgname, 'wb') as f: f.write(base64.b64decode(imgparse['b64'])) tag.extract() for tag in bsoup.find_all('a'): if rew(tag.string) == '': tag.extract() else: tag.string = tag['href'] tag.unwrap() h = html2text.HTML2Text() h.body_width = 0 txt = (h.handle(bsoup.prettify()) .replace('\\-','') .replace('\\.','.') .replace('( ', '(') .replace('[ ', '[') .replace(' )', ')') .replace(' ]', ']') .replace(' :', ':') ) if args.debug: with codecs.open('debug.debug', 'w', 'utf8') as dbg: dbg.write(txt) final_structure = chgk_parse(txt) else: sys.stderr.write('Error: unsupported file format.' + SEP) sys.exit() os.chdir(os.path.dirname(os.path.abspath(__file__))) with codecs.open( make_filename(args.filename), 'w', 'utf8') as output_file: output_file.write( compose_4s(final_structure)) print('Please review the resulting file {}:'.format( make_filename(args.filename))) subprocess.call(shlex.split('{} "{}"' .format( TEXTEDITOR, make_filename(args.filename)).encode('cp1251',errors='replace')))
from pydocx import PyDocX html = PyDocX.to_html("111.docx") f = open("test.html", 'w', encoding="utf-8") f.write(html) f.close()
def convert(self): self._raw = PyDocX.to_html(self.path) bs = BeautifulSoup(self._raw, 'html.parser') self.data = bs.body
def papertest(): html = PyDocX.to_html('./paperPDF/' + 'bitcoin.docx') print(html) return html
import os, sys import os from pydocx import PyDocX html = PyDocX.to_html("2.docx") print(html)
from pydocx import PyDocX """ 旧包:docx2html,此包要求python版本较低 新包路径如下: https://github.com/CenterForOpenScience/pydocx """ docx_file = r'D:\十二刻度-个人信息及隐私政策.docx' html_fle = r'D:\result.html' html = PyDocX.to_html(docx_file) with open(html_fle, encoding='UTF-8', mode='w') as f: f.write(html)