def test_lists_with_styles(): file_path = path.join( path.abspath(path.dirname(__file__)), '..', 'fixtures', 'lists_with_styles.docx', ) actual_html = convert(file_path) assert_html_equal(actual_html, ''' <html> <ol data-list-type="decimal"> <li>AAA</li> <li>BBB <ol data-list-type="lower-roman"> <li>CCC</li> <li>DDD <ol data-list-type="upper-alpha"> <li>EEE <ol data-list-type="lower-alpha"> <li>FFF</li> </ol> </li> </ol> </li> </ol> </li> </ol> </html> ''')
def docx_converter(): while True: docx_path = raw_input('Please enter path to folder containing .docx files:') docx_path_exists = os.path.exists(docx_path) if docx_path_exists: write_path = raw_input('Please enter path to write html files to (conversion begins automatically): ') write_path_exists = os.path.exists(write_path) if write_path_exists: for subdir, dirs, files in os.walk(docx_path): for file in files: ext = os.path.splitext(file)[-1].lower() if ext == '.docx': file_path = os.path.join(docx_path, file) html = convert(file_path, image_handler=handle_image) print 'Converting: ' + file # Give new html file same name as .docx original html_file_path = os.path.join(write_path, os.path.splitext(file)[0]) html_file = open(html_file_path, 'w') html_file.write(html) html_file.close() else: print 'Please enter a valid path.' else: print 'Please enter a valid path.'
def test_tables_in_lists(): file_path = path.join( path.abspath(path.dirname(__file__)), '..', 'fixtures', 'tables_in_lists.docx', ) actual_html = convert(file_path) assert_html_equal(actual_html, ''' <html> <ol data-list-type="decimal"> <li>AAA</li> <li>BBB<br /> <table> <tr> <td>CCC</td> <td>DDD</td> </tr> <tr> <td>EEE</td> <td>FFF</td> </tr> </table> </li> <li>GGG</li> </ol> </html> ''')
def test_bigger_font_size_to_header(): # Show when it is appropriate to convert p tags to h tags based on font # size. if not DETECT_FONT_SIZE: raise SkipTest('Font size detection is disabled.') file_path = path.join( path.abspath(path.dirname(__file__)), '..', 'fixtures', 'bigger_font_size_to_header.docx', ) actual_html = convert(file_path) assert_html_equal(actual_html, ''' <html> <p>Paragraphs:</p> <h2>Header</h2> <p>paragraph 1</p> <p>Lists:</p> <ol data-list-type="decimal"> <li>bigger</li> <li>smaller</li> </ol> <p>Tables:</p> <table> <tr> <td>bigger</td> <td>smaller</td> </tr> </table> </html> ''')
def test_shift_enter(): file_path = path.join( path.abspath(path.dirname(__file__)), '..', 'fixtures', 'shift_enter.docx', ) # Test just the convert without clean_html to make sure the first # break tag is present. actual_html = convert(file_path) assert_html_equal(actual_html, ''' <html> <p>AAA<br />BBB</p> <p>CCC</p> <ol data-list-type="decimal"> <li>DDD<br />EEE</li> <li>FFF</li> </ol> <table> <tr> <td>GGG<br />HHH</td> <td>III<br />JJJ</td> </tr> <tr> <td>KKK</td> <td>LLL</td> </tr> </table> </html> ''')
def test_convert_p_to_h(): # Show when it is correct to convert a p tag to an h tag based on # bold/italics file_path = path.join( path.abspath(path.dirname(__file__)), '..', 'fixtures', 'convert_p_to_h.docx', ) actual_html = convert(file_path) assert_html_equal(actual_html, ''' <html> <h2>AAA</h2> <h2>BBB</h2> <p>CCC</p> <ol data-list-type="decimal"> <li><strong>DDD</strong></li> <li><em>EEE</em></li> <li>FFF</li> </ol> <table> <tr> <td><strong>GGG</strong></td> <td><em>HHH</em></td> </tr> <tr> <td>III</td> <td>JJJ</td> </tr> </table> </html> ''')
def test_fall_back(): file_path = 'test.doc' def fall_back(*args, **kwargs): return 'success' html = convert(file_path, fall_back=fall_back, converter=_converter) assert html == 'success'
def docxView(request): #TODO: все к херам переписать по уму from shutil import copyfile from docx2html import convert def handle_image(image_id, relationship_dict): image_path = relationship_dict[image_id] # Now do something to the image. Let's move it somewhere. _, filename = os.path.split(image_path) destination_path = os.path.join(MEDIA_ROOT, filename) copyfile(image_path, destination_path) # Return the `src` attribute to be used in the img tag return '/protected%s%s' % (MEDIA_URL, filename) fp = request.GET.get('f', None) html = None if fp: try: pm_file = PM_Files.objects.get(pk=int(fp)) if pm_file.type == 'docx': html = convert(str(pm_file.file.path), image_handler=handle_image) elif pm_file.type == 'xlsx': html = excelToHtml(str(pm_file.file.path)) except PM_Files.DoesNotExist: pass return HttpResponse(html)
def test_nested_tables(): file_path = path.join( path.abspath(path.dirname(__file__)), '..', 'fixtures', 'nested_tables.docx', ) actual_html = convert(file_path) # Find out why br tag is there. assert_html_equal(actual_html, ''' <html> <table> <tr> <td>AAA</td> <td>BBB</td> </tr> <tr> <td>CCC</td> <td> <table> <tr> <td>DDD</td> <td>EEE</td> </tr> <tr> <td>FFF</td> <td>GGG</td> </tr> </table> <br /> </td> </tr> </table> </html> ''')
def test_list_to_header(): file_path = path.join( path.abspath(path.dirname(__file__)), '..', 'fixtures', 'list_to_header.docx', ) actual_html = convert(file_path) # It should be noted that list item `GGG` is upper roman in the word # document to show that only top level upper romans get converted. assert_html_equal(actual_html, ''' <html> <h2>AAA</h2> <ol data-list-type="decimal"> <li>BBB</li> </ol> <h2>CCC</h2> <ol data-list-type="decimal"> <li>DDD</li> </ol> <h2>EEE</h2> <ol data-list-type="decimal"> <li>FFF <ol data-list-type="upper-roman"> <li>GGG</li> </ol> </li> </ol> </html> ''')
def test_extract_html(): file_path = path.join( path.abspath(path.dirname(__file__)), '..', 'fixtures', 'simple.docx', ) actual_html = convert(file_path) assert_html_equal(actual_html, ''' <html> <p> Simple text </p> <ol data-list-type="decimal"> <li>one</li> <li>two</li> <li>three</li> </ol> <table> <tr> <td>Cell1</td> <td>Cell2</td> </tr> <tr> <td>Cell3</td> <td>cell4</td> </tr> </table> </html> ''')
def test_nested_table_rowspan(): file_path = path.join( path.abspath(path.dirname(__file__)), '..', 'fixtures', 'nested_table_rowspan.docx', ) actual_html = convert(file_path) assert_html_equal(actual_html, ''' <html> <table> <tr> <td colspan="2">AAA</td> </tr> <tr> <td>BBB</td> <td> <table> <tr> <td rowspan="2">CCC</td> <td>DDD</td> </tr> <tr> <td>EEE</td> </tr> </table> <br /> </td> </tr> </table> </html> ''')
def test_has_title(): file_path = path.join( path.abspath(path.dirname(__file__)), '..', 'fixtures', 'has_title.docx', ) actual_html = convert(file_path) assert_html_equal(actual_html, '''<html><p>Text</p></html>''')
def test_unicode(): file_path = path.join( path.abspath(path.dirname(__file__)), '..', 'fixtures', 'greek_alphabet.docx', ) actual_html = convert(file_path) assert actual_html is not None
def convert_doc_to_html(input, outdir): # LibreOffice print(input, '-->', outdir) #subprocess.call(['soffice', '--headless', '--convert-to','html:XHTML Writer File:UTF8','--outdir', outdir, input]) #args = ['soffice', '--headless', '--convert-to', 'html:XHTML Writer File:UTF8', '--outdir', outdir, input] #call_soffice(args) html_parser = HTMLParser.HTMLParser() html = convert(input) #使用docx2html模块将docx文件转成html串,随后你想干嘛都行 html_parser.unescape(html) #这句非常关键,docx2html模块将中文进行了转义,所以要将生成的字符串重新转义回来!
def test_special_chars(): file_path = path.join( path.abspath(path.dirname(__file__)), '..', 'fixtures', 'special_chars.docx', ) actual_html = convert(file_path) assert_html_equal(actual_html, ''' <html><p>& < > <a href="https://www.google.com/?test=1&more=2">link</a></p></html>''') # noqa
def test_inline_tags(): file_path = path.join( path.abspath(path.dirname(__file__)), '..', 'fixtures', 'inline_tags.docx', ) actual_html = convert(file_path) assert_html_equal(actual_html, ''' <html><p>This sentence has some <strong>bold</strong>, some <em>italics</em> and some <strong>underline</strong>, as well as a <a href="http://www.google.com/">hyperlink</a>.</p></html>''') # noqa
def readdoc(sfp, dfp): print(sfp) print(dfp) docn = convert(sfp) html_parser = HTMLParser.HTMLParser() htmltemp = html_parser.enescape(docn) print('读取docx文件成功') with open(dfp, 'w', encoding='utf-8') as f: f.write(htmltemp) print('写入docx文件成功') pass #对读取的world文件输出成html文件还要进行优化,按照客户的需求
def test_html_files(patch_zip_handler, patch_read): def raise_assertion(*args, **kwargs): raise AssertionError('Should not have called get_zip_file_handler') patch_zip_handler.side_effect = raise_assertion def return_text(*args, **kwargs): return 'test' patch_read.side_effect = return_text # Try with an html file file_path = 'test.html' html = convert(file_path) assert html == 'test' # Try again with an htm file. file_path = 'test.htm' html = convert(file_path) assert html == 'test'
def test_table_col_row_span(): file_path = path.join( path.abspath(path.dirname(__file__)), '..', 'fixtures', 'table_col_row_span.docx', ) actual_html = convert(file_path) assert_html_equal(actual_html, ''' <html> <table> <tr> <td colspan="2">AAA</td> </tr> <tr> <td rowspan="2">BBB</td> <td>CCC</td> </tr> <tr> <td>DDD</td> </tr> <tr> <td>EEE</td> <td rowspan="2">FFF</td> </tr> <tr> <td>GGG</td> </tr> </table> <table> <tr> <td>1</td> <td>2</td> <td>3</td> <td>4</td> </tr> <tr> <td>5</td> <td colspan="2" rowspan="2">6</td> <td>7</td> </tr> <tr> <td>8</td> <td>9</td> </tr> <tr> <td>10</td> <td>11</td> <td>12</td> <td>13</td> </tr> </table> </html> ''')
def test_track_changes_on(): file_path = path.join( path.abspath(path.dirname(__file__)), '..', 'fixtures', 'track_changes_on.docx', ) actual_html = convert(file_path) assert_html_equal(actual_html, ''' <html><p>This was some content.</p></html> ''')
def save_editor(self, request, pk=None, *args, **kwargs): file_res = File.objects.get(id=pk) decrypt_file( os.path.join(settings.MEDIA_ROOT + str( crud.get(self.table, "*", 'where id=' + pk)[0].get('owner_id')) + '/' + base64.b16encode(file_res.modified_file_name)), os.path.join(settings.MEDIA_ROOT + file_res.modified_file_name), '123') self.org_data = convert(settings.MEDIA_ROOT + FileSerializer(File.objects.get( id=pk)).data.get('modified_file_name')) os.remove( os.path.join(settings.MEDIA_ROOT + file_res.modified_file_name)) if self.org_data != request.data: # Activity log request_data = {} param = {'field': 'file_id', 'file_id': pk, 'label': 'version'} track_fields = { c.can_read: c.read, c.can_write: c.write, c.can_delete: c.delete } request_data.update({'user_id': request.user.id}) log_view = LogView() log_view.generate_log(request_data, param, "", track_fields) f = open( settings.MEDIA_ROOT + str( crud.get(self.table, "*", 'where id=' + pk)[0].get('owner_id')) + '/' + file_res.modified_file_name.split('.')[0] + '.html', 'w') f.write(request.data['data'].encode()) f.close() os.chdir(settings.MEDIA_ROOT + str( crud.get(self.table, "*", 'where id=' + pk)[0].get('owner_id'))) os.system('unoconv --format=' + file_res.name.split('.')[-1] + ' ' + settings.MEDIA_ROOT + str( crud.get(self.table, "*", 'where id=' + pk)[0].get('owner_id')) + '/' + file_res.modified_file_name.split('.')[0] + '.html') time.sleep(3) os.remove(settings.MEDIA_ROOT + str( crud.get(self.table, "*", 'where id=' + pk)[0].get('owner_id')) + '/' + file_res.modified_file_name.split('.')[0] + '.html') encrypt_file( os.getcwd() + '/' + file_res.modified_file_name, os.getcwd() + '/' + base64.b16encode(file_res.modified_file_name), '123') os.remove(os.getcwd() + '/' + file_res.modified_file_name) return Response({"hai": 'hai'})
def main(): if len(sys.argv) < 3: DocxToPdf(sys.argv[1]) if len(sys.argv) < 3: sys.exit( "Usage: filename.py mypresentation.pdf / mypresentation.docx Method#" ) src = sys.argv[1] ParseMethod = sys.argv[2] if "docx" in src: html = convert(src) Var_E = DataComputer(html) if ParseMethod == 1: Method1(html, Var_E) else: Method2(html, Var_E) else: basedir = os.path.dirname(os.path.realpath(__file__)) pdfdir = os.path.normpath(basedir + '/pdf') docdir = os.path.normpath(basedir + '/doc') docxdir = os.path.normpath(basedir + '/docx') lowriter = '/usr/bin/soffice' outfilter = ':"MS Word 2007 XML"' outfilter = "'writer_pdf_import'" abspath_pdf = os.path.normpath(os.path.join(pdfdir, src)) subprocess.call( '{0} --infilter={1} --convert-to docx "{3}" --outdir "{2}"'.format( lowriter, outfilter, docxdir, abspath_pdf), shell=True) time.sleep(5) new_src = docxdir + '/' + src.split(".pdf")[0] + '.docx' html = convert(new_src) Var_E = DataComputer(html) if ParseMethod == 1: Method1(html, Var_E) else: Method2(html, Var_E)
def test_upper_alpha_all_bold(): file_path = path.join( path.abspath(path.dirname(__file__)), '..', 'fixtures', 'upper_alpha_all_bold.docx', ) actual_html = convert(file_path) assert_html_equal(actual_html, ''' <html> <h2>AAA</h2> <h2>BBB</h2> <h2>CCC</h2> </html> ''')
def test_headers_with_full_line_styles(): # Show that if a natural header is completely bold/italics that # bold/italics will get stripped out. file_path = path.join( path.abspath(path.dirname(__file__)), '..', 'fixtures', 'headers_with_full_line_styles.docx', ) actual_html = convert(file_path) assert_html_equal(actual_html, ''' <html> <h2>AAA</h2> <h2>BBB</h2> <h2><strong>C</strong><em>C</em>C</h2> </html> ''')
def test_simple_list(): file_path = path.join( path.abspath(path.dirname(__file__)), '..', 'fixtures', 'simple_lists.docx', ) actual_html = convert(file_path) assert_html_equal(actual_html, ''' <html> <ol data-list-type="decimal"> <li>One</li> </ol> <ul> <li>two</li> </ul> </html> ''')
def test_split_headers(): filename = 'split_header.docx' file_path = path.join( path.abspath(path.dirname(__file__)), '..', 'fixtures', 'split_header.docx', ) # preserve_images must be true in order for the image to not be removed. # This is handled in build_import, however here we need to manually set it # to True. new_file_path, _ = _copy_file_to_tmp_dir(file_path, filename) def image_handler(*args, **kwargs): return 'test' actual_html = convert(new_file_path, image_handler=image_handler) assert_html_equal(actual_html, ''' <html><h2>AAA</h2><p>BBB</p><h2>CCC</h2></html> ''')
def test_has_image(): filename = 'has_image.docx' file_path = path.join( path.abspath(path.dirname(__file__)), '..', 'fixtures', 'has_image.docx', ) # preserve_images must be true in order for the image to not be removed. # This is handled in build_import, however here we need to manually set it # to True. new_file_path, dp = _copy_file_to_tmp_dir(file_path, filename) actual_html = convert(new_file_path) assert_html_equal(actual_html, ''' <html> <p>AAA<img src="%s/word/media/image1.gif" height="55" width="260" /></p> </html> ''' % dp)
def test_nested_list(): file_path = path.join( path.abspath(path.dirname(__file__)), '..', 'fixtures', 'nested_lists.docx', ) actual_html = convert(file_path) assert_html_equal(actual_html, ''' <html> <ol data-list-type="decimal"> <li>one</li> <li>two</li> <li>three <ol data-list-type="decimal"> <li>AAA</li> <li>BBB</li> <li>CCC <ol data-list-type="decimal"> <li>alpha</li> </ol> </li> </ol> </li> <li>four</li> </ol> <ol data-list-type="decimal"> <li>xxx <ol data-list-type="decimal"> <li>yyy</li> </ol> </li> </ol> <ul> <li>www <ul> <li>zzz</li> </ul> </li> </ul> </html> ''')
def test_has_image_using_image_handler(): filename = 'has_image.docx' file_path = path.join( path.abspath(path.dirname(__file__)), '..', 'fixtures', 'has_image.docx', ) # preserve_images must be true in order for the image to not be removed. # This is handled in build_import, however here we need to manually set it # to True. new_file_path, _ = _copy_file_to_tmp_dir(file_path, filename) def image_handler(*args, **kwargs): return 'test' actual_html = convert(new_file_path, image_handler=image_handler) assert_html_equal(actual_html, ''' <html><p>AAA<img src="test" height="55" width="260" /></p></html> ''')
def test_fake_headings_by_length(): # Show that converting p tags to h tags has a length limit. If the p tag is # supposed to be converted to an h tag but has more than seven words in the # paragraph do not convert it. file_path = path.join( path.abspath(path.dirname(__file__)), '..', 'fixtures', 'fake_headings_by_length.docx', ) actual_html = convert(file_path) assert_html_equal(actual_html, ''' <html> <h2>Heading.</h2> <h2>Still a heading.</h2> <p> <strong>This is not a heading because it is too many words.</strong> </p> </html> ''')
def course_syllabus(request, slug): course = Course.objects.get(slug=slug) syllabus = Syllabus.objects.filter(course=course,user=request.user) SyllabusFormset = modelformset_factory(Syllabus, form=SyllabusForm, extra=1, max_num=1) formset_syl = SyllabusFormset(request.POST or None, request.FILES or None, queryset=syllabus) BASE_DIR = os.path.dirname(os.path.dirname(__file__)) print tempfile.gettempdir() if request.method == 'POST': if formset_syl.is_valid(): for form in formset_syl: form1 = form.save(commit=False) form1.user = request.user form1.path = request.get_full_path() form1.course = course if '.docx' in str(form1.syllabus): form1.save() else: data = {} data['response'] = False new_data = json.dumps(data) return HttpResponse(new_data, content_type='application/json') AWS_KEY = #key AWS_SECRET = #secret aws_connection = S3Connection(AWS_KEY, AWS_SECRET) obj = Syllabus.objects.get(course=course,user=request.user) bucket_name = #bucket key = aws_connection.get_bucket(bucket_name).get_key('media/' + str(obj.syllabus)) res = key.get_contents_to_filename(BASE_DIR +'/'+key.name) html = convert(BASE_DIR +'/'+key.name) obj.html = html obj.save() response_data = {} response_data['response'] = True new_data = json.dumps(response_data) return HttpResponse(new_data, content_type='application/json')
def test_headers(): file_path = path.join( path.abspath(path.dirname(__file__)), '..', 'fixtures', 'headers.docx', ) actual_html = convert(file_path) assert_html_equal(actual_html, ''' <html> <h2>This is an H1</h2> <h3>This is an H2</h3> <h4>This is an H3</h4> <h5>This is an H4</h5> <h6>This is an H5</h6> <h6>This is an H6</h6> <h6>This is an H7</h6> <h6>This is an H8</h6> <h6>This is an H9</h6> <h6>This is an H10</h6> </html> ''')
def conv_core(docx_filename_path,docx_filename,op_mode): if os.path.isfile(docx_filename_path): def handle_image(image_id, relationship_dict): image_path = relationship_dict[image_id] # Now do something to the image. Let's move it somewhere. _, filename = os.path.split(image_path) #extension = os.path.splitext(filename)[1] #new_filename= os.path.splitext(filename)[0] filename=filename.replace('image','%s_'%docx_filename) destination_path = os.path.join('html/images/screenshots/',filename) copyfile(image_path, destination_path) # Return the `src` attribute to be used in the img tag return 'images/screenshots/%s'%filename html = convert(docx_filename_path, image_handler=handle_image) html=html.replace('<html>','').replace('</html>','').replace('<p>Group type</p>','') html=html.replace('<p>','<h3>',1).replace('</p>','</h3>',1).replace(' ',' ') html=html.replace('<ol','<ul').replace('</ol>','</ul>').replace('data-list-type="decimal"','class="number-list"') html=html.replace('“','“').replace('<table>','<table border="0" cellpadding="0">') html=html.replace('<img','<img class="img-responsive"').replace('height="157" width="624" />','/>') with open('template.html', 'r') as myfile: data=myfile.read().replace('<!--conv_active_link-->', '<li class="active-topic"><span><a href="%(1)s.html">%(2)s</a></span></li>'%{"1" : docx_filename, "2" : docx_filename}).replace('<!--conv_content-->',html) fo = open("html/%s.html"%docx_filename, "wb") fo.write(data) fo.close() shutil.rmtree('docx/word') print '******************************************************' progressbar('Converting:','Complete') print '******************************************************' print 'TASK COMPLETED SUCCESSFULLY :)' print '-Output path :html/%s.html'%docx_filename if op_mode: raw_input() else: print '******************************************************' print 'ERROR:' print '-File not found in path %s'%docx_filename_path print '-Also make sure that the file extension is ".docx"' convert_engine()
from docx2html import convert html = convert('./Geit (utkast).docx') myfile = open('testfile.txt', 'rw+') myfile.write(html) myfile.close()