def convert(options, args): if options.test_mode: import pkg_resources print 'Entering testmode' for fn in ('test1.html', 'test2.html', 'test3.html'): tmpf = newTempfile() print fn print '-' * len(fn) file(tmpf + '.html', 'wb').write( pkg_resources.resource_string('zopyx.convert2.tests.data', fn)) for name in registry.availableConverters(): cls = registry.converter_registry[name] print '%s: %s.html -> %s.%s' % (name, tmpf, tmpf, cls.output_format) C = Converter(tmpf + '.html', verbose=True) try: result = C(name, output_filename=tmpf + '.' + cls.output_format) print result except Exception, e: print 'FAILED (%s)' % e print
def html2pdf(html_filename, output_filename=None, **options): """ Convert a HTML file to PDF using FOP""" if not output_filename: output_filename = newTempfile(suffix='.pdf') if not prince_available: raise RuntimeError("The external PrinceXML converter isn't available") cmd_options = list() for k,v in options.items(): if v is None: cmd_options.append('--%s ' % k) else: cmd_options.append('--%s="%s" ' % (k, v)) if sys.platform == 'win32': raise NotImplementedError('No support for PrinceXML on Windows available') else: cmd = '%s "prince" "%s" %s -o "%s"' % \ (execution_shell, html_filename, ' '.join(cmd_options), output_filename) status, output = runcmd(cmd) if status != 0: raise ConversionError('Error executing: %s' % cmd, output) return dict(output_filename=output_filename, status=status, output=output)
def html2calibre(html_filename, output_filename=None, cmdopts='', **calibre_options): """ Convert a HTML file using calibre """ if not html_filename.endswith('.html'): shutil.copy(html_filename, html_filename + '.html') html_filename += '.html' if not output_filename: output_filename = newTempfile(suffix='.epub') if not calibre_available: raise RuntimeError("The external calibre converter isn't available") options = list() for k,v in calibre_options.items(): if v is None: options.append('--%s ' % k) else: options.append('--%s="%s" ' % (k, v)) if sys.platform == 'win32': raise NotImplementedError('No support for using Calibre on Windows available') else: options = ' '.join(options) options = options + ' ' + cmdopts cmd = '"ebook-convert" "%s" "%s" %s' % (html_filename, output_filename, options) status, output = runcmd(cmd) if status != 0: raise ConversionError('Error executing: %s' % cmd, output) return dict(output_filename=output_filename, status=status, output=output)
def html2pdf(html_filename, output_filename=None, **options): """ Convert a HTML file to PDF using FOP""" if not output_filename: output_filename = newTempfile(suffix='.pdf') if not prince_available: raise RuntimeError("The external PrinceXML converter isn't available") cmd_options = list() for k, v in options.items(): if v is None: cmd_options.append('--%s ' % k) else: cmd_options.append('--%s="%s" ' % (k, v)) if sys.platform == 'win32': raise NotImplementedError( 'No support for PrinceXML on Windows available') else: cmd = '%s "prince" "%s" %s -o "%s"' % \ (execution_shell, html_filename, ' '.join(cmd_options), output_filename) status, output = runcmd(cmd) if status != 0: raise ConversionError('Error executing: %s' % cmd, output) return dict(output_filename=output_filename, status=status, output=output)
def html2pdf(html_filename, output_filename=None, **options): """ Convert a HTML file to PDF using FOP""" if not output_filename: output_filename = newTempfile(suffix='.pdf') cmd = 'pisa --encoding utf-8 -d "%s" "%s"' % (html_filename, output_filename) status, output = runcmd(cmd) return dict(output_filename=output_filename, status=0, output=output)
def html2pdf(html_filename, output_filename=None, **options): """ Convert a HTML file to PDF using FOP""" if not output_filename: output_filename = newTempfile(suffix='.pdf') fin = file(html_filename) fout = file(output_filename, 'wb') pisaDocument(fin, fout, encoding='utf-8', debug=True) fin.close() fout.close() return dict(output_filename=output_filename, status=0, output='')
def convert(self, fo_filename, output_filename=None): if not output_filename: output_filename = newTempfile(suffix='.pdf') cmd = 'fop -fo "%s" -pdf "%s"' % (fo_filename, output_filename) status, output = runcmd(cmd) if status != 0: raise RuntimeError('Error executing: %s\n\n%s' % (cmd, output)) log.info("\n") return output_filename
def tidyhtml(filename, encoding='utf-8', strip_base=False): html = file(filename, 'rb').read() # use BeautifulSoup for performing HTML checks # and conversion to XHTML soup = BeautifulSoup(html) # check if all image files exist for img in soup.findAll('img'): src = img['src'] if not os.path.exists(src): # try to find the image relative to the location of # the HTML file html_dirname = os.path.dirname(filename) possible_img = os.path.join(html_dirname, src) if os.path.exists(possible_img): img['src'] = possible_img else: LOG.warn('No image file found: %s' % src) html = soup.renderContents() # replace the HTML tag html = '<html xmlns="http://www.w3.org/1999/xhtml">' + \ html[html.find('<html') + 6:] # add the XML preamble html = '<?xml version="1.0" ?>\n' + html # replace all HTML entities with numeric entities def handler(mo): """ Callback to convert entities """ e = mo.group(1) v = e[1:-1] if not v.startswith('#'): codepoint = name2codepoint.get(v) return codepoint and '&#%d;' % codepoint or '' else: return e entity_reg = re.compile('(&.*?;)') html = entity_reg.sub(handler, html) # replace BASE tag if strip_base: base_reg = re.compile('(<base.*?>)', re.I) html = base_reg.sub('', html) filename = newTempfile() file(filename, 'wb').write(str(html)) return filename
def tidyhtml(filename, encoding="utf-8", strip_base=False): html = file(filename, "rb").read() # use BeautifulSoup for performing HTML checks # and conversion to XHTML soup = BeautifulSoup(html) # check if all image files exist for img in soup.findAll("img"): src = img["src"] if not os.path.exists(src): # try to find the image relative to the location of # the HTML file html_dirname = os.path.dirname(filename) possible_img = os.path.join(html_dirname, src) if os.path.exists(possible_img): img["src"] = possible_img else: LOG.warn("No image file found: %s" % src) html = soup.renderContents() # replace the HTML tag html = '<html xmlns="http://www.w3.org/1999/xhtml">' + html[html.find("<html") + 6 :] # add the XML preamble html = '<?xml version="1.0" ?>\n' + html # replace all HTML entities with numeric entities def handler(mo): """ Callback to convert entities """ e = mo.group(1) v = e[1:-1] if not v.startswith("#"): codepoint = name2codepoint.get(v) return codepoint and "&#%d;" % codepoint or "" else: return e entity_reg = re.compile("(&.*?;)") html = entity_reg.sub(handler, html) # replace BASE tag if strip_base: base_reg = re.compile("(<base.*?>)", re.I) html = base_reg.sub("", html) filename = newTempfile() file(filename, "wb").write(str(html)) return filename
def html2pdf(html_filename, output_filename=None, **options): """ Convert a HTML file to PDF using FOP""" if not output_filename: output_filename = newTempfile(suffix='.pdf') if not pdfreactor_available: raise RuntimeError("The external 'pdfreactor' converter isn't available") cmd = '%s "pdfreactor" "%s" "%s"' % \ (execution_shell, html_filename, output_filename) status, output = runcmd(cmd) if status != 0: raise ConversionError('Error executing: %s' % cmd, output) return dict(output_filename=output_filename, status=status, output=output)
def fo2pdf(fo_filename, output_filename=None): """ Convert a FO file to PDF using XINC """ if not output_filename: output_filename = newTempfile(suffix=".pdf") if not xinc_available: raise RuntimeError("The external XINC converter isn't available") if sys.platform == "win32": cmd = '%s\\bin\\windows\\xinc.exe -fo "%s" -pdf "%s"' % (xinc_home, fo_filename, output_filename) else: cmd = '"%s/bin/unix/xinc" -fo "%s" -pdf "%s"' % (xinc_home, fo_filename, output_filename) status, output = runcmd(cmd) if status != 0: raise ConversionError("Error executing: %s" % cmd, output) return dict(output_filename=output_filename, status=status, output=output)
def fo2pdf(fo_filename, output_filename=None): """ Convert a FO file to PDF using XINC """ if not output_filename: output_filename = newTempfile(suffix='.pdf') if not xinc_available: raise RuntimeError("The external XINC converter isn't available") if sys.platform == 'win32': cmd = '%s\\bin\\windows\\xinc.exe -fo "%s" -pdf "%s"' % ( xinc_home, fo_filename, output_filename) else: cmd = '"%s/bin/unix/xinc" -fo "%s" -pdf "%s"' % ( xinc_home, fo_filename, output_filename) status, output = runcmd(cmd) if status != 0: raise ConversionError('Error executing: %s' % cmd, output) return dict(output_filename=output_filename, status=status, output=output)
def fo2pdf(fo_filename, output_filename=None): """ Convert a FO file to PDF using FOP""" if not output_filename: output_filename = newTempfile(suffix='.pdf') if not fop_available: raise RuntimeError("The external FOP converter isn't available") if sys.platform == 'win32': cmd = '%s\\fop.bat -fo "%s" -pdf "%s"' % (fop_home, fo_filename, output_filename) else: cmd = '%s "%s/fop" -fo "%s" -pdf "%s"' % \ (execution_shell, fop_home, fo_filename, output_filename) status, output = runcmd(cmd) if status != 0: raise ConversionError('Error executing: %s' % cmd, output) return dict(output_filename=output_filename, status=status, output=output)
def fo2xfc(fo_filename, format="rtf", output_filename=None): """ Convert a FO file to some format support through XFC-4.0. """ if not format in ("rtf", "docx", "wml", "odt"): raise ValueError("Unsupported format: %s" % format) if not output_filename: output_filename = newTempfile(suffix=".%s" % format) if sys.platform == "win32": cmd = '"%s\\fo2%s.bat" "%s" "%s"' % (xfc_dir, format, fo_filename, output_filename) else: cmd = '"%s/fo2%s" "%s" "%s"' % (xfc_dir, format, fo_filename, output_filename) status, output = runcmd(cmd) if status != 0: raise ConversionError("Error executing: %s" % cmd, output) return dict(output_filename=output_filename, status=status, output=output)
def html2calibre(html_filename, output_filename=None, cmdopts='', **calibre_options): """ Convert a HTML file using calibre """ if not html_filename.endswith('.html'): shutil.copy(html_filename, html_filename + '.html') html_filename += '.html' if not output_filename: output_filename = newTempfile(suffix='.epub') if not calibre_available: raise RuntimeError("The external calibre converter isn't available") options = list() for k, v in calibre_options.items(): if v is None: options.append('--%s ' % k) else: options.append('--%s="%s" ' % (k, v)) if sys.platform == 'win32': raise NotImplementedError( 'No support for using Calibre on Windows available') else: options = ' '.join(options) options = options + ' ' + cmdopts cmd = '"ebook-convert" "%s" "%s" %s' % (html_filename, output_filename, options) status, output = runcmd(cmd) if status != 0: raise ConversionError('Error executing: %s' % cmd, output) return dict(output_filename=output_filename, status=status, output=output)
def convert(options, args): if options.test_mode: import pkg_resources print 'Entering testmode' for fn in ('test1.html', 'test2.html', 'test3.html'): tmpf = newTempfile() print fn print '-'*len(fn) file(tmpf + '.html', 'wb').write(pkg_resources.resource_string('zopyx.convert2.tests.data', fn)) for name in registry.availableConverters(): cls = registry.converter_registry[name] print '%s: %s.html -> %s.%s' % (name, tmpf, tmpf, cls.output_format) C = Converter(tmpf + '.html', verbose=True) try: result = C(name, output_filename=tmpf + '.' + cls.output_format) print result except Exception, e: print 'FAILED (%s)' % e print
def convert(self, filename, encoding="utf-8", tidy=True, output_filename=None, **kw): """ Convert a HTML file stored as 'filename' to FO using CSS2XSLFO. """ if tidy: filename = tidyhtml(filename, encoding, strip_base=kw.get("strip_base", False)) if output_filename: fo_filename = output_filename else: fo_filename = newTempfile(suffix=".fo") csstoxslfo = os.path.abspath(os.path.join(dirname, "lib", "csstoxslfo", "css2xslfo.jar")) if not os.path.exists(csstoxslfo): raise IOError("%s does not exist" % csstoxslfo) cmd = ( '"%s"' % java + ' -Duser.language=en -Xms256m -Xmx256m -jar "%(csstoxslfo)s" "%(filename)s" -fo "%(fo_filename)s"' % vars() ) for k in kw: cmd += ' %s="%s"' % (k, kw[k]) status, output = runcmd(cmd) if status != 0: raise ConversionError("Error executing: %s" % cmd, output) # remove tidy-ed file if tidy: os.unlink(filename) # remove some stuff from the generated FO file causing # some conversion trouble either with XINC or XFC E = parse(fo_filename) ids_seen = list() for node in E.getiterator(): get = node.attrib.get # ensure that ID attributes are unique node_id = get("id") if node_id is not None: if node_id in ids_seen: del node.attrib["id"] ids_seen.append(node_id) for k, v in ( ("footnote", "reset"), ("unicode-bidi", "embed"), ("writing-mode", "lr-tb"), ("font-selection-strategy", "character-by-character"), ("line-height-shift-adjustment", "disregard-shifts"), ("page-break-after", "avoid"), ("page-break-before", "avoid"), ("page-break-inside", "avoid"), ): value = get(k) if value == v: del node.attrib[k] for attr in ( "margin-left", "margin-right", "margin-top", "margin-bottom", "padding-left", "padding-right", "padding-top", "padding-bottom", ): value = get(attr) if value == "0": node.attrib[attr] = "0em" if get("page-break-after") == "always": del node.attrib["page-break-after"] node.attrib["break-after"] = "page" if get("text-transform"): del node.attrib["text-transform"] value = get("white-space") if value == "pre": del node.attrib["white-space"] node.text = "\n" + node.text.lstrip() for k, v in { "white-space-treatment": "preserve", "white-space-collapse": "false", "wrap-option": "no-wrap", "linefeed-treatment": "preserve", }.items(): node.attrib[k] = v fo_text = tostring(E.getroot()) fo_text = fo_text.replace( "<ns0:block ", '<ns0:block margin-top="0" margin-bottom="0" ' ) # avoid a linebreak through <li><p> (XFC) # fo_text = fo_text.replace('<ns0:block/>', '') # causes a crash with XINC fo_text = fo_text.replace('<ns0:block margin-top="0" margin-bottom="0" />', "") file(fo_filename, "wb").write(fo_text) return fo_filename
def convert(self, filename, encoding='utf-8', tidy=True, output_filename=None, **kw): """ Convert a HTML file stored as 'filename' to FO using CSS2XSLFO. """ if tidy: filename = tidyhtml(filename, encoding, strip_base=kw.get('strip_base', False)) if output_filename: fo_filename = output_filename else: fo_filename = newTempfile(suffix='.fo') csstoxslfo = os.path.abspath( os.path.join(dirname, 'lib', 'csstoxslfo', 'css2xslfo.jar')) if not os.path.exists(csstoxslfo): raise IOError('%s does not exist' % csstoxslfo) cmd = '"%s"' % java + \ ' -Duser.language=en -Xms256m -Xmx256m -jar "%(csstoxslfo)s" "%(filename)s" -fo "%(fo_filename)s"' % vars() for k in kw: cmd += ' %s="%s"' % (k, kw[k]) status, output = runcmd(cmd) if status != 0: raise ConversionError('Error executing: %s' % cmd, output) # remove tidy-ed file if tidy: os.unlink(filename) # remove some stuff from the generated FO file causing # some conversion trouble either with XINC or XFC E = parse(fo_filename) ids_seen = list() for node in E.getiterator(): get = node.attrib.get # ensure that ID attributes are unique node_id = get('id') if node_id is not None: if node_id in ids_seen: del node.attrib['id'] ids_seen.append(node_id) for k, v in (('footnote', 'reset'), ('unicode-bidi', 'embed'), ('writing-mode', 'lr-tb'), ('font-selection-strategy', 'character-by-character'), ('line-height-shift-adjustment', 'disregard-shifts'), ('page-break-after', 'avoid'), ('page-break-before', 'avoid'), ('page-break-inside', 'avoid')): value = get(k) if value == v: del node.attrib[k] for attr in ('margin-left', 'margin-right', 'margin-top', 'margin-bottom', 'padding-left', 'padding-right', 'padding-top', 'padding-bottom'): value = get(attr) if value == '0': node.attrib[attr] = '0em' if get('page-break-after') == 'always': del node.attrib['page-break-after'] node.attrib['break-after'] = 'page' if get('text-transform'): del node.attrib['text-transform'] value = get('white-space') if value == 'pre': del node.attrib['white-space'] node.text = '\n' + node.text.lstrip() for k, v in { 'white-space-treatment': 'preserve', 'white-space-collapse': 'false', 'wrap-option': 'no-wrap', 'linefeed-treatment': 'preserve' }.items(): node.attrib[k] = v fo_text = tostring(E.getroot()) fo_text = fo_text.replace( '<ns0:block ', '<ns0:block margin-top="0" margin-bottom="0" ' ) # avoid a linebreak through <li><p> (XFC) # fo_text = fo_text.replace('<ns0:block/>', '') # causes a crash with XINC fo_text = fo_text.replace( '<ns0:block margin-top="0" margin-bottom="0" />', '') file(fo_filename, 'wb').write(fo_text) return fo_filename
def convert(self, filename, encoding='utf-8', tidy=True, output_filename=None, **kw): """ Convert a HTML file stored as 'filename' to FO using CSS2XSLFO. """ if tidy: filename = tidyhtml(filename, encoding, strip_base=kw.get('strip_base', False)) if output_filename: fo_filename = output_filename else: fo_filename = newTempfile(suffix='.fo') csstoxslfo = os.path.abspath(os.path.join(dirname, 'lib', 'csstoxslfo', 'css2xslfo.jar')) if not os.path.exists(csstoxslfo): raise IOError('%s does not exist' % csstoxslfo) cmd = '"%s"' % java + \ ' -Duser.language=en -Xms256m -Xmx256m -jar "%(csstoxslfo)s" "%(filename)s" -fo "%(fo_filename)s"' % vars() for k in kw: cmd += ' %s="%s"' % (k, kw[k]) status, output = runcmd(cmd) if status != 0: raise ConversionError('Error executing: %s' % cmd, output) # remove tidy-ed file if tidy: os.unlink(filename) # remove some stuff from the generated FO file causing # some conversion trouble either with XINC or XFC E = parse(fo_filename) ids_seen = list() for node in E.getiterator(): get = node.attrib.get # ensure that ID attributes are unique node_id = get('id') if node_id is not None: if node_id in ids_seen: del node.attrib['id'] ids_seen.append(node_id) for k, v in (('footnote', 'reset'), ('unicode-bidi', 'embed'), ('writing-mode', 'lr-tb'), ('font-selection-strategy', 'character-by-character'), ('line-height-shift-adjustment', 'disregard-shifts'), ('page-break-after', 'avoid'), ('page-break-before', 'avoid'), ('page-break-inside', 'avoid')): value = get(k) if value == v: del node.attrib[k] for attr in ('margin-left', 'margin-right', 'margin-top', 'margin-bottom', 'padding-left', 'padding-right', 'padding-top', 'padding-bottom'): value = get(attr) if value == '0': node.attrib[attr] = '0em' if get('page-break-after') == 'always': del node.attrib['page-break-after'] node.attrib['break-after'] = 'page' if get('text-transform'): del node.attrib['text-transform'] value = get('white-space') if value == 'pre': del node.attrib['white-space'] node.text = '\n' + node.text.lstrip() for k,v in {'white-space-treatment' : 'preserve', 'white-space-collapse' : 'false', 'wrap-option' : 'no-wrap', 'linefeed-treatment' : 'preserve' }.items(): node.attrib[k] = v fo_text = tostring(E.getroot()) fo_text = fo_text.replace('<ns0:block ' , '<ns0:block margin-top="0" margin-bottom="0" ') # avoid a linebreak through <li><p> (XFC) # fo_text = fo_text.replace('<ns0:block/>', '') # causes a crash with XINC fo_text = fo_text.replace('<ns0:block margin-top="0" margin-bottom="0" />', '') file(fo_filename, 'wb').write(fo_text) return fo_filename