def reflow_pre(xhtml): """ make <pre> reflowable. This helps a lot with readers like Sony's that cannot scroll horizontally. """ def nbsp(matchobj): return (' ' * (len(matchobj.group(0)) - 1)) + ' ' for pre in xpath(xhtml, "//xhtml:pre"): # white-space: pre-wrap would do fine # but it is not supported by OEB try: pre.tag = NS.xhtml.div writers.HTMLishWriter.add_class(pre, 'pgmonospaced') m = parsers.RE_GUTENBERG.search(pre.text) if m: writers.HTMLishWriter.add_class(pre, 'pgheader') tail = pre.tail s = etree.tostring(pre, encoding=six.text_type, with_tail=False) s = s.replace('>\n', '>') # eliminate that empty first line s = s.replace('\n', '<br/>') s = re.sub(' +', nbsp, s) div = etree.fromstring(s) div.tail = tail pre.getparent().replace(pre, div) except etree.XMLSyntaxError as what: exception("%s\n%s" % (s, what)) raise
def shipout(self, job, parsers, ncx): """ Build the zip file. """ try: ocf = OEBPSContainer( os.path.join(job.outputdir, job.outputfile), ('%d/' % options.ebook if options.ebook else None)) opf = ContentOPF() opf.metadata_item(job.dc) # write out parsers for p in parsers: try: ocf.add_bytes(self.url2filename(p.attribs.url), p.serialize(), p.mediatype()) if p.mediatype() == mt.xhtml: opf.spine_item_from_parser(p) else: opf.manifest_item_from_parser(p) except Exception as what: error("Could not process file %s: %s" % (p.attribs.url, what)) # toc for t in ncx.toc: if t[1].lower().strip(' .') in TOC_HEADERS: opf.guide_item(t[0], 'toc', t[1]) break opf.toc_item('toc.ncx') ocf.add_unicode('toc.ncx', six.text_type(ncx)) for p in parsers: if 'coverpage' in p.attribs.rel: opf.add_coverpage(ocf, p.attribs.url) break # Adobe page-map # opf.pagemap_item ('page-map.xml') # ocf.add_unicode ('page-map.xml', six.text_type (AdobePageMap (ncx))) # content.opf # debug (etree.tostring (opf.manifest, encoding=siy.text_type, pretty_print=True)) opf.rewrite_links(self.url2filename) ocf.add_unicode('content.opf', six.text_type(opf)) ocf.commit() except Exception as what: exception("Error building Epub: %s" % what) ocf.rollback() raise
def build(self, job): """ Build HTML file. """ htmlfilename = os.path.join(job.outputdir, job.outputfile) try: os.remove(htmlfilename) except OSError: pass try: info("Creating HTML file: %s" % htmlfilename) for p in job.spider.parsers: # Do html only. The images were copied earlier by PicsDirWriter. xhtml = None if hasattr(p, 'rst2html'): xhtml = p.rst2html(job) elif hasattr(p, 'xhtml'): p.parse() xhtml = copy.deepcopy(p.xhtml) if xhtml is not None: self.make_links_relative(xhtml, p.attribs.url) self.add_dublincore(job, xhtml) # makes iphones zoom in self.add_meta(xhtml, 'viewport', 'width=device-width') self.add_meta_generator(xhtml) # This writer has currently to deal only with RST # input. The RST writer has a workaround that # avoids writing empty elements. So we don't need # the same ugly workaround as the EPUB writer, # that has to deal with HTML input too. html = etree.tostring(xhtml, method='xml', doctype=gg.XHTML_DOCTYPE, encoding='utf-8', pretty_print=True, xml_declaration=True) self.write_with_crlf(htmlfilename, html) # self.copy_aux_files (job.outputdir) info("Done HTML file: %s" % htmlfilename) except Exception as what: exception("Error building HTML %s: %s" % (htmlfilename, what)) if os.access(htmlfilename, os.W_OK): os.remove(htmlfilename) raise what
def build(self, job): """ Build epub """ ncx = TocNCX(job.dc) parsers = [] css_count = 0 # add CSS parser self.add_external_css(job.spider, None, PRIVATE_CSS, 'pgepub.css') try: chunker = HTMLChunker.HTMLChunker() coverpage_url = None # do images early as we need the new dimensions later for p in job.spider.parsers: if hasattr(p, 'resize_image'): if 'coverpage' in p.attribs.rel: if job.maintype == 'kindle': np = p.resize_image(MAX_IMAGE_SIZE_KINDLE, MAX_COVER_DIMEN_KINDLE, 'jpeg') else: np = p.resize_image(MAX_IMAGE_SIZE, MAX_COVER_DIMEN) np.id = p.attribs.get('id', 'coverpage') coverpage_url = p.attribs.url else: if job.maintype == 'kindle': np = p.resize_image(MAX_IMAGE_SIZE_KINDLE, MAX_IMAGE_DIMEN_KINDLE) else: np = p.resize_image(MAX_IMAGE_SIZE, MAX_IMAGE_DIMEN) np.id = p.attribs.get('id') parsers.append(np) for p in job.spider.parsers: if p.mediatype() in OPS_CONTENT_DOCUMENTS: debug("URL: %s" % p.attribs.url) if hasattr(p, 'rst2epub2'): xhtml = p.rst2epub2(job) if options.verbose >= 2: # write html to disk for debugging debugfilename = os.path.join( job.outputdir, job.outputfile) debugfilename = os.path.splitext (debugfilename)[0] + '.' + \ job.maintype + '.debug.html' with open(debugfilename, 'wb') as fp: fp.write( etree.tostring(xhtml, encoding='utf-8')) else: # make a copy so we can mess around p.parse() xhtml = copy.deepcopy(p.xhtml) strip_classes = self.get_classes_that_float(xhtml) strip_classes = strip_classes.intersection(STRIP_CLASSES) if strip_classes: self.strip_pagenumbers(xhtml, strip_classes) # build up TOC # has side effects on xhtml ncx.toc += p.make_toc(xhtml) self.insert_root_div(xhtml) self.fix_charset(xhtml) self.fix_style_elements(xhtml) self.reflow_pre(xhtml) # strip all links to items not in manifest p.strip_links(xhtml, job.spider.dict_urls_mediatypes()) self.strip_links(xhtml, job.spider.dict_urls_mediatypes()) self.strip_noepub(xhtml) # self.strip_rst_dropcaps (xhtml) self.fix_html_image_dimensions(xhtml) if coverpage_url: self.remove_coverpage(xhtml, coverpage_url) # externalize and fix CSS for style in xpath(xhtml, '//xhtml:style'): self.add_external_css(job.spider, xhtml, style.text, "%d.css" % css_count) css_count += 1 style.drop_tree() self.add_external_css(job.spider, xhtml, None, 'pgepub.css') self.add_meta_generator(xhtml) debug("Splitting %s ..." % p.attribs.url) chunker.next_id = 0 chunker.split(xhtml, p.attribs) for p in job.spider.parsers: if hasattr(p, 'sheet'): self.fix_incompatible_css(p.sheet) p.rewrite_links(self.url2filename) parsers.append(p) # after splitting html into chunks we have to rewrite all # internal links in HTML chunker.rewrite_internal_links() # also in the TOC if not ncx.toc: ncx.toc.append([job.spider.parsers[0].attribs.url, 'Start', 1]) chunker.rewrite_internal_links_toc(ncx.toc) # make absolute links zip-filename-compatible chunker.rewrite_links(self.url2filename) ncx.rewrite_links(self.url2filename) # Do away with the chunker and copy all chunks into new parsers. # These are fake parsers that never actually parsed anything, # we just use them to just hold our data. for chunk, attribs in chunker.chunks: p = ParserFactory.ParserFactory.get(attribs) p.xhtml = chunk parsers.append(p) self.shipout(job, parsers, ncx) except Exception as what: exception("Error building Epub: %s" % what) raise
def do_job(job): """ Do one job. """ log_handler = None Logger.ebook = job.ebook if job.logfile: log_handler = open_log( os.path.join(os.path.abspath(job.outputdir), job.logfile)) debug('=== Building %s ===' % job.type) start_time = datetime.datetime.now() try: if job.url: spider = Spider.Spider() dirpath = os.path.dirname(job.url) # platform native path spider.include_urls += (options.include_urls or [parsers.webify_url(dirpath) + '/*'] ) # use for parser only spider.include_mediatypes += options.include_mediatypes if job.subtype == '.images' or job.type == 'rst.gen': spider.include_mediatypes.append('image/*') spider.exclude_urls += options.exclude_urls spider.exclude_mediatypes += options.exclude_mediatypes spider.max_depth = options.max_depth or six.MAXSIZE for rewrite in options.rewrite: from_url, to_url = rewrite.split('>') spider.add_redirection(from_url, to_url) attribs = parsers.ParserAttributes() attribs.url = parsers.webify_url(job.url) attribs.id = 'start' if options.input_mediatype: attribs.orig_mediatype = attribs.HeaderElement.from_str( options.input_mediatype) spider.recursive_parse(attribs) elect_coverpage(spider, job.url) job.url = spider.redirect(job.url) job.base_url = job.url job.spider = spider writer = WriterFactory.create(job.maintype) writer.build(job) if options.validate: writer.validate(job) packager = PackagerFactory.create(options.packager, job.type) if packager: packager.package(job) if job.type == 'html.images': # FIXME: hack for push packager options.html_images_list = list(job.spider.aux_file_iter()) except SkipOutputFormat as what: warning("%s" % what) except Exception as what: exception("%s" % what) end_time = datetime.datetime.now() info(' %s made in %s' % (job.type, end_time - start_time)) if log_handler: close_log(log_handler) log_handler = None