def open_file(cls, url, attribs): """ Open a local file for parsing. """ def open_file_from_path(path): try: return open(url, 'rb') except FileNotFoundError: error('Missing file: %s' % url) except IsADirectoryError: error('Missing file is a directory: %s' % url) return None if re.search(r'^([a-zA-z]:|/)', url): fp = open_file_from_path(url) else: try: # handles all the flavors of file: urls, including on windows fp = urllib.request.urlopen(url) except urllib.error.URLError as what: fp = None error('Missing file: %s' % what.reason) except ValueError: # just a relative path? fp = open_file_from_path(url) attribs.orig_mediatype = attribs.HeaderElement( MediaTypes.guess_type(url)) debug("... got mediatype %s from guess_type" % str(attribs.orig_mediatype)) attribs.orig_url = attribs.url = url return fp
def pre_parse(self): """ Parse a RST file as link list. """ debug("RSTParser: Pre-parsing %s" % self.attribs.url) default_style = self.get_resource('mydocutils.parsers', 'default_style.rst') source = docutils.io.StringInput(default_style + self.unicode_content()) reader = docutils.readers.standalone.Reader() parser = gutenberg_parsers.Parser() overrides = { 'get_resource': self.get_resource, 'get_image_size': self.get_image_size_from_parser, 'base_url': self.attribs.url, } doc = reader.read(source, parser, self.get_settings((reader, parser), overrides)) self.document1 = doc self.rewrite_links(partial(urllib.parse.urljoin, self.attribs.url)) debug("RSTParser: Done pre-parsing %s" % self.attribs.url)
def _make_coverpage_link (self): """ Insert a <link rel="coverpage"> in the html head. First we determine the coverpage url. In HTML we find the coverpage by appling these rules: 1. the image specified in <link rel='coverpage'>, 2. the image with an id of 'coverpage' or 3. the image with an url containing 'cover' 4. the image with an url containing 'title' If one rule returns images we take the first one in document order, else we proceed with the next rule. """ coverpages = xpath (self.xhtml, "//xhtml:link[@rel='coverpage']") for coverpage in coverpages: url = coverpage.get ('src') debug ("Found link to coverpage %s." % url) return # already provided by user # look for a suitable candidate coverpages = xpath (self.xhtml, "//xhtml:img[@id='coverpage']") if not coverpages: coverpages = xpath (self.xhtml, "//xhtml:img[contains (@src, 'cover')]") if not coverpages: coverpages = xpath (self.xhtml, "//xhtml:img[contains (@src, 'title')]") for coverpage in coverpages: for head in xpath (self.xhtml, "/xhtml:html/xhtml:head"): url = coverpage.get ('src') head.append (parsers.em.link (rel = 'coverpage', href = url)) debug ("Inserted link to coverpage %s." % url)
def _full_parse(self, writer, overrides): """ Full parse from scratch. """ debug("RSTParser: Full-parsing %s" % self.attribs.url) default_style = self.get_resource('mydocutils.parsers', 'default_style.rst') source = docutils.io.StringInput( default_style + self.unicode_content(), self.attribs.url, 'unicode') reader = docutils.readers.standalone.Reader() parser = gutenberg_parsers.Parser() doc = reader.read( source, parser, self.get_settings((reader, parser, writer), overrides)) self.document1 = doc self.rewrite_links(partial(urllib.parse.urljoin, self.attribs.url)) doc.transformer.populate_from_components( (source, reader, parser, writer)) doc.transformer.apply_transforms() debug("RSTParser: Done full-parsing %s" % self.attribs.url) return doc
def __unicode__(self): """ Serialize content.opf as unicode string. """ assert len(self.manifest), 'No manifest item in content.opf.' assert len(self.spine), 'No spine item in content.opf.' assert 'toc' in self.spine.attrib, 'No TOC item in content.opf.' package = self.opf.package(**{ 'version': '2.0', 'unique-identifier': 'id' }) package.append(self.metadata) package.append(self.manifest) package.append(self.spine) if len(self.guide): package.append(self.guide) content_opf = "%s\n\n%s" % ( gg.XML_DECLARATION, etree.tostring(package, encoding=six.text_type, pretty_print=True)) # FIXME: remove this when lxml is fixed # now merge xmlns:opf and xmlns: content_opf = content_opf.replace('lxml-bug-workaround', '') if options.verbose >= 2: debug(content_opf) return content_opf
def run(self): debug('Endsection directive state: %s' % self.state) # back out of lists, etc. if isinstance(self.state, states.SpecializedBody): debug('Backing out of list') self.state_machine.previous_line(2) # why do we need 2 ??? raise EOFError
def copy_aux_files(self, job, dest_dir): """ Copy image files to dest_dir. Use image data cached in parsers. """ for p in job.spider.parsers: if hasattr(p, 'resize_image'): src_uri = p.attribs.url fn_dest = gg.make_url_relative(webify_url(job.base_url), src_uri) fn_dest = os.path.join(dest_dir, fn_dest) # debug ('base_url = %s, src_uri = %s' % (job.base_url, src_uri)) if gg.is_same_path(src_uri, fn_dest): debug('Not copying %s to %s: same file' % (src_uri, fn_dest)) continue debug('Copying %s to %s' % (src_uri, fn_dest)) fn_dest = gg.normalize_path(fn_dest) gg.mkdir_for_filename(fn_dest) try: with open(fn_dest, 'wb') as fp_dest: fp_dest.write(p.serialize()) except IOError as what: error('Cannot copy %s to %s: %s' % (src_uri, fn_dest, what))
def decode(self, charset): """ Try to decode document contents to unicode. """ if charset is None: return None charset = charset.lower().strip() if charset in BOGUS_CHARSET_NAMES: charset = BOGUS_CHARSET_NAMES[charset] if charset == 'utf-8': charset = 'utf_8_sig' try: debug("Trying to decode document with charset %s ..." % charset) buffer = self.bytes_content() buffer = REB_PG_CHARSET.sub(b'', buffer) buffer = buffer.decode(charset) self.attribs.orig_mediatype.params['charset'] = charset return buffer except LookupError as what: # unknown charset, error("Invalid charset name: %s (%s)" % (charset, what)) except UnicodeError as what: # mis-stated charset, did not decode error("Text not in charset %s (%s)" % (charset, what)) return None
def f(url): """ Remap function """ ur, frag = urllib.parse.urldefrag(url) if ur in url_map: debug("Rewriting redirected url: %s to %s" % (ur, url_map[ur])) ur = url_map[ur] return "%s#%s" % (ur, frag) if frag else ur
def validate(self, job): """ Validate generated epub using external tools. """ debug("Validating %s ..." % job.outputfile) filename = os.path.join(job.outputdir, job.outputfile) for validator in (options.config.EPUB_VALIDATOR, options.config.EPUB_PREFLIGHT): if validator is not None: params = validator.split() + [filename] checker = subprocess.Popen(params, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (dummy_stdout, stderr) = checker.communicate() if stderr: error(stderr) return 1 #raise AssertionError ( # "%s does not validate." % job.outputfile) debug("%s validates ok." % job.outputfile) return 0
def is_included_mediatype(self, attribs): """ Return True if this document is eligible. """ mediatype = self.get_mediatype(attribs) if not mediatype: warning('Mediatype could not be determined from url %s' % attribs.url) return True # always include if mediatype unknown included = any([ fnmatch.fnmatch(mediatype, pattern) for pattern in self.include_mediatypes ]) excluded = any([ fnmatch.fnmatch(mediatype, pattern) for pattern in self.exclude_mediatypes ]) if included and not excluded: return True if excluded: debug("Dropping excluded mediatype %s" % mediatype) if not included: debug("Dropping not included mediatype %s" % mediatype) return False
def setUp(self): config() Logger.set_log_level(options.verbose) options.types = options.types or ['all'] options.types = CommonCode.add_dependencies(options.types, DEPENDENCIES, BUILD_ORDER) debug("Building types: %s" % ' '.join(options.types))
def __unicode__(self): """ Serialize toc.ncx as unicode string. """ ncx = self.ncx tocdepth = 1 if self.toc: # normalize toc so that it starts with an h1 and doesn't jump down more than one # level at a time fixer = OutlineFixer() for t in self.toc: t[2] = fixer.level(t[2]) # flatten toc if it contains only one top-level entry top_level_entries = sum(t[2] == 1 for t in self.toc) if top_level_entries < 2: for t in self.toc: if t[2] != -1: t[2] = max(1, t[2] - 1) tocdepth = max(t[2] for t in self.toc) head = ncx.head( ncx.meta(name='dtb:uid', content=self.dc.opf_identifier), ncx.meta(name='dtb:depth', content=str(tocdepth)), ncx.meta(name='dtb:generator', content=GENERATOR % VERSION), ncx.meta(name='dtb:totalPageCount', content='0'), ncx.meta(name='dtb:maxPageNumber', content='0')) doc_title = ncx.docTitle(ncx.text(self.dc.title)) self.seen_urls = {} has_pages = False for url, dummy_title, depth in self.toc: # navPoints and pageTargets referencing the same element # must have the same playOrder if url not in self.seen_urls: self.seen_urls[url] = str(len(self.seen_urls) + 1) if depth == -1: has_pages = True params = {'version': '2005-1'} if self.dc.languages: params[NS.xml.lang] = self.dc.languages[0].id ncx = ncx.ncx(head, doc_title, self._make_navmap(self.toc), **params) if has_pages: ncx.append(self._make_pagelist(self.toc)) # Ugly workaround for error: "Serialisation to unicode must not # request an XML declaration" toc_ncx = "%s\n\n%s" % (gg.XML_DECLARATION, etree.tostring(ncx, doctype=gg.NCX_DOCTYPE, encoding=six.text_type, pretty_print=True)) if options.verbose >= 3: debug(toc_ncx) return toc_ncx
def shipout_chunk(self, attribs, chunk_id=None, comment=None): """ ready chunk to be shipped """ attribs = copy.copy(attribs) if self.chunk_size > MAX_CHUNK_SIZE: self.split(self.chunk, attribs) return url = normalize_uri(attribs.url) chunk_name = self._make_name(url) # the url of the whole page if not url in self.idmap: self.idmap[url] = chunk_name # fragments of the page for e in xpath(self.chunk, '//xhtml:*[@id]'): id_ = e.attrib['id'] old_id = "%s#%s" % (url, id_) # key is unicode string, # value is uri-escaped byte string # if ids get cloned while chunking, map to the first one only if old_id not in self.idmap: self.idmap[old_id] = "%s#%s" % (chunk_name, urllib.parse.quote(id_)) attribs.url = chunk_name attribs.id = chunk_id attribs.comment = comment self.chunks.append((self.chunk, attribs)) debug("Adding chunk %s (%d bytes) %s" % (chunk_name, self.chunk_size, chunk_id))
def is_included_mediatype(self, attribs): """ Return True if this document is eligible. """ if attribs.orig_mediatype is None: mediatype = MediaTypes.guess_type(attribs.url) if mediatype: attribs.orig_mediatype = attribs.HeaderElement(mediatype) else: return True # always include if mediatype unknown mediatype = attribs.orig_mediatype.value included = any([ fnmatch.fnmatch(mediatype, pattern) for pattern in self.include_mediatypes ]) excluded = any([ fnmatch.fnmatch(mediatype, pattern) for pattern in self.exclude_mediatypes ]) if included and not excluded: return True if excluded: debug("Dropping excluded mediatype %s" % mediatype) if not included: debug("Dropping not included mediatype %s" % mediatype) return False
def get_charset_from_content_type(self): """ Get charset from server content-type. """ charset = self.attribs.orig_mediatype.params.get('charset') if charset: debug('Got charset %s from server' % charset) return charset return None
def unpack_media_handheld (sheet): """ unpack a @media handheld rule """ for rule in sheet: if rule.type == rule.MEDIA_RULE: if rule.media.mediaText.find ('handheld') > -1: debug ("Unpacking CSS @media handheld rule.") rule.media.mediaText = 'all' rule.insertRule (cssutils.css.CSSComment ('/* was @media handheld */'), 0)
def is_included_relation(self, attribs): """ Return True if this document is eligible. """ keep = attribs.rel.intersection(('coverpage', 'important')) if keep: debug("Not dropping after all because of rel.") return keep
def main(): """ Main program. """ try: config() except configparser.Error as what: error("Error in configuration file: %s", str(what)) return 1 Logger.set_log_level(options.verbose) options.types = options.types or ['all'] options.types = CommonCode.add_dependencies(options.types, DEPENDENCIES, BUILD_ORDER) debug("Building types: %s" % ' '.join(options.types)) ParserFactory.load_parsers() WriterFactory.load_writers() PackagerFactory.load_packagers() if options.is_job_queue: job_queue = cPickle.load(sys.stdin.buffer) # read bytes else: options.dc = get_dc(options.url) job_queue = [] output_files = dict() for type_ in options.types: job = CommonCode.Job(type_) job.url = options.url job.ebook = options.ebook job.dc = options.dc job.outputdir = options.outputdir job.outputfile = options.outputfile or make_output_filename( type_, options.dc) output_files[type_] = job.outputfile if job.type == 'kindle.images': job.url = os.path.join(job.outputdir, output_files['epub.images']) elif job.type == 'kindle.noimages': job.url = os.path.join(job.outputdir, output_files['epub.noimages']) job_queue.append(job) for j in job_queue: do_job(j) packager = PackagerFactory.create(options.packager, 'push') if packager: # HACK: the WWers ever only convert one ebook at a time job = job_queue[0] job.outputfile = '%d-final.zip' % (options.dc.project_gutenberg_id) packager.package(job) return 0
def _make_coverpage_link(self, coverpage_url=None): """ Insert a <link rel="coverpage"> in the html head using the image specified by the --cover command-line option """ if coverpage_url: for head in xpath(self.xhtml, "/xhtml:html/xhtml:head"): head.append( parsers.em.link(rel='coverpage', href=coverpage_url)) debug("Inserted link to coverpage %s." % coverpage_url) return
def guess_charset_from_body(self): """ Guess charset from text. """ # http://chardet-matthickford.readthedocs.org/en/latest/usage.html result = chardet.detect(self.bytes_content()) charset = result.get('encoding') if charset: debug('Got charset %s from text sniffing' % charset) return charset return None
def remove_coverpage(self, xhtml, url): """ Remove coverpage from flow. EPUB readers will display the coverpage from the manifest and if we don't remove it from flow it will be displayed twice. """ for img in xpath(xhtml, '//xhtml:img[@src = $url]', url=url): debug("remove_coverpage: dropping <img> %s from flow" % url) img.drop_tree() return # only the first one though
def bytes_content(self): """ Get document content as raw bytes. """ if self.buffer is None: try: debug("Fetching %s ..." % self.attribs.url) self.buffer = self.fp.read() self.fp.close() except IOError as what: error(what) return self.buffer
def get_charset_from_meta(self): """ Parse text for hints about charset. """ # .. -*- coding: utf-8 -*- charset = None rst = self.bytes_content() match = REB_EMACS_CHARSET.search(rst) if match: charset = match.group(1).decode('ascii') debug('Got charset %s from emacs comment' % charset) return charset
def fix_incompatible_css(sheet): """ Strip CSS properties and values that are not EPUB compatible. """ # debug("enter fix_incompatible_css") for rule in sheet: if rule.type == rule.STYLE_RULE: for p in list(rule.style): if p.name == 'float': debug("Dropping property %s" % p.name) rule.style.removeProperty('float') rule.style.removeProperty('width') rule.style.removeProperty('height') elif p.name == 'position': debug("Dropping property %s" % p.name) rule.style.removeProperty('position') rule.style.removeProperty('left') rule.style.removeProperty('right') rule.style.removeProperty('top') rule.style.removeProperty('bottom') elif p.name in ('background-image', 'background-position', 'background-attachment', 'background-repeat'): debug("Dropping property %s" % p.name) rule.style.removeProperty(p.name) elif 'border' not in p.name and 'px' in p.value: debug("Dropping property with px value %s" % p.name) rule.style.removeProperty(p.name)
def open_file (cls, orig_url, attribs): """ Open a local file for parsing. """ url = orig_url if url.startswith ('file://'): fp = open (url[7:], "rb") else: fp = open (url, "rb") attribs.orig_mediatype = attribs.HeaderElement (MediaTypes.guess_type (url)) debug ("... got mediatype %s from guess_type" % str (attribs.orig_mediatype)) attribs.orig_url = orig_url attribs.url = url return fp
def fix_incompatible_css(sheet): """ Strip CSS properties and values that are not EPUB compatible. """ cssclass = re.compile(r'\.(-?[_a-zA-Z]+[_a-zA-Z0-9-]*)') for rule in sheet: if rule.type == rule.STYLE_RULE: ruleclasses = list( cssclass.findall(rule.selectorList.selectorText)) for p in list(rule.style): if p.name == 'float' and "x-ebookmaker" not in ruleclasses: debug("Dropping property %s" % p.name) rule.style.removeProperty('float') rule.style.removeProperty('width') rule.style.removeProperty('height') elif p.name == 'position': debug("Dropping property %s" % p.name) rule.style.removeProperty('position') rule.style.removeProperty('left') rule.style.removeProperty('right') rule.style.removeProperty('top') rule.style.removeProperty('bottom') elif p.name in ('background-image', 'background-position', 'background-attachment', 'background-repeat'): debug("Dropping property %s" % p.name) rule.style.removeProperty(p.name) elif 'border' not in p.name and 'px' in p.value: debug("Dropping property with px value %s" % p.name) rule.style.removeProperty(p.name)
def fix_style_elements(xhtml): """ Fix CSS style elements. Make sure they are utf-8. """ # debug ("enter fix_style_elements") for style in xpath(xhtml, "//xhtml:style"): p = parsers.CSSParser.Parser() p.parse_string(style.text) try: # pylint: disable=E1103 style.text = p.sheet.cssText.decode('utf-8') except (ValueError, UnicodeError): debug("CSS:\n%s" % p.sheet.cssText) raise
def load_parsers (): """ See what types we can parse. """ for fn in resource_listdir ('ebookmaker.parsers', ''): modulename, ext = os.path.splitext (fn) if ext == '.py': if modulename.endswith ('Parser'): module = __import__ ('ebookmaker.parsers.' + modulename, fromlist = [modulename]) debug ("Loading parser from module: %s for mediatypes: %s" % ( modulename, ', '.join (module.mediatypes))) for mediatype in module.mediatypes: parser_modules[mediatype] = module return parser_modules.keys ()
def _full_parse_2(self, writer, destination, overrides): """ Full parser from pickled doctree. Doesn't work yet. It turned out pickling a doctree is much harder than I thought. """ debug("Full-parsing %s" % self.attribs.url) source = docutils.io.StringInput(self.unicode_content()) reader = docutils.readers.standalone.Reader() parser = gutenberg_parsers.Parser() doc = reader.read( source, parser, self.get_settings((reader, parser, writer), overrides)) self.document1 = doc self.rewrite_links(partial(urllib.parse.urljoin, self.attribs.url)) # make it picklable reporter = doc.reporter # = None # doc.reporter = None transformer = doc.transformer doc.settings = None from docutils.parsers.rst.directives.html import MetaBody #for metanode in doc.traverse (MetaBody.meta): for pending in doc.traverse(nodes.pending): # pending.transform = None # docutils' meta nodes aren't picklable because the class is nested # in pending['nodes'] if 'nodes' in pending.details: if isinstance(pending.details['nodes'][0], MetaBody.meta): pending.details['nodes'][0].__class__ = mynodes.meta from six.moves import cPickle as pickle pickled = pickle.dumps(doc) doc = pickle.loads(pickled) #doc.transformer.populate_from_components ( # (source, reader, parser, writer)) doc.transformer = transformer doc.reporter = reporter doc.settings = self.get_settings((reader, parser, writer), overrides) doc.transformer.apply_transforms() return writer.write(doc, destination)