コード例 #1
0
    def open_file(cls, url, attribs):
        """ Open a local file for parsing. """
        def open_file_from_path(path):
            try:
                return open(url, 'rb')
            except FileNotFoundError:
                error('Missing file: %s' % url)
            except IsADirectoryError:
                error('Missing file is a directory: %s' % url)
            return None

        if re.search(r'^([a-zA-z]:|/)', url):
            fp = open_file_from_path(url)
        else:
            try:
                # handles all the flavors of file: urls, including on windows
                fp = urllib.request.urlopen(url)
            except urllib.error.URLError as what:
                fp = None
                error('Missing file: %s' % what.reason)
            except ValueError:  # just a relative path?
                fp = open_file_from_path(url)

        attribs.orig_mediatype = attribs.HeaderElement(
            MediaTypes.guess_type(url))

        debug("... got mediatype %s from guess_type" %
              str(attribs.orig_mediatype))
        attribs.orig_url = attribs.url = url
        return fp
コード例 #2
0
    def pre_parse(self):
        """ Parse a RST file as link list. """

        debug("RSTParser: Pre-parsing %s" % self.attribs.url)

        default_style = self.get_resource('mydocutils.parsers',
                                          'default_style.rst')

        source = docutils.io.StringInput(default_style +
                                         self.unicode_content())
        reader = docutils.readers.standalone.Reader()
        parser = gutenberg_parsers.Parser()

        overrides = {
            'get_resource': self.get_resource,
            'get_image_size': self.get_image_size_from_parser,
            'base_url': self.attribs.url,
        }

        doc = reader.read(source, parser,
                          self.get_settings((reader, parser), overrides))
        self.document1 = doc

        self.rewrite_links(partial(urllib.parse.urljoin, self.attribs.url))

        debug("RSTParser: Done pre-parsing %s" % self.attribs.url)
コード例 #3
0
    def _make_coverpage_link (self):
        """ Insert a <link rel="coverpage"> in the html head.

        First we determine the coverpage url.  In HTML we find the
        coverpage by appling these rules:

          1. the image specified in <link rel='coverpage'>,
          2. the image with an id of 'coverpage' or
          3. the image with an url containing 'cover'
          4. the image with an url containing 'title'

        If one rule returns images we take the first one in document
        order, else we proceed with the next rule.
        """

        coverpages = xpath (self.xhtml, "//xhtml:link[@rel='coverpage']")
        for coverpage in coverpages:
            url = coverpage.get ('src')
            debug ("Found link to coverpage %s." % url)
            return   # already provided by user

        # look for a suitable candidate
        coverpages = xpath (self.xhtml, "//xhtml:img[@id='coverpage']")
        if not coverpages:
            coverpages = xpath (self.xhtml, "//xhtml:img[contains (@src, 'cover')]")
        if not coverpages:
            coverpages = xpath (self.xhtml, "//xhtml:img[contains (@src, 'title')]")

        for coverpage in coverpages:
            for head in xpath (self.xhtml, "/xhtml:html/xhtml:head"):
                url = coverpage.get ('src')
                head.append (parsers.em.link (rel = 'coverpage', href = url))
                debug ("Inserted link to coverpage %s." % url)
コード例 #4
0
    def _full_parse(self, writer, overrides):
        """ Full parse from scratch. """

        debug("RSTParser: Full-parsing %s" % self.attribs.url)

        default_style = self.get_resource('mydocutils.parsers',
                                          'default_style.rst')

        source = docutils.io.StringInput(
            default_style + self.unicode_content(), self.attribs.url,
            'unicode')
        reader = docutils.readers.standalone.Reader()
        parser = gutenberg_parsers.Parser()

        doc = reader.read(
            source, parser,
            self.get_settings((reader, parser, writer), overrides))
        self.document1 = doc

        self.rewrite_links(partial(urllib.parse.urljoin, self.attribs.url))

        doc.transformer.populate_from_components(
            (source, reader, parser, writer))
        doc.transformer.apply_transforms()
        debug("RSTParser: Done full-parsing %s" % self.attribs.url)

        return doc
コード例 #5
0
    def __unicode__(self):
        """ Serialize content.opf as unicode string. """

        assert len(self.manifest), 'No manifest item in content.opf.'
        assert len(self.spine), 'No spine item in content.opf.'
        assert 'toc' in self.spine.attrib, 'No TOC item in content.opf.'

        package = self.opf.package(**{
            'version': '2.0',
            'unique-identifier': 'id'
        })
        package.append(self.metadata)
        package.append(self.manifest)
        package.append(self.spine)
        if len(self.guide):
            package.append(self.guide)

        content_opf = "%s\n\n%s" % (
            gg.XML_DECLARATION,
            etree.tostring(package, encoding=six.text_type, pretty_print=True))

        # FIXME: remove this when lxml is fixed
        # now merge xmlns:opf and xmlns:
        content_opf = content_opf.replace('lxml-bug-workaround', '')

        if options.verbose >= 2:
            debug(content_opf)
        return content_opf
コード例 #6
0
 def run(self):
     debug('Endsection directive state: %s' % self.state)
     # back out of lists, etc.
     if isinstance(self.state, states.SpecializedBody):
         debug('Backing out of list')
         self.state_machine.previous_line(2)  # why do we need 2 ???
     raise EOFError
コード例 #7
0
    def copy_aux_files(self, job, dest_dir):
        """ Copy image files to dest_dir. Use image data cached in parsers. """

        for p in job.spider.parsers:
            if hasattr(p, 'resize_image'):
                src_uri = p.attribs.url
                fn_dest = gg.make_url_relative(webify_url(job.base_url),
                                               src_uri)
                fn_dest = os.path.join(dest_dir, fn_dest)

                # debug ('base_url =  %s, src_uri = %s' % (job.base_url, src_uri))

                if gg.is_same_path(src_uri, fn_dest):
                    debug('Not copying %s to %s: same file' %
                          (src_uri, fn_dest))
                    continue
                debug('Copying %s to %s' % (src_uri, fn_dest))

                fn_dest = gg.normalize_path(fn_dest)
                gg.mkdir_for_filename(fn_dest)
                try:
                    with open(fn_dest, 'wb') as fp_dest:
                        fp_dest.write(p.serialize())
                except IOError as what:
                    error('Cannot copy %s to %s: %s' %
                          (src_uri, fn_dest, what))
コード例 #8
0
ファイル: __init__.py プロジェクト: verdetamadachi/ebookmaker
    def decode(self, charset):
        """ Try to decode document contents to unicode. """
        if charset is None:
            return None

        charset = charset.lower().strip()

        if charset in BOGUS_CHARSET_NAMES:
            charset = BOGUS_CHARSET_NAMES[charset]

        if charset == 'utf-8':
            charset = 'utf_8_sig'

        try:
            debug("Trying to decode document with charset %s ..." % charset)
            buffer = self.bytes_content()
            buffer = REB_PG_CHARSET.sub(b'', buffer)
            buffer = buffer.decode(charset)
            self.attribs.orig_mediatype.params['charset'] = charset
            return buffer
        except LookupError as what:
            # unknown charset,
            error("Invalid charset name: %s (%s)" % (charset, what))
        except UnicodeError as what:
            # mis-stated charset, did not decode
            error("Text not in charset %s (%s)" % (charset, what))
        return None
コード例 #9
0
ファイル: __init__.py プロジェクト: verdetamadachi/ebookmaker
 def f(url):
     """ Remap function """
     ur, frag = urllib.parse.urldefrag(url)
     if ur in url_map:
         debug("Rewriting redirected url: %s to %s" % (ur, url_map[ur]))
         ur = url_map[ur]
     return "%s#%s" % (ur, frag) if frag else ur
コード例 #10
0
    def validate(self, job):
        """ Validate generated epub using external tools. """

        debug("Validating %s ..." % job.outputfile)

        filename = os.path.join(job.outputdir, job.outputfile)

        for validator in (options.config.EPUB_VALIDATOR,
                          options.config.EPUB_PREFLIGHT):
            if validator is not None:
                params = validator.split() + [filename]
                checker = subprocess.Popen(params,
                                           stdin=subprocess.PIPE,
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.PIPE)

                (dummy_stdout, stderr) = checker.communicate()
                if stderr:
                    error(stderr)
                    return 1
                    #raise AssertionError (
                    #    "%s does not validate." % job.outputfile)

        debug("%s validates ok." % job.outputfile)
        return 0
コード例 #11
0
ファイル: Spider.py プロジェクト: aredwing/ebookmaker
    def is_included_mediatype(self, attribs):
        """ Return True if this document is eligible. """

        mediatype = self.get_mediatype(attribs)
        if not mediatype:
            warning('Mediatype could not be determined from url %s' %
                    attribs.url)
            return True  # always include if mediatype unknown

        included = any([
            fnmatch.fnmatch(mediatype, pattern)
            for pattern in self.include_mediatypes
        ])
        excluded = any([
            fnmatch.fnmatch(mediatype, pattern)
            for pattern in self.exclude_mediatypes
        ])

        if included and not excluded:
            return True

        if excluded:
            debug("Dropping excluded mediatype %s" % mediatype)
        if not included:
            debug("Dropping not included mediatype %s" % mediatype)

        return False
コード例 #12
0
ファイル: test_setup.py プロジェクト: aredwing/ebookmaker
 def setUp(self):
     config()
     Logger.set_log_level(options.verbose)
     options.types = options.types or ['all']
     options.types = CommonCode.add_dependencies(options.types,
                                                 DEPENDENCIES, BUILD_ORDER)
     debug("Building types: %s" % ' '.join(options.types))
コード例 #13
0
ファイル: EpubWriter.py プロジェクト: aredwing/ebookmaker
    def __unicode__(self):
        """ Serialize toc.ncx as unicode string. """
        ncx = self.ncx
        tocdepth = 1

        if self.toc:
            # normalize toc so that it starts with an h1 and doesn't jump down more than one
            # level at a time
            fixer = OutlineFixer()
            for t in self.toc:
                t[2] = fixer.level(t[2])

            # flatten toc if it contains only one top-level entry
            top_level_entries = sum(t[2] == 1 for t in self.toc)
            if top_level_entries < 2:
                for t in self.toc:
                    if t[2] != -1:
                        t[2] = max(1, t[2] - 1)

            tocdepth = max(t[2] for t in self.toc)

        head = ncx.head(
            ncx.meta(name='dtb:uid', content=self.dc.opf_identifier),
            ncx.meta(name='dtb:depth', content=str(tocdepth)),
            ncx.meta(name='dtb:generator', content=GENERATOR % VERSION),
            ncx.meta(name='dtb:totalPageCount', content='0'),
            ncx.meta(name='dtb:maxPageNumber', content='0'))

        doc_title = ncx.docTitle(ncx.text(self.dc.title))

        self.seen_urls = {}
        has_pages = False
        for url, dummy_title, depth in self.toc:
            # navPoints and pageTargets referencing the same element
            # must have the same playOrder
            if url not in self.seen_urls:
                self.seen_urls[url] = str(len(self.seen_urls) + 1)
            if depth == -1:
                has_pages = True

        params = {'version': '2005-1'}
        if self.dc.languages:
            params[NS.xml.lang] = self.dc.languages[0].id

        ncx = ncx.ncx(head, doc_title, self._make_navmap(self.toc), **params)

        if has_pages:
            ncx.append(self._make_pagelist(self.toc))

        # Ugly workaround for error: "Serialisation to unicode must not
        # request an XML declaration"

        toc_ncx = "%s\n\n%s" % (gg.XML_DECLARATION,
                                etree.tostring(ncx,
                                               doctype=gg.NCX_DOCTYPE,
                                               encoding=six.text_type,
                                               pretty_print=True))
        if options.verbose >= 3:
            debug(toc_ncx)
        return toc_ncx
コード例 #14
0
    def shipout_chunk(self, attribs, chunk_id=None, comment=None):
        """ ready chunk to be shipped """

        attribs = copy.copy(attribs)

        if self.chunk_size > MAX_CHUNK_SIZE:
            self.split(self.chunk, attribs)
            return

        url = normalize_uri(attribs.url)
        chunk_name = self._make_name(url)

        # the url of the whole page
        if not url in self.idmap:
            self.idmap[url] = chunk_name

        # fragments of the page
        for e in xpath(self.chunk, '//xhtml:*[@id]'):
            id_ = e.attrib['id']
            old_id = "%s#%s" % (url, id_)
            # key is unicode string,
            # value is uri-escaped byte string
            # if ids get cloned while chunking, map to the first one only
            if old_id not in self.idmap:
                self.idmap[old_id] = "%s#%s" % (chunk_name,
                                                urllib.parse.quote(id_))

        attribs.url = chunk_name
        attribs.id = chunk_id
        attribs.comment = comment
        self.chunks.append((self.chunk, attribs))

        debug("Adding chunk %s (%d bytes) %s" %
              (chunk_name, self.chunk_size, chunk_id))
コード例 #15
0
ファイル: Spider.py プロジェクト: verdetamadachi/ebookmaker
    def is_included_mediatype(self, attribs):
        """ Return True if this document is eligible. """

        if attribs.orig_mediatype is None:
            mediatype = MediaTypes.guess_type(attribs.url)
            if mediatype:
                attribs.orig_mediatype = attribs.HeaderElement(mediatype)
            else:
                return True  # always include if mediatype unknown

        mediatype = attribs.orig_mediatype.value

        included = any([
            fnmatch.fnmatch(mediatype, pattern)
            for pattern in self.include_mediatypes
        ])
        excluded = any([
            fnmatch.fnmatch(mediatype, pattern)
            for pattern in self.exclude_mediatypes
        ])

        if included and not excluded:
            return True

        if excluded:
            debug("Dropping excluded mediatype %s" % mediatype)
        if not included:
            debug("Dropping not included mediatype %s" % mediatype)

        return False
コード例 #16
0
ファイル: __init__.py プロジェクト: verdetamadachi/ebookmaker
    def get_charset_from_content_type(self):
        """ Get charset from server content-type. """

        charset = self.attribs.orig_mediatype.params.get('charset')
        if charset:
            debug('Got charset %s from server' % charset)
            return charset
        return None
コード例 #17
0
 def unpack_media_handheld (sheet):
     """ unpack a @media handheld rule """
     for rule in sheet:
         if rule.type == rule.MEDIA_RULE:
             if rule.media.mediaText.find ('handheld') > -1:
                 debug ("Unpacking CSS @media handheld rule.")
                 rule.media.mediaText = 'all'
                 rule.insertRule (cssutils.css.CSSComment ('/* was @media handheld */'), 0)
コード例 #18
0
ファイル: Spider.py プロジェクト: verdetamadachi/ebookmaker
    def is_included_relation(self, attribs):
        """ Return True if this document is eligible. """

        keep = attribs.rel.intersection(('coverpage', 'important'))
        if keep:
            debug("Not dropping after all because of rel.")

        return keep
コード例 #19
0
def main():
    """ Main program. """

    try:
        config()
    except configparser.Error as what:
        error("Error in configuration file: %s", str(what))
        return 1

    Logger.set_log_level(options.verbose)

    options.types = options.types or ['all']
    options.types = CommonCode.add_dependencies(options.types, DEPENDENCIES,
                                                BUILD_ORDER)
    debug("Building types: %s" % ' '.join(options.types))

    ParserFactory.load_parsers()
    WriterFactory.load_writers()
    PackagerFactory.load_packagers()

    if options.is_job_queue:
        job_queue = cPickle.load(sys.stdin.buffer)  # read bytes
    else:
        options.dc = get_dc(options.url)
        job_queue = []
        output_files = dict()
        for type_ in options.types:
            job = CommonCode.Job(type_)
            job.url = options.url
            job.ebook = options.ebook
            job.dc = options.dc
            job.outputdir = options.outputdir
            job.outputfile = options.outputfile or make_output_filename(
                type_, options.dc)
            output_files[type_] = job.outputfile

            if job.type == 'kindle.images':
                job.url = os.path.join(job.outputdir,
                                       output_files['epub.images'])
            elif job.type == 'kindle.noimages':
                job.url = os.path.join(job.outputdir,
                                       output_files['epub.noimages'])

            job_queue.append(job)

    for j in job_queue:
        do_job(j)

    packager = PackagerFactory.create(options.packager, 'push')
    if packager:
        # HACK: the WWers ever only convert one ebook at a time
        job = job_queue[0]
        job.outputfile = '%d-final.zip' % (options.dc.project_gutenberg_id)
        packager.package(job)

    return 0
コード例 #20
0
    def _make_coverpage_link(self, coverpage_url=None):
        """ Insert a <link rel="coverpage"> in the html head
        using the image specified by the --cover command-line option
        """

        if coverpage_url:
            for head in xpath(self.xhtml, "/xhtml:html/xhtml:head"):
                head.append(
                    parsers.em.link(rel='coverpage', href=coverpage_url))
                debug("Inserted link to coverpage %s." % coverpage_url)
            return
コード例 #21
0
ファイル: __init__.py プロジェクト: verdetamadachi/ebookmaker
    def guess_charset_from_body(self):
        """ Guess charset from text. """

        # http://chardet-matthickford.readthedocs.org/en/latest/usage.html

        result = chardet.detect(self.bytes_content())
        charset = result.get('encoding')
        if charset:
            debug('Got charset %s from text sniffing' % charset)
            return charset
        return None
コード例 #22
0
    def remove_coverpage(self, xhtml, url):
        """ Remove coverpage from flow.

        EPUB readers will display the coverpage from the manifest and
        if we don't remove it from flow it will be displayed twice.

        """
        for img in xpath(xhtml, '//xhtml:img[@src = $url]', url=url):
            debug("remove_coverpage: dropping <img> %s from flow" % url)
            img.drop_tree()
            return  # only the first one though
コード例 #23
0
ファイル: __init__.py プロジェクト: verdetamadachi/ebookmaker
    def bytes_content(self):
        """ Get document content as raw bytes. """

        if self.buffer is None:
            try:
                debug("Fetching %s ..." % self.attribs.url)
                self.buffer = self.fp.read()
                self.fp.close()
            except IOError as what:
                error(what)

        return self.buffer
コード例 #24
0
    def get_charset_from_meta(self):
        """ Parse text for hints about charset. """
        # .. -*- coding: utf-8 -*-

        charset = None
        rst = self.bytes_content()

        match = REB_EMACS_CHARSET.search(rst)
        if match:
            charset = match.group(1).decode('ascii')
            debug('Got charset %s from emacs comment' % charset)

        return charset
コード例 #25
0
ファイル: EpubWriter.py プロジェクト: cpeel/ebookmaker
    def fix_incompatible_css(sheet):
        """ Strip CSS properties and values that are not EPUB compatible. """

        # debug("enter fix_incompatible_css")

        for rule in sheet:
            if rule.type == rule.STYLE_RULE:
                for p in list(rule.style):
                    if p.name == 'float':
                        debug("Dropping property %s" % p.name)
                        rule.style.removeProperty('float')
                        rule.style.removeProperty('width')
                        rule.style.removeProperty('height')
                    elif p.name == 'position':
                        debug("Dropping property %s" % p.name)
                        rule.style.removeProperty('position')
                        rule.style.removeProperty('left')
                        rule.style.removeProperty('right')
                        rule.style.removeProperty('top')
                        rule.style.removeProperty('bottom')
                    elif p.name in ('background-image', 'background-position',
                                    'background-attachment',
                                    'background-repeat'):
                        debug("Dropping property %s" % p.name)
                        rule.style.removeProperty(p.name)
                    elif 'border' not in p.name and 'px' in p.value:
                        debug("Dropping property with px value %s" % p.name)
                        rule.style.removeProperty(p.name)
コード例 #26
0
    def open_file (cls, orig_url, attribs):
        """ Open a local file for parsing. """

        url = orig_url
        if url.startswith ('file://'):
            fp = open (url[7:], "rb")
        else:
            fp = open (url, "rb")
        attribs.orig_mediatype = attribs.HeaderElement (MediaTypes.guess_type (url))

        debug ("... got mediatype %s from guess_type" % str (attribs.orig_mediatype))
        attribs.orig_url = orig_url
        attribs.url = url
        return fp
コード例 #27
0
ファイル: EpubWriter.py プロジェクト: aredwing/ebookmaker
    def fix_incompatible_css(sheet):
        """ Strip CSS properties and values that are not EPUB compatible. """

        cssclass = re.compile(r'\.(-?[_a-zA-Z]+[_a-zA-Z0-9-]*)')

        for rule in sheet:
            if rule.type == rule.STYLE_RULE:
                ruleclasses = list(
                    cssclass.findall(rule.selectorList.selectorText))
                for p in list(rule.style):
                    if p.name == 'float' and "x-ebookmaker" not in ruleclasses:
                        debug("Dropping property %s" % p.name)
                        rule.style.removeProperty('float')
                        rule.style.removeProperty('width')
                        rule.style.removeProperty('height')
                    elif p.name == 'position':
                        debug("Dropping property %s" % p.name)
                        rule.style.removeProperty('position')
                        rule.style.removeProperty('left')
                        rule.style.removeProperty('right')
                        rule.style.removeProperty('top')
                        rule.style.removeProperty('bottom')
                    elif p.name in ('background-image', 'background-position',
                                    'background-attachment',
                                    'background-repeat'):
                        debug("Dropping property %s" % p.name)
                        rule.style.removeProperty(p.name)
                    elif 'border' not in p.name and 'px' in p.value:
                        debug("Dropping property with px value %s" % p.name)
                        rule.style.removeProperty(p.name)
コード例 #28
0
    def fix_style_elements(xhtml):
        """ Fix CSS style elements.  Make sure they are utf-8. """

        # debug ("enter fix_style_elements")

        for style in xpath(xhtml, "//xhtml:style"):
            p = parsers.CSSParser.Parser()
            p.parse_string(style.text)
            try:
                # pylint: disable=E1103
                style.text = p.sheet.cssText.decode('utf-8')
            except (ValueError, UnicodeError):
                debug("CSS:\n%s" % p.sheet.cssText)
                raise
コード例 #29
0
def load_parsers ():
    """ See what types we can parse. """

    for fn in resource_listdir ('ebookmaker.parsers', ''):
        modulename, ext = os.path.splitext (fn)
        if ext == '.py':
            if modulename.endswith ('Parser'):
                module = __import__ ('ebookmaker.parsers.' + modulename, fromlist = [modulename])
                debug ("Loading parser from module: %s for mediatypes: %s" % (
                    modulename, ', '.join (module.mediatypes)))
                for mediatype in module.mediatypes:
                    parser_modules[mediatype] = module

    return parser_modules.keys ()
コード例 #30
0
    def _full_parse_2(self, writer, destination, overrides):
        """ Full parser from pickled doctree.

        Doesn't work yet. It turned out pickling a doctree is much
        harder than I thought. """

        debug("Full-parsing %s" % self.attribs.url)

        source = docutils.io.StringInput(self.unicode_content())
        reader = docutils.readers.standalone.Reader()
        parser = gutenberg_parsers.Parser()

        doc = reader.read(
            source, parser,
            self.get_settings((reader, parser, writer), overrides))
        self.document1 = doc

        self.rewrite_links(partial(urllib.parse.urljoin, self.attribs.url))

        # make it picklable
        reporter = doc.reporter  #  = None
        # doc.reporter = None
        transformer = doc.transformer
        doc.settings = None
        from docutils.parsers.rst.directives.html import MetaBody

        #for metanode in doc.traverse (MetaBody.meta):
        for pending in doc.traverse(nodes.pending):
            # pending.transform = None
            # docutils' meta nodes aren't picklable because the class is nested
            # in pending['nodes']
            if 'nodes' in pending.details:
                if isinstance(pending.details['nodes'][0], MetaBody.meta):
                    pending.details['nodes'][0].__class__ = mynodes.meta
        from six.moves import cPickle as pickle
        pickled = pickle.dumps(doc)

        doc = pickle.loads(pickled)

        #doc.transformer.populate_from_components (
        #    (source, reader, parser, writer))

        doc.transformer = transformer
        doc.reporter = reporter
        doc.settings = self.get_settings((reader, parser, writer), overrides)

        doc.transformer.apply_transforms()

        return writer.write(doc, destination)