Ejemplo n.º 1
0
    def _make_navmap(self, toc):
        """ Build the toc. """
        ncx = self.ncx

        root = ncx.navMap()
        last_np_with_depth = {0: root}

        count = 0
        for url, title, depth in toc:
            if depth > -1:
                count += 1
                np = ncx.navPoint(
                    ncx.navLabel(ncx.text(title)), ncx.content(src=url), **{
                        'id': "np-%d" % count,
                        'playOrder': self.seen_urls[url]
                    })

                try:
                    parent = last_np_with_depth[depth - 1]
                    parent.append(np)
                    last_np_with_depth[depth] = np
                except KeyError:
                    warning("Bogus depth %d in TOC" % depth)

        return root
Ejemplo n.º 2
0
    def __parse(self, html):
        # remove xml decl and doctype, we will add the correct one before serializing
        # html = re.compile ('^.*<html ', re.I | re.S).sub ('<html ', html)
        # FIXME: do not remove doctype because we need it to load the dtd

        # remove xml declaration because of parser error: "Unicode
        # strings with encoding declaration are not supported. Please
        # use bytes input or XML fragments without declaration."
        re_xml_decl = re.compile(r'^.*?<\?xml.*?\?>', re.S | re.U)
        html = re_xml_decl.sub('', html)
        try:
            return etree.fromstring(html,
                                    lxml.html.XHTMLParser(huge_tree=True),
                                    base_url=self.attribs.url)
        except etree.ParseError as what:
            # cannot try HTML parser because we depend on correct xhtml namespace
            m = re.search(r"Entity '([^']+)'", str(what))
            if m:
                warning("Missing entity: '%s'" % m.group(1))
            else:
                error("Failed to parse file because: %s" % what)
            m = re.search(r'line\s(\d+),', str(what))
            if m:
                lineno = int(m.group(1))
                error("Line %d: %s" % (lineno, html.splitlines()[lineno - 1]))
            raise
Ejemplo n.º 3
0
    def is_included_mediatype(self, attribs):
        """ Return True if this document is eligible. """

        mediatype = self.get_mediatype(attribs)
        if not mediatype:
            warning('Mediatype could not be determined from url %s' %
                    attribs.url)
            return True  # always include if mediatype unknown

        included = any([
            fnmatch.fnmatch(mediatype, pattern)
            for pattern in self.include_mediatypes
        ])
        excluded = any([
            fnmatch.fnmatch(mediatype, pattern)
            for pattern in self.exclude_mediatypes
        ])

        if included and not excluded:
            return True

        if excluded:
            debug("Dropping excluded mediatype %s" % mediatype)
        if not included:
            debug("Dropping not included mediatype %s" % mediatype)

        return False
Ejemplo n.º 4
0
    def tidy (html):
        """ Pipe html thru w3c tidy. """

        html = parsers.RE_RESTRICTED.sub ('', html)
        html = RE_XMLDECL.sub ('', html)
        html = parsers.RE_HTML_CHARSET.sub ('; charset=utf-8', html)

        # convert to xhtml
        tidy = subprocess.Popen (
            ["tidy",
             "-utf8",
             "-clean",
             "--wrap",             "0",
             # "--drop-font-tags",   "y",
             # "--drop-proprietary-attributes", "y",
             # "--add-xml-space",    "y",
             "--output-xhtml",     "y",
             "--numeric-entities", "y",
             "--merge-divs",       "n", # keep poetry indentation
             "--merge-spans",      "n",
             "--add-xml-decl",     "n",
             "--doctype",          "strict",
             "--anchor-as-name",   "n",
             "--enclose-text",     "y" ],

            stdin = subprocess.PIPE,
            stdout = subprocess.PIPE,
            stderr = subprocess.PIPE)

        # print (html.encode ('utf-8'))
        # sys.exit ()

        (html, stderr) = tidy.communicate (html.encode ('utf-8'))

        regex = re.compile (r'(Info:|Warning:|Error:)\s*', re.I)

        # pylint: disable=E1103
        msg = stderr.decode (sys.stderr.encoding).strip ()
        for line in msg.splitlines ():
            match = regex.search (line)
            if match:
                sline = regex.sub ("", line)
                g = match.group (1).lower ()
                if g == 'info:':
                    info ("tidy: %s" % sline)
                elif g == 'warning:':
                    warning ("tidy: %s" % sline)
                elif g == 'error:':
                    error ("tidy: %s" % sline)
                else:
                    error (line)

        if tidy.returncode == 2:
            raise ValueError (stderr)

        return html.decode ('utf-8')
Ejemplo n.º 5
0
    def add(zip_, filename, memberfilename):
        """ Add one file to the zip. """

        try:
            os.stat(filename)
            dummy_name, ext = os.path.splitext(filename)
            info('  Adding file: %s as %s' % (filename, memberfilename))
            zip_.write(
                filename, memberfilename, zipfile.ZIP_STORED
                if ext in ['.zip', '.png'] else zipfile.ZIP_DEFLATED)
        except OSError:
            warning('ZipPackager: Cannot add file %s', filename)
Ejemplo n.º 6
0
def elect_coverpage(spider):
    """ Find first coverpage candidate that is not too small. """

    coverpage_found = False

    for p in spider.parsers:
        if 'coverpage' in p.attribs.rel:
            if coverpage_found:
                # keep the first one found, reset all others
                p.attribs.rel.remove('coverpage')
                continue
            if hasattr(p, 'get_image_dimen'):
                dimen = p.get_image_dimen()
                if (dimen[0] * dimen[1]) < COVERPAGE_MIN_AREA:
                    p.attribs.rel.remove('coverpage')
                    warning(
                        "removed coverpage candidate %s because too small (%d x %d)"
                        % (p.url, dimen[0], dimen[1]))
                    continue
            coverpage_found = True
Ejemplo n.º 7
0
    def get_default_width(self, uri):
        """Calculate a sensible default width for images.

        Assume images are processed for a viewport 980px wide, the
        same as the iPhone browser assumes.

        """

        if (self.document.settings.get_image_size
                and six.callable(self.document.settings.get_image_size)):

            size = self.document.settings.get_image_size(uri)
            if size is not None:
                w = int(float(size[0]) / (980.0 * 0.8) * 100.0 + 0.5)
                width = "%d%%" % min(100, w)
                debug('Got dimension of image: %s: %s' % (uri, width))
                return width

        warning('Could not get dimension of image: %s' % uri)
        return '100%'
Ejemplo n.º 8
0
    def enqueue(self, queue, depth, attribs, is_doc):
        """ Enqueue url for parsing."""
        if is_doc:
            if not self.is_included_url(attribs):
                warning('External link in %s: %s' %
                        (attribs.referrer, attribs.url))
                return
            if depth >= self.max_depth:
                error('Omitted file %s due to depth > max_depth' % attribs.url)
                return
        if not self.is_included_mediatype(
                attribs) and not self.is_included_relation(attribs):
            return
        elif not self.is_included_url(
                attribs) and not self.is_included_relation(attribs):
            error(
                'Failed for embedded media in %s from disallowed location: %s'
                % (attribs.referrer, attribs.url))
            return

        queue.append((depth, attribs))
Ejemplo n.º 9
0
def elect_coverpage(spider, url):
    """ Find first coverpage candidate that is not too small. """

    coverpage_found = False
    for p in spider.parsers:
        if 'coverpage' in p.attribs.rel:
            if coverpage_found:
                # keep the first one found, reset all others
                p.attribs.rel.remove('coverpage')
                continue
            if hasattr(p, 'get_image_dimen'):
                dimen = p.get_image_dimen()
                if (dimen[0] * dimen[1]) < COVERPAGE_MIN_AREA:
                    p.attribs.rel.remove('coverpage')
                    p_url = p.url if hasattr(p, 'url') else ''
                    warning(
                        "removed coverpage candidate %s because too small (%d x %d)"
                        % (p_url, dimen[0], dimen[1]))
                    continue
            coverpage_found = True
    if spider.parsers and not coverpage_found and options.generate_cover:
        if not hasattr(Cover, 'cairo'):
            warning('Cairo not installed, cover generation disabled')
            return
        if options.outputdir:
            dir = options.outputdir
        elif url.startswith('file://'):
            dir = os.path.dirname(os.path.abspath(url[7:]))
        elif url.startswith('file:'):
            dir = os.path.dirname(os.path.abspath(url[5:]))
        else:
            dir = os.path.dirname(os.path.abspath(url))
        debug('generating cover in %s' % dir)
        cover_url = generate_cover(dir)
        if cover_url:
            cover_parser = ParserFactory.ParserFactory.create(cover_url)
            cover_parser.attribs.rel.add('coverpage')
            cover_parser.pre_parse()
            spider.parsers.append(cover_parser)
Ejemplo n.º 10
0
    def build(self, job):
        """ Build PDF file. """

        inputfilename = job.url
        outputfilename = os.path.join(os.path.abspath(job.outputdir),
                                      job.outputfile)

        debug("Inputfile: %s" % inputfilename)
        info("Creating PDF file: %s" % outputfilename)

        parser = ParserFactory.ParserFactory.create(inputfilename)

        if not hasattr(parser, 'rst2xetex'):
            warning('Skipping PDF Output because input mediatype is %s' %
                    parser.mediatype())
            raise SkipOutputFormat

        # Brain-dead xetex doesn't understand unix pipes
        # so we have to write a temp file

        texfilename = os.path.splitext(outputfilename)[0] + '.tex'
        auxfilename = os.path.splitext(outputfilename)[0] + '.aux'
        logfilename = os.path.splitext(outputfilename)[0] + '.log'

        try:
            os.remove(auxfilename)
        except OSError:
            pass

        tex = parser.rst2xetex(job)
        with open(texfilename, 'wb') as fp:
            fp.write(tex)

        try:
            cwd = os.getcwd()
            os.chdir(os.path.abspath(job.outputdir))

            _xetex = subprocess.Popen([
                options.config.XELATEX, "-output-directory", job.outputdir,
                "-interaction", "nonstopmode", texfilename
            ],
                                      stdin=subprocess.PIPE,
                                      stdout=subprocess.PIPE,
                                      stderr=subprocess.PIPE)
        except OSError as what:
            os.chdir(cwd)
            error("PDFWriter: %s %s" % (options.config.XELATEX, what))
            raise SkipOutputFormat

        (dummy_stdout, dummy_stderr) = _xetex.communicate()

        with open(logfilename, encoding='utf-8') as fp:
            for line in fp:
                line = line.strip()
                if 'Error:' in line:
                    error("xetex: %s" % line)
                if options.verbose >= 1:
                    if 'Warning:' in line:
                        warning("xetex: %s" % line)

        if options.verbose < 2:
            try:
                os.remove(texfilename)
                os.remove(logfilename)
                os.remove(auxfilename)
            except OSError:
                pass

        os.chdir(cwd)

        info("Done PDF file: %s" % outputfilename)
Ejemplo n.º 11
0
    def build(self, job):
        """ Build kindle file from epub using amazon kindlegen. """

        info("Creating Kindle file: %s" %
             os.path.join(job.outputdir, job.outputfile))
        info("            ... from: %s" % job.url)

        try:
            cwd = os.getcwd()
            os.chdir(job.outputdir)

            kindlegen = subprocess.Popen([
                options.config.MOBIGEN, '-o',
                os.path.basename(job.outputfile), job.url
            ],
                                         stdin=subprocess.PIPE,
                                         stdout=subprocess.PIPE,
                                         stderr=subprocess.PIPE)

        except OSError as what:
            os.chdir(cwd)
            error("KindleWriter: %s %s" % (options.config.MOBIGEN, what))
            raise SkipOutputFormat

        (stdout, stderr) = kindlegen.communicate()

        os.chdir(cwd)

        if kindlegen.returncode > 0:
            regex = re.compile(r'^(\w+)\(prcgen\):')

            # pylint: disable=E1103
            msg = stderr.rstrip()
            if msg:
                msg = msg.decode(sys.stderr.encoding)
                error(msg)
            msg = stdout.rstrip()
            msg = msg.decode(sys.stdout.encoding)
            for line in msg.splitlines():
                match = regex.match(line)
                if match:
                    sline = regex.sub("", line)
                    g = match.group(1).lower()
                    if g == 'info':
                        if sline == 'MOBI File generated with WARNINGS!':
                            # we knew that already
                            continue
                        # info ("kindlegen: %s" % sline)
                    elif g == 'warning':
                        if sline.startswith('Cover is too small'):
                            continue
                        if sline == 'Cover not specified':
                            continue
                        warning("kindlegen: %s" % sline)
                    elif g == 'error':
                        error("kindlegen: %s" % sline)
                    else:
                        error(line)

        info("Done Kindle file: %s" %
             os.path.join(job.outputdir, job.outputfile))
Ejemplo n.º 12
0
    def groff(self, job, nroff, encoding='utf-8'):
        """ Process thru groff.

        Takes and returns unicode strings!

        """

        device = {
            'utf-8': 'utf8',
            'iso-8859-1': 'latin1',
            'us-ascii': 'ascii'
        }[encoding]

        nroff = nroff.encode(encoding)
        nrofffilename = os.path.join(
            os.path.abspath(job.outputdir),
            os.path.splitext(job.outputfile)[0] + '.nroff')

        # write nroff file for debugging
        if options.verbose >= 2:
            with open(nrofffilename, 'wb') as fp:
                fp.write(nroff)
        else:
            try:
                # remove debug files from previous runs
                os.remove(nrofffilename)
            except OSError:
                pass

        # call groff
        try:
            _groff = subprocess.Popen(
                [
                    options.config.GROFF,
                    "-t",  # preprocess with tbl
                    "-K",
                    device,  # input encoding
                    "-T",
                    device
                ],  # output device
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE)
        except OSError:
            error("TxtWriter: executable not found: %s" % options.config.GROFF)
            raise SkipOutputFormat

        (txt, stderr) = _groff.communicate(nroff)

        # pylint: disable=E1103
        for line in stderr.splitlines():
            line = line.decode(sys.stderr.encoding)
            line = line.strip()
            if 'error' in line:
                error("groff: %s" % line)
            elif 'warn' in line:
                if options.verbose >= 1:
                    warning("groff: %s" % line)

        txt = txt.decode(encoding)
        return txt.translate(u2u)  # fix nroff idiosyncracies
Ejemplo n.º 13
0
class ParagraphMetrics(object):
    """ Calculates some metrics. """

    words = None

    try:
        fn = options.config.RHYMING_DICT
        if fn is not None:
            from six.moves import dbm_gnu
            words = dbm_gnu.open(fn)
    except ImportError:
        warning("No gnu dbm support found. Rhyming dictionary not used.")
    except dbm_gnu.error:
        warning("File containing rhyming dictionary not found: %s" % fn)

    def __init__(self, par):
        """ Calculate metrics about this paragraph. """
        lines = par.lines

        self.cnt_lines = len(lines)

        self.lengths = list(map(len, lines))
        self.centers = list(map(self._center, lines))
        self.indents = list(map(self._indent, lines))

        self.titles = list(map(self._istitle, lines))
        self.uppers = list(map(six.text_type.isupper, lines))

        # skip last line, which is almost always shorter
        self.length = MinMaxAvg(self.lengths[:-1])
        self.length.last = self.lengths[-1]
        # skip first line, which sometimes is indented on every par
        self.indent = MinMaxAvg(self.indents[1:])
        self.indent.first = self.indents[0]
        # all lines must be centered
        self.center = MinMaxAvg(self.centers)

        self.stems = None
        self.rhymes = None
        if self.words:
            self._init_rhymes(par)

    @staticmethod
    def _indent(line):
        """ Find out how much a line is left-indented. """
        return len(line) - len(line.lstrip())

    @staticmethod
    def _center(line):
        """ Find the center pos of a line. """
        len_ = len(line)
        indent = len_ - len(line.lstrip())
        return (len_ + indent) / 2

    @staticmethod
    def _istitle(line):
        """ Return True if the first char is uppercase. """
        m = re.search(r'\w', line)
        return m and m.group(0).isupper()

    def _rhyme_stemmer(self, line):
        """ Return the stem of the rhyme.

        See comments in: rhyme_compiler.py

        """

        line = re.sub(r'\W*$', '', line)

        words = re.split('[- ]+', line)
        try:
            last_word = words[-1].lower()
            return self.words[last_word.encode('utf-8')]
        except (IndexError, KeyError):
            last_word = re.sub('^(un|in)', '', last_word)
            try:
                return self.words[last_word.encode('utf-8')]
            except (IndexError, KeyError):
                return None

    def _init_rhymes(self, par):
        """ Get rhyme stems and see which lines do rhyme. """
        self.stems = list(map(self._rhyme_stemmer, par.lines))
        self.rhymes = len(self.stems) * [0]

        go_back = 8  # how many lines to consider

        for i, stem in enumerate(self.stems):
            if stem is None:
                continue
            try:
                j = self.stems.index(stem, max(0, i - go_back), i)
                self.rhymes[j] = 1
                self.rhymes[i] = 1
            except ValueError:
                pass
Ejemplo n.º 14
0
    def strip_pagenumbers(xhtml, strip_classes):
        """ Strip dp page numbers.

        Rationale: DP implements page numbers either with float or
        with absolute positioning. Float is not supported by Kindle.
        Absolute positioning is not allowed in epub.

        If we'd leave these in, they would show up as numbers in the
        middle of the text.

        To still keep links working, we replace all page number
        contraptions we can find with empty <a>'s.

        """

        # look for elements with a class that is in strip_classes

        for class_ in strip_classes:
            xp = "//xhtml:*[@class and contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % class_

            count = 0
            for elem in xpath(xhtml, xp):

                # save textual content
                text = gg.normalize(
                    etree.tostring(elem,
                                   method="text",
                                   encoding=six.text_type,
                                   with_tail=False))
                if len(text) > 10:
                    # safeguard against removing things that are not pagenumbers
                    continue

                if not text:
                    text = elem.get('title')

                # look for id anywhere inside element
                id_ = elem.xpath(".//@id")

                # transmogrify element into empty <a>
                tail = elem.tail
                elem.clear()
                elem.tag = NS.xhtml.a
                if id_:
                    # some blockheaded PPers include more than
                    # one page number in one span. take the last id
                    # because the others represent empty pages.
                    elem.set('id', id_[-1])

                if class_ in DP_PAGENUMBER_CLASSES:
                    # mark element as rewritten pagenumber. we
                    # actually don't use this class for styling
                    # because it is on an empty element
                    elem.set('class', 'x-ebookmaker-pageno')

                if text:
                    elem.set('title', text)
                elem.tail = tail
                count += 1

                # The OPS Spec 2.0 is very clear: "Reading Systems
                # must be XML processors as defined in XML 1.1."
                # Nevertheless many browser-plugin ebook readers use
                # the HTML parsers of the browser.  But HTML parsers
                # don't grok the minimized form of empty elements.
                #
                # This will force lxml to output the non-minimized form
                # of the element.
                elem.text = ''

            if count:
                warning("%d elements having class %s have been rewritten." %
                        (count, class_))
Ejemplo n.º 15
0
    def recursive_parse(self, root_attribs):
        """ Do a recursive parse starting from url.

        Do a breadth-first traversal. Assuming the first page contains
        a linked TOC, this will get us a more natural ordering of the
        pages than a depth-first traversal.

        """

        queue = []

        debug("Start of retrieval")

        # enqueue root url

        self.enqueue(queue, 0, root_attribs, True)

        while queue:
            depth, attribs = queue.pop(0)

            url = self.redirect(attribs.url)
            if url in self.parsed_urls:
                continue

            parser = ParserFactory.create(url, attribs)

            # Maybe the url was redirected to something we already have?
            url = parser.attribs.url
            if url in self.parsed_urls:
                continue
            self.parsed_urls.add(url)

            self.add_redirection(parser.attribs.orig_url, url)
            parser.pre_parse()
            self.parsers.append(parser)

            # look for more documents to add to the queue
            debug("Requesting iterlinks for: %s ..." % url)
            for url, elem in parser.iterlinks():

                if elem.get('rel') == 'nofollow':
                    # remove link to content not followed
                    elem.tag = 'span'
                    elem.set('data-nofolllow-href', elem.get('href'))
                    del elem.attrib['href']
                    del elem.attrib['rel']
                    warning('not followed: %s' % url)
                    continue

                new_attribs = parsers.ParserAttributes()
                new_attribs.url = urllib.parse.urldefrag(url)[0]
                new_attribs.referrer = parser.attribs.url

                for k, v in elem.items():
                    if k in ('id', 'title'):
                        setattr(new_attribs, k, v)
                    elif k == 'type':
                        new_attribs.orig_mediatype = new_attribs.HeaderElement.from_str(
                            v)
                    elif k == 'rel':
                        new_attribs.rel.update(v.lower().split())

                tag = elem.tag
                if tag == NS.xhtml.a:
                    if self.is_image(new_attribs) and self.is_included_url(new_attribs) and \
                            self.is_included_mediatype(new_attribs):
                        # need to wrap an image
                        wrapper_parser = parsers.WrapperParser.Parser(
                            new_attribs)
                        if wrapper_parser.attribs.url not in self.parsed_urls:
                            ParserFactory.parsers[
                                wrapper_parser.attribs.url] = wrapper_parser
                            self.parsers.append(wrapper_parser)
                            self.parsed_urls.add(wrapper_parser.attribs.url)
                        elem.set('href', wrapper_parser.attribs.url)
                        new_attribs.referrer = wrapper_parser.attribs.url
                        elem.set('title', wrapper_parser.attribs.title)
                        self.enqueue(queue, depth + 1, new_attribs, False)
                    else:
                        self.enqueue(queue, depth + 1, new_attribs, True)

                elif tag == NS.xhtml.img:
                    self.enqueue(queue, depth, new_attribs, False)
                elif tag == NS.xhtml.link:
                    if new_attribs.rel.intersection(
                        ('stylesheet', 'coverpage')):
                        self.enqueue(queue, depth, new_attribs, False)
                    else:
                        self.enqueue(queue, depth + 1, new_attribs, True)
                elif tag == NS.xhtml.object:
                    self.enqueue(queue, depth, new_attribs, False)

        debug("End of retrieval")

        # rewrite redirected urls
        if self.redirection_map:
            for parser in self.parsers:
                parser.remap_links(self.redirection_map)

        self.topological_sort()
Ejemplo n.º 16
0
    def build(self, job):
        """ Build kindle file from epub using amazon kindlegen or calibre. """

        if job.dc.languages:
            if job.dc.languages[0].id in no_kindlegen_langs:
                mobimaker = options.config.MOBILANG
            else:
                mobimaker = options.config.MOBIGEN
        if not mobimaker:
            info('no mobimaker available')
            return

        # kindlegen needs localized paths
        outputdir = os.path.abspath(job.outputdir)

        info("Creating Kindle file: %s" % os.path.join(outputdir, job.outputfile))
        info("            ... from: %s" % job.url)

        try:
            cwd = os.getcwd()
            os.chdir(outputdir)
            if 'ebook-convert' in mobimaker:
                kindlegen = subprocess.Popen(
                    [
                        mobimaker,
                        job.url,
                        os.path.basename(job.outputfile),
                        '--personal-doc="[EBOK]"',
                    ],
                    stdin=subprocess.PIPE,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE
                )
            else:
                kindlegen = subprocess.Popen(
                    [
                        mobimaker,
                        '-o', os.path.basename(job.outputfile),
                        job.url
                    ],
                    stdin=subprocess.PIPE,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE
                )

        except OSError as what:
            os.chdir(cwd)
            error("KindleWriter: %s %s" % (mobimaker, what))
            raise SkipOutputFormat

        (stdout, stderr) = kindlegen.communicate()

        os.chdir(cwd)

        if kindlegen.returncode > 0:
            regex = re.compile(r'^(\w+)\(prcgen\):')

            # pylint: disable=E1103
            msg = stderr.rstrip()
            if msg:
                msg = msg.decode(sys.stderr.encoding)
                error(msg)
            msg = stdout.rstrip()
            msg = msg.decode(sys.stdout.encoding)
            for line in msg.splitlines():
                match = regex.match(line)
                if match:
                    sline = regex.sub("", line)
                    g = match.group(1).lower()
                    if g == 'info':
                        if sline == 'MOBI File generated with WARNINGS!':
                            # we knew that already
                            continue
                        # info("kindlegen: %s" % sline)
                    elif g == 'warning':
                        if sline.startswith('Cover is too small'):
                            continue
                        if sline == 'Cover not specified':
                            continue
                        warning("kindlegen: %s" % sline)
                    elif g == 'error':
                        error("kindlegen: %s" % sline)
                    else:
                        error(line)

        info("Done Kindle file: %s" % os.path.join(outputdir, job.outputfile))
Ejemplo n.º 17
0
def do_job(job):
    """ Do one job. """

    log_handler = None
    Logger.ebook = job.ebook
    if job.logfile:
        log_handler = open_log(
            os.path.join(os.path.abspath(job.outputdir), job.logfile))

    debug('=== Building %s ===' % job.type)
    start_time = datetime.datetime.now()
    try:
        if job.url:
            spider = Spider.Spider()
            dirpath = os.path.dirname(job.url)  # platform native path
            spider.include_urls += (options.include_urls
                                    or [parsers.webify_url(dirpath) + '/*']
                                    )  # use for parser only

            spider.include_mediatypes += options.include_mediatypes
            if job.subtype == '.images' or job.type == 'rst.gen':
                spider.include_mediatypes.append('image/*')

            spider.exclude_urls += options.exclude_urls

            spider.exclude_mediatypes += options.exclude_mediatypes

            spider.max_depth = options.max_depth or six.MAXSIZE

            for rewrite in options.rewrite:
                from_url, to_url = rewrite.split('>')
                spider.add_redirection(from_url, to_url)

            attribs = parsers.ParserAttributes()
            attribs.url = parsers.webify_url(job.url)
            attribs.id = 'start'

            if options.input_mediatype:
                attribs.orig_mediatype = attribs.HeaderElement.from_str(
                    options.input_mediatype)

            spider.recursive_parse(attribs)
            elect_coverpage(spider, job.url)
            job.url = spider.redirect(job.url)
            job.base_url = job.url
            job.spider = spider

        writer = WriterFactory.create(job.maintype)
        writer.build(job)

        if options.validate:
            writer.validate(job)

        packager = PackagerFactory.create(options.packager, job.type)
        if packager:
            packager.package(job)

        if job.type == 'html.images':
            # FIXME: hack for push packager
            options.html_images_list = list(job.spider.aux_file_iter())

    except SkipOutputFormat as what:
        warning("%s" % what)

    except Exception as what:
        exception("%s" % what)

    end_time = datetime.datetime.now()
    info(' %s made in %s' % (job.type, end_time - start_time))

    if log_handler:
        close_log(log_handler)
        log_handler = None