Exemple #1
0
def get_download_filename_from_response(response):
    from polyglot.urllib import unquote, urlparse
    filename = last_part_name = ''
    try:
        purl = urlparse(response.geturl())
        last_part_name = unquote(purl.path.split('/')[-1])
        disposition = response.info().get('Content-disposition', '')
        if isinstance(disposition, bytes):
            disposition = disposition.decode('utf-8', 'replace')
        for p in disposition.split(';'):
            if 'filename' in p:
                if '*=' in disposition:
                    parts = disposition.split('*=')[-1]
                    filename = parts.split('\'')[-1]
                else:
                    filename = disposition.split('=')[-1]
                if filename[0] in ('\'', '"'):
                    filename = filename[1:]
                if filename[-1] in ('\'', '"'):
                    filename = filename[:-1]
                filename = unquote(filename)
                break
    except Exception:
        import traceback
        traceback.print_exc()
    return filename or last_part_name
Exemple #2
0
def get_download_filename_from_response(response):
    from polyglot.urllib import unquote, urlparse
    filename = last_part_name = ''
    try:
        purl = urlparse(response.geturl())
        last_part_name = unquote(purl.path.split('/')[-1])
        disposition = response.info().get('Content-disposition', '')
        if isinstance(disposition, bytes):
            disposition = disposition.decode('utf-8', 'replace')
        for p in disposition.split(';'):
            if 'filename' in p:
                if '*=' in disposition:
                    parts = disposition.split('*=')[-1]
                    filename = parts.split('\'')[-1]
                else:
                    filename = disposition.split('=')[-1]
                if filename[0] in ('\'', '"'):
                    filename = filename[1:]
                if filename[-1] in ('\'', '"'):
                    filename = filename[:-1]
                filename = unquote(filename)
                break
    except Exception:
        import traceback
        traceback.print_exc()
    return filename or last_part_name
Exemple #3
0
 def __init__(self, href_or_path, basedir=os.getcwdu(), is_path=True):
     self._href = None
     self._basedir = basedir
     self.path = None
     self.fragment = ''
     try:
         self.mime_type = guess_type(href_or_path)[0]
     except:
         self.mime_type = None
     if self.mime_type is None:
         self.mime_type = 'application/octet-stream'
     if is_path:
         path = href_or_path
         if not os.path.isabs(path):
             path = os.path.abspath(os.path.join(basedir, path))
         if isinstance(path, bytes):
             path = path.decode(sys.getfilesystemencoding())
         self.path = path
     else:
         url = urlparse(href_or_path)
         if url[0] not in ('', 'file'):
             self._href = href_or_path
         else:
             pc = url[2]
             if isinstance(pc, unicode_type):
                 pc = pc.encode('utf-8')
             pc = unquote(pc).decode('utf-8')
             self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep)))
             self.fragment = unquote(url[-1])
Exemple #4
0
 def __init__(self, href_or_path, basedir=os.getcwdu(), is_path=True):
     self._href = None
     self._basedir = basedir
     self.path = None
     self.fragment = ''
     try:
         self.mime_type = guess_type(href_or_path)[0]
     except:
         self.mime_type = None
     if self.mime_type is None:
         self.mime_type = 'application/octet-stream'
     if is_path:
         path = href_or_path
         if not os.path.isabs(path):
             path = os.path.abspath(os.path.join(basedir, path))
         if isinstance(path, bytes):
             path = path.decode(sys.getfilesystemencoding())
         self.path = path
     else:
         url = urlparse(href_or_path)
         if url[0] not in ('', 'file'):
             self._href = href_or_path
         else:
             pc = url[2]
             if isinstance(pc, unicode_type):
                 pc = pc.encode('utf-8')
             pc = unquote(pc).decode('utf-8')
             self.path = os.path.abspath(
                 os.path.join(basedir, pc.replace('/', os.sep)))
             self.fragment = unquote(url[-1])
Exemple #5
0
        def process_navpoint(np, dest):
            try:
                play_order = int(get_attr(np, 1))
            except:
                play_order = 1
            href = fragment = text = None
            nd = dest
            nl = nl_path(np)
            if nl:
                nl = nl[0]
                text = ''
                for txt in txt_path(nl):
                    text += etree.tostring(txt,
                                           method='text',
                                           encoding='unicode',
                                           with_tail=False)
                content = content_path(np)
                if content and text:
                    content = content[0]
                    # if get_attr(content, attr='src'):
                    purl = urlparse(content.get('src'))
                    href, fragment = unquote(purl[2]), unquote(purl[5])
                    nd = dest.add_item(href, fragment, text)
                    nd.play_order = play_order

            for c in np_path(np):
                process_navpoint(c, nd)
Exemple #6
0
        def process_navpoint(np, dest):
            try:
                play_order = int(get_attr(np, 1))
            except:
                play_order = 1
            href = fragment = text = None
            nd = dest
            nl = nl_path(np)
            if nl:
                nl = nl[0]
                text = u''
                for txt in txt_path(nl):
                    text += etree.tostring(txt, method='text',
                            encoding='unicode', with_tail=False)
                content = content_path(np)
                if content and text:
                    content = content[0]
                    # if get_attr(content, attr='src'):
                    purl = urlparse(content.get('src'))
                    href, fragment = unquote(purl[2]), unquote(purl[5])
                    nd = dest.add_item(href, fragment, text)
                    nd.play_order = play_order

            for c in np_path(np):
                process_navpoint(c, nd)
def parse_uri(uri, parse_query=True):
    scheme, authority, path = parse_request_uri(uri)
    if path is None:
        raise HTTPSimpleResponse(http_client.BAD_REQUEST, "No path component")
    if b'#' in path:
        raise HTTPSimpleResponse(http_client.BAD_REQUEST,
                                 "Illegal #fragment in Request-URI.")

    if scheme:
        try:
            scheme = scheme.decode('ascii')
        except ValueError:
            raise HTTPSimpleResponse(http_client.BAD_REQUEST,
                                     'Un-decodeable scheme')

    path, qs = path.partition(b'?')[::2]
    if parse_query:
        try:
            query = MultiDict.create_from_query_string(qs)
        except Exception:
            raise HTTPSimpleResponse(http_client.BAD_REQUEST,
                                     'Unparseable query string')
    else:
        query = None

    try:
        path = '%2F'.join(
            unquote(x).decode('utf-8') for x in quoted_slash.split(path))
    except ValueError as e:
        raise HTTPSimpleResponse(http_client.BAD_REQUEST, as_unicode(e))
    path = tuple(filter(None,
                        (x.replace('%2F', '/') for x in path.split('/'))))

    return scheme, path, query
Exemple #8
0
def parse_uri(uri, parse_query=True):
    scheme, authority, path = parse_request_uri(uri)
    if path is None:
        raise HTTPSimpleResponse(http_client.BAD_REQUEST, "No path component")
    if b'#' in path:
        raise HTTPSimpleResponse(http_client.BAD_REQUEST, "Illegal #fragment in Request-URI.")

    if scheme:
        try:
            scheme = scheme.decode('ascii')
        except ValueError:
            raise HTTPSimpleResponse(http_client.BAD_REQUEST, 'Un-decodeable scheme')

    path, qs = path.partition(b'?')[::2]
    if parse_query:
        try:
            query = MultiDict.create_from_query_string(qs)
        except Exception:
            raise HTTPSimpleResponse(http_client.BAD_REQUEST, 'Unparseable query string')
    else:
        query = None

    try:
        path = '%2F'.join(unquote(x).decode('utf-8') for x in quoted_slash.split(path))
    except ValueError as e:
        raise HTTPSimpleResponse(http_client.BAD_REQUEST, as_unicode(e))
    path = tuple(filter(None, (x.replace('%2F', '/') for x in path.split('/'))))

    return scheme, path, query
Exemple #9
0
def dnd_get_image(md, image_exts=None):
    '''
    Get the image in the QMimeData object md.

    :return: None, None if no image is found
             QPixmap, None if an image is found, the pixmap is guaranteed not null
             url, filename if a URL that points to an image is found
    '''
    if md.hasImage():
        for x in md.formats():
            x = unicode_type(x)
            if x.startswith('image/'):
                cdata = bytes(md.data(x))
                pmap = QPixmap()
                pmap.loadFromData(cdata)
                if not pmap.isNull():
                    return pmap, None
                break
    if md.hasFormat('application/octet-stream'):
        cdata = bytes(md.data('application/octet-stream'))
        pmap = QPixmap()
        pmap.loadFromData(cdata)
        if not pmap.isNull():
            return pmap, None

    if image_exts is None:
        image_exts = image_extensions()

    # No image, look for an URL pointing to an image
    urls = urls_from_md(md)
    paths = [path_from_qurl(u) for u in urls]
    # First look for a local file
    images = [
        xi for xi in paths
        if posixpath.splitext(unquote(xi))[1][1:].lower() in image_exts
    ]
    images = [xi for xi in images if os.path.exists(xi)]
    p = QPixmap()
    for path in images:
        try:
            with open(path, 'rb') as f:
                p.loadFromData(f.read())
        except Exception:
            continue
        if not p.isNull():
            return p, None

    # No local images, look for remote ones

    # First, see if this is from Firefox
    rurl, fname = get_firefox_rurl(md, image_exts)

    if rurl and fname:
        return rurl, fname
    # Look through all remaining URLs
    for remote_url, filename in remote_urls_from_qurl(urls, image_exts):
        return remote_url, filename

    return None, None
Exemple #10
0
def path_from_qurl(qurl):
    raw = bytes(qurl.toEncoded(
        QUrl.PreferLocalFile | QUrl.RemoveScheme | QUrl.RemovePassword | QUrl.RemoveUserInfo |
        QUrl.RemovePort | QUrl.RemoveAuthority | QUrl.RemoveQuery | QUrl.RemoveFragment))
    ans = as_unicode_polyglot(unquote(raw), errors='replace')
    if iswindows and ans.startswith('/'):
        ans = ans[1:]
    return ans
Exemple #11
0
def dnd_get_image(md, image_exts=None):
    '''
    Get the image in the QMimeData object md.

    :return: None, None if no image is found
             QPixmap, None if an image is found, the pixmap is guaranteed not null
             url, filename if a URL that points to an image is found
    '''
    if md.hasImage():
        for x in md.formats():
            x = unicode_type(x)
            if x.startswith('image/'):
                cdata = bytes(md.data(x))
                pmap = QPixmap()
                pmap.loadFromData(cdata)
                if not pmap.isNull():
                    return pmap, None
                break
    if md.hasFormat('application/octet-stream'):
        cdata = bytes(md.data('application/octet-stream'))
        pmap = QPixmap()
        pmap.loadFromData(cdata)
        if not pmap.isNull():
            return pmap, None

    if image_exts is None:
        image_exts = image_extensions()

    # No image, look for an URL pointing to an image
    urls = urls_from_md(md)
    paths = [path_from_qurl(u) for u in urls]
    # First look for a local file
    images = [xi for xi in paths if
            posixpath.splitext(unquote(xi))[1][1:].lower() in
            image_exts]
    images = [xi for xi in images if os.path.exists(xi)]
    p = QPixmap()
    for path in images:
        try:
            with open(path, 'rb') as f:
                p.loadFromData(f.read())
        except Exception:
            continue
        if not p.isNull():
            return p, None

    # No local images, look for remote ones

    # First, see if this is from Firefox
    rurl, fname = get_firefox_rurl(md, image_exts)

    if rurl and fname:
        return rurl, fname
    # Look through all remaining URLs
    for remote_url, filename in remote_urls_from_qurl(urls, image_exts):
        return remote_url, filename

    return None, None
Exemple #12
0
def download_one(tdir, timeout, progress_report, data_uri_map, url):
    try:
        purl = urlparse(url)
        data_url_key = None
        with NamedTemporaryFile(dir=tdir, delete=False) as df:
            if purl.scheme == 'file':
                path = unquote(purl.path)
                if iswindows and path.startswith('/'):
                    path = path[1:]
                src = lopen(path, 'rb')
                filename = os.path.basename(path)
                sz = (src.seek(0, os.SEEK_END), src.tell(), src.seek(0))[1]
            elif purl.scheme == 'data':
                prefix, payload = purl.path.split(',', 1)
                parts = prefix.split(';')
                if parts and parts[-1].lower() == 'base64':
                    payload = re.sub(r'\s+', '', payload)
                    payload = from_base64_bytes(payload)
                else:
                    payload = payload.encode('utf-8')
                seen_before = data_uri_map.get(payload)
                if seen_before is not None:
                    return True, (url, filename, seen_before,
                                  guess_type(seen_before))
                data_url_key = payload
                src = BytesIO(payload)
                sz = len(payload)
                ext = 'unknown'
                for x in parts:
                    if '=' not in x and '/' in x:
                        exts = mimetypes.guess_all_extensions(x)
                        if exts:
                            ext = exts[0]
                            break
                filename = 'data-uri.' + ext
            else:
                src = urlopen(url, timeout=timeout)
                filename = get_filename(purl, src)
                sz = get_content_length(src)
            progress_report(url, 0, sz)
            dest = ProgressTracker(df, url, sz, progress_report)
            with closing(src):
                shutil.copyfileobj(src, dest)
            if data_url_key is not None:
                data_uri_map[data_url_key] = dest.name
            filename = sanitize_file_name(filename)
            mt = guess_type(filename)
            if mt in OEB_DOCS:
                raise ValueError(
                    'The external resource {} looks like a HTML document ({})'.
                    format(url, filename))
            if not mt or mt == 'application/octet-stream' or '.' not in filename:
                raise ValueError(
                    'The external resource {} is not of a known type'.format(
                        url))
            return True, (url, filename, dest.name, mt)
    except Exception as err:
        return False, (url, as_unicode(err))
Exemple #13
0
 def _cover_from_html(self, hcover):
     from calibre.ebooks import render_html_svg_workaround
     with TemporaryDirectory('_html_cover') as tdir:
         writer = OEBWriter()
         writer(self.oeb, tdir)
         path = os.path.join(tdir, unquote(hcover.href))
         data = render_html_svg_workaround(path, self.logger)
         if not data:
             data = b''
     id, href = self.oeb.manifest.generate('cover', 'cover.jpg')
     item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data)
     return item
Exemple #14
0
 def add_links(self):
     for link in self.links:
         path, href, frag = link[0]
         page, rect = link[1:]
         combined_path = os.path.normcase(
             os.path.abspath(
                 os.path.join(os.path.dirname(path),
                              *unquote(href).split('/'))))
         is_local = not href or combined_path in self.anchors
         annot = Dictionary({
             'Type': Name('Annot'),
             'Subtype': Name('Link'),
             'Rect': rect,
             'Border': Array([0, 0, 0]),
         })
         if self.mark_links:
             annot.update({
                 'Border': Array([16, 16, 1]),
                 'C': Array([1.0, 0, 0])
             })
         if is_local:
             path = combined_path if href else path
             try:
                 annot['Dest'] = self.anchors[path][frag]
             except KeyError:
                 try:
                     annot['Dest'] = self.anchors[path][None]
                 except KeyError:
                     pass
         else:
             url = href + (('#' + frag) if frag else '')
             try:
                 purl = urlparse(url)
             except Exception:
                 self.pdf.debug('Ignoring unparseable URL: %r' % url)
                 continue
             if purl.scheme and purl.scheme != 'file':
                 action = Dictionary({
                     'Type': Name('Action'),
                     'S': Name('URI'),
                 })
                 # Do not try to normalize/quote/unquote this URL as if it
                 # has a query part, it will get corrupted
                 action['URI'] = String(url)
                 annot['A'] = action
         if 'A' in annot or 'Dest' in annot:
             if 'Annots' not in page:
                 page['Annots'] = Array()
             page['Annots'].append(self.pdf.objects.add(annot))
         else:
             self.pdf.debug(
                 'Could not find destination for link: %s in file %s' %
                 (href, path))
Exemple #15
0
 def _cover_from_html(self, hcover):
     from calibre.ebooks import render_html_svg_workaround
     with TemporaryDirectory('_html_cover') as tdir:
         writer = OEBWriter()
         writer(self.oeb, tdir)
         path = os.path.join(tdir, unquote(hcover.href))
         data = render_html_svg_workaround(path, self.logger)
         if not data:
             data = ''
     id, href = self.oeb.manifest.generate('cover', 'cover.jpg')
     item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data)
     return item
Exemple #16
0
    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from polyglot.urllib import unquote
        from lxml import etree

        self.log, self.opts = log, opts
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME, OEB_STYLES
        from calibre.ebooks.oeb.normalize_css import condense_sheet
        with CurrentDir(output_path):
            results = oeb_book.to_opf2(page_map=True)
            for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME):
                href, root = results.pop(key, [None, None])
                if root is not None:
                    if key == OPF_MIME:
                        try:
                            self.workaround_nook_cover_bug(root)
                        except:
                            self.log.exception(
                                'Something went wrong while trying to'
                                ' workaround Nook cover bug, ignoring')
                        try:
                            self.workaround_pocketbook_cover_bug(root)
                        except:
                            self.log.exception(
                                'Something went wrong while trying to'
                                ' workaround Pocketbook cover bug, ignoring')
                        self.migrate_lang_code(root)
                        self.adjust_mime_types(root)
                    raw = etree.tostring(root,
                                         pretty_print=True,
                                         encoding='utf-8',
                                         xml_declaration=True)
                    if key == OPF_MIME:
                        # Needed as I can't get lxml to output opf:role and
                        # not output <opf:metadata> as well
                        raw = re.sub(br'(<[/]{0,1})opf:', br'\1', raw)
                    with lopen(href, 'wb') as f:
                        f.write(raw)

            for item in oeb_book.manifest:
                if (not self.opts.expand_css and item.media_type in OEB_STYLES
                        and hasattr(item.data, 'cssText')
                        and 'nook' not in self.opts.output_profile.short_name):
                    condense_sheet(item.data)
                path = os.path.abspath(unquote(item.href))
                dir = os.path.dirname(path)
                if not os.path.exists(dir):
                    os.makedirs(dir)
                with lopen(path, 'wb') as f:
                    f.write(item.bytes_representation)
                item.unload_data_from_memory(memory=path)
Exemple #17
0
    def get_toc(self):
        self.stream.seek(24)
        toc_offset = self.read_i32()

        self.stream.seek(toc_offset)
        pages = self.read_i32()

        toc = RBToc()
        for i in range(pages):
            name = unquote(self.stream.read(32).strip(b'\x00'))
            size, offset, flags = self.read_i32(), self.read_i32(), self.read_i32()
            toc.append(RBToc.Item(name=name, size=size, offset=offset, flags=flags))

        return toc
Exemple #18
0
    def get_toc(self):
        self.stream.seek(24)
        toc_offset = self.read_i32()

        self.stream.seek(toc_offset)
        pages = self.read_i32()

        toc = RBToc()
        for i in range(pages):
            name = unquote(self.stream.read(32).strip(b'\x00'))
            size, offset, flags = self.read_i32(), self.read_i32(), self.read_i32()
            toc.append(RBToc.Item(name=name, size=size, offset=offset, flags=flags))

        return toc
Exemple #19
0
    def read_from_opf(self, opfreader):
        toc = opfreader.soup.find('spine', toc=True)
        if toc is not None:
            toc = toc['toc']
        if toc is None:
            try:
                toc = opfreader.soup.find('guide').find('reference',
                                                        attrs={'type':
                                                               'toc'})['href']
            except:
                for item in opfreader.manifest:
                    if 'toc' in item.href().lower():
                        toc = item.href()
                        break

        if toc is not None:
            if toc.lower() not in ('ncx', 'ncxtoc'):
                toc = urlparse(unquote(toc))[2]
                toc = toc.replace('/', os.sep)
                if not os.path.isabs(toc):
                    toc = os.path.join(self.base_path, toc)
                try:
                    if not os.path.exists(toc):
                        bn = os.path.basename(toc)
                        bn = bn.replace('_top.htm',
                                        '_toc.htm')  # Bug in BAEN OPF files
                        toc = os.path.join(os.path.dirname(toc), bn)

                    self.read_html_toc(toc)
                except:
                    print(
                        'WARNING: Could not read Table of Contents. Continuing anyway.'
                    )
            else:
                path = opfreader.manifest.item(toc.lower())
                path = getattr(path, 'path', path)
                if path and os.access(path, os.R_OK):
                    try:
                        self.read_ncx_toc(path)
                    except Exception as err:
                        print('WARNING: Invalid NCX file:', err)
                    return
                cwd = os.path.abspath(self.base_path)
                m = glob.glob(os.path.join(cwd, '*.ncx'))
                if m:
                    toc = m[0]
                    self.read_ncx_toc(toc)
Exemple #20
0
    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from polyglot.urllib import unquote
        from lxml import etree

        self.log, self.opts = log, opts
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME, OEB_STYLES
        from calibre.ebooks.oeb.normalize_css import condense_sheet
        with CurrentDir(output_path):
            results = oeb_book.to_opf2(page_map=True)
            for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME):
                href, root = results.pop(key, [None, None])
                if root is not None:
                    if key == OPF_MIME:
                        try:
                            self.workaround_nook_cover_bug(root)
                        except:
                            self.log.exception('Something went wrong while trying to'
                                    ' workaround Nook cover bug, ignoring')
                        try:
                            self.workaround_pocketbook_cover_bug(root)
                        except:
                            self.log.exception('Something went wrong while trying to'
                                    ' workaround Pocketbook cover bug, ignoring')
                        self.migrate_lang_code(root)
                    raw = etree.tostring(root, pretty_print=True,
                            encoding='utf-8', xml_declaration=True)
                    if key == OPF_MIME:
                        # Needed as I can't get lxml to output opf:role and
                        # not output <opf:metadata> as well
                        raw = re.sub(br'(<[/]{0,1})opf:', br'\1', raw)
                    with open(href, 'wb') as f:
                        f.write(raw)

            for item in oeb_book.manifest:
                if (
                        not self.opts.expand_css and item.media_type in OEB_STYLES and hasattr(
                            item.data, 'cssText') and 'nook' not in self.opts.output_profile.short_name):
                    condense_sheet(item.data)
                path = os.path.abspath(unquote(item.href))
                dir = os.path.dirname(path)
                if not os.path.exists(dir):
                    os.makedirs(dir)
                with open(path, 'wb') as f:
                    f.write(item.bytes_representation)
                item.unload_data_from_memory(memory=path)
Exemple #21
0
    def insert_cover(self):
        from calibre.ebooks.oeb.base import urldefrag
        g, m = self.oeb.guide, self.oeb.manifest
        item = None
        if 'titlepage' not in g:
            if 'cover' in g:
                href = g['cover'].href
            else:
                href = self.default_cover()
            if href is None:
                return
            width, height = self.inspect_cover(href)
            if width == -1 or height == -1:
                self.log.warning('Failed to read cover dimensions')
                width, height = 600, 800
            # if self.preserve_aspect_ratio:
            #    width, height = 600, 800
            self.svg_template = self.svg_template.replace(
                '__viewbox__', '0 0 %d %d' % (width, height))
            self.svg_template = self.svg_template.replace(
                '__width__', unicode_type(width))
            self.svg_template = self.svg_template.replace(
                '__height__', unicode_type(height))

            if href is not None:
                templ = self.non_svg_template if self.no_svg_cover \
                        else self.svg_template
                tp = templ % unquote(href)
                id, href = m.generate('titlepage', 'titlepage.xhtml')
                item = m.add(id,
                             href,
                             guess_type('t.xhtml')[0],
                             data=etree.fromstring(tp))
        else:
            item = self.oeb.manifest.hrefs[urldefrag(
                self.oeb.guide['titlepage'].href)[0]]
        if item is not None:
            self.oeb.spine.insert(0, item, True)
            if 'cover' not in self.oeb.guide.refs:
                self.oeb.guide.add('cover', 'Title Page', 'a')
            self.oeb.guide.refs['cover'].href = item.href
            if 'titlepage' in self.oeb.guide.refs:
                self.oeb.guide.refs['titlepage'].href = item.href
            titem = getattr(self.oeb.toc, 'item_that_refers_to_cover', None)
            if titem is not None:
                titem.href = item.href
Exemple #22
0
def parse_html_toc(data):
    from html5_parser import parse
    from calibre.utils.cleantext import clean_xml_chars
    from lxml import etree
    if isinstance(data, bytes):
        data = xml_to_unicode(data, strip_encoding_pats=True, resolve_entities=True)[0]
    root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True)
    for a in root.xpath('//*[@href and local-name()="a"]'):
        purl = urlparse(unquote(a.get('href')))
        href, fragment = purl[2], purl[5]
        if not fragment:
            fragment = None
        else:
            fragment = fragment.strip()
        href = href.strip()

        txt = etree.tostring(a, method='text', encoding='unicode')
        yield href, fragment, txt
Exemple #23
0
    def insert_cover(self):
        from calibre.ebooks.oeb.base import urldefrag
        g, m = self.oeb.guide, self.oeb.manifest
        item = None
        if 'titlepage' not in g:
            if 'cover' in g:
                href = g['cover'].href
            else:
                href = self.default_cover()
            if href is None:
                return
            width, height = self.inspect_cover(href)
            if width == -1 or height == -1:
                self.log.warning('Failed to read cover dimensions')
                width, height = 600, 800
            # if self.preserve_aspect_ratio:
            #    width, height = 600, 800
            self.svg_template = self.svg_template.replace('__viewbox__',
                    '0 0 %d %d'%(width, height))
            self.svg_template = self.svg_template.replace('__width__',
                    str(width))
            self.svg_template = self.svg_template.replace('__height__',
                    str(height))

            if href is not None:
                templ = self.non_svg_template if self.no_svg_cover \
                        else self.svg_template
                tp = templ%unquote(href)
                id, href = m.generate('titlepage', u'titlepage.xhtml')
                item = m.add(id, href, guess_type('t.xhtml')[0],
                        data=etree.fromstring(tp))
        else:
            item = self.oeb.manifest.hrefs[
                    urldefrag(self.oeb.guide['titlepage'].href)[0]]
        if item is not None:
            self.oeb.spine.insert(0, item, True)
            if 'cover' not in self.oeb.guide.refs:
                self.oeb.guide.add('cover', 'Title Page', 'a')
            self.oeb.guide.refs['cover'].href = item.href
            if 'titlepage' in self.oeb.guide.refs:
                self.oeb.guide.refs['titlepage'].href = item.href
            titem = getattr(self.oeb.toc, 'item_that_refers_to_cover', None)
            if titem is not None:
                titem.href = item.href
 def build_node(current_node, parent=None):
     if parent is None:
         parent = etree.Element('ul')
     elif len(current_node.nodes):
         parent = element(parent, ('ul'))
     for node in current_node.nodes:
         point = element(parent, 'li')
         href = relpath(abspath(unquote(node.href)), dirname(ref_url))
         if isinstance(href, bytes):
             href = href.decode('utf-8')
         link = element(point, 'a', href=clean_xml_chars(href))
         title = node.title
         if isinstance(title, bytes):
             title = title.decode('utf-8')
         if title:
             title = re.sub(r'\s+', ' ', title)
         link.text = clean_xml_chars(title)
         build_node(node, point)
     return parent
Exemple #25
0
 def build_node(current_node, parent=None):
     if parent is None:
         parent = etree.Element('ul')
     elif len(current_node.nodes):
         parent = element(parent, ('ul'))
     for node in current_node.nodes:
         point = element(parent, 'li')
         href = relpath(abspath(unquote(node.href)), dirname(ref_url))
         if isinstance(href, bytes):
             href = href.decode('utf-8')
         link = element(point, 'a', href=clean_xml_chars(href))
         title = node.title
         if isinstance(title, bytes):
             title = title.decode('utf-8')
         if title:
             title = re.sub(r'\s+', ' ', title)
         link.text = clean_xml_chars(title)
         build_node(node, point)
     return parent
Exemple #26
0
    def read_from_opf(self, opfreader):
        toc = opfreader.soup.find('spine', toc=True)
        if toc is not None:
            toc = toc['toc']
        if toc is None:
            try:
                toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
            except:
                for item in opfreader.manifest:
                    if 'toc' in item.href().lower():
                        toc = item.href()
                        break

        if toc is not None:
            if toc.lower() not in ('ncx', 'ncxtoc'):
                toc = urlparse(unquote(toc))[2]
                toc = toc.replace('/', os.sep)
                if not os.path.isabs(toc):
                    toc = os.path.join(self.base_path, toc)
                try:
                    if not os.path.exists(toc):
                        bn  = os.path.basename(toc)
                        bn  = bn.replace('_top.htm', '_toc.htm')  # Bug in BAEN OPF files
                        toc = os.path.join(os.path.dirname(toc), bn)

                    self.read_html_toc(toc)
                except:
                    print('WARNING: Could not read Table of Contents. Continuing anyway.')
            else:
                path = opfreader.manifest.item(toc.lower())
                path = getattr(path, 'path', path)
                if path and os.access(path, os.R_OK):
                    try:
                        self.read_ncx_toc(path)
                    except Exception as err:
                        print('WARNING: Invalid NCX file:', err)
                    return
                cwd = os.path.abspath(self.base_path)
                m = glob.glob(os.path.join(cwd, '*.ncx'))
                if m:
                    toc = m[0]
                    self.read_ncx_toc(toc)
Exemple #27
0
 def _build_manifest(self):
     states = ['linear', 'nonlinear', 'css', 'images']
     manifest = dict((state, []) for state in states)
     for item in self._oeb.manifest.values():
         if item.spine_position is not None:
             key = 'linear' if item.linear else 'nonlinear'
             manifest[key].append(item)
         elif item.media_type in OEB_STYLES:
             manifest['css'].append(item)
         elif item.media_type in LIT_IMAGES:
             manifest['images'].append(item)
     data = io.BytesIO()
     data.write(pack('<Bc', 1, b'\\'))
     offset = 0
     for state in states:
         items = sorted(manifest[state], key=attrgetter('sort_key'))
         data.write(pack('<I', len(items)))
         for item in items:
             id, media_type = item.id, item.media_type
             if media_type in OEB_DOCS:
                 # Needs to have 'html' in media-type
                 media_type = XHTML_MIME
             elif media_type in OEB_STYLES:
                 media_type = CSS_MIME
             href = unquote(item.href)
             item.offset = offset \
                 if state in ('linear', 'nonlinear') else 0
             data.write(pack('<I', item.offset))
             entry = [
                 codepoint_to_chr(len(id)),
                 unicode_type(id),
                 codepoint_to_chr(len(href)),
                 unicode_type(href),
                 codepoint_to_chr(len(media_type)),
                 unicode_type(media_type)
             ]
             for value in entry:
                 data.write(value.encode('utf-8'))
             data.write(b'\0')
             offset += item.size
     self._add_file('/manifest', data.getvalue())
Exemple #28
0
def dnd_get_local_image_and_pixmap(md, image_exts=None):
    if md.hasImage():
        for x in md.formats():
            x = unicode_type(x)
            if x.startswith('image/'):
                cdata = bytes(md.data(x))
                pmap = QPixmap()
                pmap.loadFromData(cdata)
                if not pmap.isNull():
                    return pmap, cdata
    if md.hasFormat('application/octet-stream'):
        cdata = bytes(md.data('application/octet-stream'))
        pmap = QPixmap()
        pmap.loadFromData(cdata)
        if not pmap.isNull():
            return pmap, cdata

    if image_exts is None:
        image_exts = image_extensions()

    # No image, look for an URL pointing to an image
    urls = urls_from_md(md)
    paths = [path_from_qurl(u) for u in urls]
    # Look for a local file
    images = [
        xi for xi in paths
        if posixpath.splitext(unquote(xi))[1][1:].lower() in image_exts
    ]
    images = [xi for xi in images if os.path.exists(xi)]
    for path in images:
        try:
            with open(path, 'rb') as f:
                cdata = f.read()
        except Exception:
            continue
        p = QPixmap()
        p.loadFromData(cdata)
        if not p.isNull():
            return p, cdata

    return None, None
Exemple #29
0
    def rewrite_links(self, url):
        href, frag = urldefrag(url)
        try:
            href = self.current_item.abshref(href)
        except ValueError:
            # Unparsable URL
            return url
        try:
            href = urlnormalize(href)
        except ValueError:
            # href has non utf-8 quoting
            return url
        if href in self.map:
            anchor_map = self.map[href]
            nhref = anchor_map[frag if frag else None]
            nhref = self.current_item.relhref(nhref)
            if frag:
                nhref = '#'.join((unquote(nhref), frag))

            return nhref
        return url
Exemple #30
0
def dnd_get_files(md, exts, allow_all_extensions=False, filter_exts=()):
    '''
    Get the file in the QMimeData object md with an extension that is one of
    the extensions in exts.

    :return: None, None if no file is found
             [paths], None if a local file is found
             [urls], [filenames] if URLs that point to a files are found
    '''
    # Look for a URL pointing to a file
    urls = urls_from_md(md)
    # First look for a local file
    local_files = [path_from_qurl(x) for x in urls]

    def is_ok(path):
        ext = posixpath.splitext(path)[1][1:].lower()
        if allow_all_extensions and ext and ext not in filter_exts:
            return True
        return ext in exts and ext not in filter_exts

    local_files = [p for p in local_files if is_ok(unquote(p))]
    local_files = [x for x in local_files if os.path.exists(x)]
    if local_files:
        return local_files, None

    # No local files, look for remote ones

    # First, see if this is from Firefox
    rurl, fname = get_firefox_rurl(md, exts)
    if rurl and fname:
        return [rurl], [fname]

    # Look through all remaining URLs
    rurls, filenames = [], []
    for rurl, fname in remote_urls_from_qurl(urls, exts):
        rurls.append(rurl), filenames.append(fname)
    if rurls:
        return rurls, filenames

    return None, None
Exemple #31
0
def dnd_get_files(md, exts, allow_all_extensions=False, filter_exts=()):
    '''
    Get the file in the QMimeData object md with an extension that is one of
    the extensions in exts.

    :return: None, None if no file is found
             [paths], None if a local file is found
             [urls], [filenames] if URLs that point to a files are found
    '''
    # Look for a URL pointing to a file
    urls = urls_from_md(md)
    # First look for a local file
    local_files = [path_from_qurl(x) for x in urls]

    def is_ok(path):
        ext = posixpath.splitext(path)[1][1:].lower()
        if allow_all_extensions and ext and ext not in filter_exts:
            return True
        return ext in exts and ext not in filter_exts
    local_files = [p for p in local_files if is_ok(unquote(p))]
    local_files = [x for x in local_files if os.path.exists(x)]
    if local_files:
        return local_files, None

    # No local files, look for remote ones

    # First, see if this is from Firefox
    rurl, fname = get_firefox_rurl(md, exts)
    if rurl and fname:
        return [rurl], [fname]

    # Look through all remaining URLs
    rurls, filenames = [], []
    for rurl, fname in remote_urls_from_qurl(urls, exts):
        rurls.append(rurl), filenames.append(fname)
    if rurls:
        return rurls, filenames

    return None, None
Exemple #32
0
    def read_html_toc(self, toc):
        self.base_path = os.path.dirname(toc)
        soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
        for a in soup.findAll('a'):
            if not a.has_key('href'):  # noqa
                continue
            purl = urlparse(unquote(a['href']))
            href, fragment = purl[2], purl[5]
            if not fragment:
                fragment = None
            else:
                fragment = fragment.strip()
            href = href.strip()

            txt = ''.join([unicode_type(s).strip() for s in a.findAll(text=True)])
            add = True
            for i in self.flat():
                if i.href == href and i.fragment == fragment:
                    add = False
                    break
            if add:
                self.add_item(href, fragment, txt)
Exemple #33
0
    def __init__(self, title=_('Choose Files'),
                       filters=[],
                       add_all_files_filter=True,
                       parent=None,
                       modal=True,
                       name='',
                       mode=QFileDialog.ExistingFiles,
                       default_dir=u'~',
                       no_save_dir=False,
                       combine_file_and_saved_dir=False
                       ):
        from calibre.gui2 import dynamic, sanitize_env_vars
        QObject.__init__(self)
        ftext = ''
        if filters:
            for filter in filters:
                text, extensions = filter
                extensions = ['*'+(i if i.startswith('.') else '.'+i) for i in
                        extensions]
                ftext += '%s (%s);;'%(text, ' '.join(extensions))
        if add_all_files_filter or not ftext:
            ftext += 'All files (*)'
        if ftext.endswith(';;'):
            ftext = ftext[:-2]

        self.dialog_name = dialog_name(name, title)
        self.selected_files = None
        self.fd = None

        if combine_file_and_saved_dir:
            bn = os.path.basename(default_dir)
            prev = dynamic.get(self.dialog_name,
                    expanduser(u'~'))
            if os.path.exists(prev):
                if os.path.isfile(prev):
                    prev = os.path.dirname(prev)
            else:
                prev = expanduser(u'~')
            initial_dir = os.path.join(prev, bn)
        elif no_save_dir:
            initial_dir = expanduser(default_dir)
        else:
            initial_dir = dynamic.get(self.dialog_name,
                    expanduser(default_dir))
        if not isinstance(initial_dir, string_or_bytes):
            initial_dir = expanduser(default_dir)
        if not initial_dir or (not os.path.exists(initial_dir) and not (
                mode == QFileDialog.AnyFile and (no_save_dir or combine_file_and_saved_dir))):
            initial_dir = select_initial_dir(initial_dir)
        self.selected_files = []
        use_native_dialog = 'CALIBRE_NO_NATIVE_FILEDIALOGS' not in os.environ
        with sanitize_env_vars():
            opts = QFileDialog.Option()
            if not use_native_dialog:
                opts |= QFileDialog.DontUseNativeDialog
            if mode == QFileDialog.AnyFile:
                f = QFileDialog.getSaveFileName(parent, title,
                    initial_dir, ftext, "", opts)
                if f and f[0]:
                    self.selected_files.append(f[0])
            elif mode == QFileDialog.ExistingFile:
                f = QFileDialog.getOpenFileName(parent, title,
                    initial_dir, ftext, "", opts)
                if f and f[0] and os.path.exists(f[0]):
                    self.selected_files.append(f[0])
            elif mode == QFileDialog.ExistingFiles:
                fs = QFileDialog.getOpenFileNames(parent, title, initial_dir,
                        ftext, "", opts)
                if fs and fs[0]:
                    for f in fs[0]:
                        f = unicode_type(f)
                        if not f:
                            continue
                        if not os.path.exists(f):
                            # QFileDialog for some reason quotes spaces
                            # on linux if there is more than one space in a row
                            f = unquote(f)
                        if f and os.path.exists(f):
                            self.selected_files.append(f)
            else:
                if mode == QFileDialog.Directory:
                    opts |= QFileDialog.ShowDirsOnly
                f = unicode_type(QFileDialog.getExistingDirectory(parent, title, initial_dir, opts))
                if os.path.exists(f):
                    self.selected_files.append(f)
        if self.selected_files:
            self.selected_files = [unicode_type(q) for q in self.selected_files]
            saved_loc = self.selected_files[0]
            if os.path.isfile(saved_loc):
                saved_loc = os.path.dirname(saved_loc)
            if not no_save_dir:
                dynamic[self.dialog_name] = saved_loc
        self.accepted = bool(self.selected_files)
Exemple #34
0
    def __init__(
        self, title=_('Choose Files'),
        filters=[],
        add_all_files_filter=True,
        parent=None,
        modal=True,
        name='',
        mode=QFileDialog.ExistingFiles,
        default_dir=u'~',
        no_save_dir=False,
        combine_file_and_saved_dir=False
    ):
        from calibre.gui2 import dynamic, sanitize_env_vars
        from calibre.gui2.ui import get_gui
        gui = get_gui()
        adapt_menubar = gui.bars_manager.adapt_menu_bar_for_dialog if gui is not None else Dummy()
        QObject.__init__(self)
        ftext = ''
        if filters:
            for filter in filters:
                text, extensions = filter
                extensions = ['*'+(i if i.startswith('.') else '.'+i) for i in
                        extensions]
                ftext += '%s (%s);;'%(text, ' '.join(extensions))
        if add_all_files_filter or not ftext:
            ftext += 'All files (*)'
        if ftext.endswith(';;'):
            ftext = ftext[:-2]

        self.dialog_name = dialog_name(name, title)
        self.selected_files = None
        self.fd = None

        if combine_file_and_saved_dir:
            bn = os.path.basename(default_dir)
            prev = dynamic.get(self.dialog_name,
                    os.path.expanduser(u'~'))
            if os.path.exists(prev):
                if os.path.isfile(prev):
                    prev = os.path.dirname(prev)
            else:
                prev = os.path.expanduser(u'~')
            initial_dir = os.path.join(prev, bn)
        elif no_save_dir:
            initial_dir = os.path.expanduser(default_dir)
        else:
            initial_dir = dynamic.get(self.dialog_name,
                    os.path.expanduser(default_dir))
        if not isinstance(initial_dir, string_or_bytes):
            initial_dir = os.path.expanduser(default_dir)
        if not initial_dir or (not os.path.exists(initial_dir) and not (
                mode == QFileDialog.AnyFile and (no_save_dir or combine_file_and_saved_dir))):
            initial_dir = select_initial_dir(initial_dir)
        self.selected_files = []
        use_native_dialog = 'CALIBRE_NO_NATIVE_FILEDIALOGS' not in os.environ
        with sanitize_env_vars():
            opts = QFileDialog.Option()
            if not use_native_dialog:
                opts |= QFileDialog.DontUseNativeDialog
            if mode == QFileDialog.AnyFile:
                with adapt_menubar:
                    f = QFileDialog.getSaveFileName(parent, title,
                        initial_dir, ftext, "", opts)
                if f and f[0]:
                    self.selected_files.append(f[0])
            elif mode == QFileDialog.ExistingFile:
                with adapt_menubar:
                    f = QFileDialog.getOpenFileName(parent, title,
                        initial_dir, ftext, "", opts)
                if f and f[0] and os.path.exists(f[0]):
                    self.selected_files.append(f[0])
            elif mode == QFileDialog.ExistingFiles:
                with adapt_menubar:
                    fs = QFileDialog.getOpenFileNames(parent, title, initial_dir,
                            ftext, "", opts)
                if fs and fs[0]:
                    for f in fs[0]:
                        f = unicode_type(f)
                        if not f:
                            continue
                        if not os.path.exists(f):
                            # QFileDialog for some reason quotes spaces
                            # on linux if there is more than one space in a row
                            f = unquote(f)
                        if f and os.path.exists(f):
                            self.selected_files.append(f)
            else:
                if mode == QFileDialog.Directory:
                    opts |= QFileDialog.ShowDirsOnly
                with adapt_menubar:
                    f = unicode_type(QFileDialog.getExistingDirectory(parent, title, initial_dir, opts))
                if os.path.exists(f):
                    self.selected_files.append(f)
        if self.selected_files:
            self.selected_files = [unicode_type(q) for q in self.selected_files]
            saved_loc = self.selected_files[0]
            if os.path.isfile(saved_loc):
                saved_loc = os.path.dirname(saved_loc)
            if not no_save_dir:
                dynamic[self.dialog_name] = saved_loc
        self.accepted = bool(self.selected_files)
Exemple #35
0
    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from lxml import etree
        from calibre.utils import zipfile
        from templite import Templite
        from polyglot.urllib import unquote
        from calibre.ebooks.html.meta import EasyMeta

        # read template files
        if opts.template_html_index is not None:
            template_html_index_data = open(opts.template_html_index, 'rb').read()
        else:
            template_html_index_data = P('templates/html_export_default_index.tmpl', data=True)

        if opts.template_html is not None:
            template_html_data = open(opts.template_html, 'rb').read()
        else:
            template_html_data = P('templates/html_export_default.tmpl', data=True)

        if opts.template_css is not None:
            template_css_data = open(opts.template_css, 'rb').read()
        else:
            template_css_data = P('templates/html_export_default.css', data=True)

        template_html_index_data = template_html_index_data.decode('utf-8')
        template_html_data = template_html_data.decode('utf-8')
        template_css_data = template_css_data.decode('utf-8')

        self.log  = log
        self.opts = opts
        meta = EasyMeta(oeb_book.metadata)

        tempdir = os.path.realpath(PersistentTemporaryDirectory())
        output_file = os.path.join(tempdir,
                basename(re.sub(r'\.zip', '', output_path)+'.html'))
        output_dir = re.sub(r'\.html', '', output_file)+'_files'

        if not exists(output_dir):
            os.makedirs(output_dir)

        css_path = output_dir+os.sep+'calibreHtmlOutBasicCss.css'
        with open(css_path, 'wb') as f:
            f.write(template_css_data.encode('utf-8'))

        with open(output_file, 'wb') as f:
            html_toc = self.generate_html_toc(oeb_book, output_file, output_dir)
            templite = Templite(template_html_index_data)
            nextLink = oeb_book.spine[0].href
            nextLink = relpath(output_dir+os.sep+nextLink, dirname(output_file))
            cssLink = relpath(abspath(css_path), dirname(output_file))
            tocUrl = relpath(output_file, dirname(output_file))
            t = templite.render(has_toc=bool(oeb_book.toc.count()),
                    toc=html_toc, meta=meta, nextLink=nextLink,
                    tocUrl=tocUrl, cssLink=cssLink,
                    firstContentPageLink=nextLink)
            if isinstance(t, unicode_type):
                t = t.encode('utf-8')
            f.write(t)

        with CurrentDir(output_dir):
            for item in oeb_book.manifest:
                path = abspath(unquote(item.href))
                dir = dirname(path)
                if not exists(dir):
                    os.makedirs(dir)
                if item.spine_position is not None:
                    with open(path, 'wb') as f:
                        pass
                else:
                    with open(path, 'wb') as f:
                        f.write(item.bytes_representation)
                    item.unload_data_from_memory(memory=path)

            for item in oeb_book.spine:
                path = abspath(unquote(item.href))
                dir = dirname(path)
                root = item.data.getroottree()

                # get & clean HTML <HEAD>-data
                head = root.xpath('//h:head', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
                head_content = etree.tostring(head, pretty_print=True, encoding='unicode')
                head_content = re.sub(r'\<\/?head.*\>', '', head_content)
                head_content = re.sub(re.compile(r'\<style.*\/style\>', re.M|re.S), '', head_content)
                head_content = re.sub(r'<(title)([^>]*)/>', r'<\1\2></\1>', head_content)

                # get & clean HTML <BODY>-data
                body = root.xpath('//h:body', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
                ebook_content = etree.tostring(body, pretty_print=True, encoding='unicode')
                ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content)
                ebook_content = re.sub(r'<(div|a|span)([^>]*)/>', r'<\1\2></\1>', ebook_content)

                # generate link to next page
                if item.spine_position+1 < len(oeb_book.spine):
                    nextLink = oeb_book.spine[item.spine_position+1].href
                    nextLink = relpath(abspath(nextLink), dir)
                else:
                    nextLink = None

                # generate link to previous page
                if item.spine_position > 0:
                    prevLink = oeb_book.spine[item.spine_position-1].href
                    prevLink = relpath(abspath(prevLink), dir)
                else:
                    prevLink = None

                cssLink = relpath(abspath(css_path), dir)
                tocUrl = relpath(output_file, dir)
                firstContentPageLink = oeb_book.spine[0].href

                # render template
                templite = Templite(template_html_data)
                toc = lambda: self.generate_html_toc(oeb_book, path, output_dir)
                t = templite.render(ebookContent=ebook_content,
                        prevLink=prevLink, nextLink=nextLink,
                        has_toc=bool(oeb_book.toc.count()), toc=toc,
                        tocUrl=tocUrl, head_content=head_content,
                        meta=meta, cssLink=cssLink,
                        firstContentPageLink=firstContentPageLink)

                # write html to file
                with open(path, 'wb') as f:
                    f.write(t.encode('utf-8'))
                item.unload_data_from_memory(memory=path)

        zfile = zipfile.ZipFile(output_path, "w")
        zfile.add_dir(output_dir, basename(output_dir))
        zfile.write(output_file, basename(output_file), zipfile.ZIP_DEFLATED)

        if opts.extract_to:
            if os.path.exists(opts.extract_to):
                shutil.rmtree(opts.extract_to)
            os.makedirs(opts.extract_to)
            zfile.extractall(opts.extract_to)
            self.log('Zip file extracted to', opts.extract_to)

        zfile.close()

        # cleanup temp dir
        shutil.rmtree(tempdir)
Exemple #36
0
    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from lxml import etree
        from calibre.utils import zipfile
        from templite import Templite
        from polyglot.urllib import unquote
        from calibre.ebooks.html.meta import EasyMeta

        # read template files
        if opts.template_html_index is not None:
            template_html_index_data = open(opts.template_html_index,
                                            'rb').read()
        else:
            template_html_index_data = P(
                'templates/html_export_default_index.tmpl', data=True)

        if opts.template_html is not None:
            template_html_data = open(opts.template_html, 'rb').read()
        else:
            template_html_data = P('templates/html_export_default.tmpl',
                                   data=True)

        if opts.template_css is not None:
            template_css_data = open(opts.template_css, 'rb').read()
        else:
            template_css_data = P('templates/html_export_default.css',
                                  data=True)

        template_html_index_data = template_html_index_data.decode('utf-8')
        template_html_data = template_html_data.decode('utf-8')
        template_css_data = template_css_data.decode('utf-8')

        self.log = log
        self.opts = opts
        meta = EasyMeta(oeb_book.metadata)

        tempdir = os.path.realpath(PersistentTemporaryDirectory())
        output_file = os.path.join(
            tempdir, basename(re.sub(r'\.zip', '', output_path) + '.html'))
        output_dir = re.sub(r'\.html', '', output_file) + '_files'

        if not exists(output_dir):
            os.makedirs(output_dir)

        css_path = output_dir + os.sep + 'calibreHtmlOutBasicCss.css'
        with open(css_path, 'wb') as f:
            f.write(template_css_data.encode('utf-8'))

        with open(output_file, 'wb') as f:
            html_toc = self.generate_html_toc(oeb_book, output_file,
                                              output_dir)
            templite = Templite(template_html_index_data)
            nextLink = oeb_book.spine[0].href
            nextLink = relpath(output_dir + os.sep + nextLink,
                               dirname(output_file))
            cssLink = relpath(abspath(css_path), dirname(output_file))
            tocUrl = relpath(output_file, dirname(output_file))
            t = templite.render(has_toc=bool(oeb_book.toc.count()),
                                toc=html_toc,
                                meta=meta,
                                nextLink=nextLink,
                                tocUrl=tocUrl,
                                cssLink=cssLink,
                                firstContentPageLink=nextLink)
            if isinstance(t, unicode_type):
                t = t.encode('utf-8')
            f.write(t)

        with CurrentDir(output_dir):
            for item in oeb_book.manifest:
                path = abspath(unquote(item.href))
                dir = dirname(path)
                if not exists(dir):
                    os.makedirs(dir)
                if item.spine_position is not None:
                    with open(path, 'wb') as f:
                        pass
                else:
                    with open(path, 'wb') as f:
                        f.write(str(item))
                    item.unload_data_from_memory(memory=path)

            for item in oeb_book.spine:
                path = abspath(unquote(item.href))
                dir = dirname(path)
                root = item.data.getroottree()

                # get & clean HTML <HEAD>-data
                head = root.xpath(
                    '//h:head',
                    namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
                head_content = etree.tostring(head,
                                              pretty_print=True,
                                              encoding='utf-8')
                head_content = re.sub(r'\<\/?head.*\>', '', head_content)
                head_content = re.sub(
                    re.compile(r'\<style.*\/style\>', re.M | re.S), '',
                    head_content)
                head_content = re.sub(r'<(title)([^>]*)/>', r'<\1\2></\1>',
                                      head_content)

                # get & clean HTML <BODY>-data
                body = root.xpath(
                    '//h:body',
                    namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
                ebook_content = etree.tostring(body,
                                               pretty_print=True,
                                               encoding='utf-8')
                ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content)
                ebook_content = re.sub(r'<(div|a|span)([^>]*)/>',
                                       r'<\1\2></\1>', ebook_content)

                # generate link to next page
                if item.spine_position + 1 < len(oeb_book.spine):
                    nextLink = oeb_book.spine[item.spine_position + 1].href
                    nextLink = relpath(abspath(nextLink), dir)
                else:
                    nextLink = None

                # generate link to previous page
                if item.spine_position > 0:
                    prevLink = oeb_book.spine[item.spine_position - 1].href
                    prevLink = relpath(abspath(prevLink), dir)
                else:
                    prevLink = None

                cssLink = relpath(abspath(css_path), dir)
                tocUrl = relpath(output_file, dir)
                firstContentPageLink = oeb_book.spine[0].href

                # render template
                templite = Templite(template_html_data)
                toc = lambda: self.generate_html_toc(oeb_book, path, output_dir
                                                     )
                t = templite.render(ebookContent=ebook_content,
                                    prevLink=prevLink,
                                    nextLink=nextLink,
                                    has_toc=bool(oeb_book.toc.count()),
                                    toc=toc,
                                    tocUrl=tocUrl,
                                    head_content=head_content,
                                    meta=meta,
                                    cssLink=cssLink,
                                    firstContentPageLink=firstContentPageLink)

                # write html to file
                with open(path, 'wb') as f:
                    f.write(t)
                item.unload_data_from_memory(memory=path)

        zfile = zipfile.ZipFile(output_path, "w")
        zfile.add_dir(output_dir, basename(output_dir))
        zfile.write(output_file, basename(output_file), zipfile.ZIP_DEFLATED)

        if opts.extract_to:
            if os.path.exists(opts.extract_to):
                shutil.rmtree(opts.extract_to)
            os.makedirs(opts.extract_to)
            zfile.extractall(opts.extract_to)
            self.log('Zip file extracted to', opts.extract_to)

        zfile.close()

        # cleanup temp dir
        shutil.rmtree(tempdir)
Exemple #37
0
 def add_links(self):
     for link in self.links:
         path, href, frag = link[0]
         page, rect = link[1:]
         combined_path = os.path.normcase(os.path.abspath(os.path.join(os.path.dirname(path), *unquote(href).split('/'))))
         is_local = not href or combined_path in self.anchors
         annot = Dictionary({
             'Type':Name('Annot'), 'Subtype':Name('Link'),
             'Rect':rect, 'Border':Array([0,0,0]),
         })
         if self.mark_links:
             annot.update({'Border':Array([16, 16, 1]), 'C':Array([1.0, 0,
                                                                   0])})
         if is_local:
             path = combined_path if href else path
             try:
                 annot['Dest'] = self.anchors[path][frag]
             except KeyError:
                 try:
                     annot['Dest'] = self.anchors[path][None]
                 except KeyError:
                     pass
         else:
             url = href + (('#'+frag) if frag else '')
             try:
                 purl = urlparse(url)
             except Exception:
                 self.pdf.debug('Ignoring unparseable URL: %r' % url)
                 continue
             if purl.scheme and purl.scheme != 'file':
                 action = Dictionary({
                     'Type':Name('Action'), 'S':Name('URI'),
                 })
                 # Do not try to normalize/quote/unquote this URL as if it
                 # has a query part, it will get corrupted
                 action['URI'] = String(url)
                 annot['A'] = action
         if 'A' in annot or 'Dest' in annot:
             if 'Annots' not in page:
                 page['Annots'] = Array()
             page['Annots'].append(self.pdf.objects.add(annot))
         else:
             self.pdf.debug('Could not find destination for link: %s in file %s'%
                            (href, path))