Example #1
0
 def request_sync(self, tagname, href, lnum):
     if self.current_name:
         c = current_container()
         if tagname == 'a' and href:
             if href and href.startswith('#'):
                 name = self.current_name
             else:
                 name = c.href_to_name(href, self.current_name) if href else None
             if name == self.current_name:
                 return self.view.page().go_to_anchor(urlparse(href).fragment, lnum)
             if name and c.exists(name) and c.mime_map[name] in OEB_DOCS:
                 return self.link_clicked.emit(name, urlparse(href).fragment or TOP)
         self.sync_requested.emit(self.current_name, lnum)
Example #2
0
def parse_ncx(container, ncx_name):
    root = container.parsed(ncx_name)
    toc_root = TOC()
    navmaps = root.xpath('//*[calibre:lower-case(local-name()) = "navmap"]')
    if navmaps:
        process_ncx_node(container, navmaps[0], toc_root, ncx_name)
    toc_root.lang = toc_root.uid = None
    for attr, val in iteritems(root.attrib):
        if attr.endswith('lang'):
            toc_root.lang = unicode_type(val)
            break
    for uid in root.xpath('//*[calibre:lower-case(local-name()) = "meta" and @name="dtb:uid"]/@content'):
        if uid:
            toc_root.uid = unicode_type(uid)
            break
    for pl in root.xpath('//*[calibre:lower-case(local-name()) = "pagelist"]'):
        for pt in pl.xpath('descendant::*[calibre:lower-case(local-name()) = "pagetarget"]'):
            pagenum = pt.get('value')
            if pagenum:
                href = pt.xpath('descendant::*[calibre:lower-case(local-name()) = "content"]/@src')
                if href:
                    dest = container.href_to_name(href[0], base=ncx_name)
                    frag = urlparse(href[0]).fragment or None
                    toc_root.page_list.append({'dest': dest, 'pagenum': pagenum, 'frag': frag})
    return toc_root
Example #3
0
    def __init__(self, url, fname, parent):
        QDialog.__init__(self, parent)
        self.setWindowTitle(_('Download %s')%fname)
        self.l = QVBoxLayout(self)
        self.purl = urlparse(url)
        self.msg = QLabel(_('Downloading <b>%(fname)s</b> from %(url)s')%dict(
            fname=fname, url=self.purl.netloc))
        self.msg.setWordWrap(True)
        self.l.addWidget(self.msg)
        self.pb = QProgressBar(self)
        self.pb.setMinimum(0)
        self.pb.setMaximum(0)
        self.l.addWidget(self.pb)
        self.bb = QDialogButtonBox(QDialogButtonBox.Cancel, Qt.Horizontal, self)
        self.l.addWidget(self.bb)
        self.bb.rejected.connect(self.reject)
        sz = self.sizeHint()
        self.resize(max(sz.width(), 400), sz.height())

        fpath = PersistentTemporaryFile(os.path.splitext(fname)[1])
        fpath.close()
        self.fpath = fpath.name

        self.worker = Worker(url, self.fpath, Queue())
        self.rejected = False
Example #4
0
def get_download_filename_from_response(response):
    from polyglot.urllib import unquote, urlparse
    filename = last_part_name = ''
    try:
        purl = urlparse(response.geturl())
        last_part_name = unquote(purl.path.split('/')[-1])
        disposition = response.info().get('Content-disposition', '')
        if isinstance(disposition, bytes):
            disposition = disposition.decode('utf-8', 'replace')
        for p in disposition.split(';'):
            if 'filename' in p:
                if '*=' in disposition:
                    parts = disposition.split('*=')[-1]
                    filename = parts.split('\'')[-1]
                else:
                    filename = disposition.split('=')[-1]
                if filename[0] in ('\'', '"'):
                    filename = filename[1:]
                if filename[-1] in ('\'', '"'):
                    filename = filename[:-1]
                filename = unquote(filename)
                break
    except Exception:
        import traceback
        traceback.print_exc()
    return filename or last_part_name
Example #5
0
 def __init__(self, href_or_path, basedir=os.getcwdu(), is_path=True):
     self._href = None
     self._basedir = basedir
     self.path = None
     self.fragment = ''
     try:
         self.mime_type = guess_type(href_or_path)[0]
     except:
         self.mime_type = None
     if self.mime_type is None:
         self.mime_type = 'application/octet-stream'
     if is_path:
         path = href_or_path
         if not os.path.isabs(path):
             path = os.path.abspath(os.path.join(basedir, path))
         if isinstance(path, bytes):
             path = path.decode(sys.getfilesystemencoding())
         self.path = path
     else:
         url = urlparse(href_or_path)
         if url[0] not in ('', 'file'):
             self._href = href_or_path
         else:
             pc = url[2]
             if isinstance(pc, unicode_type):
                 pc = pc.encode('utf-8')
             pc = unquote(pc).decode('utf-8')
             self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep)))
             self.fragment = unquote(url[-1])
Example #6
0
 def create_toc_from_links(self):
     num = 0
     for item in self.oeb.spine:
         for a in XPath('//h:a[@href]')(item.data):
             href = a.get('href')
             try:
                 purl = urlparse(href)
             except ValueError:
                 self.log.warning('Ignoring malformed URL:', href)
                 continue
             if not purl[0] or purl[0] == 'file':
                 href, frag = purl.path, purl.fragment
                 href = item.abshref(href)
                 if frag:
                     href = '#'.join((href, frag))
                 if not self.oeb.toc.has_href(href):
                     text = xml2text(a)
                     text = text[:100].strip()
                     if (not self.opts.duplicate_links_in_toc and
                             self.oeb.toc.has_text(text)):
                         continue
                     try:
                         self.oeb.toc.add(text, href,
                             play_order=self.oeb.toc.next_play_order())
                         num += 1
                     except ValueError:
                         self.oeb.log.exception('Failed to process link: %r' % href)
                         continue  # Most likely an incorrectly URL encoded link
                     if self.opts.max_toc_links > 0 and \
                             num >= self.opts.max_toc_links:
                         self.log('Maximum TOC links reached, stopping.')
                         return
Example #7
0
    def serialize_hyperlink(self, parent, link):
        item, url, tooltip = link
        purl = urlparse(url)
        href = purl.path

        def make_link(parent, anchor=None, id=None, tooltip=None):
            kw = {}
            if anchor is not None:
                kw['w_anchor'] = anchor
            elif id is not None:
                kw['r_id'] = id
            if tooltip:
                kw['w_tooltip'] = tooltip
            return self.namespace.makeelement(parent, 'w:hyperlink', **kw)

        if not purl.scheme:
            href = item.abshref(href)
            if href in self.document_hrefs:
                key = (href, purl.fragment or self.top_anchor)
                if key in self.anchor_map:
                    bmark = self.anchor_map[key]
                else:
                    bmark = self.anchor_map[(href, self.top_anchor)]
                return make_link(parent, anchor=bmark, tooltip=tooltip)
            else:
                self.log.warn('Ignoring internal hyperlink with href (%s) pointing to unknown destination' % url)
        if purl.scheme in {'http', 'https', 'ftp'}:
            if url not in self.external_links:
                self.external_links[url] = self.document_relationships.add_relationship(url, self.namespace.names['LINKS'], target_mode='External')
            return make_link(parent, id=self.external_links[url], tooltip=tooltip)
        return parent
Example #8
0
    def __init__(self, *args):
        QWebView.__init__(self, *args)
        self.gui = None
        self.tags = ''
        self.create_browser = None

        self._page = NPWebPage()
        self.setPage(self._page)
        self.cookie_jar = QNetworkCookieJar()
        self.page().networkAccessManager().setCookieJar(self.cookie_jar)

        http_proxy = get_proxies().get('http', None)
        if http_proxy:
            proxy_parts = urlparse(http_proxy)
            proxy = QNetworkProxy()
            proxy.setType(QNetworkProxy.HttpProxy)
            if proxy_parts.username:
                proxy.setUser(proxy_parts.username)
            if proxy_parts.password:
                proxy.setPassword(proxy_parts.password)
            if proxy_parts.hostname:
                proxy.setHostName(proxy_parts.hostname)
            if proxy_parts.port:
                proxy.setPort(proxy_parts.port)
            self.page().networkAccessManager().setProxy(proxy)

        self.page().setForwardUnsupportedContent(True)
        self.page().unsupportedContent.connect(self.start_download)
        self.page().downloadRequested.connect(self.start_download)
        self.page().networkAccessManager().sslErrors.connect(self.ignore_ssl_errors)
Example #9
0
        def process_navpoint(np, dest):
            try:
                play_order = int(get_attr(np, 1))
            except:
                play_order = 1
            href = fragment = text = None
            nd = dest
            nl = nl_path(np)
            if nl:
                nl = nl[0]
                text = u''
                for txt in txt_path(nl):
                    text += etree.tostring(txt, method='text',
                            encoding='unicode', with_tail=False)
                content = content_path(np)
                if content and text:
                    content = content[0]
                    # if get_attr(content, attr='src'):
                    purl = urlparse(content.get('src'))
                    href, fragment = unquote(purl[2]), unquote(purl[5])
                    nd = dest.add_item(href, fragment, text)
                    nd.play_order = play_order

            for c in np_path(np):
                process_navpoint(c, nd)
Example #10
0
 def link_replacer(base, url):
     if url.startswith('#'):
         frag = urlunquote(url[1:])
         if not frag:
             return url
         changed.add(base)
         return resource_template.format(encode_url(base, frag))
     purl = urlparse(url)
     if purl.netloc or purl.query:
         return url
     if purl.scheme and purl.scheme != 'file':
         return url
     if not purl.path or purl.path.startswith('/'):
         return url
     url, frag = purl.path, purl.fragment
     name = self.href_to_name(url, base)
     if name:
         if self.has_name_and_is_not_empty(name):
             frag = urlunquote(frag)
             url = resource_template.format(encode_url(name, frag))
         else:
             if isinstance(name, unicode_type):
                 name = name.encode('utf-8')
             url = 'missing:' + force_unicode(quote(name), 'utf-8')
         changed.add(base)
     return url
Example #11
0
    def fetch_url(self, url):
        data = None
        self.log.debug('Fetching', url)
        st = time.time()

        # Check for a URL pointing to the local filesystem and special case it
        # for efficiency and robustness. Bypasses delay checking as it does not
        # apply to local fetches. Ensures that unicode paths that are not
        # representable in the filesystem_encoding work.
        is_local = 0
        if url.startswith('file://'):
            is_local = 7
        elif url.startswith('file:'):
            is_local = 5
        if is_local > 0:
            url = url[is_local:]
            if iswindows and url.startswith('/'):
                url = url[1:]
            with open(url, 'rb') as f:
                data = response(f.read())
                data.newurl = 'file:'+url  # This is what mechanize does for
                # local URLs
            self.log.debug('Fetched %s in %.1f seconds' % (url, time.time() - st))
            return data

        delta = time.time() - self.last_fetch_at
        if delta < self.delay:
            time.sleep(self.delay - delta)
        # mechanize does not handle quoting automatically
        if re.search(r'\s+', url) is not None:
            if isinstance(url, unicode_type):
                url = url.encode('utf-8')
            purl = list(urlparse(url))
            for i in range(2, 6):
                purl[i] = quote(purl[i])
            url = urlunparse(purl).decode('utf-8')
        open_func = getattr(self.browser, 'open_novisit', self.browser.open)
        try:
            with closing(open_func(url, timeout=self.timeout)) as f:
                data = response(f.read()+f.read())
                data.newurl = f.geturl()
        except URLError as err:
            if hasattr(err, 'code') and err.code in responses:
                raise FetchError(responses[err.code])
            if getattr(err, 'reason', [0])[0] == 104 or \
                getattr(getattr(err, 'args', [None])[0], 'errno', None) in (-2,
                        -3):  # Connection reset by peer or Name or service not known
                self.log.debug('Temporary error, retrying in 1 second')
                time.sleep(1)
                with closing(open_func(url, timeout=self.timeout)) as f:
                    data = response(f.read()+f.read())
                    data.newurl = f.geturl()
            else:
                raise err
        finally:
            self.last_fetch_at = time.time()
        self.log.debug('Fetched %s in %f seconds' % (url, time.time() - st))
        return data
Example #12
0
def add_from_li(container, li, parent, nav_name):
    dest = frag = text = None
    for x in li.iterchildren(XHTML('a'), XHTML('span')):
        text = etree.tostring(x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join(x.xpath('descendant-or-self::*/@title')).strip()
        href = x.get('href')
        if href:
            dest = nav_name if href.startswith('#') else container.href_to_name(href, base=nav_name)
            frag = urlparse(href).fragment or None
        break
    return parent.add(text or None, dest or None, frag or None)
Example #13
0
def canonicalize_url(url):
    # mechanize does not handle quoting automatically
    if re.search(r'\s+', url) is not None:
        if isinstance(url, unicode_type):
            url = url.encode('utf-8')
        purl = list(urlparse(url))
        for i in range(2, 6):
            purl[i] = as_bytes(quote(purl[i]))
        url = urlunparse(purl).decode('utf-8')
    return url
Example #14
0
def add_from_li(container, li, parent, nav_name):
    dest = frag = text = None
    for x in li.iterchildren(XHTML('a'), XHTML('span')):
        text = etree.tostring(x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join(x.xpath('descendant-or-self::*/@title')).strip()
        href = x.get('href')
        if href:
            dest = nav_name if href.startswith('#') else container.href_to_name(href, base=nav_name)
            frag = urlparse(href).fragment or None
        break
    return parent.add(text or None, dest or None, frag or None)
Example #15
0
def check_external_links(container, progress_callback=(lambda num, total:None), check_anchors=True):
    progress_callback(0, 0)
    external_links = defaultdict(list)
    for name, mt in iteritems(container.mime_map):
        if mt in OEB_DOCS or mt in OEB_STYLES:
            for href, lnum, col in container.iterlinks(name):
                purl = urlparse(href)
                if purl.scheme in ('http', 'https'):
                    external_links[href].append((name, href, lnum, col))
    if not external_links:
        return []
    items = Queue()
    ans = []
    tuple(map(items.put, iteritems(external_links)))
    progress_callback(0, len(external_links))
    done = []
    downloaded_html_ids = {}

    def check_links():
        br = browser(honor_time=False, verify_ssl_certificates=False)
        while True:
            try:
                full_href, locations = items.get_nowait()
            except Empty:
                return
            href, frag = full_href.partition('#')[::2]
            try:
                res = br.open(href, timeout=10)
            except Exception as e:
                ans.append((locations, e, full_href))
            else:
                if frag and check_anchors:
                    ct = res.info().get('Content-Type')
                    if ct and ct.split(';')[0].lower() in {'text/html', XHTML_MIME}:
                        ids = downloaded_html_ids.get(href)
                        if ids is None:
                            try:
                                ids = downloaded_html_ids[href] = get_html_ids(res.read())
                            except Exception:
                                ids = downloaded_html_ids[href] = frozenset()
                        if frag not in ids:
                            ans.append((locations, ValueError('HTML anchor {} not found on the page'.format(frag)), full_href))
                res.close()
            finally:
                done.append(None)
                progress_callback(len(done), len(external_links))

    workers = [Thread(name="CheckLinks", target=check_links) for i in range(min(10, len(external_links)))]
    for w in workers:
        w.daemon = True
        w.start()

    for w in workers:
        w.join()
    return ans
Example #16
0
def check_external_links(container, progress_callback=(lambda num, total:None), check_anchors=True):
    progress_callback(0, 0)
    external_links = defaultdict(list)
    for name, mt in iteritems(container.mime_map):
        if mt in OEB_DOCS or mt in OEB_STYLES:
            for href, lnum, col in container.iterlinks(name):
                purl = urlparse(href)
                if purl.scheme in ('http', 'https'):
                    external_links[href].append((name, href, lnum, col))
    if not external_links:
        return []
    items = Queue()
    ans = []
    tuple(map(items.put, iteritems(external_links)))
    progress_callback(0, len(external_links))
    done = []
    downloaded_html_ids = {}

    def check_links():
        br = browser(honor_time=False, verify_ssl_certificates=False)
        while True:
            try:
                full_href, locations = items.get_nowait()
            except Empty:
                return
            href, frag = full_href.partition('#')[::2]
            try:
                res = br.open(href, timeout=10)
            except Exception as e:
                ans.append((locations, e, full_href))
            else:
                if frag and check_anchors:
                    ct = res.info().get('Content-Type')
                    if ct and ct.split(';')[0].lower() in {'text/html', XHTML_MIME}:
                        ids = downloaded_html_ids.get(href)
                        if ids is None:
                            try:
                                ids = downloaded_html_ids[href] = get_html_ids(res.read())
                            except Exception:
                                ids = downloaded_html_ids[href] = frozenset()
                        if frag not in ids:
                            ans.append((locations, ValueError('HTML anchor {} not found on the page'.format(frag)), full_href))
                res.close()
            finally:
                done.append(None)
                progress_callback(len(done), len(external_links))

    workers = [Thread(name="CheckLinks", target=check_links) for i in range(min(10, len(external_links)))]
    for w in workers:
        w.daemon = True
        w.start()

    for w in workers:
        w.join()
    return ans
Example #17
0
def download_one(tdir, timeout, progress_report, data_uri_map, url):
    try:
        purl = urlparse(url)
        data_url_key = None
        with NamedTemporaryFile(dir=tdir, delete=False) as df:
            if purl.scheme == 'file':
                src = lopen(purl.path, 'rb')
                filename = os.path.basename(src)
                sz = (src.seek(0, os.SEEK_END), src.tell(), src.seek(0))[1]
            elif purl.scheme == 'data':
                prefix, payload = purl.path.split(',', 1)
                parts = prefix.split(';')
                if parts and parts[-1].lower() == 'base64':
                    payload = re.sub(r'\s+', '', payload)
                    payload = from_base64_bytes(payload)
                else:
                    payload = payload.encode('utf-8')
                seen_before = data_uri_map.get(payload)
                if seen_before is not None:
                    return True, (url, filename, seen_before,
                                  guess_type(seen_before))
                data_url_key = payload
                src = BytesIO(payload)
                sz = len(payload)
                ext = 'unknown'
                for x in parts:
                    if '=' not in x and '/' in x:
                        exts = mimetypes.guess_all_extensions(x)
                        if exts:
                            ext = exts[0]
                            break
                filename = 'data-uri.' + ext
            else:
                src = urlopen(url, timeout=timeout)
                filename = get_filename(purl, src)
                sz = get_content_length(src)
            progress_report(url, 0, sz)
            dest = ProgressTracker(df, url, sz, progress_report)
            with closing(src):
                shutil.copyfileobj(src, dest)
            if data_url_key is not None:
                data_uri_map[data_url_key] = dest.name
            filename = sanitize_file_name(filename)
            mt = guess_type(filename)
            if mt in OEB_DOCS:
                raise ValueError(
                    'The external resource {} looks like a HTML document ({})'.
                    format(url, filename))
            if not mt or mt == 'application/octet-stream' or '.' not in filename:
                raise ValueError(
                    'The external resource {} is not of a known type'.format(
                        url))
            return True, (url, filename, dest.name, mt)
    except Exception as err:
        return False, (url, as_unicode(err))
Example #18
0
 def __call__(self, url):
     if url and url.startswith('#'):
         return url
     name = self.container.href_to_name(url, self.base)
     if name != self.top_name:
         return url
     purl = urlparse(url)
     if purl.fragment and purl.fragment in self.bottom_anchors:
         url = self.container.name_to_href(self.bottom_name, self.base) + '#' + purl.fragment
         self.replaced = True
     return url
Example #19
0
def localize_website_link(url):
    lc = lang_as_iso639_1(get_lang())
    langs = website_languages()
    if lc == 'en' or lc not in langs:
        return url
    from polyglot.urllib import urlparse, urlunparse
    parts = urlparse(url)
    path = '/{}{}'.format(lc, parts.path)
    parts = list(parts)
    parts[2] = path
    return urlunparse(parts)
Example #20
0
def localize_website_link(url):
    lc = lang_as_iso639_1(get_lang())
    langs = website_languages()
    if lc == 'en' or lc not in langs:
        return url
    from polyglot.urllib import urlparse, urlunparse
    parts = urlparse(url)
    path = '/{}{}'.format(lc, parts.path)
    parts = list(parts)
    parts[2] = path
    return urlunparse(parts)
Example #21
0
 def __call__(self, url):
     if url and url.startswith('#'):
         return url
     name = self.container.href_to_name(url, self.base)
     if name != self.top_name:
         return url
     purl = urlparse(url)
     if purl.fragment and purl.fragment in self.bottom_anchors:
         url = self.container.name_to_href(self.bottom_name,
                                           self.base) + '#' + purl.fragment
         self.replaced = True
     return url
Example #22
0
def localize_user_manual_link(url):
    lc = lang_code_for_user_manual()
    if not lc:
        return url
    from polyglot.urllib import urlparse, urlunparse
    parts = urlparse(url)
    path = re.sub(r'/generated/[a-z]+/', '/generated/%s/' % lc, parts.path
                  or '')
    path = '/%s%s' % (lc, path)
    parts = list(parts)
    parts[2] = path
    return urlunparse(parts)
Example #23
0
 def add_links(self):
     for link in self.links:
         path, href, frag = link[0]
         page, rect = link[1:]
         combined_path = os.path.normcase(
             os.path.abspath(
                 os.path.join(os.path.dirname(path),
                              *unquote(href).split('/'))))
         is_local = not href or combined_path in self.anchors
         annot = Dictionary({
             'Type': Name('Annot'),
             'Subtype': Name('Link'),
             'Rect': rect,
             'Border': Array([0, 0, 0]),
         })
         if self.mark_links:
             annot.update({
                 'Border': Array([16, 16, 1]),
                 'C': Array([1.0, 0, 0])
             })
         if is_local:
             path = combined_path if href else path
             try:
                 annot['Dest'] = self.anchors[path][frag]
             except KeyError:
                 try:
                     annot['Dest'] = self.anchors[path][None]
                 except KeyError:
                     pass
         else:
             url = href + (('#' + frag) if frag else '')
             try:
                 purl = urlparse(url)
             except Exception:
                 self.pdf.debug('Ignoring unparseable URL: %r' % url)
                 continue
             if purl.scheme and purl.scheme != 'file':
                 action = Dictionary({
                     'Type': Name('Action'),
                     'S': Name('URI'),
                 })
                 # Do not try to normalize/quote/unquote this URL as if it
                 # has a query part, it will get corrupted
                 action['URI'] = String(url)
                 annot['A'] = action
         if 'A' in annot or 'Dest' in annot:
             if 'Annots' not in page:
                 page['Annots'] = Array()
             page['Annots'].append(self.pdf.objects.add(annot))
         else:
             self.pdf.debug(
                 'Could not find destination for link: %s in file %s' %
                 (href, path))
Example #24
0
    def isRelURL(self, url):
        """
        Identify relative urls.

        >>> t = Textile()
        >>> t.isRelURL("http://www.google.com/")
        False
        >>> t.isRelURL("/foo")
        True

        """
        (scheme, netloc) = urlparse(url)[0:2]
        return not scheme and not netloc
Example #25
0
    def isRelURL(self, url):
        """
        Identify relative urls.

        >>> t = Textile()
        >>> t.isRelURL("http://www.google.com/")
        False
        >>> t.isRelURL("/foo")
        True

        """
        (scheme, netloc) = urlparse(url)[0:2]
        return not scheme and not netloc
Example #26
0
 def __call__(self, url):
     if url and url.startswith('#'):
         return url
     name = self.container.href_to_name(url, self.base)
     amap = self.anchor_map.get(name, None)
     if amap is None:
         return url
     purl = urlparse(url)
     frag = purl.fragment or ''
     frag = amap.get(frag, frag)
     url = self.container.name_to_href(self.master, self.base) + '#' + frag
     self.replaced = True
     return url
Example #27
0
 def __call__(self, url):
     if url and url.startswith('#'):
         return url
     name = self.container.href_to_name(url, self.base)
     amap = self.anchor_map.get(name, None)
     if amap is None:
         return url
     purl = urlparse(url)
     frag = purl.fragment or ''
     frag = amap.get(frag, frag)
     url = self.container.name_to_href(self.master, self.base) + '#' + frag
     self.replaced = True
     return url
Example #28
0
 def replace_link(url):
     purl = urlparse(url)
     if purl.scheme != 'https' or purl.netloc not in ('calibre-pdf-anchor.a', 'calibre-pdf-anchor.n'):
         return
     loc = None
     if purl.netloc == 'calibre-pdf-anchor.a':
         loc = anchor_locations.get(purl.fragment)
         if loc is None:
             log.warn('Anchor location for link to {} not found'.format(purl.fragment))
     else:
         loc = anchor_locations.get(name_anchor_map.get(purl.fragment))
         if loc is None:
             log.warn('Anchor location for link to {} not found'.format(purl.fragment))
     return None if loc is None else loc.as_tuple
Example #29
0
def download_one(tdir, timeout, progress_report, data_uri_map, url):
    try:
        purl = urlparse(url)
        data_url_key = None
        with NamedTemporaryFile(dir=tdir, delete=False) as df:
            if purl.scheme == 'file':
                src = lopen(purl.path, 'rb')
                filename = os.path.basename(src)
                sz = (src.seek(0, os.SEEK_END), src.tell(), src.seek(0))[1]
            elif purl.scheme == 'data':
                prefix, payload = purl.path.split(',', 1)
                parts = prefix.split(';')
                if parts and parts[-1].lower() == 'base64':
                    payload = re.sub(r'\s+', '', payload)
                    payload = from_base64_bytes(payload)
                else:
                    payload = payload.encode('utf-8')
                seen_before = data_uri_map.get(payload)
                if seen_before is not None:
                    return True, (url, filename, seen_before, guess_type(seen_before))
                data_url_key = payload
                src = BytesIO(payload)
                sz = len(payload)
                ext = 'unknown'
                for x in parts:
                    if '=' not in x and '/' in x:
                        exts = mimetypes.guess_all_extensions(x)
                        if exts:
                            ext = exts[0]
                            break
                filename = 'data-uri.' + ext
            else:
                src = urlopen(url, timeout=timeout)
                filename = get_filename(purl, src)
                sz = get_content_length(src)
            progress_report(url, 0, sz)
            dest = ProgressTracker(df, url, sz, progress_report)
            with closing(src):
                shutil.copyfileobj(src, dest)
            if data_url_key is not None:
                data_uri_map[data_url_key] = dest.name
            filename = sanitize_file_name(filename)
            mt = guess_type(filename)
            if mt in OEB_DOCS:
                raise ValueError('The external resource {} looks like a HTML document ({})'.format(url, filename))
            if not mt or mt == 'application/octet-stream' or '.' not in filename:
                raise ValueError('The external resource {} is not of a known type'.format(url))
            return True, (url, filename, dest.name, mt)
    except Exception as err:
        return False, (url, as_unicode(err))
Example #30
0
def localize_user_manual_link(url):
    lc = lang_as_iso639_1(get_lang())
    if lc == 'en':
        return url
    stats = user_manual_stats()
    if stats.get(lc, 0) < 0.3:
        return url
    from polyglot.urllib import urlparse, urlunparse
    parts = urlparse(url)
    path = re.sub(r'/generated/[a-z]+/', '/generated/%s/' % lc, parts.path or '')
    path = '/%s%s' % (lc, path)
    parts = list(parts)
    parts[2] = path
    return urlunparse(parts)
Example #31
0
def localize_user_manual_link(url):
    lc = lang_as_iso639_1(get_lang())
    if lc == 'en':
        return url
    stats = user_manual_stats()
    if stats.get(lc, 0) < 0.3:
        return url
    from polyglot.urllib import urlparse, urlunparse
    parts = urlparse(url)
    path = re.sub(r'/generated/[a-z]+/', '/generated/%s/' % lc, parts.path or '')
    path = '/%s%s' % (lc, path)
    parts = list(parts)
    parts[2] = path
    return urlunparse(parts)
 def process_toc_node(self, toc, level=0):
     href = toc.href
     if href:
         purl = urlparse(href)
         href = purl.path
         if href in self.document_hrefs:
             key = (href, purl.fragment or self.top_anchor)
             if key in self.anchor_map:
                 bmark = self.anchor_map[key]
             else:
                 bmark = self.anchor_map[(href, self.top_anchor)]
             self.toc.append(TOCItem(toc.title, bmark, level))
     for child in toc:
         self.process_toc_node(child, level+1)
Example #33
0
 def __init__(self, url, base):
     '''
     :param url:  The url this link points to. Must be an unquoted unicode string.
     :param base: The base directory that relative URLs are with respect to.
                  Must be a unicode string.
     '''
     assert isinstance(url, unicode_type) and isinstance(base, unicode_type)
     self.url         = url
     self.parsed_url  = urlparse(self.url)
     self.is_local    = self.parsed_url.scheme in ('', 'file')
     self.is_internal = self.is_local and not bool(self.parsed_url.path)
     self.path        = None
     self.fragment    = urlunquote(self.parsed_url.fragment)
     if self.is_local and not self.is_internal:
         self.path = self.url_to_local_path(self.parsed_url, base)
Example #34
0
 def __init__(self, url, base):
     '''
     :param url:  The url this link points to. Must be an unquoted unicode string.
     :param base: The base folder that relative URLs are with respect to.
                  Must be a unicode string.
     '''
     assert isinstance(url, str) and isinstance(base, str)
     self.url = url
     self.parsed_url = urlparse(self.url)
     self.is_local = self.parsed_url.scheme in ('', 'file')
     self.is_internal = self.is_local and not bool(self.parsed_url.path)
     self.path = None
     self.fragment = urlunquote(self.parsed_url.fragment)
     if self.is_local and not self.is_internal:
         self.path = self.url_to_local_path(self.parsed_url, base)
Example #35
0
 def __call__(self, url):
     if url and url.startswith('#'):
         return url
     purl = urlparse(url)
     frag = purl.fragment
     name = self.container.href_to_name(url, self.old_name)
     if not name:
         return url
     if name == self.old_name:
         name = self.new_name
     href = self.container.name_to_href(name, self.new_name)
     if frag:
         href += '#' + frag
     if href != url:
         self.replaced = True
     return href
Example #36
0
 def __call__(self, url):
     if url and url.startswith('#'):
         return url
     purl = urlparse(url)
     frag = purl.fragment
     name = self.container.href_to_name(url, self.old_name)
     if not name:
         return url
     if name == self.old_name:
         name = self.new_name
     href = self.container.name_to_href(name, self.new_name)
     if frag:
         href += '#' + frag
     if href != url:
         self.replaced = True
     return href
Example #37
0
    def read_from_opf(self, opfreader):
        toc = opfreader.soup.find('spine', toc=True)
        if toc is not None:
            toc = toc['toc']
        if toc is None:
            try:
                toc = opfreader.soup.find('guide').find('reference',
                                                        attrs={'type':
                                                               'toc'})['href']
            except:
                for item in opfreader.manifest:
                    if 'toc' in item.href().lower():
                        toc = item.href()
                        break

        if toc is not None:
            if toc.lower() not in ('ncx', 'ncxtoc'):
                toc = urlparse(unquote(toc))[2]
                toc = toc.replace('/', os.sep)
                if not os.path.isabs(toc):
                    toc = os.path.join(self.base_path, toc)
                try:
                    if not os.path.exists(toc):
                        bn = os.path.basename(toc)
                        bn = bn.replace('_top.htm',
                                        '_toc.htm')  # Bug in BAEN OPF files
                        toc = os.path.join(os.path.dirname(toc), bn)

                    self.read_html_toc(toc)
                except:
                    print(
                        'WARNING: Could not read Table of Contents. Continuing anyway.'
                    )
            else:
                path = opfreader.manifest.item(toc.lower())
                path = getattr(path, 'path', path)
                if path and os.access(path, os.R_OK):
                    try:
                        self.read_ncx_toc(path)
                    except Exception as err:
                        print('WARNING: Invalid NCX file:', err)
                    return
                cwd = os.path.abspath(self.base_path)
                m = glob.glob(os.path.join(cwd, '*.ncx'))
                if m:
                    toc = m[0]
                    self.read_ncx_toc(toc)
Example #38
0
def add_from_navpoint(container, navpoint, parent, ncx_name):
    dest = frag = text = None
    nl = child_xpath(navpoint, 'navlabel')
    if nl:
        nl = nl[0]
        text = ''
        for txt in child_xpath(nl, 'text'):
            text += etree.tostring(txt, method='text',
                    encoding='unicode', with_tail=False)
    content = child_xpath(navpoint, 'content')
    if content:
        content = content[0]
        href = content.get('src', None)
        if href:
            dest = container.href_to_name(href, base=ncx_name)
            frag = urlparse(href).fragment or None
    return parent.add(text or None, dest or None, frag or None)
Example #39
0
    def url_replacer(self, orig_url):
        url = urlnormalize(orig_url)
        parts = urlparse(url)
        if parts.scheme:
            # Only rewrite local URLs
            return orig_url
        path, frag = urldefrag(url)
        if self.renamed_items_map:
            orig_item = self.renamed_items_map.get(self.current_item.href, self.current_item)
        else:
            orig_item = self.current_item

        href = orig_item.abshref(path)
        replacement = self.current_item.relhref(self.rename_map.get(href, href))
        if frag:
            replacement += '#' + frag
        return replacement
Example #40
0
def add_from_navpoint(container, navpoint, parent, ncx_name):
    dest = frag = text = None
    nl = child_xpath(navpoint, 'navlabel')
    if nl:
        nl = nl[0]
        text = ''
        for txt in child_xpath(nl, 'text'):
            text += etree.tostring(txt, method='text',
                    encoding='unicode', with_tail=False)
    content = child_xpath(navpoint, 'content')
    if content:
        content = content[0]
        href = content.get('src', None)
        if href:
            dest = container.href_to_name(href, base=ncx_name)
            frag = urlparse(href).fragment or None
    return parent.add(text or None, dest or None, frag or None)
Example #41
0
    def __call__(self, container):
        frag = urlparse(self.href).fragment
        nhref = container.name_to_href(self.corrected_name, self.name)
        if frag:
            nhref += '#' + frag
        orig_href = self.href

        class LinkReplacer(object):
            replaced = False

            def __call__(self, url):
                if url != orig_href:
                    return url
                self.replaced = True
                return nhref
        replacer = LinkReplacer()
        container.replace_links(self.name, replacer)
        return replacer.replaced
Example #42
0
def parse_html_toc(data):
    from html5_parser import parse
    from calibre.utils.cleantext import clean_xml_chars
    from lxml import etree
    if isinstance(data, bytes):
        data = xml_to_unicode(data, strip_encoding_pats=True, resolve_entities=True)[0]
    root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True)
    for a in root.xpath('//*[@href and local-name()="a"]'):
        purl = urlparse(unquote(a.get('href')))
        href, fragment = purl[2], purl[5]
        if not fragment:
            fragment = None
        else:
            fragment = fragment.strip()
        href = href.strip()

        txt = etree.tostring(a, method='text', encoding='unicode')
        yield href, fragment, txt
Example #43
0
    def __call__(self, container):
        frag = urlparse(self.href).fragment
        nhref = container.name_to_href(self.corrected_name, self.name)
        if frag:
            nhref += '#' + frag
        orig_href = self.href

        class LinkReplacer:
            replaced = False

            def __call__(self, url):
                if url != orig_href:
                    return url
                self.replaced = True
                return nhref
        replacer = LinkReplacer()
        container.replace_links(self.name, replacer)
        return replacer.replaced
Example #44
0
 def add_links(self):
     for link in self.links:
         path, href, frag = link[0]
         page, rect = link[1:]
         combined_path = os.path.normcase(os.path.abspath(os.path.join(os.path.dirname(path), *unquote(href).split('/'))))
         is_local = not href or combined_path in self.anchors
         annot = Dictionary({
             'Type':Name('Annot'), 'Subtype':Name('Link'),
             'Rect':rect, 'Border':Array([0,0,0]),
         })
         if self.mark_links:
             annot.update({'Border':Array([16, 16, 1]), 'C':Array([1.0, 0,
                                                                   0])})
         if is_local:
             path = combined_path if href else path
             try:
                 annot['Dest'] = self.anchors[path][frag]
             except KeyError:
                 try:
                     annot['Dest'] = self.anchors[path][None]
                 except KeyError:
                     pass
         else:
             url = href + (('#'+frag) if frag else '')
             try:
                 purl = urlparse(url)
             except Exception:
                 self.pdf.debug('Ignoring unparseable URL: %r' % url)
                 continue
             if purl.scheme and purl.scheme != 'file':
                 action = Dictionary({
                     'Type':Name('Action'), 'S':Name('URI'),
                 })
                 # Do not try to normalize/quote/unquote this URL as if it
                 # has a query part, it will get corrupted
                 action['URI'] = String(url)
                 annot['A'] = action
         if 'A' in annot or 'Dest' in annot:
             if 'Annots' not in page:
                 page['Annots'] = Array()
             page['Annots'].append(self.pdf.objects.add(annot))
         else:
             self.pdf.debug('Could not find destination for link: %s in file %s'%
                            (href, path))
Example #45
0
 def replace_link(url):
     purl = urlparse(url)
     if purl.scheme != 'https' or purl.netloc not in ('calibre-pdf-anchor.a', 'calibre-pdf-anchor.n'):
         return
     loc = None
     if purl.netloc == 'calibre-pdf-anchor.a':
         loc = anchor_locations.get(purl.fragment)
         if loc is None:
             log.warn(f'Anchor location for link to {purl.fragment} not found')
     else:
         loc = anchor_locations.get(name_anchor_map.get(purl.fragment))
         if loc is None:
             log.warn(f'Anchor location for link to {purl.fragment} not found')
     if loc is None:
         return None
     if loc.pagenum > pc:
         log.warn(f'Anchor location for link to {purl.fragment} is past the end of the document, moving it to last page')
         loc.pagenum = pc
     return loc.as_tuple
Example #46
0
    def url_replacer(self, orig_url):
        url = urlnormalize(orig_url)
        parts = urlparse(url)
        if parts.scheme:
            # Only rewrite local URLs
            return orig_url
        path, frag = urldefrag(url)
        if self.renamed_items_map:
            orig_item = self.renamed_items_map.get(self.current_item.href,
                                                   self.current_item)
        else:
            orig_item = self.current_item

        href = orig_item.abshref(path)
        replacement = self.current_item.relhref(self.rename_map.get(
            href, href))
        if frag:
            replacement += '#' + frag
        return replacement
Example #47
0
    def read_from_opf(self, opfreader):
        toc = opfreader.soup.find('spine', toc=True)
        if toc is not None:
            toc = toc['toc']
        if toc is None:
            try:
                toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
            except:
                for item in opfreader.manifest:
                    if 'toc' in item.href().lower():
                        toc = item.href()
                        break

        if toc is not None:
            if toc.lower() not in ('ncx', 'ncxtoc'):
                toc = urlparse(unquote(toc))[2]
                toc = toc.replace('/', os.sep)
                if not os.path.isabs(toc):
                    toc = os.path.join(self.base_path, toc)
                try:
                    if not os.path.exists(toc):
                        bn  = os.path.basename(toc)
                        bn  = bn.replace('_top.htm', '_toc.htm')  # Bug in BAEN OPF files
                        toc = os.path.join(os.path.dirname(toc), bn)

                    self.read_html_toc(toc)
                except:
                    print('WARNING: Could not read Table of Contents. Continuing anyway.')
            else:
                path = opfreader.manifest.item(toc.lower())
                path = getattr(path, 'path', path)
                if path and os.access(path, os.R_OK):
                    try:
                        self.read_ncx_toc(path)
                    except Exception as err:
                        print('WARNING: Invalid NCX file:', err)
                    return
                cwd = os.path.abspath(self.base_path)
                m = glob.glob(os.path.join(cwd, '*.ncx'))
                if m:
                    toc = m[0]
                    self.read_ncx_toc(toc)
Example #48
0
    def serialize_hyperlink(self, parent, link):
        item, url, tooltip = link
        purl = urlparse(url)
        href = purl.path

        def make_link(parent, anchor=None, id=None, tooltip=None):
            kw = {}
            if anchor is not None:
                kw['w_anchor'] = anchor
            elif id is not None:
                kw['r_id'] = id
            if tooltip:
                kw['w_tooltip'] = tooltip
            return self.namespace.makeelement(parent, 'w:hyperlink', **kw)

        if not purl.scheme:
            href = item.abshref(href)
            if href not in self.document_hrefs:
                href = urlquote(href)
            if href in self.document_hrefs:
                key = (href, purl.fragment or self.top_anchor)
                if key in self.anchor_map:
                    bmark = self.anchor_map[key]
                else:
                    bmark = self.anchor_map[(href, self.top_anchor)]
                return make_link(parent, anchor=bmark, tooltip=tooltip)
            else:
                self.log.warn(
                    'Ignoring internal hyperlink with href (%s) pointing to unknown destination'
                    % url)
        if purl.scheme in {'http', 'https', 'ftp'}:
            if url not in self.external_links:
                self.external_links[
                    url] = self.document_relationships.add_relationship(
                        url,
                        self.namespace.names['LINKS'],
                        target_mode='External')
            return make_link(parent,
                             id=self.external_links[url],
                             tooltip=tooltip)
        return parent
Example #49
0
def check_link_destination(container, dest_map, name, href, a, errors):
    if href.startswith('#'):
        tname = name
    else:
        try:
            tname = container.href_to_name(href, name)
        except ValueError:
            tname = None  # Absolute links to files on another drive in windows cause this
    if tname and tname in container.mime_map:
        if container.mime_map[tname] not in OEB_DOCS:
            errors.append(BadDestinationType(name, tname, a))
        else:
            root = container.parsed(tname)
            if hasattr(root, 'xpath'):
                if tname not in dest_map:
                    dest_map[tname] = set(root.xpath('//*/@id|//*/@name'))
                purl = urlparse(href)
                if purl.fragment and purl.fragment not in dest_map[tname]:
                    errors.append(BadDestinationFragment(name, tname, a, purl.fragment))
            else:
                errors.append(BadDestinationType(name, tname, a))
Example #50
0
def get_proxy_info(proxy_scheme, proxy_string):
    '''
    Parse all proxy information from a proxy string (as returned by
    get_proxies). The returned dict will have members set to None when the info
    is not available in the string. If an exception occurs parsing the string
    this method returns None.
    '''
    from polyglot.urllib import urlparse
    try:
        proxy_url = '%s://%s'%(proxy_scheme, proxy_string)
        urlinfo = urlparse(proxy_url)
        ans = {
            'scheme': urlinfo.scheme,
            'hostname': urlinfo.hostname,
            'port': urlinfo.port,
            'username': urlinfo.username,
            'password': urlinfo.password,
        }
    except Exception:
        return None
    return ans
Example #51
0
def get_proxy_info(proxy_scheme, proxy_string):
    '''
    Parse all proxy information from a proxy string (as returned by
    get_proxies). The returned dict will have members set to None when the info
    is not available in the string. If an exception occurs parsing the string
    this method returns None.
    '''
    from polyglot.urllib import urlparse
    try:
        proxy_url = '%s://%s'%(proxy_scheme, proxy_string)
        urlinfo = urlparse(proxy_url)
        ans = {
            'scheme': urlinfo.scheme,
            'hostname': urlinfo.hostname,
            'port': urlinfo.port,
            'username': urlinfo.username,
            'password': urlinfo.password,
        }
    except Exception:
        return None
    return ans
Example #52
0
def check_link_destination(container, dest_map, name, href, a, errors):
    if href.startswith('#'):
        tname = name
    else:
        try:
            tname = container.href_to_name(href, name)
        except ValueError:
            tname = None  # Absolute links to files on another drive in windows cause this
    if tname and tname in container.mime_map:
        if container.mime_map[tname] not in OEB_DOCS:
            errors.append(BadDestinationType(name, tname, a))
        else:
            root = container.parsed(tname)
            if hasattr(root, 'xpath'):
                if tname not in dest_map:
                    dest_map[tname] = set(root.xpath('//*/@id|//*/@name'))
                purl = urlparse(href)
                if purl.fragment and purl.fragment not in dest_map[tname]:
                    errors.append(BadDestinationFragment(name, tname, a, purl.fragment))
            else:
                errors.append(BadDestinationType(name, tname, a))
Example #53
0
    def read_html_toc(self, toc):
        self.base_path = os.path.dirname(toc)
        soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
        for a in soup.findAll('a'):
            if not a.has_key('href'):  # noqa
                continue
            purl = urlparse(unquote(a['href']))
            href, fragment = purl[2], purl[5]
            if not fragment:
                fragment = None
            else:
                fragment = fragment.strip()
            href = href.strip()

            txt = ''.join([unicode_type(s).strip() for s in a.findAll(text=True)])
            add = True
            for i in self.flat():
                if i.href == href and i.fragment == fragment:
                    add = False
                    break
            if add:
                self.add_item(href, fragment, txt)
Example #54
0
 def __call__(self, url):
     if url and url.startswith('#'):
         repl = self.frag_map(self.base, url[1:])
         if not repl or repl == url[1:]:
             return url
         self.replaced = True
         return '#' + repl
     name = self.container.href_to_name(url, self.base)
     if not name:
         return url
     nname = self.link_map.get(name, None)
     if not nname:
         return url
     purl = urlparse(url)
     href = self.container.name_to_href(nname, self.base)
     if purl.fragment:
         nfrag = self.frag_map(name, purl.fragment)
         if nfrag:
             href += '#%s' % nfrag
     if href != url:
         self.replaced = True
     return href
Example #55
0
 def __call__(self, url):
     if url and url.startswith('#'):
         repl = self.id_map.get(self.base, {}).get(url[1:])
         if repl is None or repl == url[1:]:
             return url
         self.replaced = True
         return '#' + repl
     name = self.container.href_to_name(url, self.base)
     if not name:
         return url
     id_map = self.id_map.get(name)
     if id_map is None:
         return url
     purl = urlparse(url)
     nfrag = id_map.get(purl.fragment)
     if nfrag is None:
         return url
     purl = purl._replace(fragment=nfrag)
     href = urlunparse(purl)
     if href != url:
         self.replaced = True
     return href
Example #56
0
    def __init__(self, format):
        '''
        Create a query object by passing it the url format obtained
        from the opensearch Description.
        '''
        self.format = format

        # unpack the url to a tuple
        self.url_parts = urlparse(format)

        # unpack the query string to a dictionary
        self.query_string = parse_qs(self.url_parts[4])

        # look for standard macros and create a mapping of the
        # opensearch names to the service specific ones
        # so q={searchTerms} will result in a mapping between searchTerms and q
        self.macro_map = {}
        for key,values in self.query_string.items():
            # TODO eventually optional/required params should be
            # distinguished somehow (the ones with/without trailing ?
            macro = values[0].replace('{', '').replace('}', '').replace('?', '')
            if macro in Query.standard_macros:
                self.macro_map[macro] = key
Example #57
0
 def __init__(self, opts):
     self.library_path = opts.library_path or prefs['library_path']
     self.timeout = opts.timeout
     self.url = None
     if self.library_path is None:
         raise SystemExit(
             'No saved library path, either run the GUI or use the'
             ' --with-library option')
     if self.library_path.partition(':')[0] in ('http', 'https'):
         parts = urlparse(self.library_path)
         self.library_id = parts.fragment or None
         self.url = urlunparse(parts._replace(fragment='')).rstrip('/')
         self.br = browser(handle_refresh=False,
                           user_agent='{} {}'.format(
                               __appname__, __version__))
         self.is_remote = True
         username, password = read_credentials(opts)
         self.has_credentials = False
         if username and password:
             self.br.add_password(self.url, username, password)
             self.has_credentials = True
         if self.library_id == '-':
             self.list_libraries()
             raise SystemExit()
     else:
         self.library_path = os.path.expanduser(self.library_path)
         if not singleinstance('db'):
             ext = '.exe' if iswindows else ''
             raise SystemExit(
                 _('Another calibre program such as {} or the main calibre program is running.'
                   ' Having multiple programs that can make changes to a calibre library'
                   ' running at the same time is a bad idea. calibredb can connect directly'
                   ' to a running calibre Content server, to make changes through it, instead.'
                   ' See the documentation of the {} option for details.').
                 format('calibre-server' + ext, '--with-library'))
         self._db = None
         self.is_remote = False
Example #58
0
def get_download_filename_from_response(response):
    from polyglot.urllib import unquote, urlparse
    filename = last_part_name = ''
    try:
        purl = urlparse(response.geturl())
        last_part_name = unquote(purl.path.split('/')[-1])
        disposition = response.info().get('Content-disposition', '')
        for p in disposition.split(';'):
            if 'filename' in p:
                if '*=' in disposition:
                    parts = disposition.split('*=')[-1]
                    filename = parts.split('\'')[-1]
                else:
                    filename = disposition.split('=')[-1]
                if filename[0] in ('\'', '"'):
                    filename = filename[1:]
                if filename[-1] in ('\'', '"'):
                    filename = filename[:-1]
                filename = unquote(filename)
                break
    except Exception:
        import traceback
        traceback.print_exc()
    return filename or last_part_name