def request_sync(self, tagname, href, lnum): if self.current_name: c = current_container() if tagname == 'a' and href: if href and href.startswith('#'): name = self.current_name else: name = c.href_to_name(href, self.current_name) if href else None if name == self.current_name: return self.view.page().go_to_anchor(urlparse(href).fragment, lnum) if name and c.exists(name) and c.mime_map[name] in OEB_DOCS: return self.link_clicked.emit(name, urlparse(href).fragment or TOP) self.sync_requested.emit(self.current_name, lnum)
def parse_ncx(container, ncx_name): root = container.parsed(ncx_name) toc_root = TOC() navmaps = root.xpath('//*[calibre:lower-case(local-name()) = "navmap"]') if navmaps: process_ncx_node(container, navmaps[0], toc_root, ncx_name) toc_root.lang = toc_root.uid = None for attr, val in iteritems(root.attrib): if attr.endswith('lang'): toc_root.lang = unicode_type(val) break for uid in root.xpath('//*[calibre:lower-case(local-name()) = "meta" and @name="dtb:uid"]/@content'): if uid: toc_root.uid = unicode_type(uid) break for pl in root.xpath('//*[calibre:lower-case(local-name()) = "pagelist"]'): for pt in pl.xpath('descendant::*[calibre:lower-case(local-name()) = "pagetarget"]'): pagenum = pt.get('value') if pagenum: href = pt.xpath('descendant::*[calibre:lower-case(local-name()) = "content"]/@src') if href: dest = container.href_to_name(href[0], base=ncx_name) frag = urlparse(href[0]).fragment or None toc_root.page_list.append({'dest': dest, 'pagenum': pagenum, 'frag': frag}) return toc_root
def __init__(self, url, fname, parent): QDialog.__init__(self, parent) self.setWindowTitle(_('Download %s')%fname) self.l = QVBoxLayout(self) self.purl = urlparse(url) self.msg = QLabel(_('Downloading <b>%(fname)s</b> from %(url)s')%dict( fname=fname, url=self.purl.netloc)) self.msg.setWordWrap(True) self.l.addWidget(self.msg) self.pb = QProgressBar(self) self.pb.setMinimum(0) self.pb.setMaximum(0) self.l.addWidget(self.pb) self.bb = QDialogButtonBox(QDialogButtonBox.Cancel, Qt.Horizontal, self) self.l.addWidget(self.bb) self.bb.rejected.connect(self.reject) sz = self.sizeHint() self.resize(max(sz.width(), 400), sz.height()) fpath = PersistentTemporaryFile(os.path.splitext(fname)[1]) fpath.close() self.fpath = fpath.name self.worker = Worker(url, self.fpath, Queue()) self.rejected = False
def get_download_filename_from_response(response): from polyglot.urllib import unquote, urlparse filename = last_part_name = '' try: purl = urlparse(response.geturl()) last_part_name = unquote(purl.path.split('/')[-1]) disposition = response.info().get('Content-disposition', '') if isinstance(disposition, bytes): disposition = disposition.decode('utf-8', 'replace') for p in disposition.split(';'): if 'filename' in p: if '*=' in disposition: parts = disposition.split('*=')[-1] filename = parts.split('\'')[-1] else: filename = disposition.split('=')[-1] if filename[0] in ('\'', '"'): filename = filename[1:] if filename[-1] in ('\'', '"'): filename = filename[:-1] filename = unquote(filename) break except Exception: import traceback traceback.print_exc() return filename or last_part_name
def __init__(self, href_or_path, basedir=os.getcwdu(), is_path=True): self._href = None self._basedir = basedir self.path = None self.fragment = '' try: self.mime_type = guess_type(href_or_path)[0] except: self.mime_type = None if self.mime_type is None: self.mime_type = 'application/octet-stream' if is_path: path = href_or_path if not os.path.isabs(path): path = os.path.abspath(os.path.join(basedir, path)) if isinstance(path, bytes): path = path.decode(sys.getfilesystemencoding()) self.path = path else: url = urlparse(href_or_path) if url[0] not in ('', 'file'): self._href = href_or_path else: pc = url[2] if isinstance(pc, unicode_type): pc = pc.encode('utf-8') pc = unquote(pc).decode('utf-8') self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep))) self.fragment = unquote(url[-1])
def create_toc_from_links(self): num = 0 for item in self.oeb.spine: for a in XPath('//h:a[@href]')(item.data): href = a.get('href') try: purl = urlparse(href) except ValueError: self.log.warning('Ignoring malformed URL:', href) continue if not purl[0] or purl[0] == 'file': href, frag = purl.path, purl.fragment href = item.abshref(href) if frag: href = '#'.join((href, frag)) if not self.oeb.toc.has_href(href): text = xml2text(a) text = text[:100].strip() if (not self.opts.duplicate_links_in_toc and self.oeb.toc.has_text(text)): continue try: self.oeb.toc.add(text, href, play_order=self.oeb.toc.next_play_order()) num += 1 except ValueError: self.oeb.log.exception('Failed to process link: %r' % href) continue # Most likely an incorrectly URL encoded link if self.opts.max_toc_links > 0 and \ num >= self.opts.max_toc_links: self.log('Maximum TOC links reached, stopping.') return
def serialize_hyperlink(self, parent, link): item, url, tooltip = link purl = urlparse(url) href = purl.path def make_link(parent, anchor=None, id=None, tooltip=None): kw = {} if anchor is not None: kw['w_anchor'] = anchor elif id is not None: kw['r_id'] = id if tooltip: kw['w_tooltip'] = tooltip return self.namespace.makeelement(parent, 'w:hyperlink', **kw) if not purl.scheme: href = item.abshref(href) if href in self.document_hrefs: key = (href, purl.fragment or self.top_anchor) if key in self.anchor_map: bmark = self.anchor_map[key] else: bmark = self.anchor_map[(href, self.top_anchor)] return make_link(parent, anchor=bmark, tooltip=tooltip) else: self.log.warn('Ignoring internal hyperlink with href (%s) pointing to unknown destination' % url) if purl.scheme in {'http', 'https', 'ftp'}: if url not in self.external_links: self.external_links[url] = self.document_relationships.add_relationship(url, self.namespace.names['LINKS'], target_mode='External') return make_link(parent, id=self.external_links[url], tooltip=tooltip) return parent
def __init__(self, *args): QWebView.__init__(self, *args) self.gui = None self.tags = '' self.create_browser = None self._page = NPWebPage() self.setPage(self._page) self.cookie_jar = QNetworkCookieJar() self.page().networkAccessManager().setCookieJar(self.cookie_jar) http_proxy = get_proxies().get('http', None) if http_proxy: proxy_parts = urlparse(http_proxy) proxy = QNetworkProxy() proxy.setType(QNetworkProxy.HttpProxy) if proxy_parts.username: proxy.setUser(proxy_parts.username) if proxy_parts.password: proxy.setPassword(proxy_parts.password) if proxy_parts.hostname: proxy.setHostName(proxy_parts.hostname) if proxy_parts.port: proxy.setPort(proxy_parts.port) self.page().networkAccessManager().setProxy(proxy) self.page().setForwardUnsupportedContent(True) self.page().unsupportedContent.connect(self.start_download) self.page().downloadRequested.connect(self.start_download) self.page().networkAccessManager().sslErrors.connect(self.ignore_ssl_errors)
def process_navpoint(np, dest): try: play_order = int(get_attr(np, 1)) except: play_order = 1 href = fragment = text = None nd = dest nl = nl_path(np) if nl: nl = nl[0] text = u'' for txt in txt_path(nl): text += etree.tostring(txt, method='text', encoding='unicode', with_tail=False) content = content_path(np) if content and text: content = content[0] # if get_attr(content, attr='src'): purl = urlparse(content.get('src')) href, fragment = unquote(purl[2]), unquote(purl[5]) nd = dest.add_item(href, fragment, text) nd.play_order = play_order for c in np_path(np): process_navpoint(c, nd)
def link_replacer(base, url): if url.startswith('#'): frag = urlunquote(url[1:]) if not frag: return url changed.add(base) return resource_template.format(encode_url(base, frag)) purl = urlparse(url) if purl.netloc or purl.query: return url if purl.scheme and purl.scheme != 'file': return url if not purl.path or purl.path.startswith('/'): return url url, frag = purl.path, purl.fragment name = self.href_to_name(url, base) if name: if self.has_name_and_is_not_empty(name): frag = urlunquote(frag) url = resource_template.format(encode_url(name, frag)) else: if isinstance(name, unicode_type): name = name.encode('utf-8') url = 'missing:' + force_unicode(quote(name), 'utf-8') changed.add(base) return url
def fetch_url(self, url): data = None self.log.debug('Fetching', url) st = time.time() # Check for a URL pointing to the local filesystem and special case it # for efficiency and robustness. Bypasses delay checking as it does not # apply to local fetches. Ensures that unicode paths that are not # representable in the filesystem_encoding work. is_local = 0 if url.startswith('file://'): is_local = 7 elif url.startswith('file:'): is_local = 5 if is_local > 0: url = url[is_local:] if iswindows and url.startswith('/'): url = url[1:] with open(url, 'rb') as f: data = response(f.read()) data.newurl = 'file:'+url # This is what mechanize does for # local URLs self.log.debug('Fetched %s in %.1f seconds' % (url, time.time() - st)) return data delta = time.time() - self.last_fetch_at if delta < self.delay: time.sleep(self.delay - delta) # mechanize does not handle quoting automatically if re.search(r'\s+', url) is not None: if isinstance(url, unicode_type): url = url.encode('utf-8') purl = list(urlparse(url)) for i in range(2, 6): purl[i] = quote(purl[i]) url = urlunparse(purl).decode('utf-8') open_func = getattr(self.browser, 'open_novisit', self.browser.open) try: with closing(open_func(url, timeout=self.timeout)) as f: data = response(f.read()+f.read()) data.newurl = f.geturl() except URLError as err: if hasattr(err, 'code') and err.code in responses: raise FetchError(responses[err.code]) if getattr(err, 'reason', [0])[0] == 104 or \ getattr(getattr(err, 'args', [None])[0], 'errno', None) in (-2, -3): # Connection reset by peer or Name or service not known self.log.debug('Temporary error, retrying in 1 second') time.sleep(1) with closing(open_func(url, timeout=self.timeout)) as f: data = response(f.read()+f.read()) data.newurl = f.geturl() else: raise err finally: self.last_fetch_at = time.time() self.log.debug('Fetched %s in %f seconds' % (url, time.time() - st)) return data
def add_from_li(container, li, parent, nav_name): dest = frag = text = None for x in li.iterchildren(XHTML('a'), XHTML('span')): text = etree.tostring(x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join(x.xpath('descendant-or-self::*/@title')).strip() href = x.get('href') if href: dest = nav_name if href.startswith('#') else container.href_to_name(href, base=nav_name) frag = urlparse(href).fragment or None break return parent.add(text or None, dest or None, frag or None)
def canonicalize_url(url): # mechanize does not handle quoting automatically if re.search(r'\s+', url) is not None: if isinstance(url, unicode_type): url = url.encode('utf-8') purl = list(urlparse(url)) for i in range(2, 6): purl[i] = as_bytes(quote(purl[i])) url = urlunparse(purl).decode('utf-8') return url
def check_external_links(container, progress_callback=(lambda num, total:None), check_anchors=True): progress_callback(0, 0) external_links = defaultdict(list) for name, mt in iteritems(container.mime_map): if mt in OEB_DOCS or mt in OEB_STYLES: for href, lnum, col in container.iterlinks(name): purl = urlparse(href) if purl.scheme in ('http', 'https'): external_links[href].append((name, href, lnum, col)) if not external_links: return [] items = Queue() ans = [] tuple(map(items.put, iteritems(external_links))) progress_callback(0, len(external_links)) done = [] downloaded_html_ids = {} def check_links(): br = browser(honor_time=False, verify_ssl_certificates=False) while True: try: full_href, locations = items.get_nowait() except Empty: return href, frag = full_href.partition('#')[::2] try: res = br.open(href, timeout=10) except Exception as e: ans.append((locations, e, full_href)) else: if frag and check_anchors: ct = res.info().get('Content-Type') if ct and ct.split(';')[0].lower() in {'text/html', XHTML_MIME}: ids = downloaded_html_ids.get(href) if ids is None: try: ids = downloaded_html_ids[href] = get_html_ids(res.read()) except Exception: ids = downloaded_html_ids[href] = frozenset() if frag not in ids: ans.append((locations, ValueError('HTML anchor {} not found on the page'.format(frag)), full_href)) res.close() finally: done.append(None) progress_callback(len(done), len(external_links)) workers = [Thread(name="CheckLinks", target=check_links) for i in range(min(10, len(external_links)))] for w in workers: w.daemon = True w.start() for w in workers: w.join() return ans
def download_one(tdir, timeout, progress_report, data_uri_map, url): try: purl = urlparse(url) data_url_key = None with NamedTemporaryFile(dir=tdir, delete=False) as df: if purl.scheme == 'file': src = lopen(purl.path, 'rb') filename = os.path.basename(src) sz = (src.seek(0, os.SEEK_END), src.tell(), src.seek(0))[1] elif purl.scheme == 'data': prefix, payload = purl.path.split(',', 1) parts = prefix.split(';') if parts and parts[-1].lower() == 'base64': payload = re.sub(r'\s+', '', payload) payload = from_base64_bytes(payload) else: payload = payload.encode('utf-8') seen_before = data_uri_map.get(payload) if seen_before is not None: return True, (url, filename, seen_before, guess_type(seen_before)) data_url_key = payload src = BytesIO(payload) sz = len(payload) ext = 'unknown' for x in parts: if '=' not in x and '/' in x: exts = mimetypes.guess_all_extensions(x) if exts: ext = exts[0] break filename = 'data-uri.' + ext else: src = urlopen(url, timeout=timeout) filename = get_filename(purl, src) sz = get_content_length(src) progress_report(url, 0, sz) dest = ProgressTracker(df, url, sz, progress_report) with closing(src): shutil.copyfileobj(src, dest) if data_url_key is not None: data_uri_map[data_url_key] = dest.name filename = sanitize_file_name(filename) mt = guess_type(filename) if mt in OEB_DOCS: raise ValueError( 'The external resource {} looks like a HTML document ({})'. format(url, filename)) if not mt or mt == 'application/octet-stream' or '.' not in filename: raise ValueError( 'The external resource {} is not of a known type'.format( url)) return True, (url, filename, dest.name, mt) except Exception as err: return False, (url, as_unicode(err))
def __call__(self, url): if url and url.startswith('#'): return url name = self.container.href_to_name(url, self.base) if name != self.top_name: return url purl = urlparse(url) if purl.fragment and purl.fragment in self.bottom_anchors: url = self.container.name_to_href(self.bottom_name, self.base) + '#' + purl.fragment self.replaced = True return url
def localize_website_link(url): lc = lang_as_iso639_1(get_lang()) langs = website_languages() if lc == 'en' or lc not in langs: return url from polyglot.urllib import urlparse, urlunparse parts = urlparse(url) path = '/{}{}'.format(lc, parts.path) parts = list(parts) parts[2] = path return urlunparse(parts)
def localize_user_manual_link(url): lc = lang_code_for_user_manual() if not lc: return url from polyglot.urllib import urlparse, urlunparse parts = urlparse(url) path = re.sub(r'/generated/[a-z]+/', '/generated/%s/' % lc, parts.path or '') path = '/%s%s' % (lc, path) parts = list(parts) parts[2] = path return urlunparse(parts)
def add_links(self): for link in self.links: path, href, frag = link[0] page, rect = link[1:] combined_path = os.path.normcase( os.path.abspath( os.path.join(os.path.dirname(path), *unquote(href).split('/')))) is_local = not href or combined_path in self.anchors annot = Dictionary({ 'Type': Name('Annot'), 'Subtype': Name('Link'), 'Rect': rect, 'Border': Array([0, 0, 0]), }) if self.mark_links: annot.update({ 'Border': Array([16, 16, 1]), 'C': Array([1.0, 0, 0]) }) if is_local: path = combined_path if href else path try: annot['Dest'] = self.anchors[path][frag] except KeyError: try: annot['Dest'] = self.anchors[path][None] except KeyError: pass else: url = href + (('#' + frag) if frag else '') try: purl = urlparse(url) except Exception: self.pdf.debug('Ignoring unparseable URL: %r' % url) continue if purl.scheme and purl.scheme != 'file': action = Dictionary({ 'Type': Name('Action'), 'S': Name('URI'), }) # Do not try to normalize/quote/unquote this URL as if it # has a query part, it will get corrupted action['URI'] = String(url) annot['A'] = action if 'A' in annot or 'Dest' in annot: if 'Annots' not in page: page['Annots'] = Array() page['Annots'].append(self.pdf.objects.add(annot)) else: self.pdf.debug( 'Could not find destination for link: %s in file %s' % (href, path))
def isRelURL(self, url): """ Identify relative urls. >>> t = Textile() >>> t.isRelURL("http://www.google.com/") False >>> t.isRelURL("/foo") True """ (scheme, netloc) = urlparse(url)[0:2] return not scheme and not netloc
def __call__(self, url): if url and url.startswith('#'): return url name = self.container.href_to_name(url, self.base) amap = self.anchor_map.get(name, None) if amap is None: return url purl = urlparse(url) frag = purl.fragment or '' frag = amap.get(frag, frag) url = self.container.name_to_href(self.master, self.base) + '#' + frag self.replaced = True return url
def replace_link(url): purl = urlparse(url) if purl.scheme != 'https' or purl.netloc not in ('calibre-pdf-anchor.a', 'calibre-pdf-anchor.n'): return loc = None if purl.netloc == 'calibre-pdf-anchor.a': loc = anchor_locations.get(purl.fragment) if loc is None: log.warn('Anchor location for link to {} not found'.format(purl.fragment)) else: loc = anchor_locations.get(name_anchor_map.get(purl.fragment)) if loc is None: log.warn('Anchor location for link to {} not found'.format(purl.fragment)) return None if loc is None else loc.as_tuple
def download_one(tdir, timeout, progress_report, data_uri_map, url): try: purl = urlparse(url) data_url_key = None with NamedTemporaryFile(dir=tdir, delete=False) as df: if purl.scheme == 'file': src = lopen(purl.path, 'rb') filename = os.path.basename(src) sz = (src.seek(0, os.SEEK_END), src.tell(), src.seek(0))[1] elif purl.scheme == 'data': prefix, payload = purl.path.split(',', 1) parts = prefix.split(';') if parts and parts[-1].lower() == 'base64': payload = re.sub(r'\s+', '', payload) payload = from_base64_bytes(payload) else: payload = payload.encode('utf-8') seen_before = data_uri_map.get(payload) if seen_before is not None: return True, (url, filename, seen_before, guess_type(seen_before)) data_url_key = payload src = BytesIO(payload) sz = len(payload) ext = 'unknown' for x in parts: if '=' not in x and '/' in x: exts = mimetypes.guess_all_extensions(x) if exts: ext = exts[0] break filename = 'data-uri.' + ext else: src = urlopen(url, timeout=timeout) filename = get_filename(purl, src) sz = get_content_length(src) progress_report(url, 0, sz) dest = ProgressTracker(df, url, sz, progress_report) with closing(src): shutil.copyfileobj(src, dest) if data_url_key is not None: data_uri_map[data_url_key] = dest.name filename = sanitize_file_name(filename) mt = guess_type(filename) if mt in OEB_DOCS: raise ValueError('The external resource {} looks like a HTML document ({})'.format(url, filename)) if not mt or mt == 'application/octet-stream' or '.' not in filename: raise ValueError('The external resource {} is not of a known type'.format(url)) return True, (url, filename, dest.name, mt) except Exception as err: return False, (url, as_unicode(err))
def localize_user_manual_link(url): lc = lang_as_iso639_1(get_lang()) if lc == 'en': return url stats = user_manual_stats() if stats.get(lc, 0) < 0.3: return url from polyglot.urllib import urlparse, urlunparse parts = urlparse(url) path = re.sub(r'/generated/[a-z]+/', '/generated/%s/' % lc, parts.path or '') path = '/%s%s' % (lc, path) parts = list(parts) parts[2] = path return urlunparse(parts)
def process_toc_node(self, toc, level=0): href = toc.href if href: purl = urlparse(href) href = purl.path if href in self.document_hrefs: key = (href, purl.fragment or self.top_anchor) if key in self.anchor_map: bmark = self.anchor_map[key] else: bmark = self.anchor_map[(href, self.top_anchor)] self.toc.append(TOCItem(toc.title, bmark, level)) for child in toc: self.process_toc_node(child, level+1)
def __init__(self, url, base): ''' :param url: The url this link points to. Must be an unquoted unicode string. :param base: The base directory that relative URLs are with respect to. Must be a unicode string. ''' assert isinstance(url, unicode_type) and isinstance(base, unicode_type) self.url = url self.parsed_url = urlparse(self.url) self.is_local = self.parsed_url.scheme in ('', 'file') self.is_internal = self.is_local and not bool(self.parsed_url.path) self.path = None self.fragment = urlunquote(self.parsed_url.fragment) if self.is_local and not self.is_internal: self.path = self.url_to_local_path(self.parsed_url, base)
def __init__(self, url, base): ''' :param url: The url this link points to. Must be an unquoted unicode string. :param base: The base folder that relative URLs are with respect to. Must be a unicode string. ''' assert isinstance(url, str) and isinstance(base, str) self.url = url self.parsed_url = urlparse(self.url) self.is_local = self.parsed_url.scheme in ('', 'file') self.is_internal = self.is_local and not bool(self.parsed_url.path) self.path = None self.fragment = urlunquote(self.parsed_url.fragment) if self.is_local and not self.is_internal: self.path = self.url_to_local_path(self.parsed_url, base)
def __call__(self, url): if url and url.startswith('#'): return url purl = urlparse(url) frag = purl.fragment name = self.container.href_to_name(url, self.old_name) if not name: return url if name == self.old_name: name = self.new_name href = self.container.name_to_href(name, self.new_name) if frag: href += '#' + frag if href != url: self.replaced = True return href
def read_from_opf(self, opfreader): toc = opfreader.soup.find('spine', toc=True) if toc is not None: toc = toc['toc'] if toc is None: try: toc = opfreader.soup.find('guide').find('reference', attrs={'type': 'toc'})['href'] except: for item in opfreader.manifest: if 'toc' in item.href().lower(): toc = item.href() break if toc is not None: if toc.lower() not in ('ncx', 'ncxtoc'): toc = urlparse(unquote(toc))[2] toc = toc.replace('/', os.sep) if not os.path.isabs(toc): toc = os.path.join(self.base_path, toc) try: if not os.path.exists(toc): bn = os.path.basename(toc) bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files toc = os.path.join(os.path.dirname(toc), bn) self.read_html_toc(toc) except: print( 'WARNING: Could not read Table of Contents. Continuing anyway.' ) else: path = opfreader.manifest.item(toc.lower()) path = getattr(path, 'path', path) if path and os.access(path, os.R_OK): try: self.read_ncx_toc(path) except Exception as err: print('WARNING: Invalid NCX file:', err) return cwd = os.path.abspath(self.base_path) m = glob.glob(os.path.join(cwd, '*.ncx')) if m: toc = m[0] self.read_ncx_toc(toc)
def add_from_navpoint(container, navpoint, parent, ncx_name): dest = frag = text = None nl = child_xpath(navpoint, 'navlabel') if nl: nl = nl[0] text = '' for txt in child_xpath(nl, 'text'): text += etree.tostring(txt, method='text', encoding='unicode', with_tail=False) content = child_xpath(navpoint, 'content') if content: content = content[0] href = content.get('src', None) if href: dest = container.href_to_name(href, base=ncx_name) frag = urlparse(href).fragment or None return parent.add(text or None, dest or None, frag or None)
def url_replacer(self, orig_url): url = urlnormalize(orig_url) parts = urlparse(url) if parts.scheme: # Only rewrite local URLs return orig_url path, frag = urldefrag(url) if self.renamed_items_map: orig_item = self.renamed_items_map.get(self.current_item.href, self.current_item) else: orig_item = self.current_item href = orig_item.abshref(path) replacement = self.current_item.relhref(self.rename_map.get(href, href)) if frag: replacement += '#' + frag return replacement
def __call__(self, container): frag = urlparse(self.href).fragment nhref = container.name_to_href(self.corrected_name, self.name) if frag: nhref += '#' + frag orig_href = self.href class LinkReplacer(object): replaced = False def __call__(self, url): if url != orig_href: return url self.replaced = True return nhref replacer = LinkReplacer() container.replace_links(self.name, replacer) return replacer.replaced
def parse_html_toc(data): from html5_parser import parse from calibre.utils.cleantext import clean_xml_chars from lxml import etree if isinstance(data, bytes): data = xml_to_unicode(data, strip_encoding_pats=True, resolve_entities=True)[0] root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True) for a in root.xpath('//*[@href and local-name()="a"]'): purl = urlparse(unquote(a.get('href'))) href, fragment = purl[2], purl[5] if not fragment: fragment = None else: fragment = fragment.strip() href = href.strip() txt = etree.tostring(a, method='text', encoding='unicode') yield href, fragment, txt
def __call__(self, container): frag = urlparse(self.href).fragment nhref = container.name_to_href(self.corrected_name, self.name) if frag: nhref += '#' + frag orig_href = self.href class LinkReplacer: replaced = False def __call__(self, url): if url != orig_href: return url self.replaced = True return nhref replacer = LinkReplacer() container.replace_links(self.name, replacer) return replacer.replaced
def add_links(self): for link in self.links: path, href, frag = link[0] page, rect = link[1:] combined_path = os.path.normcase(os.path.abspath(os.path.join(os.path.dirname(path), *unquote(href).split('/')))) is_local = not href or combined_path in self.anchors annot = Dictionary({ 'Type':Name('Annot'), 'Subtype':Name('Link'), 'Rect':rect, 'Border':Array([0,0,0]), }) if self.mark_links: annot.update({'Border':Array([16, 16, 1]), 'C':Array([1.0, 0, 0])}) if is_local: path = combined_path if href else path try: annot['Dest'] = self.anchors[path][frag] except KeyError: try: annot['Dest'] = self.anchors[path][None] except KeyError: pass else: url = href + (('#'+frag) if frag else '') try: purl = urlparse(url) except Exception: self.pdf.debug('Ignoring unparseable URL: %r' % url) continue if purl.scheme and purl.scheme != 'file': action = Dictionary({ 'Type':Name('Action'), 'S':Name('URI'), }) # Do not try to normalize/quote/unquote this URL as if it # has a query part, it will get corrupted action['URI'] = String(url) annot['A'] = action if 'A' in annot or 'Dest' in annot: if 'Annots' not in page: page['Annots'] = Array() page['Annots'].append(self.pdf.objects.add(annot)) else: self.pdf.debug('Could not find destination for link: %s in file %s'% (href, path))
def replace_link(url): purl = urlparse(url) if purl.scheme != 'https' or purl.netloc not in ('calibre-pdf-anchor.a', 'calibre-pdf-anchor.n'): return loc = None if purl.netloc == 'calibre-pdf-anchor.a': loc = anchor_locations.get(purl.fragment) if loc is None: log.warn(f'Anchor location for link to {purl.fragment} not found') else: loc = anchor_locations.get(name_anchor_map.get(purl.fragment)) if loc is None: log.warn(f'Anchor location for link to {purl.fragment} not found') if loc is None: return None if loc.pagenum > pc: log.warn(f'Anchor location for link to {purl.fragment} is past the end of the document, moving it to last page') loc.pagenum = pc return loc.as_tuple
def url_replacer(self, orig_url): url = urlnormalize(orig_url) parts = urlparse(url) if parts.scheme: # Only rewrite local URLs return orig_url path, frag = urldefrag(url) if self.renamed_items_map: orig_item = self.renamed_items_map.get(self.current_item.href, self.current_item) else: orig_item = self.current_item href = orig_item.abshref(path) replacement = self.current_item.relhref(self.rename_map.get( href, href)) if frag: replacement += '#' + frag return replacement
def read_from_opf(self, opfreader): toc = opfreader.soup.find('spine', toc=True) if toc is not None: toc = toc['toc'] if toc is None: try: toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href'] except: for item in opfreader.manifest: if 'toc' in item.href().lower(): toc = item.href() break if toc is not None: if toc.lower() not in ('ncx', 'ncxtoc'): toc = urlparse(unquote(toc))[2] toc = toc.replace('/', os.sep) if not os.path.isabs(toc): toc = os.path.join(self.base_path, toc) try: if not os.path.exists(toc): bn = os.path.basename(toc) bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files toc = os.path.join(os.path.dirname(toc), bn) self.read_html_toc(toc) except: print('WARNING: Could not read Table of Contents. Continuing anyway.') else: path = opfreader.manifest.item(toc.lower()) path = getattr(path, 'path', path) if path and os.access(path, os.R_OK): try: self.read_ncx_toc(path) except Exception as err: print('WARNING: Invalid NCX file:', err) return cwd = os.path.abspath(self.base_path) m = glob.glob(os.path.join(cwd, '*.ncx')) if m: toc = m[0] self.read_ncx_toc(toc)
def serialize_hyperlink(self, parent, link): item, url, tooltip = link purl = urlparse(url) href = purl.path def make_link(parent, anchor=None, id=None, tooltip=None): kw = {} if anchor is not None: kw['w_anchor'] = anchor elif id is not None: kw['r_id'] = id if tooltip: kw['w_tooltip'] = tooltip return self.namespace.makeelement(parent, 'w:hyperlink', **kw) if not purl.scheme: href = item.abshref(href) if href not in self.document_hrefs: href = urlquote(href) if href in self.document_hrefs: key = (href, purl.fragment or self.top_anchor) if key in self.anchor_map: bmark = self.anchor_map[key] else: bmark = self.anchor_map[(href, self.top_anchor)] return make_link(parent, anchor=bmark, tooltip=tooltip) else: self.log.warn( 'Ignoring internal hyperlink with href (%s) pointing to unknown destination' % url) if purl.scheme in {'http', 'https', 'ftp'}: if url not in self.external_links: self.external_links[ url] = self.document_relationships.add_relationship( url, self.namespace.names['LINKS'], target_mode='External') return make_link(parent, id=self.external_links[url], tooltip=tooltip) return parent
def check_link_destination(container, dest_map, name, href, a, errors): if href.startswith('#'): tname = name else: try: tname = container.href_to_name(href, name) except ValueError: tname = None # Absolute links to files on another drive in windows cause this if tname and tname in container.mime_map: if container.mime_map[tname] not in OEB_DOCS: errors.append(BadDestinationType(name, tname, a)) else: root = container.parsed(tname) if hasattr(root, 'xpath'): if tname not in dest_map: dest_map[tname] = set(root.xpath('//*/@id|//*/@name')) purl = urlparse(href) if purl.fragment and purl.fragment not in dest_map[tname]: errors.append(BadDestinationFragment(name, tname, a, purl.fragment)) else: errors.append(BadDestinationType(name, tname, a))
def get_proxy_info(proxy_scheme, proxy_string): ''' Parse all proxy information from a proxy string (as returned by get_proxies). The returned dict will have members set to None when the info is not available in the string. If an exception occurs parsing the string this method returns None. ''' from polyglot.urllib import urlparse try: proxy_url = '%s://%s'%(proxy_scheme, proxy_string) urlinfo = urlparse(proxy_url) ans = { 'scheme': urlinfo.scheme, 'hostname': urlinfo.hostname, 'port': urlinfo.port, 'username': urlinfo.username, 'password': urlinfo.password, } except Exception: return None return ans
def read_html_toc(self, toc): self.base_path = os.path.dirname(toc) soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES) for a in soup.findAll('a'): if not a.has_key('href'): # noqa continue purl = urlparse(unquote(a['href'])) href, fragment = purl[2], purl[5] if not fragment: fragment = None else: fragment = fragment.strip() href = href.strip() txt = ''.join([unicode_type(s).strip() for s in a.findAll(text=True)]) add = True for i in self.flat(): if i.href == href and i.fragment == fragment: add = False break if add: self.add_item(href, fragment, txt)
def __call__(self, url): if url and url.startswith('#'): repl = self.frag_map(self.base, url[1:]) if not repl or repl == url[1:]: return url self.replaced = True return '#' + repl name = self.container.href_to_name(url, self.base) if not name: return url nname = self.link_map.get(name, None) if not nname: return url purl = urlparse(url) href = self.container.name_to_href(nname, self.base) if purl.fragment: nfrag = self.frag_map(name, purl.fragment) if nfrag: href += '#%s' % nfrag if href != url: self.replaced = True return href
def __call__(self, url): if url and url.startswith('#'): repl = self.id_map.get(self.base, {}).get(url[1:]) if repl is None or repl == url[1:]: return url self.replaced = True return '#' + repl name = self.container.href_to_name(url, self.base) if not name: return url id_map = self.id_map.get(name) if id_map is None: return url purl = urlparse(url) nfrag = id_map.get(purl.fragment) if nfrag is None: return url purl = purl._replace(fragment=nfrag) href = urlunparse(purl) if href != url: self.replaced = True return href
def __init__(self, format): ''' Create a query object by passing it the url format obtained from the opensearch Description. ''' self.format = format # unpack the url to a tuple self.url_parts = urlparse(format) # unpack the query string to a dictionary self.query_string = parse_qs(self.url_parts[4]) # look for standard macros and create a mapping of the # opensearch names to the service specific ones # so q={searchTerms} will result in a mapping between searchTerms and q self.macro_map = {} for key,values in self.query_string.items(): # TODO eventually optional/required params should be # distinguished somehow (the ones with/without trailing ? macro = values[0].replace('{', '').replace('}', '').replace('?', '') if macro in Query.standard_macros: self.macro_map[macro] = key
def __init__(self, opts): self.library_path = opts.library_path or prefs['library_path'] self.timeout = opts.timeout self.url = None if self.library_path is None: raise SystemExit( 'No saved library path, either run the GUI or use the' ' --with-library option') if self.library_path.partition(':')[0] in ('http', 'https'): parts = urlparse(self.library_path) self.library_id = parts.fragment or None self.url = urlunparse(parts._replace(fragment='')).rstrip('/') self.br = browser(handle_refresh=False, user_agent='{} {}'.format( __appname__, __version__)) self.is_remote = True username, password = read_credentials(opts) self.has_credentials = False if username and password: self.br.add_password(self.url, username, password) self.has_credentials = True if self.library_id == '-': self.list_libraries() raise SystemExit() else: self.library_path = os.path.expanduser(self.library_path) if not singleinstance('db'): ext = '.exe' if iswindows else '' raise SystemExit( _('Another calibre program such as {} or the main calibre program is running.' ' Having multiple programs that can make changes to a calibre library' ' running at the same time is a bad idea. calibredb can connect directly' ' to a running calibre Content server, to make changes through it, instead.' ' See the documentation of the {} option for details.'). format('calibre-server' + ext, '--with-library')) self._db = None self.is_remote = False
def get_download_filename_from_response(response): from polyglot.urllib import unquote, urlparse filename = last_part_name = '' try: purl = urlparse(response.geturl()) last_part_name = unquote(purl.path.split('/')[-1]) disposition = response.info().get('Content-disposition', '') for p in disposition.split(';'): if 'filename' in p: if '*=' in disposition: parts = disposition.split('*=')[-1] filename = parts.split('\'')[-1] else: filename = disposition.split('=')[-1] if filename[0] in ('\'', '"'): filename = filename[1:] if filename[-1] in ('\'', '"'): filename = filename[:-1] filename = unquote(filename) break except Exception: import traceback traceback.print_exc() return filename or last_part_name