def process_stylesheets(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets')) if not os.path.exists(diskpath): os.mkdir(diskpath) for c, tag in enumerate(soup.findAll(name=['link', 'style'])): try: mtype = tag['type'] except KeyError: mtype = 'text/css' if tag.name.lower() == 'style' else '' if mtype.lower() != 'text/css': continue if tag.has_attr('href'): iurl = tag['href'] if not urlsplit(iurl).scheme: iurl = urljoin(baseurl, iurl, False) found_cached = False with self.stylemap_lock: if iurl in self.stylemap: tag['href'] = self.stylemap[iurl] found_cached = True if found_cached: continue try: data = self.fetch_url(iurl) except Exception: self.log.exception('Could not fetch stylesheet ', iurl) continue stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') with self.stylemap_lock: self.stylemap[iurl] = stylepath with open(stylepath, 'wb') as x: x.write(data) tag['href'] = stylepath else: for ns in tag.findAll(text=True): src = str(ns) m = self.__class__.CSS_IMPORT_PATTERN.search(src) if m: iurl = m.group(1) if not urlsplit(iurl).scheme: iurl = urljoin(baseurl, iurl, False) found_cached = False with self.stylemap_lock: if iurl in self.stylemap: ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl])) found_cached = True if found_cached: continue try: data = self.fetch_url(iurl) except Exception: self.log.exception('Could not fetch stylesheet ', iurl) continue c += 1 stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') with self.stylemap_lock: self.stylemap[iurl] = stylepath with open(stylepath, 'wb') as x: x.write(data) ns.replaceWith(src.replace(m.group(1), stylepath))
def process_stylesheets(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets')) if not os.path.exists(diskpath): os.mkdir(diskpath) for c, tag in enumerate( soup.findAll(lambda tag: tag.name.lower() in ['link', 'style'] and tag.has_key('type') and tag['type'].lower( ) == 'text/css')): # noqa if tag.has_key('href'): # noqa iurl = tag['href'] if not urlsplit(iurl).scheme: iurl = urljoin(baseurl, iurl, False) with self.stylemap_lock: if self.stylemap.has_key(iurl): # noqa tag['href'] = self.stylemap[iurl] continue try: data = self.fetch_url(iurl) except Exception: self.log.exception('Could not fetch stylesheet ', iurl) continue stylepath = os.path.join(diskpath, 'style' + str(c) + '.css') with self.stylemap_lock: self.stylemap[iurl] = stylepath with open(stylepath, 'wb') as x: x.write(data) tag['href'] = stylepath else: for ns in tag.findAll(text=True): src = str(ns) m = self.__class__.CSS_IMPORT_PATTERN.search(src) if m: iurl = m.group(1) if not urlsplit(iurl).scheme: iurl = urljoin(baseurl, iurl, False) with self.stylemap_lock: if self.stylemap.has_key(iurl): # noqa ns.replaceWith( src.replace(m.group(1), self.stylemap[iurl])) continue try: data = self.fetch_url(iurl) except Exception: self.log.exception('Could not fetch stylesheet ', iurl) continue c += 1 stylepath = os.path.join(diskpath, 'style' + str(c) + '.css') with self.stylemap_lock: self.stylemap[iurl] = stylepath with open(stylepath, 'wb') as x: x.write(data) ns.replaceWith(src.replace(m.group(1), stylepath))
def basename(url): try: parts = urlsplit(url) path = url2pathname(parts.path) res = os.path.basename(path) except: global bad_url_counter bad_url_counter += 1 return 'bad_url_%d.html' % bad_url_counter if not os.path.splitext(res)[1]: return 'index.html' return res
def absurl(self, baseurl, tag, key, filter=True): iurl = tag[key] parts = urlsplit(iurl) if not parts.netloc and not parts.path and not parts.query: return None if not parts.scheme: iurl = urljoin(baseurl, iurl, False) if not self.is_link_ok(iurl): self.log.debug('Skipping invalid link:', iurl) return None if filter and not self.is_link_wanted(iurl, tag): self.log.debug('Filtered link: ' + iurl) return None return iurl
def localize_link(self, tag, key, path): parts = urlsplit(tag[key]) suffix = ('#' + parts.fragment) if parts.fragment else '' tag[key] = path + suffix
def normurl(self, url): parts = list(urlsplit(url)) parts[4] = '' return urlunsplit(parts)
def process_images(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'images')) if not os.path.exists(diskpath): os.mkdir(diskpath) c = 0 for tag in soup.findAll(lambda tag: tag.name.lower() == 'img' and tag. has_key('src')): # noqa iurl = tag['src'] if iurl.startswith('data:image/'): try: data = b64decode(iurl.partition(',')[-1]) except: self.log.exception('Failed to decode embedded image') continue else: if callable(self.image_url_processor): iurl = self.image_url_processor(baseurl, iurl) if not urlsplit(iurl).scheme: iurl = urljoin(baseurl, iurl, False) with self.imagemap_lock: if self.imagemap.has_key(iurl): # noqa tag['src'] = self.imagemap[iurl] continue try: data = self.fetch_url(iurl) if data == 'GIF89a\x01': # Skip empty GIF files as PIL errors on them anyway continue except Exception: self.log.exception('Could not fetch image ', iurl) continue c += 1 fname = ascii_filename('img' + str(c)) if isinstance(fname, unicode_type): fname = fname.encode('ascii', 'replace') data = self.preprocess_image_ext( data, iurl) if self.preprocess_image_ext is not None else data if data is None: continue itype = what(None, data) if itype == 'svg' or (itype is None and b'<svg' in data[:1024]): # SVG image imgpath = os.path.join(diskpath, fname + '.svg') with self.imagemap_lock: self.imagemap[iurl] = imgpath with open(imgpath, 'wb') as x: x.write(data) tag['src'] = imgpath else: try: # Ensure image is valid img = image_from_data(data) if itype not in {'png', 'jpg', 'jpeg'}: itype = 'png' if itype == 'gif' else 'jpeg' data = image_to_data(img, fmt=itype) if self.compress_news_images and itype in {'jpg', 'jpeg'}: try: data = self.rescale_image(data) except Exception: self.log.exception('failed to compress image ' + iurl) # Moon+ apparently cannot handle .jpeg files if itype == 'jpeg': itype = 'jpg' imgpath = os.path.join(diskpath, fname + '.' + itype) with self.imagemap_lock: self.imagemap[iurl] = imgpath with open(imgpath, 'wb') as x: x.write(data) tag['src'] = imgpath except Exception: traceback.print_exc() continue
def get_https_resource_securely( url, cacerts='calibre-ebook-root-CA.crt', timeout=60, max_redirects=5, ssl_version=None, headers=None, get_response=False): ''' Download the resource pointed to by url using https securely (verify server certificate). Ensures that redirects, if any, are also downloaded securely. Needs a CA certificates bundle (in PEM format) to verify the server's certificates. You can pass cacerts=None to download using SSL but without verifying the server certificate. ''' if ssl_version is None: try: ssl_version = ssl.PROTOCOL_TLSv1_2 except AttributeError: ssl_version = ssl.PROTOCOL_TLSv1 # old python cert_file = None if cacerts is not None: cert_file = P(cacerts, allow_user_override=False) p = urlsplit(url) if p.scheme != 'https': raise ValueError('URL %s scheme must be https, not %r' % (url, p.scheme)) hostname, port = p.hostname, p.port proxies = get_proxies() has_proxy = False for q in ('https', 'http'): if q in proxies: try: h, po = proxies[q].rpartition(':')[::2] po = int(po) if h: hostname, port, has_proxy = h, po, True break except Exception: # Invalid proxy, ignore pass c = HTTPSConnection(ssl_version, hostname, port, cert_file=cert_file, timeout=timeout) if has_proxy: c.set_tunnel(p.hostname, p.port) with closing(c): c.connect() # This is needed for proxy connections path = p.path or '/' if p.query: path += '?' + p.query c.request('GET', path, headers=headers or {}) response = c.getresponse() if response.status in (httplib.MOVED_PERMANENTLY, httplib.FOUND, httplib.SEE_OTHER): if max_redirects <= 0: raise ValueError('Too many redirects, giving up') newurl = response.getheader('Location', None) if newurl is None: raise ValueError('%s returned a redirect response with no Location header' % url) return get_https_resource_securely( newurl, cacerts=cacerts, timeout=timeout, max_redirects=max_redirects-1, ssl_version=ssl_version, get_response=get_response) if response.status != httplib.OK: raise HTTPError(url, response.status) if get_response: return response return response.read()
def process_images(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'images')) if not os.path.exists(diskpath): os.mkdir(diskpath) c = 0 for tag in soup.findAll('img', src=True): iurl = tag['src'] if iurl.startswith('data:'): try: data = urlopen(iurl).read() except Exception: self.log.exception('Failed to decode embedded image') continue else: if callable(self.image_url_processor): iurl = self.image_url_processor(baseurl, iurl) if not urlsplit(iurl).scheme: iurl = urljoin(baseurl, iurl, False) found_in_cache = False with self.imagemap_lock: if iurl in self.imagemap: tag['src'] = self.imagemap[iurl] found_in_cache = True if found_in_cache: continue try: data = self.fetch_url(iurl) if data == b'GIF89a\x01': # Skip empty GIF files as PIL errors on them anyway continue except Exception: self.log.exception('Could not fetch image ', iurl) continue c += 1 fname = ascii_filename('img' + str(c)) data = self.preprocess_image_ext( data, iurl) if self.preprocess_image_ext is not None else data if data is None: continue itype = what(None, data) if itype == 'svg' or (itype is None and b'<svg' in data[:1024]): # SVG image imgpath = os.path.join(diskpath, fname + '.svg') with self.imagemap_lock: self.imagemap[iurl] = imgpath with open(imgpath, 'wb') as x: x.write(data) tag['src'] = imgpath else: from calibre.utils.img import image_from_data, image_to_data try: # Ensure image is valid img = image_from_data(data) if itype not in {'png', 'jpg', 'jpeg'}: itype = 'png' if itype == 'gif' else 'jpeg' data = image_to_data(img, fmt=itype) if self.compress_news_images and itype in {'jpg', 'jpeg'}: try: data = self.rescale_image(data) except Exception: self.log.exception('failed to compress image ' + iurl) # Moon+ apparently cannot handle .jpeg files if itype == 'jpeg': itype = 'jpg' imgpath = os.path.join(diskpath, fname + '.' + itype) with self.imagemap_lock: self.imagemap[iurl] = imgpath with open(imgpath, 'wb') as x: x.write(data) tag['src'] = imgpath except Exception: traceback.print_exc() continue