Exemple #1
0
 def process_stylesheets(self, soup, baseurl):
     diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets'))
     if not os.path.exists(diskpath):
         os.mkdir(diskpath)
     for c, tag in enumerate(soup.findAll(name=['link', 'style'])):
         try:
             mtype = tag['type']
         except KeyError:
             mtype = 'text/css' if tag.name.lower() == 'style' else ''
         if mtype.lower() != 'text/css':
             continue
         if tag.has_attr('href'):
             iurl = tag['href']
             if not urlsplit(iurl).scheme:
                 iurl = urljoin(baseurl, iurl, False)
             found_cached = False
             with self.stylemap_lock:
                 if iurl in self.stylemap:
                     tag['href'] = self.stylemap[iurl]
                     found_cached = True
             if found_cached:
                 continue
             try:
                 data = self.fetch_url(iurl)
             except Exception:
                 self.log.exception('Could not fetch stylesheet ', iurl)
                 continue
             stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
             with self.stylemap_lock:
                 self.stylemap[iurl] = stylepath
             with open(stylepath, 'wb') as x:
                 x.write(data)
             tag['href'] = stylepath
         else:
             for ns in tag.findAll(text=True):
                 src = str(ns)
                 m = self.__class__.CSS_IMPORT_PATTERN.search(src)
                 if m:
                     iurl = m.group(1)
                     if not urlsplit(iurl).scheme:
                         iurl = urljoin(baseurl, iurl, False)
                     found_cached = False
                     with self.stylemap_lock:
                         if iurl in self.stylemap:
                             ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
                             found_cached = True
                     if found_cached:
                         continue
                     try:
                         data = self.fetch_url(iurl)
                     except Exception:
                         self.log.exception('Could not fetch stylesheet ', iurl)
                         continue
                     c += 1
                     stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
                     with self.stylemap_lock:
                         self.stylemap[iurl] = stylepath
                     with open(stylepath, 'wb') as x:
                         x.write(data)
                     ns.replaceWith(src.replace(m.group(1), stylepath))
Exemple #2
0
 def process_stylesheets(self, soup, baseurl):
     diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets'))
     if not os.path.exists(diskpath):
         os.mkdir(diskpath)
     for c, tag in enumerate(
             soup.findAll(lambda tag: tag.name.lower() in ['link', 'style']
                          and tag.has_key('type') and tag['type'].lower(
                          ) == 'text/css')):  # noqa
         if tag.has_key('href'):  # noqa
             iurl = tag['href']
             if not urlsplit(iurl).scheme:
                 iurl = urljoin(baseurl, iurl, False)
             with self.stylemap_lock:
                 if self.stylemap.has_key(iurl):  # noqa
                     tag['href'] = self.stylemap[iurl]
                     continue
             try:
                 data = self.fetch_url(iurl)
             except Exception:
                 self.log.exception('Could not fetch stylesheet ', iurl)
                 continue
             stylepath = os.path.join(diskpath, 'style' + str(c) + '.css')
             with self.stylemap_lock:
                 self.stylemap[iurl] = stylepath
             with open(stylepath, 'wb') as x:
                 x.write(data)
             tag['href'] = stylepath
         else:
             for ns in tag.findAll(text=True):
                 src = str(ns)
                 m = self.__class__.CSS_IMPORT_PATTERN.search(src)
                 if m:
                     iurl = m.group(1)
                     if not urlsplit(iurl).scheme:
                         iurl = urljoin(baseurl, iurl, False)
                     with self.stylemap_lock:
                         if self.stylemap.has_key(iurl):  # noqa
                             ns.replaceWith(
                                 src.replace(m.group(1),
                                             self.stylemap[iurl]))
                             continue
                     try:
                         data = self.fetch_url(iurl)
                     except Exception:
                         self.log.exception('Could not fetch stylesheet ',
                                            iurl)
                         continue
                     c += 1
                     stylepath = os.path.join(diskpath,
                                              'style' + str(c) + '.css')
                     with self.stylemap_lock:
                         self.stylemap[iurl] = stylepath
                     with open(stylepath, 'wb') as x:
                         x.write(data)
                     ns.replaceWith(src.replace(m.group(1), stylepath))
Exemple #3
0
def basename(url):
    try:
        parts = urlsplit(url)
        path = url2pathname(parts.path)
        res = os.path.basename(path)
    except:
        global bad_url_counter
        bad_url_counter += 1
        return 'bad_url_%d.html' % bad_url_counter
    if not os.path.splitext(res)[1]:
        return 'index.html'
    return res
Exemple #4
0
 def absurl(self, baseurl, tag, key, filter=True):
     iurl = tag[key]
     parts = urlsplit(iurl)
     if not parts.netloc and not parts.path and not parts.query:
         return None
     if not parts.scheme:
         iurl = urljoin(baseurl, iurl, False)
     if not self.is_link_ok(iurl):
         self.log.debug('Skipping invalid link:', iurl)
         return None
     if filter and not self.is_link_wanted(iurl, tag):
         self.log.debug('Filtered link: ' + iurl)
         return None
     return iurl
Exemple #5
0
 def localize_link(self, tag, key, path):
     parts = urlsplit(tag[key])
     suffix = ('#' + parts.fragment) if parts.fragment else ''
     tag[key] = path + suffix
Exemple #6
0
 def normurl(self, url):
     parts = list(urlsplit(url))
     parts[4] = ''
     return urlunsplit(parts)
Exemple #7
0
 def process_images(self, soup, baseurl):
     diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
     if not os.path.exists(diskpath):
         os.mkdir(diskpath)
     c = 0
     for tag in soup.findAll(lambda tag: tag.name.lower() == 'img' and tag.
                             has_key('src')):  # noqa
         iurl = tag['src']
         if iurl.startswith('data:image/'):
             try:
                 data = b64decode(iurl.partition(',')[-1])
             except:
                 self.log.exception('Failed to decode embedded image')
                 continue
         else:
             if callable(self.image_url_processor):
                 iurl = self.image_url_processor(baseurl, iurl)
             if not urlsplit(iurl).scheme:
                 iurl = urljoin(baseurl, iurl, False)
             with self.imagemap_lock:
                 if self.imagemap.has_key(iurl):  # noqa
                     tag['src'] = self.imagemap[iurl]
                     continue
             try:
                 data = self.fetch_url(iurl)
                 if data == 'GIF89a\x01':
                     # Skip empty GIF files as PIL errors on them anyway
                     continue
             except Exception:
                 self.log.exception('Could not fetch image ', iurl)
                 continue
         c += 1
         fname = ascii_filename('img' + str(c))
         if isinstance(fname, unicode_type):
             fname = fname.encode('ascii', 'replace')
         data = self.preprocess_image_ext(
             data, iurl) if self.preprocess_image_ext is not None else data
         if data is None:
             continue
         itype = what(None, data)
         if itype == 'svg' or (itype is None and b'<svg' in data[:1024]):
             # SVG image
             imgpath = os.path.join(diskpath, fname + '.svg')
             with self.imagemap_lock:
                 self.imagemap[iurl] = imgpath
             with open(imgpath, 'wb') as x:
                 x.write(data)
             tag['src'] = imgpath
         else:
             try:
                 # Ensure image is valid
                 img = image_from_data(data)
                 if itype not in {'png', 'jpg', 'jpeg'}:
                     itype = 'png' if itype == 'gif' else 'jpeg'
                     data = image_to_data(img, fmt=itype)
                 if self.compress_news_images and itype in {'jpg', 'jpeg'}:
                     try:
                         data = self.rescale_image(data)
                     except Exception:
                         self.log.exception('failed to compress image ' +
                                            iurl)
                 # Moon+ apparently cannot handle .jpeg files
                 if itype == 'jpeg':
                     itype = 'jpg'
                 imgpath = os.path.join(diskpath, fname + '.' + itype)
                 with self.imagemap_lock:
                     self.imagemap[iurl] = imgpath
                 with open(imgpath, 'wb') as x:
                     x.write(data)
                 tag['src'] = imgpath
             except Exception:
                 traceback.print_exc()
                 continue
Exemple #8
0
def get_https_resource_securely(
    url, cacerts='calibre-ebook-root-CA.crt', timeout=60, max_redirects=5, ssl_version=None, headers=None, get_response=False):
    '''
    Download the resource pointed to by url using https securely (verify server
    certificate).  Ensures that redirects, if any, are also downloaded
    securely. Needs a CA certificates bundle (in PEM format) to verify the
    server's certificates.

    You can pass cacerts=None to download using SSL but without verifying the server certificate.
    '''
    if ssl_version is None:
        try:
            ssl_version = ssl.PROTOCOL_TLSv1_2
        except AttributeError:
            ssl_version = ssl.PROTOCOL_TLSv1  # old python
    cert_file = None
    if cacerts is not None:
        cert_file = P(cacerts, allow_user_override=False)
    p = urlsplit(url)
    if p.scheme != 'https':
        raise ValueError('URL %s scheme must be https, not %r' % (url, p.scheme))

    hostname, port = p.hostname, p.port
    proxies = get_proxies()
    has_proxy = False
    for q in ('https', 'http'):
        if q in proxies:
            try:
                h, po = proxies[q].rpartition(':')[::2]
                po = int(po)
                if h:
                    hostname, port, has_proxy = h, po, True
                    break
            except Exception:
                # Invalid proxy, ignore
                pass

    c = HTTPSConnection(ssl_version, hostname, port, cert_file=cert_file, timeout=timeout)
    if has_proxy:
        c.set_tunnel(p.hostname, p.port)

    with closing(c):
        c.connect()  # This is needed for proxy connections
        path = p.path or '/'
        if p.query:
            path += '?' + p.query
        c.request('GET', path, headers=headers or {})
        response = c.getresponse()
        if response.status in (httplib.MOVED_PERMANENTLY, httplib.FOUND, httplib.SEE_OTHER):
            if max_redirects <= 0:
                raise ValueError('Too many redirects, giving up')
            newurl = response.getheader('Location', None)
            if newurl is None:
                raise ValueError('%s returned a redirect response with no Location header' % url)
            return get_https_resource_securely(
                newurl, cacerts=cacerts, timeout=timeout, max_redirects=max_redirects-1, ssl_version=ssl_version, get_response=get_response)
        if response.status != httplib.OK:
            raise HTTPError(url, response.status)
        if get_response:
            return response
        return response.read()
Exemple #9
0
 def process_images(self, soup, baseurl):
     diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
     if not os.path.exists(diskpath):
         os.mkdir(diskpath)
     c = 0
     for tag in soup.findAll('img', src=True):
         iurl = tag['src']
         if iurl.startswith('data:'):
             try:
                 data = urlopen(iurl).read()
             except Exception:
                 self.log.exception('Failed to decode embedded image')
                 continue
         else:
             if callable(self.image_url_processor):
                 iurl = self.image_url_processor(baseurl, iurl)
             if not urlsplit(iurl).scheme:
                 iurl = urljoin(baseurl, iurl, False)
             found_in_cache = False
             with self.imagemap_lock:
                 if iurl in self.imagemap:
                     tag['src'] = self.imagemap[iurl]
                     found_in_cache = True
             if found_in_cache:
                 continue
             try:
                 data = self.fetch_url(iurl)
                 if data == b'GIF89a\x01':
                     # Skip empty GIF files as PIL errors on them anyway
                     continue
             except Exception:
                 self.log.exception('Could not fetch image ', iurl)
                 continue
         c += 1
         fname = ascii_filename('img' + str(c))
         data = self.preprocess_image_ext(
             data, iurl) if self.preprocess_image_ext is not None else data
         if data is None:
             continue
         itype = what(None, data)
         if itype == 'svg' or (itype is None and b'<svg' in data[:1024]):
             # SVG image
             imgpath = os.path.join(diskpath, fname + '.svg')
             with self.imagemap_lock:
                 self.imagemap[iurl] = imgpath
             with open(imgpath, 'wb') as x:
                 x.write(data)
             tag['src'] = imgpath
         else:
             from calibre.utils.img import image_from_data, image_to_data
             try:
                 # Ensure image is valid
                 img = image_from_data(data)
                 if itype not in {'png', 'jpg', 'jpeg'}:
                     itype = 'png' if itype == 'gif' else 'jpeg'
                     data = image_to_data(img, fmt=itype)
                 if self.compress_news_images and itype in {'jpg', 'jpeg'}:
                     try:
                         data = self.rescale_image(data)
                     except Exception:
                         self.log.exception('failed to compress image ' +
                                            iurl)
                 # Moon+ apparently cannot handle .jpeg files
                 if itype == 'jpeg':
                     itype = 'jpg'
                 imgpath = os.path.join(diskpath, fname + '.' + itype)
                 with self.imagemap_lock:
                     self.imagemap[iurl] = imgpath
                 with open(imgpath, 'wb') as x:
                     x.write(data)
                 tag['src'] = imgpath
             except Exception:
                 traceback.print_exc()
                 continue