Ejemplo n.º 1
0
def save_soup(soup, target):
    ns = BeautifulSoup(
        '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />'
    )
    nm = ns.find('meta')
    metas = soup.findAll('meta', content=True)
    added = False
    for meta in metas:
        if 'charset' in meta.get('content', '').lower():
            meta.replaceWith(nm)
            added = True
    if not added:
        head = soup.find('head')
        if head is not None:
            head.insert(0, nm)

    selfdir = os.path.dirname(target)

    for tag in soup.findAll(['img', 'link', 'a']):
        for key in ('src', 'href'):
            path = tag.get(key, None)
            if path and os.path.isfile(path) and os.path.exists(
                    path) and os.path.isabs(path):
                tag[key] = unicode_path(
                    relpath(path, selfdir).replace(os.sep, '/'))

    html = unicode_type(soup)
    with open(target, 'wb') as f:
        f.write(html.encode('utf-8'))
Ejemplo n.º 2
0
def save_soup(soup, target):
    ns = BeautifulSoup('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
    nm = ns.find('meta')
    metas = soup.findAll('meta', content=True)
    added = False
    for meta in metas:
        if 'charset' in meta.get('content', '').lower():
            meta.replaceWith(nm)
            added = True
    if not added:
        head = soup.find('head')
        if head is not None:
            head.insert(0, nm)

    selfdir = os.path.dirname(target)

    for tag in soup.findAll(['img', 'link', 'a']):
        for key in ('src', 'href'):
            path = tag.get(key, None)
            if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path):
                tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/'))

    html = unicode(soup)
    with open(target, 'wb') as f:
        f.write(html.encode('utf-8'))
Ejemplo n.º 3
0
 def process_stylesheets(self, soup, baseurl):
     diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets'))
     if not os.path.exists(diskpath):
         os.mkdir(diskpath)
     for c, tag in enumerate(soup.findAll(name=['link', 'style'])):
         try:
             mtype = tag['type']
         except KeyError:
             mtype = 'text/css' if tag.name.lower() == 'style' else ''
         if mtype.lower() != 'text/css':
             continue
         if tag.has_attr('href'):
             iurl = tag['href']
             if not urlsplit(iurl).scheme:
                 iurl = urljoin(baseurl, iurl, False)
             found_cached = False
             with self.stylemap_lock:
                 if iurl in self.stylemap:
                     tag['href'] = self.stylemap[iurl]
                     found_cached = True
             if found_cached:
                 continue
             try:
                 data = self.fetch_url(iurl)
             except Exception:
                 self.log.exception('Could not fetch stylesheet ', iurl)
                 continue
             stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
             with self.stylemap_lock:
                 self.stylemap[iurl] = stylepath
             with open(stylepath, 'wb') as x:
                 x.write(data)
             tag['href'] = stylepath
         else:
             for ns in tag.findAll(text=True):
                 src = str(ns)
                 m = self.__class__.CSS_IMPORT_PATTERN.search(src)
                 if m:
                     iurl = m.group(1)
                     if not urlsplit(iurl).scheme:
                         iurl = urljoin(baseurl, iurl, False)
                     found_cached = False
                     with self.stylemap_lock:
                         if iurl in self.stylemap:
                             ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
                             found_cached = True
                     if found_cached:
                         continue
                     try:
                         data = self.fetch_url(iurl)
                     except Exception:
                         self.log.exception('Could not fetch stylesheet ', iurl)
                         continue
                     c += 1
                     stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
                     with self.stylemap_lock:
                         self.stylemap[iurl] = stylepath
                     with open(stylepath, 'wb') as x:
                         x.write(data)
                     ns.replaceWith(src.replace(m.group(1), stylepath))
Ejemplo n.º 4
0
    def __init__(self,
                 path_to_html_file,
                 level,
                 encoding,
                 verbose,
                 referrer=None):
        '''
        :param level: The level of this file. Should be 0 for the root file.
        :param encoding: Use `encoding` to decode HTML.
        :param referrer: The :class:`HTMLFile` that first refers to this file.
        '''
        self.path = unicode_path(path_to_html_file, abs=True)
        self.title = os.path.splitext(os.path.basename(self.path))[0]
        self.base = os.path.dirname(self.path)
        self.level = level
        self.referrer = referrer
        self.links = []

        try:
            with open(self.path, 'rb') as f:
                src = header = f.read(4096)
                encoding = detect_xml_encoding(src)[1]
                if encoding:
                    try:
                        header = header.decode(encoding, errors='replace')
                    except ValueError:
                        pass
                self.is_binary = False
                if level > 0:
                    pat = self.HTML_PAT_BIN if isinstance(
                        header, bytes) else self.HTML_PAT
                    self.is_binary = not bool(pat.search(header))
                if not self.is_binary:
                    src += f.read()
        except OSError as err:
            msg = 'Could not read from file: %s with error: %s' % (
                self.path, as_unicode(err))
            if level == 0:
                raise OSError(msg)
            raise IgnoreFile(msg, err.errno)

        if not src:
            if level == 0:
                raise ValueError('The file %s is empty' % self.path)
            self.is_binary = True

        if not self.is_binary:
            if not encoding:
                encoding = detect_xml_encoding(src[:4096], verbose=verbose)[1]
                self.encoding = encoding
            else:
                self.encoding = encoding

            src = src.decode(encoding, 'replace')
            match = self.TITLE_PAT.search(src)
            self.title = match.group(1) if match is not None else self.title
            self.find_links(src)
Ejemplo n.º 5
0
 def process_stylesheets(self, soup, baseurl):
     diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets'))
     if not os.path.exists(diskpath):
         os.mkdir(diskpath)
     for c, tag in enumerate(
             soup.findAll(lambda tag: tag.name.lower() in ['link', 'style']
                          and tag.has_key('type') and tag['type'].lower(
                          ) == 'text/css')):  # noqa
         if tag.has_key('href'):  # noqa
             iurl = tag['href']
             if not urlsplit(iurl).scheme:
                 iurl = urljoin(baseurl, iurl, False)
             with self.stylemap_lock:
                 if self.stylemap.has_key(iurl):  # noqa
                     tag['href'] = self.stylemap[iurl]
                     continue
             try:
                 data = self.fetch_url(iurl)
             except Exception:
                 self.log.exception('Could not fetch stylesheet ', iurl)
                 continue
             stylepath = os.path.join(diskpath, 'style' + str(c) + '.css')
             with self.stylemap_lock:
                 self.stylemap[iurl] = stylepath
             with open(stylepath, 'wb') as x:
                 x.write(data)
             tag['href'] = stylepath
         else:
             for ns in tag.findAll(text=True):
                 src = str(ns)
                 m = self.__class__.CSS_IMPORT_PATTERN.search(src)
                 if m:
                     iurl = m.group(1)
                     if not urlsplit(iurl).scheme:
                         iurl = urljoin(baseurl, iurl, False)
                     with self.stylemap_lock:
                         if self.stylemap.has_key(iurl):  # noqa
                             ns.replaceWith(
                                 src.replace(m.group(1),
                                             self.stylemap[iurl]))
                             continue
                     try:
                         data = self.fetch_url(iurl)
                     except Exception:
                         self.log.exception('Could not fetch stylesheet ',
                                            iurl)
                         continue
                     c += 1
                     stylepath = os.path.join(diskpath,
                                              'style' + str(c) + '.css')
                     with self.stylemap_lock:
                         self.stylemap[iurl] = stylepath
                     with open(stylepath, 'wb') as x:
                         x.write(data)
                     ns.replaceWith(src.replace(m.group(1), stylepath))
Ejemplo n.º 6
0
 def process_stylesheets(self, soup, baseurl):
     diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets'))
     if not os.path.exists(diskpath):
         os.mkdir(diskpath)
     for c, tag in enumerate(soup.findAll(name=['link', 'style'])):
         try:
             mtype = tag['type']
         except KeyError:
             mtype = 'text/css' if tag.name.lower() == 'style' else ''
         if mtype.lower() != 'text/css':
             continue
         if tag.has_attr('href'):
             iurl = tag['href']
             if not urlsplit(iurl).scheme:
                 iurl = urljoin(baseurl, iurl, False)
             with self.stylemap_lock:
                 if iurl in self.stylemap:
                     tag['href'] = self.stylemap[iurl]
                     continue
             try:
                 data = self.fetch_url(iurl)
             except Exception:
                 self.log.exception('Could not fetch stylesheet ', iurl)
                 continue
             stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
             with self.stylemap_lock:
                 self.stylemap[iurl] = stylepath
             with open(stylepath, 'wb') as x:
                 x.write(data)
             tag['href'] = stylepath
         else:
             for ns in tag.findAll(text=True):
                 src = str(ns)
                 m = self.__class__.CSS_IMPORT_PATTERN.search(src)
                 if m:
                     iurl = m.group(1)
                     if not urlsplit(iurl).scheme:
                         iurl = urljoin(baseurl, iurl, False)
                     with self.stylemap_lock:
                         if iurl in self.stylemap:
                             ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
                             continue
                     try:
                         data = self.fetch_url(iurl)
                     except Exception:
                         self.log.exception('Could not fetch stylesheet ', iurl)
                         continue
                     c += 1
                     stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
                     with self.stylemap_lock:
                         self.stylemap[iurl] = stylepath
                     with open(stylepath, 'wb') as x:
                         x.write(data)
                     ns.replaceWith(src.replace(m.group(1), stylepath))
Ejemplo n.º 7
0
 def process_images(self, soup, baseurl):
     diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
     if not os.path.exists(diskpath):
         os.mkdir(diskpath)
     c = 0
     for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
         iurl = tag['src']
         if iurl.startswith('data:image/'):
             try:
                 data = b64decode(iurl.partition(',')[-1])
             except:
                 self.log.exception('Failed to decode embedded image')
                 continue
         else:
             if callable(self.image_url_processor):
                 iurl = self.image_url_processor(baseurl, iurl)
             if not urlparse.urlsplit(iurl).scheme:
                 iurl = urlparse.urljoin(baseurl, iurl, False)
             with self.imagemap_lock:
                 if self.imagemap.has_key(iurl):
                     tag['src'] = self.imagemap[iurl]
                     continue
             try:
                 data = self.fetch_url(iurl)
                 if data == 'GIF89a\x01':
                     # Skip empty GIF files as PIL errors on them anyway
                     continue
             except Exception:
                 self.log.exception('Could not fetch image ', iurl)
                 continue
         c += 1
         fname = ascii_filename('img'+str(c))
         if isinstance(fname, unicode):
             fname = fname.encode('ascii', 'replace')
         imgpath = os.path.join(diskpath, fname+'.jpg')
         if (imghdr.what(None, data) is None and b'<svg' in data[:1024]):
             # SVG image
             imgpath = os.path.join(diskpath, fname+'.svg')
             with self.imagemap_lock:
                 self.imagemap[iurl] = imgpath
             with open(imgpath, 'wb') as x:
                 x.write(data)
             tag['src'] = imgpath
         else:
             try:
                 im = Image.open(StringIO(data)).convert('RGBA')
                 with self.imagemap_lock:
                     self.imagemap[iurl] = imgpath
                 with open(imgpath, 'wb') as x:
                     im.save(x, 'JPEG')
                 tag['src'] = imgpath
             except:
                 traceback.print_exc()
                 continue
Ejemplo n.º 8
0
    def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None):
        '''
        :param level: The level of this file. Should be 0 for the root file.
        :param encoding: Use `encoding` to decode HTML.
        :param referrer: The :class:`HTMLFile` that first refers to this file.
        '''
        self.path     = unicode_path(path_to_html_file, abs=True)
        self.title    = os.path.splitext(os.path.basename(self.path))[0]
        self.base     = os.path.dirname(self.path)
        self.level    = level
        self.referrer = referrer
        self.links    = []

        try:
            with open(self.path, 'rb') as f:
                src = header = f.read(4096)
                encoding = detect_xml_encoding(src)[1]
                if encoding:
                    try:
                        header = header.decode(encoding)
                    except ValueError:
                        pass
                self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header))
                if not self.is_binary:
                    src += f.read()
        except IOError as err:
            msg = 'Could not read from file: %s with error: %s'%(self.path, as_unicode(err))
            if level == 0:
                raise IOError(msg)
            raise IgnoreFile(msg, err.errno)

        if not src:
            if level == 0:
                raise ValueError('The file %s is empty'%self.path)
            self.is_binary = True

        if not self.is_binary:
            if not encoding:
                encoding = detect_xml_encoding(src[:4096], verbose=verbose)[1]
                self.encoding = encoding
            else:
                self.encoding = encoding

            src = src.decode(encoding, 'replace')
            match = self.TITLE_PAT.search(src)
            self.title = match.group(1) if match is not None else self.title
            self.find_links(src)
Ejemplo n.º 9
0
    def process_images(self, soup, baseurl):
        diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
        if not os.path.exists(diskpath):
            os.mkdir(diskpath)
        c = 0
        for tag in soup.findAll(
                lambda tag: tag.name.lower() == 'img' and tag.has_key('src')):
            iurl = tag['src']
            if callable(self.image_url_processor):
                iurl = self.image_url_processor(baseurl, iurl)
            if not urlparse.urlsplit(iurl).scheme:
                iurl = urlparse.urljoin(baseurl, iurl, False)
            with self.imagemap_lock:
                if self.imagemap.has_key(iurl):
                    tag['src'] = self.imagemap[iurl]
                    continue

            #==== Changes begin here ====
            try:
                data = self.fetch_url(iurl)
            except Exception:
                self.log.exception('Could not fetch image ', iurl)
                continue

            c += 1
            fname = ascii_filename('img' + str(c))
            # Hm. Does ascii_filename return unicode names? Not touching.
            if isinstance(fname, unicode):
                fname = fname.encode('ascii', 'replace')

            for image_format in self._image_formats:
                # Use the last format as a fallback
                if image_format.magic(
                        data) or image_format == self._image_formats[-1]:
                    imgpath = os.path.join(
                        diskpath, fname + "." + image_format.extension)
                    try:
                        with self.imagemap_lock:
                            self.imagemap[iurl] = imgpath
                        if not image_format.save(imgpath, data):
                            break
                    except:
                        traceback.print_exc()
                        break

                    tag['src'] = imgpath
                    break
Ejemplo n.º 10
0
 def process_images(self, soup, baseurl):
     diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
     if not os.path.exists(diskpath):
         os.mkdir(diskpath)
     c = 0
     for tag in soup.findAll(
             lambda tag: tag.name.lower() == 'img' and tag.has_key('src')):
         iurl = tag['src']
         if iurl.startswith('data:image/'):
             try:
                 data = b64decode(iurl.partition(',')[-1])
             except:
                 self.log.exception('Failed to decode embedded image')
                 continue
         else:
             if callable(self.image_url_processor):
                 iurl = self.image_url_processor(baseurl, iurl)
             if not urlparse.urlsplit(iurl).scheme:
                 iurl = urlparse.urljoin(baseurl, iurl, False)
             with self.imagemap_lock:
                 if self.imagemap.has_key(iurl):
                     tag['src'] = self.imagemap[iurl]
                     continue
             try:
                 data = self.fetch_url(iurl)
                 if data == 'GIF89a\x01':
                     # Skip empty GIF files as PIL errors on them anyway
                     continue
             except Exception:
                 self.log.exception('Could not fetch image ', iurl)
                 continue
         c += 1
         fname = ascii_filename('img' + str(c))
         if isinstance(fname, unicode):
             fname = fname.encode('ascii', 'replace')
         imgpath = os.path.join(diskpath, fname + '.jpg')
         try:
             im = Image.open(StringIO(data)).convert('RGBA')
             with self.imagemap_lock:
                 self.imagemap[iurl] = imgpath
             with open(imgpath, 'wb') as x:
                 im.save(x, 'JPEG')
             tag['src'] = imgpath
         except:
             traceback.print_exc()
             continue
Ejemplo n.º 11
0
    def process_images(self, soup, baseurl):
        diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
        if not os.path.exists(diskpath):
            os.mkdir(diskpath)
        c = 0
        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
            iurl = tag['src']
            if callable(self.image_url_processor):
                iurl = self.image_url_processor(baseurl, iurl)
            if not urlparse.urlsplit(iurl).scheme:
                iurl = urlparse.urljoin(baseurl, iurl, False)
            with self.imagemap_lock:
                if self.imagemap.has_key(iurl):
                    tag['src'] = self.imagemap[iurl]
                    continue

            #==== Changes begin here ====
            try:
                data = self.fetch_url(iurl)
            except Exception:
                self.log.exception('Could not fetch image ', iurl)
                continue

            c += 1
            fname = ascii_filename('img'+str(c))
            # Hm. Does ascii_filename return unicode names? Not touching.
            if isinstance(fname, unicode):
                fname = fname.encode('ascii', 'replace')

            for image_format in self._image_formats:
                # Use the last format as a fallback
                if image_format.magic(data) or image_format == self._image_formats[-1]:
                    imgpath = os.path.join(diskpath, fname + "." + image_format.extension)
                    try:
                        with self.imagemap_lock:
                            self.imagemap[iurl] = imgpath
                        if not image_format.save(imgpath, data):
                            break
                    except:
                        traceback.print_exc()
                        break

                    tag['src'] = imgpath
                    break
Ejemplo n.º 12
0
def save_soup(soup, target):
    for meta in soup.findAll('meta', content=True):
        if 'charset' in meta['content'].lower():
            meta.extract()
    for meta in soup.findAll('meta', charset=True):
        meta.extract()
    head = soup.find('head')
    if head is not None:
        nm = soup.new_tag('meta', charset='utf-8')
        head.insert(0, nm)

    selfdir = os.path.dirname(target)

    for tag in soup.findAll(['img', 'link', 'a']):
        for key in ('src', 'href'):
            path = tag.get(key, None)
            if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path):
                tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/'))

    html = str(soup)
    with open(target, 'wb') as f:
        f.write(html.encode('utf-8'))
Ejemplo n.º 13
0
def save_soup(soup, target):
    for meta in soup.findAll('meta', content=True):
        if 'charset' in meta['content'].lower():
            meta.extract()
    for meta in soup.findAll('meta', charset=True):
        meta.extract()
    head = soup.find('head')
    if head is not None:
        nm = soup.new_tag('meta', charset='utf-8')
        head.insert(0, nm)

    selfdir = os.path.dirname(target)

    for tag in soup.findAll(['img', 'link', 'a']):
        for key in ('src', 'href'):
            path = tag.get(key, None)
            if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path):
                tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/'))

    html = unicode_type(soup)
    with open(target, 'wb') as f:
        f.write(html.encode('utf-8'))
Ejemplo n.º 14
0
 def process_images(self, soup, baseurl):
     diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
     if not os.path.exists(diskpath):
         os.mkdir(diskpath)
     c = 0
     for tag in soup.findAll(lambda tag: tag.name.lower() == 'img' and tag.
                             has_key('src')):  # noqa
         iurl = tag['src']
         if iurl.startswith('data:image/'):
             try:
                 data = b64decode(iurl.partition(',')[-1])
             except:
                 self.log.exception('Failed to decode embedded image')
                 continue
         else:
             if callable(self.image_url_processor):
                 iurl = self.image_url_processor(baseurl, iurl)
             if not urlsplit(iurl).scheme:
                 iurl = urljoin(baseurl, iurl, False)
             with self.imagemap_lock:
                 if self.imagemap.has_key(iurl):  # noqa
                     tag['src'] = self.imagemap[iurl]
                     continue
             try:
                 data = self.fetch_url(iurl)
                 if data == 'GIF89a\x01':
                     # Skip empty GIF files as PIL errors on them anyway
                     continue
             except Exception:
                 self.log.exception('Could not fetch image ', iurl)
                 continue
         c += 1
         fname = ascii_filename('img' + str(c))
         if isinstance(fname, unicode_type):
             fname = fname.encode('ascii', 'replace')
         data = self.preprocess_image_ext(
             data, iurl) if self.preprocess_image_ext is not None else data
         if data is None:
             continue
         itype = what(None, data)
         if itype == 'svg' or (itype is None and b'<svg' in data[:1024]):
             # SVG image
             imgpath = os.path.join(diskpath, fname + '.svg')
             with self.imagemap_lock:
                 self.imagemap[iurl] = imgpath
             with open(imgpath, 'wb') as x:
                 x.write(data)
             tag['src'] = imgpath
         else:
             try:
                 # Ensure image is valid
                 img = image_from_data(data)
                 if itype not in {'png', 'jpg', 'jpeg'}:
                     itype = 'png' if itype == 'gif' else 'jpeg'
                     data = image_to_data(img, fmt=itype)
                 if self.compress_news_images and itype in {'jpg', 'jpeg'}:
                     try:
                         data = self.rescale_image(data)
                     except Exception:
                         self.log.exception('failed to compress image ' +
                                            iurl)
                 # Moon+ apparently cannot handle .jpeg files
                 if itype == 'jpeg':
                     itype = 'jpg'
                 imgpath = os.path.join(diskpath, fname + '.' + itype)
                 with self.imagemap_lock:
                     self.imagemap[iurl] = imgpath
                 with open(imgpath, 'wb') as x:
                     x.write(data)
                 tag['src'] = imgpath
             except Exception:
                 traceback.print_exc()
                 continue
Ejemplo n.º 15
0
 def process_images(self, soup, baseurl):
     diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
     if not os.path.exists(diskpath):
         os.mkdir(diskpath)
     c = 0
     for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
         iurl = tag['src']
         if iurl.startswith('data:image/'):
             try:
                 data = b64decode(iurl.partition(',')[-1])
             except:
                 self.log.exception('Failed to decode embedded image')
                 continue
         else:
             if callable(self.image_url_processor):
                 iurl = self.image_url_processor(baseurl, iurl)
             if not urlparse.urlsplit(iurl).scheme:
                 iurl = urlparse.urljoin(baseurl, iurl, False)
             with self.imagemap_lock:
                 if self.imagemap.has_key(iurl):
                     tag['src'] = self.imagemap[iurl]
                     continue
             try:
                 data = self.fetch_url(iurl)
                 if data == 'GIF89a\x01':
                     # Skip empty GIF files as PIL errors on them anyway
                     continue
             except Exception:
                 self.log.exception('Could not fetch image ', iurl)
                 continue
         c += 1
         fname = ascii_filename('img'+str(c))
         if isinstance(fname, unicode):
             fname = fname.encode('ascii', 'replace')
         itype = what(None, data)
         if itype is None and b'<svg' in data[:1024]:
             # SVG image
             imgpath = os.path.join(diskpath, fname+'.svg')
             with self.imagemap_lock:
                 self.imagemap[iurl] = imgpath
             with open(imgpath, 'wb') as x:
                 x.write(data)
             tag['src'] = imgpath
         else:
             try:
                 # Ensure image is valid
                 img = image_from_data(data)
                 if itype not in {'png', 'jpg', 'jpeg'}:
                     itype = 'png' if itype == 'gif' else 'jpeg'
                     data = image_to_data(img, fmt=itype)
                 if self.compress_news_images and itype in {'jpg','jpeg'}:
                     try:
                         data = self.rescale_image(data)
                     except Exception:
                         self.log.exception('failed to compress image '+iurl)
                 # Moon+ apparently cannot handle .jpeg files
                 if itype == 'jpeg':
                     itype = 'jpg'
                 imgpath = os.path.join(diskpath, fname+'.'+itype)
                 with self.imagemap_lock:
                     self.imagemap[iurl] = imgpath
                 with open(imgpath, 'wb') as x:
                     x.write(data)
                 tag['src'] = imgpath
             except Exception:
                 traceback.print_exc()
                 continue
Ejemplo n.º 16
0
 def process_images(self, soup, baseurl):
     diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
     if not os.path.exists(diskpath):
         os.mkdir(diskpath)
     c = 0
     for tag in soup.findAll('img', src=True):
         iurl = tag['src']
         if iurl.startswith('data:'):
             try:
                 data = urlopen(iurl).read()
             except Exception:
                 self.log.exception('Failed to decode embedded image')
                 continue
         else:
             if callable(self.image_url_processor):
                 iurl = self.image_url_processor(baseurl, iurl)
             if not urlsplit(iurl).scheme:
                 iurl = urljoin(baseurl, iurl, False)
             found_in_cache = False
             with self.imagemap_lock:
                 if iurl in self.imagemap:
                     tag['src'] = self.imagemap[iurl]
                     found_in_cache = True
             if found_in_cache:
                 continue
             try:
                 data = self.fetch_url(iurl)
                 if data == b'GIF89a\x01':
                     # Skip empty GIF files as PIL errors on them anyway
                     continue
             except Exception:
                 self.log.exception('Could not fetch image ', iurl)
                 continue
         c += 1
         fname = ascii_filename('img' + str(c))
         data = self.preprocess_image_ext(
             data, iurl) if self.preprocess_image_ext is not None else data
         if data is None:
             continue
         itype = what(None, data)
         if itype == 'svg' or (itype is None and b'<svg' in data[:1024]):
             # SVG image
             imgpath = os.path.join(diskpath, fname + '.svg')
             with self.imagemap_lock:
                 self.imagemap[iurl] = imgpath
             with open(imgpath, 'wb') as x:
                 x.write(data)
             tag['src'] = imgpath
         else:
             from calibre.utils.img import image_from_data, image_to_data
             try:
                 # Ensure image is valid
                 img = image_from_data(data)
                 if itype not in {'png', 'jpg', 'jpeg'}:
                     itype = 'png' if itype == 'gif' else 'jpeg'
                     data = image_to_data(img, fmt=itype)
                 if self.compress_news_images and itype in {'jpg', 'jpeg'}:
                     try:
                         data = self.rescale_image(data)
                     except Exception:
                         self.log.exception('failed to compress image ' +
                                            iurl)
                 # Moon+ apparently cannot handle .jpeg files
                 if itype == 'jpeg':
                     itype = 'jpg'
                 imgpath = os.path.join(diskpath, fname + '.' + itype)
                 with self.imagemap_lock:
                     self.imagemap[iurl] = imgpath
                 with open(imgpath, 'wb') as x:
                     x.write(data)
                 tag['src'] = imgpath
             except Exception:
                 traceback.print_exc()
                 continue