def save_soup(soup, target): ns = BeautifulSoup( '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />' ) nm = ns.find('meta') metas = soup.findAll('meta', content=True) added = False for meta in metas: if 'charset' in meta.get('content', '').lower(): meta.replaceWith(nm) added = True if not added: head = soup.find('head') if head is not None: head.insert(0, nm) selfdir = os.path.dirname(target) for tag in soup.findAll(['img', 'link', 'a']): for key in ('src', 'href'): path = tag.get(key, None) if path and os.path.isfile(path) and os.path.exists( path) and os.path.isabs(path): tag[key] = unicode_path( relpath(path, selfdir).replace(os.sep, '/')) html = unicode_type(soup) with open(target, 'wb') as f: f.write(html.encode('utf-8'))
def save_soup(soup, target): ns = BeautifulSoup('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />') nm = ns.find('meta') metas = soup.findAll('meta', content=True) added = False for meta in metas: if 'charset' in meta.get('content', '').lower(): meta.replaceWith(nm) added = True if not added: head = soup.find('head') if head is not None: head.insert(0, nm) selfdir = os.path.dirname(target) for tag in soup.findAll(['img', 'link', 'a']): for key in ('src', 'href'): path = tag.get(key, None) if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path): tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/')) html = unicode(soup) with open(target, 'wb') as f: f.write(html.encode('utf-8'))
def process_stylesheets(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets')) if not os.path.exists(diskpath): os.mkdir(diskpath) for c, tag in enumerate(soup.findAll(name=['link', 'style'])): try: mtype = tag['type'] except KeyError: mtype = 'text/css' if tag.name.lower() == 'style' else '' if mtype.lower() != 'text/css': continue if tag.has_attr('href'): iurl = tag['href'] if not urlsplit(iurl).scheme: iurl = urljoin(baseurl, iurl, False) found_cached = False with self.stylemap_lock: if iurl in self.stylemap: tag['href'] = self.stylemap[iurl] found_cached = True if found_cached: continue try: data = self.fetch_url(iurl) except Exception: self.log.exception('Could not fetch stylesheet ', iurl) continue stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') with self.stylemap_lock: self.stylemap[iurl] = stylepath with open(stylepath, 'wb') as x: x.write(data) tag['href'] = stylepath else: for ns in tag.findAll(text=True): src = str(ns) m = self.__class__.CSS_IMPORT_PATTERN.search(src) if m: iurl = m.group(1) if not urlsplit(iurl).scheme: iurl = urljoin(baseurl, iurl, False) found_cached = False with self.stylemap_lock: if iurl in self.stylemap: ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl])) found_cached = True if found_cached: continue try: data = self.fetch_url(iurl) except Exception: self.log.exception('Could not fetch stylesheet ', iurl) continue c += 1 stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') with self.stylemap_lock: self.stylemap[iurl] = stylepath with open(stylepath, 'wb') as x: x.write(data) ns.replaceWith(src.replace(m.group(1), stylepath))
def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None): ''' :param level: The level of this file. Should be 0 for the root file. :param encoding: Use `encoding` to decode HTML. :param referrer: The :class:`HTMLFile` that first refers to this file. ''' self.path = unicode_path(path_to_html_file, abs=True) self.title = os.path.splitext(os.path.basename(self.path))[0] self.base = os.path.dirname(self.path) self.level = level self.referrer = referrer self.links = [] try: with open(self.path, 'rb') as f: src = header = f.read(4096) encoding = detect_xml_encoding(src)[1] if encoding: try: header = header.decode(encoding, errors='replace') except ValueError: pass self.is_binary = False if level > 0: pat = self.HTML_PAT_BIN if isinstance( header, bytes) else self.HTML_PAT self.is_binary = not bool(pat.search(header)) if not self.is_binary: src += f.read() except OSError as err: msg = 'Could not read from file: %s with error: %s' % ( self.path, as_unicode(err)) if level == 0: raise OSError(msg) raise IgnoreFile(msg, err.errno) if not src: if level == 0: raise ValueError('The file %s is empty' % self.path) self.is_binary = True if not self.is_binary: if not encoding: encoding = detect_xml_encoding(src[:4096], verbose=verbose)[1] self.encoding = encoding else: self.encoding = encoding src = src.decode(encoding, 'replace') match = self.TITLE_PAT.search(src) self.title = match.group(1) if match is not None else self.title self.find_links(src)
def process_stylesheets(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets')) if not os.path.exists(diskpath): os.mkdir(diskpath) for c, tag in enumerate( soup.findAll(lambda tag: tag.name.lower() in ['link', 'style'] and tag.has_key('type') and tag['type'].lower( ) == 'text/css')): # noqa if tag.has_key('href'): # noqa iurl = tag['href'] if not urlsplit(iurl).scheme: iurl = urljoin(baseurl, iurl, False) with self.stylemap_lock: if self.stylemap.has_key(iurl): # noqa tag['href'] = self.stylemap[iurl] continue try: data = self.fetch_url(iurl) except Exception: self.log.exception('Could not fetch stylesheet ', iurl) continue stylepath = os.path.join(diskpath, 'style' + str(c) + '.css') with self.stylemap_lock: self.stylemap[iurl] = stylepath with open(stylepath, 'wb') as x: x.write(data) tag['href'] = stylepath else: for ns in tag.findAll(text=True): src = str(ns) m = self.__class__.CSS_IMPORT_PATTERN.search(src) if m: iurl = m.group(1) if not urlsplit(iurl).scheme: iurl = urljoin(baseurl, iurl, False) with self.stylemap_lock: if self.stylemap.has_key(iurl): # noqa ns.replaceWith( src.replace(m.group(1), self.stylemap[iurl])) continue try: data = self.fetch_url(iurl) except Exception: self.log.exception('Could not fetch stylesheet ', iurl) continue c += 1 stylepath = os.path.join(diskpath, 'style' + str(c) + '.css') with self.stylemap_lock: self.stylemap[iurl] = stylepath with open(stylepath, 'wb') as x: x.write(data) ns.replaceWith(src.replace(m.group(1), stylepath))
def process_stylesheets(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets')) if not os.path.exists(diskpath): os.mkdir(diskpath) for c, tag in enumerate(soup.findAll(name=['link', 'style'])): try: mtype = tag['type'] except KeyError: mtype = 'text/css' if tag.name.lower() == 'style' else '' if mtype.lower() != 'text/css': continue if tag.has_attr('href'): iurl = tag['href'] if not urlsplit(iurl).scheme: iurl = urljoin(baseurl, iurl, False) with self.stylemap_lock: if iurl in self.stylemap: tag['href'] = self.stylemap[iurl] continue try: data = self.fetch_url(iurl) except Exception: self.log.exception('Could not fetch stylesheet ', iurl) continue stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') with self.stylemap_lock: self.stylemap[iurl] = stylepath with open(stylepath, 'wb') as x: x.write(data) tag['href'] = stylepath else: for ns in tag.findAll(text=True): src = str(ns) m = self.__class__.CSS_IMPORT_PATTERN.search(src) if m: iurl = m.group(1) if not urlsplit(iurl).scheme: iurl = urljoin(baseurl, iurl, False) with self.stylemap_lock: if iurl in self.stylemap: ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl])) continue try: data = self.fetch_url(iurl) except Exception: self.log.exception('Could not fetch stylesheet ', iurl) continue c += 1 stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') with self.stylemap_lock: self.stylemap[iurl] = stylepath with open(stylepath, 'wb') as x: x.write(data) ns.replaceWith(src.replace(m.group(1), stylepath))
def process_images(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'images')) if not os.path.exists(diskpath): os.mkdir(diskpath) c = 0 for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): iurl = tag['src'] if iurl.startswith('data:image/'): try: data = b64decode(iurl.partition(',')[-1]) except: self.log.exception('Failed to decode embedded image') continue else: if callable(self.image_url_processor): iurl = self.image_url_processor(baseurl, iurl) if not urlparse.urlsplit(iurl).scheme: iurl = urlparse.urljoin(baseurl, iurl, False) with self.imagemap_lock: if self.imagemap.has_key(iurl): tag['src'] = self.imagemap[iurl] continue try: data = self.fetch_url(iurl) if data == 'GIF89a\x01': # Skip empty GIF files as PIL errors on them anyway continue except Exception: self.log.exception('Could not fetch image ', iurl) continue c += 1 fname = ascii_filename('img'+str(c)) if isinstance(fname, unicode): fname = fname.encode('ascii', 'replace') imgpath = os.path.join(diskpath, fname+'.jpg') if (imghdr.what(None, data) is None and b'<svg' in data[:1024]): # SVG image imgpath = os.path.join(diskpath, fname+'.svg') with self.imagemap_lock: self.imagemap[iurl] = imgpath with open(imgpath, 'wb') as x: x.write(data) tag['src'] = imgpath else: try: im = Image.open(StringIO(data)).convert('RGBA') with self.imagemap_lock: self.imagemap[iurl] = imgpath with open(imgpath, 'wb') as x: im.save(x, 'JPEG') tag['src'] = imgpath except: traceback.print_exc() continue
def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None): ''' :param level: The level of this file. Should be 0 for the root file. :param encoding: Use `encoding` to decode HTML. :param referrer: The :class:`HTMLFile` that first refers to this file. ''' self.path = unicode_path(path_to_html_file, abs=True) self.title = os.path.splitext(os.path.basename(self.path))[0] self.base = os.path.dirname(self.path) self.level = level self.referrer = referrer self.links = [] try: with open(self.path, 'rb') as f: src = header = f.read(4096) encoding = detect_xml_encoding(src)[1] if encoding: try: header = header.decode(encoding) except ValueError: pass self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header)) if not self.is_binary: src += f.read() except IOError as err: msg = 'Could not read from file: %s with error: %s'%(self.path, as_unicode(err)) if level == 0: raise IOError(msg) raise IgnoreFile(msg, err.errno) if not src: if level == 0: raise ValueError('The file %s is empty'%self.path) self.is_binary = True if not self.is_binary: if not encoding: encoding = detect_xml_encoding(src[:4096], verbose=verbose)[1] self.encoding = encoding else: self.encoding = encoding src = src.decode(encoding, 'replace') match = self.TITLE_PAT.search(src) self.title = match.group(1) if match is not None else self.title self.find_links(src)
def process_images(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'images')) if not os.path.exists(diskpath): os.mkdir(diskpath) c = 0 for tag in soup.findAll( lambda tag: tag.name.lower() == 'img' and tag.has_key('src')): iurl = tag['src'] if callable(self.image_url_processor): iurl = self.image_url_processor(baseurl, iurl) if not urlparse.urlsplit(iurl).scheme: iurl = urlparse.urljoin(baseurl, iurl, False) with self.imagemap_lock: if self.imagemap.has_key(iurl): tag['src'] = self.imagemap[iurl] continue #==== Changes begin here ==== try: data = self.fetch_url(iurl) except Exception: self.log.exception('Could not fetch image ', iurl) continue c += 1 fname = ascii_filename('img' + str(c)) # Hm. Does ascii_filename return unicode names? Not touching. if isinstance(fname, unicode): fname = fname.encode('ascii', 'replace') for image_format in self._image_formats: # Use the last format as a fallback if image_format.magic( data) or image_format == self._image_formats[-1]: imgpath = os.path.join( diskpath, fname + "." + image_format.extension) try: with self.imagemap_lock: self.imagemap[iurl] = imgpath if not image_format.save(imgpath, data): break except: traceback.print_exc() break tag['src'] = imgpath break
def process_images(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'images')) if not os.path.exists(diskpath): os.mkdir(diskpath) c = 0 for tag in soup.findAll( lambda tag: tag.name.lower() == 'img' and tag.has_key('src')): iurl = tag['src'] if iurl.startswith('data:image/'): try: data = b64decode(iurl.partition(',')[-1]) except: self.log.exception('Failed to decode embedded image') continue else: if callable(self.image_url_processor): iurl = self.image_url_processor(baseurl, iurl) if not urlparse.urlsplit(iurl).scheme: iurl = urlparse.urljoin(baseurl, iurl, False) with self.imagemap_lock: if self.imagemap.has_key(iurl): tag['src'] = self.imagemap[iurl] continue try: data = self.fetch_url(iurl) if data == 'GIF89a\x01': # Skip empty GIF files as PIL errors on them anyway continue except Exception: self.log.exception('Could not fetch image ', iurl) continue c += 1 fname = ascii_filename('img' + str(c)) if isinstance(fname, unicode): fname = fname.encode('ascii', 'replace') imgpath = os.path.join(diskpath, fname + '.jpg') try: im = Image.open(StringIO(data)).convert('RGBA') with self.imagemap_lock: self.imagemap[iurl] = imgpath with open(imgpath, 'wb') as x: im.save(x, 'JPEG') tag['src'] = imgpath except: traceback.print_exc() continue
def process_images(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'images')) if not os.path.exists(diskpath): os.mkdir(diskpath) c = 0 for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): iurl = tag['src'] if callable(self.image_url_processor): iurl = self.image_url_processor(baseurl, iurl) if not urlparse.urlsplit(iurl).scheme: iurl = urlparse.urljoin(baseurl, iurl, False) with self.imagemap_lock: if self.imagemap.has_key(iurl): tag['src'] = self.imagemap[iurl] continue #==== Changes begin here ==== try: data = self.fetch_url(iurl) except Exception: self.log.exception('Could not fetch image ', iurl) continue c += 1 fname = ascii_filename('img'+str(c)) # Hm. Does ascii_filename return unicode names? Not touching. if isinstance(fname, unicode): fname = fname.encode('ascii', 'replace') for image_format in self._image_formats: # Use the last format as a fallback if image_format.magic(data) or image_format == self._image_formats[-1]: imgpath = os.path.join(diskpath, fname + "." + image_format.extension) try: with self.imagemap_lock: self.imagemap[iurl] = imgpath if not image_format.save(imgpath, data): break except: traceback.print_exc() break tag['src'] = imgpath break
def save_soup(soup, target): for meta in soup.findAll('meta', content=True): if 'charset' in meta['content'].lower(): meta.extract() for meta in soup.findAll('meta', charset=True): meta.extract() head = soup.find('head') if head is not None: nm = soup.new_tag('meta', charset='utf-8') head.insert(0, nm) selfdir = os.path.dirname(target) for tag in soup.findAll(['img', 'link', 'a']): for key in ('src', 'href'): path = tag.get(key, None) if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path): tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/')) html = str(soup) with open(target, 'wb') as f: f.write(html.encode('utf-8'))
def save_soup(soup, target): for meta in soup.findAll('meta', content=True): if 'charset' in meta['content'].lower(): meta.extract() for meta in soup.findAll('meta', charset=True): meta.extract() head = soup.find('head') if head is not None: nm = soup.new_tag('meta', charset='utf-8') head.insert(0, nm) selfdir = os.path.dirname(target) for tag in soup.findAll(['img', 'link', 'a']): for key in ('src', 'href'): path = tag.get(key, None) if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path): tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/')) html = unicode_type(soup) with open(target, 'wb') as f: f.write(html.encode('utf-8'))
def process_images(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'images')) if not os.path.exists(diskpath): os.mkdir(diskpath) c = 0 for tag in soup.findAll(lambda tag: tag.name.lower() == 'img' and tag. has_key('src')): # noqa iurl = tag['src'] if iurl.startswith('data:image/'): try: data = b64decode(iurl.partition(',')[-1]) except: self.log.exception('Failed to decode embedded image') continue else: if callable(self.image_url_processor): iurl = self.image_url_processor(baseurl, iurl) if not urlsplit(iurl).scheme: iurl = urljoin(baseurl, iurl, False) with self.imagemap_lock: if self.imagemap.has_key(iurl): # noqa tag['src'] = self.imagemap[iurl] continue try: data = self.fetch_url(iurl) if data == 'GIF89a\x01': # Skip empty GIF files as PIL errors on them anyway continue except Exception: self.log.exception('Could not fetch image ', iurl) continue c += 1 fname = ascii_filename('img' + str(c)) if isinstance(fname, unicode_type): fname = fname.encode('ascii', 'replace') data = self.preprocess_image_ext( data, iurl) if self.preprocess_image_ext is not None else data if data is None: continue itype = what(None, data) if itype == 'svg' or (itype is None and b'<svg' in data[:1024]): # SVG image imgpath = os.path.join(diskpath, fname + '.svg') with self.imagemap_lock: self.imagemap[iurl] = imgpath with open(imgpath, 'wb') as x: x.write(data) tag['src'] = imgpath else: try: # Ensure image is valid img = image_from_data(data) if itype not in {'png', 'jpg', 'jpeg'}: itype = 'png' if itype == 'gif' else 'jpeg' data = image_to_data(img, fmt=itype) if self.compress_news_images and itype in {'jpg', 'jpeg'}: try: data = self.rescale_image(data) except Exception: self.log.exception('failed to compress image ' + iurl) # Moon+ apparently cannot handle .jpeg files if itype == 'jpeg': itype = 'jpg' imgpath = os.path.join(diskpath, fname + '.' + itype) with self.imagemap_lock: self.imagemap[iurl] = imgpath with open(imgpath, 'wb') as x: x.write(data) tag['src'] = imgpath except Exception: traceback.print_exc() continue
def process_images(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'images')) if not os.path.exists(diskpath): os.mkdir(diskpath) c = 0 for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): iurl = tag['src'] if iurl.startswith('data:image/'): try: data = b64decode(iurl.partition(',')[-1]) except: self.log.exception('Failed to decode embedded image') continue else: if callable(self.image_url_processor): iurl = self.image_url_processor(baseurl, iurl) if not urlparse.urlsplit(iurl).scheme: iurl = urlparse.urljoin(baseurl, iurl, False) with self.imagemap_lock: if self.imagemap.has_key(iurl): tag['src'] = self.imagemap[iurl] continue try: data = self.fetch_url(iurl) if data == 'GIF89a\x01': # Skip empty GIF files as PIL errors on them anyway continue except Exception: self.log.exception('Could not fetch image ', iurl) continue c += 1 fname = ascii_filename('img'+str(c)) if isinstance(fname, unicode): fname = fname.encode('ascii', 'replace') itype = what(None, data) if itype is None and b'<svg' in data[:1024]: # SVG image imgpath = os.path.join(diskpath, fname+'.svg') with self.imagemap_lock: self.imagemap[iurl] = imgpath with open(imgpath, 'wb') as x: x.write(data) tag['src'] = imgpath else: try: # Ensure image is valid img = image_from_data(data) if itype not in {'png', 'jpg', 'jpeg'}: itype = 'png' if itype == 'gif' else 'jpeg' data = image_to_data(img, fmt=itype) if self.compress_news_images and itype in {'jpg','jpeg'}: try: data = self.rescale_image(data) except Exception: self.log.exception('failed to compress image '+iurl) # Moon+ apparently cannot handle .jpeg files if itype == 'jpeg': itype = 'jpg' imgpath = os.path.join(diskpath, fname+'.'+itype) with self.imagemap_lock: self.imagemap[iurl] = imgpath with open(imgpath, 'wb') as x: x.write(data) tag['src'] = imgpath except Exception: traceback.print_exc() continue
def process_images(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'images')) if not os.path.exists(diskpath): os.mkdir(diskpath) c = 0 for tag in soup.findAll('img', src=True): iurl = tag['src'] if iurl.startswith('data:'): try: data = urlopen(iurl).read() except Exception: self.log.exception('Failed to decode embedded image') continue else: if callable(self.image_url_processor): iurl = self.image_url_processor(baseurl, iurl) if not urlsplit(iurl).scheme: iurl = urljoin(baseurl, iurl, False) found_in_cache = False with self.imagemap_lock: if iurl in self.imagemap: tag['src'] = self.imagemap[iurl] found_in_cache = True if found_in_cache: continue try: data = self.fetch_url(iurl) if data == b'GIF89a\x01': # Skip empty GIF files as PIL errors on them anyway continue except Exception: self.log.exception('Could not fetch image ', iurl) continue c += 1 fname = ascii_filename('img' + str(c)) data = self.preprocess_image_ext( data, iurl) if self.preprocess_image_ext is not None else data if data is None: continue itype = what(None, data) if itype == 'svg' or (itype is None and b'<svg' in data[:1024]): # SVG image imgpath = os.path.join(diskpath, fname + '.svg') with self.imagemap_lock: self.imagemap[iurl] = imgpath with open(imgpath, 'wb') as x: x.write(data) tag['src'] = imgpath else: from calibre.utils.img import image_from_data, image_to_data try: # Ensure image is valid img = image_from_data(data) if itype not in {'png', 'jpg', 'jpeg'}: itype = 'png' if itype == 'gif' else 'jpeg' data = image_to_data(img, fmt=itype) if self.compress_news_images and itype in {'jpg', 'jpeg'}: try: data = self.rescale_image(data) except Exception: self.log.exception('failed to compress image ' + iurl) # Moon+ apparently cannot handle .jpeg files if itype == 'jpeg': itype = 'jpg' imgpath = os.path.join(diskpath, fname + '.' + itype) with self.imagemap_lock: self.imagemap[iurl] = imgpath with open(imgpath, 'wb') as x: x.write(data) tag['src'] = imgpath except Exception: traceback.print_exc() continue