def process_stylesheets(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets')) if not os.path.exists(diskpath): os.mkdir(diskpath) for c, tag in enumerate(soup.findAll(name=['link', 'style'])): try: mtype = tag['type'] except KeyError: mtype = 'text/css' if tag.name.lower() == 'style' else '' if mtype.lower() != 'text/css': continue if tag.has_attr('href'): iurl = tag['href'] if not urlsplit(iurl).scheme: iurl = urljoin(baseurl, iurl, False) found_cached = False with self.stylemap_lock: if iurl in self.stylemap: tag['href'] = self.stylemap[iurl] found_cached = True if found_cached: continue try: data = self.fetch_url(iurl) except Exception: self.log.exception('Could not fetch stylesheet ', iurl) continue stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') with self.stylemap_lock: self.stylemap[iurl] = stylepath with open(stylepath, 'wb') as x: x.write(data) tag['href'] = stylepath else: for ns in tag.findAll(text=True): src = str(ns) m = self.__class__.CSS_IMPORT_PATTERN.search(src) if m: iurl = m.group(1) if not urlsplit(iurl).scheme: iurl = urljoin(baseurl, iurl, False) found_cached = False with self.stylemap_lock: if iurl in self.stylemap: ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl])) found_cached = True if found_cached: continue try: data = self.fetch_url(iurl) except Exception: self.log.exception('Could not fetch stylesheet ', iurl) continue c += 1 stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') with self.stylemap_lock: self.stylemap[iurl] = stylepath with open(stylepath, 'wb') as x: x.write(data) ns.replaceWith(src.replace(m.group(1), stylepath))
def process_stylesheets(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets')) if not os.path.exists(diskpath): os.mkdir(diskpath) for c, tag in enumerate( soup.findAll(lambda tag: tag.name.lower() in ['link', 'style'] and tag.has_key('type') and tag['type'].lower( ) == 'text/css')): # noqa if tag.has_key('href'): # noqa iurl = tag['href'] if not urlsplit(iurl).scheme: iurl = urljoin(baseurl, iurl, False) with self.stylemap_lock: if self.stylemap.has_key(iurl): # noqa tag['href'] = self.stylemap[iurl] continue try: data = self.fetch_url(iurl) except Exception: self.log.exception('Could not fetch stylesheet ', iurl) continue stylepath = os.path.join(diskpath, 'style' + str(c) + '.css') with self.stylemap_lock: self.stylemap[iurl] = stylepath with open(stylepath, 'wb') as x: x.write(data) tag['href'] = stylepath else: for ns in tag.findAll(text=True): src = str(ns) m = self.__class__.CSS_IMPORT_PATTERN.search(src) if m: iurl = m.group(1) if not urlsplit(iurl).scheme: iurl = urljoin(baseurl, iurl, False) with self.stylemap_lock: if self.stylemap.has_key(iurl): # noqa ns.replaceWith( src.replace(m.group(1), self.stylemap[iurl])) continue try: data = self.fetch_url(iurl) except Exception: self.log.exception('Could not fetch stylesheet ', iurl) continue c += 1 stylepath = os.path.join(diskpath, 'style' + str(c) + '.css') with self.stylemap_lock: self.stylemap[iurl] = stylepath with open(stylepath, 'wb') as x: x.write(data) ns.replaceWith(src.replace(m.group(1), stylepath))
def process_stylesheets(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets')) if not os.path.exists(diskpath): os.mkdir(diskpath) for c, tag in enumerate(soup.findAll(name=['link', 'style'])): try: mtype = tag['type'] except KeyError: mtype = 'text/css' if tag.name.lower() == 'style' else '' if mtype.lower() != 'text/css': continue if tag.has_attr('href'): iurl = tag['href'] if not urlsplit(iurl).scheme: iurl = urljoin(baseurl, iurl, False) with self.stylemap_lock: if iurl in self.stylemap: tag['href'] = self.stylemap[iurl] continue try: data = self.fetch_url(iurl) except Exception: self.log.exception('Could not fetch stylesheet ', iurl) continue stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') with self.stylemap_lock: self.stylemap[iurl] = stylepath with open(stylepath, 'wb') as x: x.write(data) tag['href'] = stylepath else: for ns in tag.findAll(text=True): src = str(ns) m = self.__class__.CSS_IMPORT_PATTERN.search(src) if m: iurl = m.group(1) if not urlsplit(iurl).scheme: iurl = urljoin(baseurl, iurl, False) with self.stylemap_lock: if iurl in self.stylemap: ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl])) continue try: data = self.fetch_url(iurl) except Exception: self.log.exception('Could not fetch stylesheet ', iurl) continue c += 1 stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') with self.stylemap_lock: self.stylemap[iurl] = stylepath with open(stylepath, 'wb') as x: x.write(data) ns.replaceWith(src.replace(m.group(1), stylepath))
def absurl(self, baseurl, tag, key, filter=True): iurl = tag[key] parts = urlsplit(iurl) if not parts.netloc and not parts.path and not parts.query: return None if not parts.scheme: iurl = urljoin(baseurl, iurl, False) if not self.is_link_ok(iurl): self.log.debug('Skipping invalid link:', iurl) return None if filter and not self.is_link_wanted(iurl, tag): self.log.debug('Filtered link: ' + iurl) return None return iurl
def absurl(self, baseurl, tag, key, filter=True): iurl = tag[key] parts = urlsplit(iurl) if not parts.netloc and not parts.path and not parts.query: return None if not parts.scheme: iurl = urljoin(baseurl, iurl, False) if not self.is_link_ok(iurl): self.log.debug('Skipping invalid link:', iurl) return None if filter and not self.is_link_wanted(iurl, tag): self.log.debug('Filtered link: '+iurl) return None return iurl
def process_links(self, soup, baseurl, recursion_level, into_dir='links'): res = '' diskpath = os.path.join(self.current_dir, into_dir) if not os.path.exists(diskpath): os.mkdir(diskpath) prev_dir = self.current_dir try: self.current_dir = diskpath tags = list(soup.findAll('a', href=True)) for c, tag in enumerate(tags): if self.show_progress: print('.', end=' ') sys.stdout.flush() sys.stdout.flush() iurl = self.absurl(baseurl, tag, 'href', filter=recursion_level != 0) if not iurl: continue nurl = self.normurl(iurl) if self.filemap.has_key(nurl): # noqa self.localize_link(tag, 'href', self.filemap[nurl]) continue if self.files > self.max_files: return res linkdir = 'link' + str(c) if into_dir else '' linkdiskpath = os.path.join(diskpath, linkdir) if not os.path.exists(linkdiskpath): os.mkdir(linkdiskpath) try: self.current_dir = linkdiskpath dsrc = self.fetch_url(iurl) newbaseurl = dsrc.newurl if len(dsrc) == 0 or \ len(re.compile('<!--.*?-->', re.DOTALL).sub('', dsrc).strip()) == 0: raise ValueError('No content at URL %r' % iurl) if callable(self.encoding): dsrc = self.encoding(dsrc) elif self.encoding is not None: dsrc = dsrc.decode(self.encoding, 'replace') else: dsrc = xml_to_unicode(dsrc, self.verbose)[0] st = time.time() soup = self.get_soup(dsrc, url=iurl) self.log.debug('Parsed %s in %.1f seconds' % (iurl, time.time() - st)) base = soup.find('base', href=True) if base is not None: newbaseurl = base['href'] self.log.debug('Processing images...') self.process_images(soup, newbaseurl) if self.download_stylesheets: self.process_stylesheets(soup, newbaseurl) _fname = basename(iurl) if not isinstance(_fname, unicode_type): _fname.decode('latin1', 'replace') _fname = _fname.encode('ascii', 'replace').replace( '%', '').replace(os.sep, '') _fname = ascii_filename(_fname) _fname = os.path.splitext(_fname)[0][:120] + '.xhtml' res = os.path.join(linkdiskpath, _fname) self.downloaded_paths.append(res) self.filemap[nurl] = res if recursion_level < self.max_recursions: self.log.debug('Processing links...') self.process_links(soup, newbaseurl, recursion_level + 1) else: self.process_return_links(soup, newbaseurl) self.log.debug( 'Recursion limit reached. Skipping links in', iurl) if newbaseurl and not newbaseurl.startswith('/'): for atag in soup.findAll( 'a', href=lambda x: x and x.startswith('/')): atag['href'] = urljoin(newbaseurl, atag['href'], True) if callable(self.postprocess_html_ext): soup = self.postprocess_html_ext( soup, c == 0 and recursion_level == 0 and not getattr(self, 'called_first', False), self.job_info) if c == 0 and recursion_level == 0: self.called_first = True save_soup(soup, res) self.localize_link(tag, 'href', res) except Exception as err: if isinstance(err, AbortArticle): raise self.failed_links.append((iurl, traceback.format_exc())) self.log.exception('Could not fetch link', iurl) finally: self.current_dir = diskpath self.files += 1 finally: self.current_dir = prev_dir if self.show_progress: print() return res
def process_images(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'images')) if not os.path.exists(diskpath): os.mkdir(diskpath) c = 0 for tag in soup.findAll(lambda tag: tag.name.lower() == 'img' and tag. has_key('src')): # noqa iurl = tag['src'] if iurl.startswith('data:image/'): try: data = b64decode(iurl.partition(',')[-1]) except: self.log.exception('Failed to decode embedded image') continue else: if callable(self.image_url_processor): iurl = self.image_url_processor(baseurl, iurl) if not urlsplit(iurl).scheme: iurl = urljoin(baseurl, iurl, False) with self.imagemap_lock: if self.imagemap.has_key(iurl): # noqa tag['src'] = self.imagemap[iurl] continue try: data = self.fetch_url(iurl) if data == 'GIF89a\x01': # Skip empty GIF files as PIL errors on them anyway continue except Exception: self.log.exception('Could not fetch image ', iurl) continue c += 1 fname = ascii_filename('img' + str(c)) if isinstance(fname, unicode_type): fname = fname.encode('ascii', 'replace') data = self.preprocess_image_ext( data, iurl) if self.preprocess_image_ext is not None else data if data is None: continue itype = what(None, data) if itype == 'svg' or (itype is None and b'<svg' in data[:1024]): # SVG image imgpath = os.path.join(diskpath, fname + '.svg') with self.imagemap_lock: self.imagemap[iurl] = imgpath with open(imgpath, 'wb') as x: x.write(data) tag['src'] = imgpath else: try: # Ensure image is valid img = image_from_data(data) if itype not in {'png', 'jpg', 'jpeg'}: itype = 'png' if itype == 'gif' else 'jpeg' data = image_to_data(img, fmt=itype) if self.compress_news_images and itype in {'jpg', 'jpeg'}: try: data = self.rescale_image(data) except Exception: self.log.exception('failed to compress image ' + iurl) # Moon+ apparently cannot handle .jpeg files if itype == 'jpeg': itype = 'jpg' imgpath = os.path.join(diskpath, fname + '.' + itype) with self.imagemap_lock: self.imagemap[iurl] = imgpath with open(imgpath, 'wb') as x: x.write(data) tag['src'] = imgpath except Exception: traceback.print_exc() continue
def process_links(self, soup, baseurl, recursion_level, into_dir='links'): res = '' diskpath = os.path.join(self.current_dir, into_dir) if not os.path.exists(diskpath): os.mkdir(diskpath) prev_dir = self.current_dir try: self.current_dir = diskpath tags = list(soup.findAll('a', href=True)) for c, tag in enumerate(tags): if self.show_progress: print('.', end=' ') sys.stdout.flush() sys.stdout.flush() iurl = self.absurl(baseurl, tag, 'href', filter=recursion_level != 0) if not iurl: continue nurl = self.normurl(iurl) if nurl in self.filemap: self.localize_link(tag, 'href', self.filemap[nurl]) continue if self.files > self.max_files: return res linkdir = 'link'+str(c) if into_dir else '' linkdiskpath = os.path.join(diskpath, linkdir) if not os.path.exists(linkdiskpath): os.mkdir(linkdiskpath) try: self.current_dir = linkdiskpath dsrc = self.fetch_url(iurl) newbaseurl = dsrc.newurl if len(dsrc) == 0 or \ len(re.compile(b'<!--.*?-->', re.DOTALL).sub(b'', dsrc).strip()) == 0: raise ValueError('No content at URL %r'%iurl) if callable(self.encoding): dsrc = self.encoding(dsrc) elif self.encoding is not None: dsrc = dsrc.decode(self.encoding, 'replace') else: dsrc = xml_to_unicode(dsrc, self.verbose)[0] st = time.time() soup = self.get_soup(dsrc, url=iurl) self.log.debug('Parsed %s in %.1f seconds' % (iurl, time.time() - st)) base = soup.find('base', href=True) if base is not None: newbaseurl = base['href'] self.log.debug('Processing images...') self.process_images(soup, newbaseurl) if self.download_stylesheets: self.process_stylesheets(soup, newbaseurl) _fname = basename(iurl) if not isinstance(_fname, unicode_type): _fname.decode('latin1', 'replace') _fname = _fname.replace('%', '').replace(os.sep, '') _fname = ascii_filename(_fname) _fname = os.path.splitext(_fname)[0][:120] + '.xhtml' res = os.path.join(linkdiskpath, _fname) self.downloaded_paths.append(res) self.filemap[nurl] = res if recursion_level < self.max_recursions: self.log.debug('Processing links...') self.process_links(soup, newbaseurl, recursion_level+1) else: self.process_return_links(soup, newbaseurl) self.log.debug('Recursion limit reached. Skipping links in', iurl) if newbaseurl and not newbaseurl.startswith('/'): for atag in soup.findAll('a', href=lambda x: x and x.startswith('/')): atag['href'] = urljoin(newbaseurl, atag['href'], True) if callable(self.postprocess_html_ext): soup = self.postprocess_html_ext(soup, c==0 and recursion_level==0 and not getattr(self, 'called_first', False), self.job_info) if c==0 and recursion_level == 0: self.called_first = True save_soup(soup, res) self.localize_link(tag, 'href', res) except Exception as err: if isinstance(err, AbortArticle): raise self.failed_links.append((iurl, traceback.format_exc())) self.log.exception('Could not fetch link', iurl) finally: self.current_dir = diskpath self.files += 1 finally: self.current_dir = prev_dir if self.show_progress: print() return res
def process_images(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'images')) if not os.path.exists(diskpath): os.mkdir(diskpath) c = 0 for tag in soup.findAll('img', src=True): iurl = tag['src'] if iurl.startswith('data:image/'): try: data = from_base64_bytes(iurl.partition(',')[-1]) except Exception: self.log.exception('Failed to decode embedded image') continue else: if callable(self.image_url_processor): iurl = self.image_url_processor(baseurl, iurl) if not urlsplit(iurl).scheme: iurl = urljoin(baseurl, iurl, False) with self.imagemap_lock: if iurl in self.imagemap: tag['src'] = self.imagemap[iurl] continue try: data = self.fetch_url(iurl) if data == 'GIF89a\x01': # Skip empty GIF files as PIL errors on them anyway continue except Exception: self.log.exception('Could not fetch image ', iurl) continue c += 1 fname = ascii_filename('img'+str(c)) data = self.preprocess_image_ext(data, iurl) if self.preprocess_image_ext is not None else data if data is None: continue itype = what(None, data) if itype == 'svg' or (itype is None and b'<svg' in data[:1024]): # SVG image imgpath = os.path.join(diskpath, fname+'.svg') with self.imagemap_lock: self.imagemap[iurl] = imgpath with open(imgpath, 'wb') as x: x.write(data) tag['src'] = imgpath else: try: # Ensure image is valid img = image_from_data(data) if itype not in {'png', 'jpg', 'jpeg'}: itype = 'png' if itype == 'gif' else 'jpeg' data = image_to_data(img, fmt=itype) if self.compress_news_images and itype in {'jpg','jpeg'}: try: data = self.rescale_image(data) except Exception: self.log.exception('failed to compress image '+iurl) # Moon+ apparently cannot handle .jpeg files if itype == 'jpeg': itype = 'jpg' imgpath = os.path.join(diskpath, fname+'.'+itype) with self.imagemap_lock: self.imagemap[iurl] = imgpath with open(imgpath, 'wb') as x: x.write(data) tag['src'] = imgpath except Exception: traceback.print_exc() continue
def process_images(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'images')) if not os.path.exists(diskpath): os.mkdir(diskpath) c = 0 for tag in soup.findAll('img', src=True): iurl = tag['src'] if iurl.startswith('data:'): try: data = urlopen(iurl).read() except Exception: self.log.exception('Failed to decode embedded image') continue else: if callable(self.image_url_processor): iurl = self.image_url_processor(baseurl, iurl) if not urlsplit(iurl).scheme: iurl = urljoin(baseurl, iurl, False) found_in_cache = False with self.imagemap_lock: if iurl in self.imagemap: tag['src'] = self.imagemap[iurl] found_in_cache = True if found_in_cache: continue try: data = self.fetch_url(iurl) if data == b'GIF89a\x01': # Skip empty GIF files as PIL errors on them anyway continue except Exception: self.log.exception('Could not fetch image ', iurl) continue c += 1 fname = ascii_filename('img' + str(c)) data = self.preprocess_image_ext( data, iurl) if self.preprocess_image_ext is not None else data if data is None: continue itype = what(None, data) if itype == 'svg' or (itype is None and b'<svg' in data[:1024]): # SVG image imgpath = os.path.join(diskpath, fname + '.svg') with self.imagemap_lock: self.imagemap[iurl] = imgpath with open(imgpath, 'wb') as x: x.write(data) tag['src'] = imgpath else: from calibre.utils.img import image_from_data, image_to_data try: # Ensure image is valid img = image_from_data(data) if itype not in {'png', 'jpg', 'jpeg'}: itype = 'png' if itype == 'gif' else 'jpeg' data = image_to_data(img, fmt=itype) if self.compress_news_images and itype in {'jpg', 'jpeg'}: try: data = self.rescale_image(data) except Exception: self.log.exception('failed to compress image ' + iurl) # Moon+ apparently cannot handle .jpeg files if itype == 'jpeg': itype = 'jpg' imgpath = os.path.join(diskpath, fname + '.' + itype) with self.imagemap_lock: self.imagemap[iurl] = imgpath with open(imgpath, 'wb') as x: x.write(data) tag['src'] = imgpath except Exception: traceback.print_exc() continue