def save_soup(soup, target): ns = BeautifulSoup( '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />' ) nm = ns.find('meta') metas = soup.findAll('meta', content=True) added = False for meta in metas: if 'charset' in meta.get('content', '').lower(): meta.replaceWith(nm) added = True if not added: head = soup.find('head') if head is not None: head.insert(0, nm) selfdir = os.path.dirname(target) for tag in soup.findAll(['img', 'link', 'a']): for key in ('src', 'href'): path = tag.get(key, None) if path and os.path.isfile(path) and os.path.exists( path) and os.path.isabs(path): tag[key] = unicode_path( relpath(path, selfdir).replace(os.sep, '/')) html = unicode_type(soup) with open(target, 'wb') as f: f.write(html.encode('utf-8'))
def href(self, basedir=None): ''' Return a URL pointing to this resource. If it is a file on the filesystem the URL is relative to `basedir`. `basedir`: If None, the basedir of this resource is used (see :method:`set_basedir`). If this resource has no basedir, then the current working directory is used as the basedir. ''' from urllib import quote if basedir is None: if self._basedir: basedir = self._basedir else: basedir = os.getcwdu() if self.path is None: return self._href f = self.fragment.encode('utf-8') if isinstance(self.fragment, unicode) else self.fragment frag = '#'+quote(f) if self.fragment else '' if self.path == basedir: return ''+frag try: rpath = relpath(self.path, basedir) except OSError: # On windows path and basedir could be on different drives rpath = self.path if isinstance(rpath, unicode): rpath = rpath.encode('utf-8') return quote(rpath.replace(os.sep, '/'))+frag
def save_soup(soup, target): ns = BeautifulSoup('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />') nm = ns.find('meta') metas = soup.findAll('meta', content=True) added = False for meta in metas: if 'charset' in meta.get('content', '').lower(): meta.replaceWith(nm) added = True if not added: head = soup.find('head') if head is not None: head.insert(0, nm) selfdir = os.path.dirname(target) for tag in soup.findAll(['img', 'link', 'a']): for key in ('src', 'href'): path = tag.get(key, None) if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path): tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/')) html = unicode(soup) with open(target, 'wb') as f: f.write(html.encode('utf-8'))
def save_soup(soup, target): for meta in soup.findAll('meta', content=True): if 'charset' in meta['content'].lower(): meta.extract() for meta in soup.findAll('meta', charset=True): meta.extract() head = soup.find('head') if head is not None: nm = soup.new_tag('meta', charset='utf-8') head.insert(0, nm) selfdir = os.path.dirname(target) for tag in soup.findAll(['img', 'link', 'a']): for key in ('src', 'href'): path = tag.get(key, None) if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path): tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/')) html = str(soup) with open(target, 'wb') as f: f.write(html.encode('utf-8'))
def save_soup(soup, target): for meta in soup.findAll('meta', content=True): if 'charset' in meta['content'].lower(): meta.extract() for meta in soup.findAll('meta', charset=True): meta.extract() head = soup.find('head') if head is not None: nm = soup.new_tag('meta', charset='utf-8') head.insert(0, nm) selfdir = os.path.dirname(target) for tag in soup.findAll(['img', 'link', 'a']): for key in ('src', 'href'): path = tag.get(key, None) if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path): tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/')) html = unicode_type(soup) with open(target, 'wb') as f: f.write(html.encode('utf-8'))