def opener(url, ienc=None, save=None, **ka): if url.startswith(FILE): url = url[len(FILE) :] import io d = io.open(url, "rb").read() if not ienc: import chardet # chardet.feedparser.org, python3-chardet, python-chardet ienc = chardet.detect(d)["encoding"] # .confidence else: ienc = None r = url_opener(url, ka) d = r.read() r.close() if save: open(url.replace("/", "__"), "wb").write(d) # remove hanging open < d = re.sub(b"<([^>]*?<)", rb"\1", d) # if d!=d1: print( 2222222) if ienc: d = d.decode(ienc) return d
def loadUrl(self, url): self.setUrl(url) # Use PyQuery's URL opener to properly handle content encoding html = url_opener(url, {}) if hasattr(html, 'read'): html = html.read() self.loadHtml( str(html) )
def loadUrl(self, url): self.setUrl(url) # Use PyQuery's URL opener to properly handle content encoding html = url_opener(url, {}) if hasattr(html, 'read'): html = html.read() self.loadHtml(str(html))