def grabber(url, header=None, *, referer=None, cookie=None, raise_429=True, params=None, done=None): """Request url, return text or bytes of the content.""" scheme, netloc, path, query, frag = urlsplit(url) if netloc not in sessions: s = requests.Session() s.headers.update(default_header) sessions[netloc] = s else: s = sessions[netloc] if header: s.headers.update(header) if referer: s.headers['referer'] = quote_unicode(referer) if cookie: quote_unicode_dict(cookie) requests.utils.add_dict_to_cookiejar(s.cookies, cookie) r = sync(do_request, s, url, params, raise_429) if done: done(s, r) return r
def grabimg(*args, **kwargs): """Return byte array.""" r = sync(grabber, *args, **kwargs) # find extension mime = None b = r.content if "Content-Type" in r.headers: mime = re.search("^(.*?)(;|$)", r.headers["Content-Type"]).group(1) mime = mime.strip() return get_ext(mime, b), b
def grabhtml(*args, **kwargs): """Get html source of given url. Return String.""" r = sync(grabber, *args, **kwargs) # decode to text match = re.search(br"charset=[\"']?([^\"'>]+)", r.content) if match: encoding = match.group(1).decode("latin-1") if encoding == "gb2312": encoding = "gbk" r.encoding = encoding return r.text