def query(keyword, page=1): headers = { "Host": "weixin.sogou.com", "User-Agent": random.choice(WEB_USER_AGENT), } proxies = { "http": "127.0.0.1:3333", "https": "127.0.0.1:3333", } url = "http://weixin.sogou.com/weixin" params = {"type": 1, "query": keyword, "ie": "utf8", "page": page} r = None for i in range(3): r = http.get(url, params=params, headers=headers, proxies=proxies) if r and not (r.is_redirect or r.is_permanent_redirect): break time.sleep(random.randint(2, 20) * 0.1) else: if r: _logger.warning('sogou ban') else: _logger.warning('download exception') if not r: return [], "" return parse(r.content), r.url
def download_image(url): r = http.get(url, headers=headers) if not r: return None if r.status_code >= 400: _logger.warn("download image error: %s" % r.status_code) return None else: return r.content
def get_document(url): netloc = urlparse(url).netloc headers = {"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", "host": netloc} r = http.get(url, headers=headers) if not r: return '' if r.status_code == 200: encoding = r.apparent_encoding if encoding.lower() == "gb2312": encoding = "GB18030" return r.content.decode(encoding).encode("utf-8") else: print("get document error: %s" % r.status_code) return ""
def get_document(url): netloc = urlparse(url).netloc headers = {"user-agent": WEB_USER_AGENT[0], "host": netloc} r = http.get(url, headers=headers) if not r: return '' if r.status_code == 200: encoding = r.apparent_encoding if encoding.lower() == "gb2312": encoding = "GB18030" return r.content.decode(encoding).encode("utf-8") else: return r.content else: print("get document error: %s" % r.status_code) return ""