Exemple #1
0
def query(keyword, page=1):
    headers = {
        "Host": "weixin.sogou.com",
        "User-Agent": random.choice(WEB_USER_AGENT),
    }
    proxies = {
        "http": "127.0.0.1:3333",
        "https": "127.0.0.1:3333",
    }
    url = "http://weixin.sogou.com/weixin"
    params = {"type": 1, "query": keyword, "ie": "utf8", "page": page}
    r = None
    for i in range(3):
        r = http.get(url, params=params, headers=headers, proxies=proxies)
        if r and not (r.is_redirect or r.is_permanent_redirect):
            break
        time.sleep(random.randint(2, 20) * 0.1)
    else:
        if r:
            _logger.warning('sogou ban')
        else:
            _logger.warning('download exception')
    if not r:
        return [], ""
    return parse(r.content), r.url
Exemple #2
0
def download_image(url):
    r = http.get(url, headers=headers)
    if not r:
        return None
    if r.status_code >= 400:
        _logger.warn("download image error: %s" % r.status_code)
        return None
    else:
        return r.content
def get_document(url):
    netloc = urlparse(url).netloc
    headers = {"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36",
               "host": netloc}
    r = http.get(url, headers=headers)
    if not r:
        return ''
    if r.status_code == 200:
        encoding = r.apparent_encoding
        if encoding.lower() == "gb2312":
            encoding = "GB18030"
        return r.content.decode(encoding).encode("utf-8")
    else:
        print("get document error: %s" % r.status_code)
        return ""
Exemple #4
0
def get_document(url):
    netloc = urlparse(url).netloc
    headers = {"user-agent": WEB_USER_AGENT[0], "host": netloc}
    r = http.get(url, headers=headers)
    if not r:
        return ''
    if r.status_code == 200:
        encoding = r.apparent_encoding
        if encoding.lower() == "gb2312":
            encoding = "GB18030"
            return r.content.decode(encoding).encode("utf-8")
        else:
            return r.content
    else:
        print("get document error: %s" % r.status_code)
        return ""