コード例 #1
0
def get_360_pic(keyword, data_path):
    data_path = data_path + keyword + "/"
    if not os.path.exists(data_path):
        os.mkdir(data_path)
    to_headers = b"""
        user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36
        """
    headers = headers_raw_to_dict(to_headers)
    pic_url_set = set()
    first_url_list = [f"http://image.so.com/i?q={keyword}&src=srp&zoom={zoom_type}" for zoom_type in range(1, 4)]
    white_url_list = [url + "&color=white" for url in first_url_list]
    black_url_list = [url + "&color=black" for url in first_url_list]
    first_url_list = first_url_list + white_url_list + black_url_list
    try:
        for first_url in first_url_list:
            # print(proxies)
            first_webdata = requests.get(first_url, headers=headers, proxies=proxies).text
            first_datas = etree.HTML(first_webdata)
            xpath_datas = json.loads(first_datas.xpath("//script[@id='initData']/text()")[0])
            for data in xpath_datas["list"]:
                pic_url_set.add(data["img"])
            time.sleep(random.randint(5, 10) / 10)
    except Exception as e:
        print(e)
    for i in range(1, 80):
        print(f"第{i}页,有{len(pic_url_set)}张图")
        # url_list = [f"http://image.so.com/j?q={keyword}&src=srp=srp&correct={keyword}&pn=60" \
        #       f"&ch=&sn={50+60*i}&ran=0&ras=6&cn=0&gn=0&kn=50"]
        url_list = [f"https://image.so.com/j?q={keyword}&pd=1&pn=60&correct={keyword}" \
              f"&adstar=0&tab=all&sid=2e488cafefd0f95cc08342c9a979c788&ras=0&cn=0&gn=0&kn=50&crn=0&bxn=0&cuben=0&src=srp&zoom={zoom_type}&sn={50+60*i}&pn=60" for zoom_type in range(1, 4)]
        white_url = [f"https://image.so.com/j?q={keyword}&pd=1&pn=60&correct={keyword}" \
              f"&adstar=0&tab=all&sid=2e488cafefd0f95cc08342c9a979c788&ras=0&cn=0&gn=0&kn=50&crn=0&bxn=0&cuben=0&src=srp&zoom={zoom_type}&color=white&sn={50+60*i}&pn=60" for zoom_type in range(1, 4)]
        black_url = [f"https://image.so.com/j?q={keyword}&pd=1&pn=60&correct={keyword}" \
                     f"&adstar=0&tab=all&sid=2e488cafefd0f95cc08342c9a979c788&ras=0&cn=0&gn=0&kn=50&crn=0&bxn=0&cuben=0&src=srp&zoom={zoom_type}&color=black_url&sn={50 + 60 * i}&pn=60" for zoom_type in range(1, 4)]
        url_list = url_list + white_url +black_url
        def url_process(url, proxies):
            url_set = set()
            web_data = requests.get(url, headers=headers).text
            if "您的电脑或所在局域网络对本站有异常访问" in web_data:
                raise InterruptedError("访问异常")
            datas = json.loads(web_data)
            for data in datas["list"]:
                url_set.add(data["img"])
            return url_set
        for url in url_list:
            try:
                pic_url_set = pic_url_set | url_process(url)
            except InterruptedError:
                print("您的电脑或所在局域网络对本站有异常访问")

                pic_url_set = pic_url_set | url_process(url)
            except TimeoutError:
                print('换代理ip')
                pic_url_set = pic_url_set | url_process(url)
            except Exception as e:
                traceback.print_exc()
            time.sleep(random.randint(20, 25)/10)
            pic_dict = {"关键词": keyword, "图片数量": len(pic_url_set), "图片链接列表": list(pic_url_set)}
            with open(data_path+f"pic_360_{keyword}.txt", "w", encoding='utf-8') as f:
                f.write(json.dumps(pic_dict, ensure_ascii=False))
コード例 #2
0
 def get_headers(self):
     headers = self.headers
     response = self.s.get('https://www.zhihu.com/signup?next=%2F', headers=headers)
     headers.update(headers_raw_to_dict(self.raw_headers))
     #随机的UA可能导致response.cookies空值
     headers['x-xsrftoken'] = response.cookies['_xsrf']
     return headers
コード例 #3
0
def get_cookies():
    headers = '''
authority: www.lagou.com
method: GET
path: /jobs/list_python/p-city_0?
scheme: https
accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
accept-encoding: gzip, deflate, br
accept-language: zh-CN,zh;q=0.9
cache-control: no-cache
pragma: no-cache
sec-fetch-dest: document
sec-fetch-mode: navigate
sec-fetch-site: none
sec-fetch-user: ?1
upgrade-insecure-requests: 1
user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36
    '''
    h = bytes(headers, encoding="utf-8")
    headers = headers_raw_to_dict(h)
    url = 'https://www.lagou.com/jobs/list_python/p-city_0?'
    response = requests.get(url=url, headers=headers)
    cookies = response.cookies.get_dict()
    # print(cookies)
    # print()
    return cookies
コード例 #4
0
def main():
    try:
        headers = b"""
        accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
        accept-encoding:gzip, deflate, br
        accept-language:zh-CN,zh;q=0.9
        cache-control:max-age=0
        cookie:_T_WM=7b01809efebbf4591817160024e1b6a6; WEIBOCN_WM=3333_2001; SCF=AteYQ3Fn2RHnmdtLq7-P7aC9_YRjVMYq1xxQNzuNXBhqmw4Z8DQJ3rPSMkWAtZuZSXpQxc9w_BI7FQldOaoGW6o.; SUB=_2A25xlxyODeRhGeNL61YW8yzIzTSIHXVTe6TGrDV6PUJbkdAKLWulkW1NSSa2EgDEwD6DyS0MdOhlDCS7DVAoMx1o; SUHB=0BLoPb1nuQTHiT; SSOLoginState=1553165534; MLOGIN=1; XSRF-TOKEN=58c003; M_WEIBOCN_PARAMS=featurecode%3Dnewtitle%26oid%3D4346211433851508%26luicode%3D10000011%26lfid%3D102803; WEIBOCN_FROM=1110106030
        upgrade-insecure-requests:1
        user-agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36
        """

        # 将请求头字符串转化为字典
        headers = headers_raw_to_dict(headers)
        file_name = '#缺觉一代#环球时报'
        identity = '4362948325812636'
        page1 = get_page(identity, headers)
        page2 = get_attitudes_page(identity, headers)

        attitudes = get_attitudes(identity, page2, headers)
        write_file(file_name, attitudes, '赞')

        comments = get_comments(identity, page1, headers)
        write_file(file_name, comments, '评论')

        get_statuses(identity, page1, headers)
        write_file(file_name, comments, '转发')

    except Exception as e:
        print("Error: ", e)
        traceback.print_exc()
コード例 #5
0
 def download(self, url):
     try:
         content = requests.get(
             url,
             headers=copyheaders.headers_raw_to_dict(headers),
             proxies=proxt.proxy()).text
         return content
     except:
         return None
コード例 #6
0
ファイル: test.py プロジェクト: TravelNes/Python
 def post_headers(self):
     response = self.session.get('https://www.zhihu.com', verify=False)
     xsrf = response.headers['Set-Cookie'].split('=')[1][:-6]
     pattern = re.compile('xUDID":"(.*?)"')
     xudid = re.findall(pattern, response.text)[0]
     headers = headers_raw_to_dict(post_headers_raw)
     headers['authorization'] = 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'
     headers['X-UDID'] = xudid
     headers['X-Xsrftoken'] = xsrf
     return headers
コード例 #7
0
ファイル: zhihulogin.py プロジェクト: q7695650/pachong
def getheaders():
    '从网页源代码内解析出 uuid与Xsrftoken'
    z1 = s.get('https://www.zhihu.com/')
    sel = Selector(z1.text)
    jsdata = sel.css('div#data::attr(data-state)').extract_first()
    xudid = json.loads(jsdata)['token']['xUDID']
    xsrf = json.loads(jsdata)['token']['xsrf']
    headers = headers_raw_to_dict(post_headers_raw)
    headers['X-UDID'] = xudid
    headers['X-Xsrftoken'] = xsrf
    return headers
コード例 #8
0
def getheaders():
    '从网页源代码内解析出 uuid与Xsrftoken'
    z1 = s.get('https://www.zhihu.com/')
    sel = Selector(z1.text)
    jsdata = sel.css('div#data::attr(data-state)').extract_first()
    xudid = json.loads(jsdata)['token']['xUDID']
    xsrf = json.loads(jsdata)['token']['xsrf']
    headers = headers_raw_to_dict(post_headers_raw)
    headers['X-UDID'] = xudid
    headers['X-Xsrftoken'] = xsrf
    return headers
コード例 #9
0
def get_header():
    # 浏览器复制的请求头,对应着小号信息,20190703-21:25
    r_h = b'''
    Cookie: buvid3=CD3256C4-1351-43A2-98CC-88E12F177A6E40768infoc; LIVE_BUVID=AUTO4715623865079860; sid=i1w7ujpe; DedeUserID=440237330; DedeUserID__ckMd5=4f6573ba8b1afb25; SESSDATA=e816c21a%2C1564978587%2Cbe6fdc71; bili_jct=ea4f385dbad9ab257970aa2d35914653; CURRENT_FNVAL=16; stardustvideo=1
    '''
    headers = headers_raw_to_dict(r_h)
    headers[b'Content-Type'] = b'application/x-www-form-urlencoded; charset=UTF-8'
    headers[b'Connection'] = b'keep-alive'
    headers[b'User-Agent'] = bytes(FakeChromeUA.get_ua(), 'utf8')

    return headers
コード例 #10
0
    def get_content(self, url):
        daly = self.random()
        time.sleep(daly)
        item = random.choice(self.proxy)
        proxy = {item['proxy_scheme']: item['proxy']}

        html = requests.get(
            url,
            headers=headers_raw_to_dict(self.headers),
        ).text

        return html
コード例 #11
0
 def get_xsrf(self):
     z1 = self.s.get('https://www.zhihu.com/')
     # sel = Selector(z1.text)
     # jsdata = sel.css('div#data::attr(data-state)').extract_first()
     # xudid = json.loads(jsdata)['token']['xUDID']
     # xsrf = json.loads(jsdata)['token']['xsrf']
     headers = headers_raw_to_dict(post_headers_raw)
     # headers['X-UDID'] = xudid
     for c in z1.cookies:
         if c.name == '_xsrf':
             print("c.value: ", c.value)
             return c.value
コード例 #12
0
ファイル: LAGOU.py プロジェクト: LTG01/ArticleSpider
def get_cookies():
    headers = '''
user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36
    '''
    h = bytes(headers, encoding="utf-8")
    headers = headers_raw_to_dict(h)
    url = 'https://www.lagou.com/jobs/list_python/p-city_0?'
    response = requests.get(url=url,headers=headers)
    cookies= response.cookies.get_dict()
    print(cookies)
    # print()
    return cookies
コード例 #13
0
def getheaders():
    '从网页源代码内解析出 uuid与Xsrftoken'
    z1 = s.get('https://www.zhihu.com/')

    dom = etree.HTML(z1.text)
    jsonData = dom.xpath('//div/@data-state')
    xudid = json.loads(jsonData[0])['token']['xUDID']
    xsrf = json.loads(jsonData[0])['token']['xsrf']
    headers = headers_raw_to_dict(post_headers_raw)
    headers['X-UDID'] = xudid.encode("utf-8").decode("utf-8")
    headers['X-Xsrftoken'] = xsrf.encode("utf-8").decode("utf-8")
    return headers
コード例 #14
0
ファイル: yelp_spider.py プロジェクト: whateversky/yelp
    def get_detail(self, url_suffix):
        url = "https://www.yelp.com" + url_suffix
        headers_str = b"""
            authority: www.yelp.com
            method: GET
            path: /biz/district-social-new-york?hrid=295oISkILBXnFFf_ZVDTQw&osq=bars
            scheme: https
            accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
            accept-encoding: gzip, deflate, br
            accept-language: zh-CN,zh;q=0.9
            cache-control: max-age=0
            cookie: __cfduid=d3207597b7f9964b5d7382fe9d4fea4361588046941; hl=en_US; wdi=1|FECB55555242BF09|0x1.7a9eb1776f972p+30|7f7c45b3a6763c87; _ga=GA1.2.FECB55555242BF09; _gid=GA1.2.1527499598.1588046945; __adroll_fpc=3ea8f1b4e033921303ce5044baa07e08-1588047141657; bse=9e5786973cd34b82b87984dd21b642b0; recentlocations=New+York; location=%7B%22max_longitude%22%3A+-73.7938%2C+%22address3%22%3A+%22%22%2C+%22min_longitude%22%3A+-74.1948%2C+%22neighborhood%22%3A+%22%22%2C+%22address1%22%3A+%22%22%2C+%22place_id%22%3A+%221208%22%2C+%22min_latitude%22%3A+40.5597%2C+%22county%22%3A+null%2C+%22unformatted%22%3A+%22New+York%2C+NY%2C+United+States%22%2C+%22display%22%3A+%22New+York%2C+NY%22%2C+%22borough%22%3A+%22%22%2C+%22polygons%22%3A+null%2C+%22max_latitude%22%3A+40.8523%2C+%22city%22%3A+%22New+York%22%2C+%22isGoogleHood%22%3A+false%2C+%22language%22%3A+null%2C+%22zip%22%3A+%22%22%2C+%22parent_id%22%3A+975%2C+%22country%22%3A+%22US%22%2C+%22provenance%22%3A+%22YELP_GEOCODING_ENGINE%22%2C+%22longitude%22%3A+-74.0072%2C+%22location_type%22%3A+%22locality%22%2C+%22confident%22%3A+null%2C+%22state%22%3A+%22NY%22%2C+%22latitude%22%3A+40.713%2C+%22usingDefaultZip%22%3A+false%2C+%22address2%22%3A+%22%22%2C+%22accuracy%22%3A+4%7D; sc=7861677975; adc=K_e7_aNgghFLZ2zvjFSNkQ%3ArHulTzqonV_UFGbrz301Tg%3A1588131876; xcj=1|nL8omGMlArkoAcUYz2qIiFk0PEVJdicJty4IkWleRno; __ar_v4=7YX6SJQ4RZAMPB6LZ7CHFF%3A20200428%3A12%7CQB5JPFIKRZDSBOZSULG4YB%3A20200428%3A12%7CBHPKS4B4ONEJJMGH4QCJZR%3A20200428%3A12; _gat_www=1
            sec-fetch-mode: navigate
            sec-fetch-site: none
            sec-fetch-user: ?1
            upgrade-insecure-requests: 1
            user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36
        """
        headers = copyheaders.headers_raw_to_dict(headers_str)
        try:
            response = requests.get(url=url, headers=headers, timeout=10)
            text = response.text
            address_detail = re.search(
                "\"addressLines\":(.*?),\"formattedCrossStreets\"",
                text).groups()[0]
            street_detail = re.search("\"formattedCrossStreets\":(.*?)\\},",
                                      text).groups()[0]
            if "&" in street_detail:
                street_detail = street_detail.replace("&", "&")
            detail = dict()
            detail["address_detail"] = eval(address_detail)
            detail["street_detail"] = street_detail

            text = text.replace("\n", "").replace("\r", "")
            script_list = re.findall(
                "<script type=\"application/ld\\+json\">(.*?)</script>", text)
            label_list = list()
            for script_item in script_list:
                script_dic = json.loads(script_item)
                if script_dic["@type"] == "BreadcrumbList":
                    item_list_element = script_dic["itemListElement"]
                    label_list.append(
                        item_list_element[len(item_list_element) -
                                          1]["item"]["name"])
            detail["label_list"] = label_list
            detail["html"] = text
            return detail
        except Exception as exception:
            # change proxy if has proxy
            raise exception
コード例 #15
0
ファイル: client.py プロジェクト: zyws1000/Python-crawler
 def getHeaders(self):
     '''从网页源代码内解析出 uuid与Xsrftoken'''
     self.s.headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
     }
     z1 = self.s.get('https://www.zhihu.com/')
     sel = Selector(z1.text)
     jsdata = sel.css('div#data::attr(data-state)').extract_first()
     xudid = json.loads(jsdata)['token']['xUDID']
     xsrf = json.loads(jsdata)['token']['xsrf']
     headers = headers_raw_to_dict(self.post_headers_raw)
     headers['X-UDID'] = xudid
     headers['X-Xsrftoken'] = xsrf
     return headers
コード例 #16
0
 def __init__(self):
     self.headers = headers_raw_to_dict(b"""
     Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
     Accept-Encoding: gzip, deflate, br
     Accept-Language: zh-CN,zh;q=0.9
     Cache-Control: max-age=0
     Connection: keep-alive
     Cookie: _lxsdk_s=16fb0ce3a0d-4cf-d9e-cf2%7C%7C1
     Host: www.meituan.com
     Sec-Fetch-Mode: navigate
     Sec-Fetch-Site: none
     Sec-Fetch-User: ?1
     Upgrade-Insecure-Requests: 1
     User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36
     """)
コード例 #17
0
def getheaders():
    # 从网页源代码内解析出 uuid与Xsrftoken
    z1 = s.get('https://www.zhihu.com/')
    sel = Selector(z1.text)
    jsdata = sel.css('div#data::attr(data-state)').extract_first()
    xudid = json.loads(jsdata)['token']['xUDID']
    print(xudid)
    xsrf = z1.cookies._cookies['.zhihu.com']['/']['_xsrf']
    xsrf = re.match('.*?=(.*?)for', str(xsrf)).group(1).strip()
    # xsrf = json.loads(jsdata)['token']['xsrf']
    print(xsrf)
    headers = headers_raw_to_dict(post_headers_raw)
    headers['X-UDID'] = xudid
    headers['X-Xsrftoken'] = xsrf
    return headers
コード例 #18
0
ファイル: zhihulogin.py プロジェクト: chennanxu/crawler
def getheaders():
    req = s.get(url)
    # print(req)
    doc = pq(req.text)

    d = doc('div').filter('#data').attr("data-state")
    data = json.loads(d)

    xsrf = data['token']['xsrf']
    xUDID = data['token']['xUDID']
    #print("xsrf:",xsrf, "xUDID:", xUDID)

    headers = headers_raw_to_dict(post_headers_raw)
    headers['X-UDID'] = xUDID
    headers['X-Xsrftoken'] = xsrf

    return headers
コード例 #19
0
ファイル: weibo.py プロジェクト: MuggleK/Decrypt_JS
def login():
    login_url = 'https://login.sina.com.cn/signup/signin.php?'
    form_data = {
        'entry': 'weibo',
        'gateway': '1',
        'from':'',
        'savestate': '7',
        'qrcode_flag': 'false',
        'useticket': '1',
        'pagerefer':'',
        'wsseretry': 'servertime_error',
        'pcid': item['pcid'],
        'door': get_captcha(),
        'vsnf': '1',
        'su': _su,
        'service': 'miniblog',
        'servertime': item['servertime'],
        'nonce': item["nonce"],
        'pwencode': 'rsa2',
        'rsakv': item["rsakv"],
        'sp': _sp,
        'sr': '1920 * 1080',
        'encoding': 'UTF - 8',
        'cdult': '2',
        'domain': 'weibo.com',
        'prelt': '46',
        'returntype': 'TEXT'
    }
    headers = headers_raw_to_dict(b'''
    Accept: */*
    Accept-Encoding: gzip, deflate, br
    Accept-Language: zh-CN,zh;q=0.9
    Connection: keep-alive
    Content-Length: 613
    Content-Type: application/x-www-form-urlencoded
    Cookie: SINAGLOBAL=106.91.209.232_1579146295.789444; Apache=106.91.209.232_1579146295.789445; login=6ec7f45fe2ef20765c00bb7efc116593; U_TRS1=00000036.f4b94bb5.5e20083b.9d21c035; U_TRS2=00000036.f4c14bb5.5e20083b.39aa7d6d; ULOGIN_IMG=gz-4dbbdb6eda12f62d45b33c227fe2ad378491
    Host: login.sina.com.cn
    Origin: https://weibo.com
    Referer: https://weibo.com/login.php
    Sec-Fetch-Mode: cors
    Sec-Fetch-Site: cross-site
    User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36
    ''')
    res = requests.post(url = login_url,data = form_data,headers = headers)
    print(res.status_code)
コード例 #20
0
 def get_headers(self):
     '从知乎登录页面提取出X-udid和X-Xsrftoken'
     z1 = self.s.get('https://www.zhihu.com/')
     # sel = Selector(z1.text)
     # jsdata = sel.css('div#data::attr(data-state)').extract_first()
     # xudid = json.loads(jsdata)['token']['xUDID']
     # xsrf = json.loads(jsdata)['token']['xsrf']
     headers = headers_raw_to_dict(post_headers_raw)
     # headers['X-UDID'] = xudid
     for c in z1.cookies:
         if c.name == '_xsrf':
             print("c.value: ", c.value)
     headers.update({
         'content-type': 'application/x-www-form-urlencoded',
         'x-zse-83': '3_1.1',
         'x-xsrftoken': self.get_xsrf()
     })
     return headers
コード例 #21
0
def parse_header():
    # 替换这个
    header = b'''
    Accept: */*
    Accept-Encoding: gzip, deflate, br
    Accept-Language: zh,en;q=0.9,en-US;q=0.8
    Cache-Control: no-cache
    Connection: keep-alive
    Cookie: _ga=GA1.2.1120330993.1533803771; device_id=45dc0a51a26fc3078e5d8636d5141178; aliyungf_tc=AQAAABUPpRGD+w0AOnFoypiKi1AgLha3; Hm_lvt_1db88642e346389874251b5a1eded6e3=1538060166,1539759418; s=ev17xxecme; _gid=GA1.2.489835841.1540172180; remember=1; remember.sig=K4F3faYzmVuqC0iXIERCQf55g2Y; xq_a_token=0a093a7b60eeaf5abb3468ebb1827ab37492829a; xq_a_token.sig=Ugrl-_BEM5Ed2K1tThP4B9xd-WI; xqat=0a093a7b60eeaf5abb3468ebb1827ab37492829a; xqat.sig=cC3oDwhUgpI-cY_nx4o-fIir8ag; xq_r_token=7147aa65f965bdfd68872710923386e22d547761; xq_r_token.sig=WZ_zkORdsy2K2ngXNlFRV6DkcCg; xq_is_login=1; xq_is_login.sig=J3LxgPVPUzbBg3Kee_PquUfih7Q; u=1733473480; u.sig=2sMTnVmBVOASyCZs6lbVBQ6Zfgs; bid=a8ec0ec01035c8be5606c595aed718d4_jnl1zufy; _gat_gtag_UA_16079156_4=1; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540258247
    Host: xueqiu.com
    Pragma: no-cache
    Referer: https://xueqiu.com/2227798650/115496801
    User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36
    X-Requested-With: XMLHttpRequest
    '''

    header_dict = headers_raw_to_dict(header)
    print(header_dict)
    for k, v in header_dict.items():
        print('"{}":"{}",'.format(str(k, encoding='utf8'), str(v, encoding='utf8')))
コード例 #22
0
def parse_header():
    # 替换这个
    header = b'''
    Accept: */*
    Accept-Encoding: gzip, deflate, br
    Accept-Language: zh,en;q=0.9,en-US;q=0.8
    Cache-Control: no-cache
    Connection: keep-alive
    Cookie: _ga=GA1.2.1120330993.1533803771; device_id=45dc0a51a26fc3078e5d8636d5141178; aliyungf_tc=AQAAABUPpRGD+w0AOnFoypiKi1AgLha3; Hm_lvt_1db88642e346389874251b5a1eded6e3=1538060166,1539759418; s=ev17xxecme; _gid=GA1.2.489835841.1540172180; remember=1; remember.sig=K4F3faYzmVuqC0iXIERCQf55g2Y; xq_a_token=0a093a7b60eeaf5abb3468ebb1827ab37492829a; xq_a_token.sig=Ugrl-_BEM5Ed2K1tThP4B9xd-WI; xqat=0a093a7b60eeaf5abb3468ebb1827ab37492829a; xqat.sig=cC3oDwhUgpI-cY_nx4o-fIir8ag; xq_r_token=7147aa65f965bdfd68872710923386e22d547761; xq_r_token.sig=WZ_zkORdsy2K2ngXNlFRV6DkcCg; xq_is_login=1; xq_is_login.sig=J3LxgPVPUzbBg3Kee_PquUfih7Q; u=1733473480; u.sig=2sMTnVmBVOASyCZs6lbVBQ6Zfgs; bid=a8ec0ec01035c8be5606c595aed718d4_jnl1zufy; _gat_gtag_UA_16079156_4=1; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540258247
    Host: xueqiu.com
    Pragma: no-cache
    Referer: https://xueqiu.com/2227798650/115496801
    User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36
    X-Requested-With: XMLHttpRequest
    '''

    header_dict = headers_raw_to_dict(header)
    print(header_dict)
    for k, v in header_dict.items():
        print('"{}":"{}",'.format(str(k, encoding='utf8'), str(v, encoding='utf8')))
コード例 #23
0
def get_info_list(page=1):
    headers = '''
   referer: https://www.lagou.com/jobs/list_python/p-city_0?
    user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36

    '''
    h = bytes(headers, encoding="utf-8")
    headers = headers_raw_to_dict(h)

    info_list = []
    info_list.append(['城市', '公司名', '公司规模', '学历', '职位名称', '薪资', '工作时间'])
    for i in range(1, page + 1):
        url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
        data = {"first": "true", "pn": str(i), "kd": "python"}

        response = requests.post(url,
                                 data=data,
                                 headers=headers,
                                 cookies=get_cookies())
        data = response.json()
        try:
            print(data)
            datalist = data['content']['positionResult']['result']
            for data in datalist:
                city = data['city']
                companyFullName = data['companyFullName']
                companySize = data['companySize']
                education = data['education']
                positionName = data['positionName']
                salary = data['salary']
                workYear = data['workYear']

                info_list.append([
                    city, companyFullName, companySize, education,
                    positionName, salary, workYear
                ])

        except Exception as e:
            print(str(e))
            pass
    return info_list
コード例 #24
0
ファイル: yelp_spider.py プロジェクト: whateversky/yelp
 def get_page(self, start_number):
     url = "https://www.yelp.com/search/snippet?find_desc=bars&find_loc=New%20York%2C%20NY%2C%20United%20States&start={}&parent_request_id=dfcaae5fb7b44685&request_origin=user" \
         .format(start_number)
     headers_str = b"""
         cache-control: max-age=0, must-revalidate, no-cache, no-store, private
         cache-control: no-transform
         cf-cache-status: DYNAMIC
         cf-ray: 58b26184fbd76c86-SJC
         cf-request-id: 02635b471c00006c86b019b200000001
         content-encoding: gzip
         content-security-policy: report-uri https://www.yelp.com/csp_block?id=bf59639897830a99&page=enforced_by_default_directives&policy_hash=7b6f2d6630868fdb2698dac44731677c&site=www&timestamp=1588093661; object-src 'self'; base-uri 'self' https://*.yelpcdn.com https://*.adsrvr.org https://6372968.fls.doubleclick.net; font-src data: 'self' https://*.yelp.com https://*.yelpcdn.com https://fonts.gstatic.com https://connect.facebook.net https://cdnjs.cloudflare.com https://apis.google.com https://www.google-analytics.com https://use.typekit.net https://player.ooyala.com https://use.fontawesome.com https://maxcdn.bootstrapcdn.com https://fonts.googleapis.com
         content-security-policy-report-only: report-uri https://www.yelp.com/csp_report_only?id=bf59639897830a99&page=csp_report_frame_directives%2Cfull_site_ssl_csp_report_directives&policy_hash=9dd00a1a6fbb402584b7ce0c1fdb4d14&site=www&timestamp=1588093661; frame-ancestors 'self' https://*.yelp.com; default-src https:; img-src https: data: https://*.adsrvr.org; script-src https: data: 'unsafe-inline' 'unsafe-eval' blob:; style-src https: 'unsafe-inline' data:; connect-src https:; font-src data: 'self' https://*.yelp.com https://*.yelpcdn.com https://fonts.gstatic.com https://connect.facebook.net https://cdnjs.cloudflare.com https://apis.google.com https://www.google-analytics.com https://use.typekit.net https://player.ooyala.com https://use.fontawesome.com https://maxcdn.bootstrapcdn.com https://fonts.googleapis.com; frame-src https: yelp-webview://* yelp://* data:; child-src https: yelp-webview://* yelp://*; media-src https:; object-src 'self'; base-uri 'self' https://*.yelpcdn.com https://*.adsrvr.org https://6372968.fls.doubleclick.net; form-action https: 'self'
         content-type: application/json; charset=utf-8
         date: Tue, 28 Apr 2020 17:07:42 GMT
         expect-ct: max-age=604800, report-uri="https://report-uri.cloudflare.com/cdn-cgi/beacon/expect-ct"
         expires: Tue, 28 Apr 2020 17:07:41 GMT
         pragma: no-cache
         referrer-policy: origin-when-cross-origin
         server: cloudflare
         status: 200
         strict-transport-security: max-age=31536000; includeSubDomains; preload
         vary: User-Agent
         vary: Accept-Encoding
         x-b3-sampled: 0
         x-content-type-options: nosniff
         x-mode: ro
         x-node: www_all
         x-node: 10-69-179-105-uswest2bprod-9c0a6478-895a-11ea-98c5-b6d34d770
         x-proxied: 10-69-159-164-uswest2bprod
         x-routing-service: 10-69-187-145-uswest2bprod; site=www
         x-xss-protection: 1; report=https://www.yelp.com/xss_protection_report
         x-zipkin-id: 9a87fa4730749a04
     """
     headers = copyheaders.headers_raw_to_dict(headers_str)
     try:
         response = requests.get(url=url, headers=headers, timeout=10)
         return json.loads(response.text)
     except Exception as exception:
         # change proxy if has proxy
         raise exception
コード例 #25
0
def get_html(url, params):

    my_headers = b'''
    accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
    accept-language: zh-CN,zh;q=0.9
    cache-control: max-age=0
    cookie: x-zp-client-id=448f2b96-6b3a-48e3-e912-e6c8dd73e6cb; adfbid=0; adfbid2=0; Hm_lvt_38ba284938d5eddca645bb5e02a02006=1617108464; sajssdk_2015_cross_new_user=1; sts_deviceid=178832cf3f2680-0b20242883a4a9-6618207c-1296000-178832cf3f3780; sts_sg=1; sts_chnlsid=Unknown; zp_src_url=https%3A%2F%2Fwww.google.com.hk%2F; FSSBBIl1UgzbN7N443S=kc8_mcJe5xsW.UilCMHXpkoWeyQ8te3q7QhYV8Y8aA0Se9k9JJXcnQVvrOJ9NYDP; locationInfo_search={%22code%22:%22538%22%2C%22name%22:%22%E4%B8%8A%E6%B5%B7%22%2C%22message%22:%22%E5%8C%B9%E9%85%8D%E5%88%B0%E5%B8%82%E7%BA%A7%E7%BC%96%E7%A0%81%22}; zp_passport_deepknow_sessionId=a2ea7206sade7641768f38078ea6b45afef0; at=02a0ea392e1d4fd6a4d6003ac136aae0; rt=82f98e13344843d6b5bf3dadf38e8bb2; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221071739258%22%2C%22first_id%22%3A%22178832cf3bd20f-0be4af1633ae3d-6618207c-1296000-178832cf3be4b8%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%22178832cf3bd20f-0be4af1633ae3d-6618207c-1296000-178832cf3be4b8%22%7D; urlfrom=121126445; urlfrom2=121126445; adfcid=none; adfcid2=none; ZL_REPORT_GLOBAL={%22//www%22:{%22seid%22:%2202a0ea392e1d4fd6a4d6003ac136aae0%22%2C%22actionid%22:%2243ffc74e-c32e-42ee-ba04-1e24611fecde-cityPage%22}}; LastCity=%E4%B8%8A%E6%B5%B7; LastCity%5Fid=538; Hm_lpvt_38ba284938d5eddca645bb5e02a02006=1617111259; zpfe_probe_token=ae612f12s0feb44ac697a7434fe1f22af086; d4d6cd0b4a19fa72b8cc377185129bb7=ab637759-b57a-4214-a915-8dcbc5630065; selectCity_search=538; FSSBBIl1UgzbN7N443T=5pRoIYmxrZTzxVozDFEYjcClKKRpXbK9zf0gYH4zU5AyLqGUMT5fnVzyE0SMv7ZDGFLY0HV8o6iXLPBGBBTJhDhz3TIaQ3omm324Q2m4BSJzD0VgZzesPGIXudf636xQZkuag1QJmdqzgFLv6YPcKq.ukZPymp1IazfsOec5vBcMT9yemSrYb9UBk2XF.rZIeM3mIOBqpNii26kDRzjxHP5TsGLJzWaaZvklHnh61NT4acHPQt3Lq1.w2X4htg9ck.uGhzHt9w954igFEqhLCmggLi9OjPUaiU8TA4yn1oR1T5Qmjm1I5AA0PIu76e0T2u6w2f7thMkv6E7lkoDggrRMta0Z_uVEP3Y1sS8hJw7ycE2PTVtVassRyoN6UuTBHtSZ
    sec-ch-ua: "Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"
    sec-ch-ua-mobile: ?0
    sec-fetch-dest: document
    sec-fetch-mode: navigate
    sec-fetch-site: same-origin
    sec-fetch-user: ?1
    upgrade-insecure-requests: 1
    user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36
    '''
    my_headers = headers_raw_to_dict(my_headers)  # 把复制的浏览器需求头转化为字典形式
    req = requests.get(url, headers=my_headers, params=params)
    req.encoding = req.apparent_encoding
    html = req.text

    return html
コード例 #26
0
def get_html(url, headers, t=''):
    '''
    获取网页源码
    :param url:
    :param headers:
    :return:
    '''

    headers = headers_raw_to_dict(bytes(headers, encoding="utf-8"))
    html_response = requests.get(url=url, headers=headers).text

    # t = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())
    if os.makedirs('./html', exist_ok=True):
        os.makedirs('./html')
    # shop-name
    selector = parsel.Selector(html_response)
    shopname = selector.css('.shop-name::text').get()

    name = './html/' + shopname + str(t) + '.html'
    with open(name, 'w', encoding='utf-8') as f:
        f.write(html_response)
    return html_response
コード例 #27
0
def get_pic(keyword):
    pic_url_list = []
    to_headers = b"""
    accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
    accept-encoding: gzip, deflate, br
    accept-language: zh-CN,zh;q=0.9
    cache-control: max-age=0
    upgrade-insecure-requests: 1
    user-agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36
    """
    headers = headers_raw_to_dict(to_headers)
    first_url = f"https://cn.bing.com/images/search?q={keyword}"
    try:
        first_webdata = requests.get(first_url, headers=headers).text
        first_datas = etree.HTML(first_webdata)
        xpath_datas = first_datas.xpath("//a[@class='iusc']/@href")
        for data in xpath_datas:
            img_url = "https://cn.bing.com" + data
            pic_url_list.append(img_url)
    except Exception as e:
        print(e)
    for i in range(1, 30):
        try:
            url = "https://cn.bing.com/images/async?q=%E6%B1%BD%E8%BD%A6&first=71&count=35&relp=35&lostate=r&mmasync=1"
            web_data = requests.get(url, headers=headers).text
            datas = etree.HTML(web_data)
            xpath_datas = datas.xpath("//a[@class='iusc']/@href")
            for url in xpath_datas:
                img_url = "https://cn.bing.com" + url
                pic_url_list.append(img_url)
        except Exception as e:
            print(e)
        time.sleep(random.randint(1, 10) / 10)
    print(pic_url_list)
    pic_dict = {"关键词": keyword, "图片数量": len(pic_url_list), "图片链接列表": pic_url_list}
    with open(f"pic_bing_{keyword}.txt", "w", encoding='utf-8') as f:
        f.write(json.dumps(pic_dict, ensure_ascii=False))
コード例 #28
0
ファイル: zhihu_login.py プロジェクト: lranc/Login-or-verify
    def getheaders(self,session):
        '从网页源代码内解析出 uuid与Xsrftoken'

        z1 = self.session.get('https://www.zhihu.com/')
        sel = Selector(z1.text)
        jsdata = sel.css('div#data::attr(data-state)').extract_first()
        xudid = json.loads(jsdata)['token']['xUDID']
        xsrf = json.loads(jsdata)['token']['xsrf']
        post_headers_raw = b'''
        accept:application/json, text/plain, */*
        Accept-Encoding:gzip, deflate, br
        Accept-Language:zh-CN,zh;q=0.9,zh-TW;q=0.8
        authorization:oauth c3cef7c66a1843f8b3a9e6a1e3160e20
        Connection:keep-alive
        DNT:1
        Host:www.zhihu.com
        Origin:https://www.zhihu.com
        Referer:https://www.zhihu.com/signup?next=%2F
        User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36
        '''
        headers = headers_raw_to_dict(post_headers_raw)
        headers['X-UDID'] = xudid
        headers['X-Xsrftoken'] = xsrf
        return headers
コード例 #29
0
import requests as rq
from copyheaders import headers_raw_to_dict
from fontTools.ttLib import TTFont
from lxml import etree
import lxml.html as H

headers = b"""
accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
accept-encoding: gzip, deflate, br
accept-language: zh-CN,zh;q=0.9
cache-control: max-age=0
cookie: f=n; commontopbar_new_city_info=5%7C%E8%8B%8F%E5%B7%9E%7Csu; commontopbar_ipcity=hz%7C%E6%9D%AD%E5%B7%9E%7C0; id58=c5/nn1xe7U4ywTecgnC7Ag==; 58tj_uuid=9f7077b1-1975-463b-85ac-9527b4a4ac3f; sessionid=6857625c-ff00-47aa-b1e2-2fc07112d5ab; param8616=0; param8716kop=1; JSESSIONID=AFADDF58F973FB4989A495AC949578A8; new_uv=2; utm_source=; spm=; init_refer=; jl_list_left_banner=1; als=0; Hm_lvt_a3013634de7e7a5d307653e15a0584cf=1554791615; Hm_lpvt_a3013634de7e7a5d307653e15a0584cf=1554791615; wmda_uuid=141ea88c5ba3413bc1a20a28f391118a; wmda_new_uuid=1; wmda_session_id_1731916484865=1554791616012-0246c2d1-6521-4700; wmda_visited_projects=%3B1731916484865; f=n; new_session=0; ppStore_fingerprint=CA099D7310246607A7B358E3962CB00581402866AC956EC4%EF%BC%BF1554791718315
user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36
"""

headers = headers_raw_to_dict(headers)


def generate_font_file(path="test.ttf"):
    a = b"d09GRgABAAAAABskAAsAAAAAJlgAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAABHU1VCAAABCAAAADMAAABCsP6z7U9TLzIAAAE8AAAARAAAAFZtBmZoY21hcAAAAYAAAAHzAAAFThOUhbpnbHlmAAADdAAAFEMAABl0ai65rmhlYWQAABe4AAAALwAAADYYxE1/aGhlYQAAF+gAAAAcAAAAJBFsBhhobXR4AAAYBAAAADAAAAC8RisAAGxvY2EAABg0AAAAYAAAAGCQCpaObWF4cAAAGJQAAAAfAAAAIAFCAGFuYW1lAAAYtAAAAXIAAALQd5CEoXBvc3QAABooAAAA+wAAAdFb3rCyeJxjYGRgYOBikGPQYWB0cfMJYeBgYGGAAJAMY05meiJQDMoDyrGAaQ4gZoOIAgCKIwNPAHicY2Bk+8g4gYGVgYNVmD2FgYGxCkKzCjK0MO1kYGBiYGVmwAoC0lxTGBwYKn4c4Cj/+4LhM0c5kwRQmBEkBwDIaQw3eJzN1DtP22AYxfF/Ejf0Qtu0TWmb3unFvYW5YurCwBdATA1i4AvAwMDEwILYmLIhkNjZEAsSTEhsiIUFOzXGwsFIdoCNHvOwd4pUR7/ItuQ31nvOE+AWUJIf4ui0j4LOKFZ0t3B9v8Td6/tO4Y+uf/NLz7xh+nDeK3mut+EX/YY/5y/4zdZSay8YCoaD2aNmWA0Hw9Xj0ciJ1qKDk614PF6Jt9u1tns6llSSRrKcrJ/NpPV0JJ3KJrPdbL8z0dk5X7woX25eXel3ur1+946C9qh7n3z9ojJxlEKZHm5zR/nco5f7POAhFR7xmCdUeUofz3jOC2q85BWvldpb3vGefj7wkU98xuULX/nGd+VcZ0CLl7u6N//YuP/k6M2/ij9vrrQrTN/QKx7OG6WAVzL5BHmuyafI2zD5dPlFo7TwG0a54c8ZJYi/YJQlftPkU9daMsqX1p7J3y4YMsqcYNgofYJZox5w1DRqBGHVqBuEg0YtIVw16gvHo0bNIXKMOkS0ZtQmogOjXnGyZdQw4nGjrhGvGLWOeNuof7RrRk2k7Rp1ktMxo3aSVIx6StIwaizJslF3SdaNWszZjFGfSetGzSYdMeo46ZRR28kmjXpPtms0AWT7RrNAZ8JoKujsGM0H54sm/6e8KBvNDJebhoG/YToZWwB4nFVYD1hT57k/73dOzkFEmoQQ0HEZ4b+IjJGQIDfjQaZIGaM8ljIuZVzqGKOMUkspZZRS6liapogxjTRS6ihFR6mj1jnnlDq0PI4yb6vUS6ml1iqlmeUyRWcRznm530lAW8I5J+eQ8H3v977v78/HAMMsTjNJjIYhDGPSB2pCNbEM/QH5xG3hVzG+9IYwbISaCdAl67gtt/CfurNRoPlKYthxhRMLpHX4NajIALz8x6XvEZH/gfw9ncofhGRjpEmvIqkJn/78GQj+SbZifUJN0GD4now755fHEXzoON+TP28wRfACREB0TIQ2UBWh0oE2SG80gU7Fb7xPsUaD8DgSdQjv+ze4D943qARDIJwHwpHnDm+PX3iUa4t9tuoDUeBXLVxs2LRLz4XfuckwPI1RIYzzpxiBCWZCmVgmnkmkQ+tUEcm6wAiWDqPSRITHRMdo9UnJKp1Cp6KPQRcdY9IG6QJ1QA+WfhKaFGqXGOrKcvEX3Du6/TTHtjXMF4M/a5V87NIk64yPbd9gZH89l6c4J4ayVwYkF/cVBqeniRzbJxXe5jYV+TcQZQuWFPDmhzaaf/F4yyJzSBzp6/OsA94ULvJ/ZL5Pb3iBAToHnbwCyToVoetBp0vf6eWnjHDRkI8heBlG0NiJ0XBxUbqRayQbYI34CVveyZaLH0EagKEQniJbRHcrlkNHKxuMHMZmJ8BHZIUTKtDllLzjLj4uTHjHNek8w+l4gY4XEMQLSwMuH7Fsl2QmQ9L13/0xLgPW9PQkp8Opg0SUOPrQzFbM5SgAO+CD66czssEMw/EOC/RiYQcmdSBpZzg61kPCFP82o2RCGD2TSoeny8oLXJAQGcUromMiV4BnCt7hwGjiBV4RI19i1AFBhDUZTUEM0M8Gcgo2kR3acQjWf1O+yOD5GdwVm0IyoUysJamYy85hIUmcn8OqoWt+/G3sjnz19dOV0RePu46fvJn7gBMRY6ELQnm8jhbFX+yOE12vtlsP9b63OyvjJIzMJ4DbbsdgO+ldE52fHm52dt30W2GFcOg1YwdO8s+9nI2lkn9+1taMlDJPydP83RCc/ItMFJPGeJIVCDqWzj/GU1hBWpMxIMKTyOWLLhnUAXeTqqK1RpedZpwI0XyYxHFacYS1R/tmZx7L7/LXNjv61GqNu1RypySCmHulEvPjDdC0DWel6LECnAKDHUJwssBSX1dZa69yhShetjtQKfm68wsJX5Cv5vjj12BQqsLaXCNUkUSMx6GMNAiGehf6QBY0W8GK9Vbsz3JYy1utC7NcPtrHC2QYoLHdERr4B5nVtEp+QDPHgFKQi8RoMq6mGaM3d5sHPM3D0rjoSx+o875oNaugPj7RlZrSFBvGd8+19fsEn65om+8Aji2U0l09hGARHOiEA1hEuIUStkwaZ59A7qR7LNs8eLoXulmfO0OIHOHN5X4WhUPslCab2ZER66FD1hGrdAb88Bad6wpPH13m32CCmAgmhTEz6cwmJovJZfK91eZpedV3pqcCPc2Bnj6mL1h+fBcRfgQ6OZ0KekTpAteSZB3ryRV48IE/gDY7ttkJkTpJmZ00S82En3OPOAcHnSNwjSNOyMCT12t6fZTdTdULyMZCHdokhBpsY+vRBnWiVX6fWmRvLq1jU+pb0hyV8GF1c3M7q2xpdrubW6Rg0kUM0ln7goablmpra5unrnDp+f5Vo3DFarWixSLGlZWUV5YpBjPM5WlGxpuzxRWCm3+LrsJ/MhuZLXJN0u6hPR2ou9fY34a9ZKMpglV7U2ky0jOtThmAAomgVZjUQQKJol+PMZI5WPfDBFf8OiKJHz5cCDkumEX/aUuDn7q1ogn5Xz08l5B6O7dRGbKtwJYY4irKRrGdpFsje9jmpoG6HQeltB29LssBxc5WR9yzT7fd+QIKsdcuNVwjW/OVTaTIceW2OIldCf5+WwuCffxmu89ALITeqRHWtGJ61fjtkkXmTCGET+fNDRYdo7GyNOf/FK7z+xkVrc8oGqlepWTVcjsRGVQC1CylEUaQJx9DAzeCNkAgMTzf0zY3Yfufs19hYkIazPUf5yQn74thdRnmmhLSMwE+lp7Ozmb+aXRJH+OJRWauCb/E18+k5bghCeJJkaTcYcbR3ATjvvw6RfV8ZYui+AKOF4LhggcPvqR1qKDMGsPQZjcGeHCNyOu/1P8KvUqj1yVRUKNlJPBrIUyVzirjfKQCvlCc949mR0U/l5Qfn07yj0gajul2gNqqVvrkFGuUayiCHcB5du85YDo7F5lznVIxtm7KhjhSapdm99XVHA0JPVM2DBovt9wRpvlXGS1D20KeiFqlV7J0JmoFpRqWzieE3bBR7GN/ruhXsE+LljZWW54kPTXcKeljH4PHyeedsApUeBP/1Sl+sbgtHXjye+l1dBjvaYXNfAzlVgYM0RHhvJCcBvokLlDDC/5EbqdkQxqY0oB9aJ+1dOihnFM7RoF57eTp3xpYIibd33v46O+4p5/dte3hkw/kfzH41vxvLbWVOf0/tZx4p7H5lPzvFTSGq8Ik5algJoHZQLs6k67qMn7S3g38Nn6COshI+YLWLS9ExniYxRS9conR5Qwo6B9pgUd5vhQyWoAzEOxwuzPbjFVppY4Kyxp2umqfs7SjpeMKFCUa0XVxCi2xKVBzqa/2AAmJy7LkGneQHQs5EG07NGjOOEfCqi4ONfTDO1AOpTbYh9tsx41H8vNKbAsVXDz2DxVAP04ePQoJKXhgcKA0tbhpcMKRnrfj7CBkkgM1zlRbY2pr/CY8h3Fx4HOkyH2rYta7tosgnOWB8ZerSOYJRiFEgtyYBsElGophBIx1OIJ5R8VrPPRgTjPOjnfPgM9CJV9MFQ+DC0IW/2NmJV25ECaMifToH8aLc14F5FVCKjUtS7UKPIpHdfeQBQFRkkwn/RV9nOjm85zzExzOG9MuKQbmJxSRaITDmOs9RJ/l96xpcnJhqLZWyJIc1dhNiac/Ix1iSQiZdS/EUt7pQjeWQrAVgmmcPnSenwnnPDokillL9ZlhCb09XJp8F7Dvccry7BUmTl4R+nc2iJNT7Y1ARutkz/whjDjFFhKKYXDZRWJdcBnDSLjYSNrbFyrb2bZU6IXwdLSJsyRWGucLsQf7Q+KggnMh7WPvIVUCIpEP9sl+66B1gP7QS784ZDZvq99eZm1SpA7OTza1F5tzCtmkARwaaB5An4ElbbA4xf0X7Q9fRk2zqGR0YSolExFO38rvdGEPgxIITOPX89MoDcMGMOAoniJu6ID94hTW4yNgg+elV8iT5EVvTbxE1+oN5r67mu2uThsg10QrW3+ErRNt5Nqd+nVsyQFSK7UeELu7lrBSon30Kq0FE/MjD1Z6vutFggjVEhhRqExWLRUGS1GLLBMFlTG6ALk0IiGIUZBzMx3sdun8q3Ao14Apu9+pHPBXW05nx1VlzUgW0uzEc2EwFuwfLG496qftL21Ra/0wOhWOoYYzQgHE1eFZ7GQfYHPEI3YMWsxJhweJTtpqqCrO2FRriLNllRAH2yiVplG0qzmOU3xOibKlE9aQStiKthEI2Y5TE964Fr/iHqBrHEwZj9Z3gMFoSgAZdpJCQcahUFD6Q1gCmDQ0CmOyITqMD9Bok4yG6HB+W1fTL+ueeP/myPYny5v2Nbnx86nmwb6dtrffwRuHUpwX9r782W44ZLmSuOFwRfXRil8dfbz8T6k/vILfnK+tHW21HXjzhea33yK5FXvazzl2efLzmHBScYMJuKepk7+lozOouikgfZIv3yXx5LiUSaVzyypS0UXMXV3SUJfk6lrC1cUvuVDqjShuUyCnKUjWhhmVFGAFGQpYUxLHheJHePHXr8Hmv3/4ruOpDeSCdD5s521YDZeu4rWNw+V/gtj9q1j1Qbyf//fSOoHQyP+B9pnXBZFvMyWV2OogD6ouQ6XMpUvyTptMbyI4TfbopMN2oHscR2OzIW1sfLRxvN9WIh4hsZk5trKUahJN0vbNVI75qbtrOjDB5eLeRwc24sho5ewUHppOyZ2hqU8AbRFOXgMGyh22xsyu2hQHHL3TgSIh/NZSDXEpqhaUSzw6xe9h1slYQOem5wVt0PKaftefqLRBXBS1E1GRrEwCJnWAxzWoKduzhJUFDQnp9HNy6oIeKnmH0A7bXbL2gD60yNc6JyDxj9Qgg76o6U+pjmVH648f7qxpbzt67JC1q/PIeFra7PGzT1H6Cz7oHKlguVZIaQVfvN2Kw614spWSBOH9i7IIp4E8xP4Svikfx/ASxEOOAWvxIMX3sTLQAo8fLDIZsRAGm6BoSWOfp3GupwgRziRThltiN69akzUrFdygM3keCR5hRvtTTo5Jtk/e/JmMqqgIlZ6f2obz5AyOGrOhES1WQ6qBrYQMI8ZyFulWGkw7pGIHuBxULpjHK3D6ugUrExPAUp0prZmdJTqwwJmWFvdZcVMzEStKzOTYfGvr/MIMaxEbWA4/wMu9Oee6bZcP4u2u7DTnRCtoR6QcubZoIB20Xu+jMoP6SZqcJK3ce4FyC3IRoPJ0no71XJW6Yfj72Vu7n9v3Ln5xGf99dPd+nBi+/tpb+Aq/6r3Xm4fWcup/dA5/oyjB7+189lPpMWly9/Pgu4ynmbTXZY3PmGg/q1V8DE8BNdnA6JMYCq668JiA5UaneuSvzc5zXxJSfuO9RQaCP/8KODyEH7/xxBNdOxsO7H/h2Z7RbGqBEgg5DX5jkxCFe3A/PohJyVxI51/+8PQfPj25hJ1XaY5ep7GlM5s9fuI7RUgUXsHBeQUHqzCaVHcFCkVWTZCg0tzzSrwAOipUCFLvcJv4dpEw6TL07UTxC8ih1XLiCt7Brrh0+KV06XFxzys78Tb4dly79uRj3ORw3nY//zpzwQsv8tul54tGp9mgFkhyyFbDgR/U4/a5d35emNHy7pmX0nNrj82C1bkuDt62QQs22vD5qMhS8/3SMxcIIfkFfkUkcf0PIBWHLl72rC0VWy/SHAoyu+iok2NpmQXQ7ongBVbx4mf7pGf2fEZyx/ZcW7lKsdLfLTt8fpWYTVq+/9PMaKnRW8s3hAnKMWEUY5Iojy8rNdkzRsouQl6w7yi29bQ2VFFBCnqnCPDaK885eLIWhyCrw+q/BlqgyXCkG9LwdKG9YXtzTXtddzjpZZUdYPbDYx0Sc6n4kq2iDQdKJ2srSiCIivT6NohW+mVnh5idwWocbruS11hXY68U41kLnh3bejqTu7CQGmoPlQ6SzEyupqyvuc6BI+WHKxu2efF3iltN60xHI9B6azYiWR+mMujChWSjnha3UuGpbRmbd/1+uOsEPvz8M/AUXn3D7vrHmes40vsefjQ/8QKwz7zWAtHdoFms/mvhh/vx3M849fvt5xeZrct1Nc5vpmsezkTf25kCWkzgBWa6ekbTMhwE6hgPBUcrjrgWGl384dRwqR6nj0gF2gyyb584weZrecouFon3UbJtoh3ihxcauRZ+s1STn78wlpddCE50tJmLGsDQWm5pbbWWtaKN98OxJb1Ck2ijNeBHHRUDS9aYGkTGsxNH2G63+6S0/hRYnkMtfFMxQ/MP3eiEqr2fEOMdEciVE/i6vO+G/yfU8ltoXBHUBa3z7h0ELEUn0IgomMviQucRF2HUCemIQo522XCD13BP8QbXvOhqIqMrV3Sv9GX3atTRODo44qOdgEqt0k+cv19aW0GcD0qbQUuM0gj5wHP2HPOifOW3iJn5+Z25pb/4CaITB3w2VQUfrrHh5nfte+EBwrndbqiemlryzvgvqrn/SqP/Hs099W+w5BtUSlkjr6ZOgrauzC30WYBJJhjwnNmjjoLcR9h1d67hROKm62zuRmkX+1rn2/0ndsPMneEqV2NtXzGUdlTkHUrlf51b3ZziQq1kxu6MLPAjVmKppqu3i/w5kcrlZhdXbaBTNRuglfjEowszEqB1CYe+Fkqpt7+PKowgz4rqVV4NH+HdulStJSpdL0sOsr49UsdBqbuH10gdpHzeTkqkbu4bOII54vYqKZi4q4iIHIhLe1j/Es7wb9KowxnZZkR5kCwqMkrtLUDv9gEBTpaEHLXUxdX4Z3wJP5aei0wlW+A/qKQftVZSQNMXbTyc9xPuTfzvn+HnULpLerT6l0DmIeKx9lPvXn0zs+j5d196kdr6NU68FTl/wSndwE/62Ms4Pl77NAQysm9ZXC3c4vdSPRfucQOpFHPlXQxWL8+F1VNK131bk95LCAh3bS212bIbCPJuX3jOCmMUT+MAQ8KIAVNJSVqTvx+PWOXwVdq3t/r4+WBYOskgo/btrroWKF1IJGPRs1CeHoddl86iIzId2i5gL5siDs+25RsdqVjcui3DmSGWumnpcZOoiJIukm6rOIZFeB3P8dn56r4J6vH6549DGbQkYgf224RN6bh1KM8MHFRjO142ZIMGhqQwW100XrgcB5rueJy8lASMJiEjLy0xc4nzHqZYpJVzA5TUCKunEKT2gI+H8pKC4FuUx87/puX9W0RRfeP96/j1x9N4Ex6BsP0lUt4bLzR3vfJSS48iMw278fz/4vwnV3ECnoAHqd+4ul6EvRcHnF2Hj9zz9isoFqxkmLVwT21xK8S/saFSH4mTxshv+FWHUfmO9I1nnuDhyNXMD+XKTJZdWKAi8Du7TT8C6iG8To6RGVO2s4zXuqWzndAgZnI9aJBNwUzdAK8+3FArlm1lo+1stGSmZs1piJUQB+3olsaDM4kPhEu58GTBoLVrck0rile4TeXqiivc2ZkZKsuKtmYchBSspYg3ccm8td8TkWdP5CrlqP2UoxjgFBRwZHiVSytALxcR65FZgUD7noki/rAC9DC00PUIRpDVVdK/yZZHjz7K/klUVXGfzv/zEa5Nmra0baOyMQx+WrcbXnDOuUDtkpWhC2dctztxQ/pmF/jIVuX/AcSzTkYAeJxjYGRgYADimhbvinh+m68M3BwMIHDjUmMFgv53kYOB7SCQy8HABBIFADqiC2YAeJxjYGRg4Cj/+4LhMwcDCABJRgZUoA8AYu8DmXic42AAghQGBpaNEMx+kYGBg4F4DNKDTwzGRlfHyooQY12IXy2yWSxpCD4A7W4KaQAAAAAADAAoAEAAcgDEAQABNgGkAhICaALsA2IDrAPyBCIEVATQBPAFRAWyBdoF+AZkBqoGzgb4B1AHxAgmCFoIlgkICSoJjgnCCgoKLgqKCtoLBgtMC9QMEAwoDHwMunicY2BkYGDQZwhl4GQAASYg5gJCBob/YD4DABZbAaQAeJx9ks1Kw0AUhU9sVWxFQcGVyqxEUFN/du5E0W6K0EWh3aXpTI2kmTAZCz6H7+DT+Azik4gn6VWpQjPk8t1zz525AwNgC+8IMPv2+M84wCazGS9hFcfCNWzgQrhOvhJeRhP3wivUB8INHOFBuIltvHCHoL7G7BKvwgH28SG8xN5P4Rp2g3XhOvlQeBk7wY3wCvWBcAO9YCrcxEHw1lDq2unI65EaPitjM38SR84l2rHSSWJnC2u86kdtnXT1+CmN3I9aifNZT7sisZk6C0/nC3c60+77mGI6PvfeKOPsRN3yTJ2mVuXOPurYhw/e55etlhE9jO2EcyuuazhoRPCMI+ZDPDMaWGTUThCz5rgS1p30dJjFzCwK/oY+hT59bXoSdBnHeEJadf73/joX1XrVeQWpnEThDCFOF3bcMWZV19/bFJhyonOqnj3l7codJqRbuafmtClZIa9qj1Ri6iFfUdmV8920uMwff0gXd/oCX7iEuAAAeJxtkEdXxDAMhP0tLEvvfem9b+ISO8fdxPklXLhw4z1+PkTiiA/S02hG0tgMjL6x+f9NGDDHPEMWGLHIEsussMoa62ywyRbb7LDLHvsccMgRx4w54ZQzzrngkiuuueGWO+554JEnnnnhlTcmFIbv4dfnh4sSyz7699FvrKdt6nMVbdPnGMpOcm46xSvpB/+XXZiKrvJS+5yD8EIupF9b3WEFTbZUVjGTbuNVnRsvLNWkrN3OFoLK5OjSTCqZH2wpaGqj1DGFIC6scGrZZpPoveyMdSs+ndP7nW706tJFQXPt9T7b6vXR6a+kRtiVzFOH0Wflls6YH1WNYlgA"
    b = b""
    font_data_decode = base64.b64decode(a)

    with open(path, 'wb') as f:
        f.write(font_data_decode)

    font_data_decode2 = base64.b64decode(b)

    with open("2set" + path, 'wb') as f:
        f.write(font_data_decode2)
コード例 #30
0
                   'User-Agent="Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"',
                   'Upgrade-Insecure-Requests=1',
                   'Accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"',
                   'Accept-Encoding="gzip, deflate"',
                   'Accept-Language="zh-CN,zh;q=0.9"']
    rq_header = b'''Host: www.gsxt.gov.cn
                Connection: keep-alive
                Cache-Control: max-age=0
                Upgrade-Insecure-Requests: 1
                User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36
                Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
                Referer: http://www.gsxt.gov.cn/corp-query-search-1.html
                Accept-Encoding: gzip, deflate
                Accept-Language: zh-CN,zh;q=0.9
            '''
    rq_headers = headers_raw_to_dict(rq_header)

    test1 = CorpSearch(init_url, index_url, chm_headers, max_click)
    test1.main(search_list[0])
    index_cookiename = md5(search_list[0].encode("utf-8")).hexdigest() + ".json"
    cookie_html = test1.to_dict()
    search_result = SearchResultParse(cookie_html['page'], base_url, result_parse_rule)
    url_list = search_result.search_result_parse()
    detail_request = CookieRequest(cookie_html['cookies'], url_list=url_list, headers=rq_headers)
    detail_result = detail_request.cookie_requests()
    for pg in detail_result:
        pg_detail = PageDetailParse(pg, detail_parse_rule)
        detail = pg_detail.search_result_parse()
        print(detail)
    time.sleep(5)
    test1.driver.quit()
コード例 #31
0
ファイル: 测试.py プロジェクト: houzhimeng/WebSpider
from copyheaders import headers_raw_to_dict
import requests

header = b'''
:authority: www.zhihu.com
:method: GET
:path: /api/v4/articles/35656001/contribute_requests
:scheme: https
accept: */*
accept-encoding: gzip, deflate, br
accept-language: zh-CN,zh;q=0.9,en;q=0.8
cookie: _zap=48356585-ec48-411b-9a66-d487b7ae607b; d_c0="ANDiMph6lw6PTvEQHMMifR0QmejqXGqazZ8=|1543463744"; l_cap_id="MjRjNWNkZDA1YWQ4NDY4MjkxMGRhYzc2OTkyNWZiNDI=|1544601803|6786e6673fe0f040c6a9223f5f4baed9721dc814"; r_cap_id="YmE5MzhmZTIyZTEzNDRmZmE2OTQwZWZjNDRjODNlNDE=|1544601803|815c2e4c7621df052c4f61ed8bfc68b51b758acd"; cap_id="MzM5MzBhMzA5ZGQ0NDkwNTg5MjFjNjkwNDgxZjM0MzA=|1544601803|6cfb19bf6f9133eff82dea3b17141cc2276e9a73"; _xsrf=h29qRM08rAMJXER3FsTSUGeR67a9bAAd; tst=r; capsion_ticket="2|1:0|10:1544685115|14:capsion_ticket|44:MWNiNmVhNzc4NzUzNDJhNDhlZjgyNzk0NjRjMTk1YWM=|727a2ae3d07f83ff43bdceeeef8990f2dc9d0442e52ae6a04b8702f38c716be5"; z_c0="2|1:0|10:1544685155|4:z_c0|92:Mi4xSkxzaEFnQUFBQUFBME9JeW1IcVhEaVlBQUFCZ0FsVk5ZMVRfWEFEUXRBdG1uLTlXQ0hnN2U5TExQUDJYYUNzTW13|d1aa7ab03e3c753f76a5697cb80e08da9d042a909458153bea0eadf0f3a665fa"; q_c1=5136b8abe99a460f941cbd9688908b60|1544960842000|1544960842000; tgw_l7_route=c919f0a0115842464094a26115457122
origin: https://zhuanlan.zhihu.com
referer: https://zhuanlan.zhihu.com/p/35656001
user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36
x-ab-param: top_root=1;top_root_ac=1;top_sj=2;top_bill=0;ls_new_video=1;top_billupdate1=3;top_ebook=0;top_new_user_gift=0;top_recall_tb_short=61;top_v_album=1;se_majorob_style=0;se_prf=0;top_feedre_cpt=101;top_is_gr=0;top_recall_exp_v1=6;top_user_gift=0;top_cc_at=1;top_newuser_feed=0;top_yhgc=0;pf_creator_card=1;se_entity=on;top_gif=0;top_root_web=0;top_vd_gender=0;top_video_score=1;gw_guide=0;ls_is_use_zrec=0;top_ab_validate=3;tp_dis_version=0;top_fqai=2;se_boost=1;top_card=-1;se_search_feed=Y;se_correct_ab=0;se_engine=1;top_30=0;top_billab=0;se_billboardsearch=1;se_ltr_v1=0;top_login_card=1;top_new_user_rec=0;top_f_r_nb=1;top_feedre_itemcf=33;top_feedre=1;ls_topic_is_use_zrec=0;top_new_feed=1;top_recall_core_interest=81;tp_sft=a;se_consulting_price=n;top_follow_reason=0;top_tr=0;top_recall_tb_follow=71;pin_efs=orig;top_mt=0;top_recall_tb_long=51;pin_ef=a;top_feedre_rtt=41;se_backsearch=1;se_consulting_switch=off;se_major_onebox=major;top_no_weighing=1;top_recall_deep_user=1;top_rerank_reformat=2;top_billvideo=0;top_limit_num=0;top_new_video=-1;tp_sticky_android=0;se_websearch=1;top_quality=0;tp_qa_metacard=1;se_ad_index=4;se_auto_syn=0;top_universalebook=1;top_newfollow=0;ls_new_score=1;top_recall_follow_user=91;top_billpic=0;top_recall_tb=5;top_thank=1;zr_art_rec=base;se_gemini_service=content;top_deep_promo=0;top_wonderful=1;soc_brandquestion=1;top_root_few_topic=0;tp_discussion_feed_card_type=2;top_raf=n;se_click2=0;se_filter=0;top_distinction=0;tp_write_pin_guide=3;se_new_market_search=on;top_yc=0;top_ydyq=X;tp_favsku=c;zr_ans_rec=gbrank;se_time_search=origin;se_wiki_box=1;top_test_4_liguangyi=1;top_followtop=1;top_newfollowans=0;top_topic_feedre=21;qa_answerlist_ad=0;se_daxuechuisou=new;se_minor_onebox=d;tp_qa_metacard_top=0;se_mfq=0;top_ntr=1;top_scaled_score=0;top_nad=1;top_root_mg=1;top_rank=0;tp_discussion_feed_type_android=2;top_nucc=3;top_recall=0
x-requested-with: fetch
x-udid: ANDiMph6lw6PTvEQHMMifR0QmejqXGqazZ8='''

header = headers_raw_to_dict(header)
コード例 #32
0
ファイル: HkHoldPuller.py プロジェクト: zhoupj/flwm
post_headers_raw = b'''
Accept:*/*
Accept-Encoding:gzip, deflate
Accept-Language:zh-CN,zh;q=0.9
Cache-Control:no-cache
Connection:keep-alive
DNT:1
Host:dcfm.eastmoney.com
Pragma:no-cache
Referer:http://data.eastmoney.com/hsgtcg/StockStatistics.aspx
User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36
X-Requested-With:XMLHttpRequest
'''
#把header转化为字典类型
header_dict = headers_raw_to_dict(post_headers_raw)

request_get_raw = '''
type: HSGTHDSTA
token: 70f12f2f4f091e459a279469fe49eca5
st: HDDATE,SHAREHOLDPRICE
sr: 3
p: 1
ps: 50
js: var LIkffNKM={pages:(tp),data:(x)}
filter: (MARKET in ('001','003'))(HDDATE=^2018-10-12^)
rt: 51317778
'''
#params_dict = headers_raw_to_dict(request_get_raw)
params_dict = StrUtil.convert_to_dict(request_get_raw)