Python CookieJar Exemples, scrapy.http.cookies.CookieJar Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : utils.py Projet : ayteam/IcrisCrawler

def get_cookies_dict_from_response(response):
    jar = CookieJar()
    jar.extract_cookies(response, response.request)
    cookie_objs = jar.make_cookies(response, response.request)
    cookies = {_.name: _.value for _ in cookie_objs}

    return cookies

Exemple #2

0

Afficher le fichier

Fichier : chouti.py Projet : JamieChen-1230/PythonWork

    def parse(self, response):
        """
        登入抽屜
        :param response:
        :return:
        """
        # 創建cookie對象
        cookie_obj = CookieJar()
        # 獲取cookie
        cookie_obj.extract_cookies(response, response.request)
        # 將cookie存入成員變量
        self.cookie = cookie_obj._cookies

        # 發起登入請求
        yield Request(
            url="https://dig.chouti.com/login",
            method="POST",
            headers={
                "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
            },
            # 這裡的請求體不能用字典格式
            body="phone=886918207171&password=jamie851230&oneMonth=1",
            cookies=cookie_obj._cookies,
            callback=self.check_login,
        )

Exemple #3

0

Afficher le fichier

Fichier : chouti.py Projet : Mitsui1993/scrapy

    def parse1(self, response):
        # response.text 首页所有内容
        from scrapy.http.cookies import CookieJar
        cookie_jar = CookieJar()  # 对象，中封装了 cookies
        cookie_jar.extract_cookies(response, response.request)  # 去响应中获取cookies

        for k, v in cookie_jar._cookies.items():
            for i, j in v.items():
                for m, n in j.items():
                    self.cookie_dict[m] = n.value
        post_dict = {
            'phone': '8615131255089',
            'password': '******',
            'oneMonth': 1,
        }
        import urllib.parse

        # 目的：发送POST进行登录
        yield Request(url="http://dig.chouti.com/login",
                      method='POST',
                      cookies=self.cookie_dict,
                      body=urllib.parse.urlencode(post_dict),
                      headers={
                          'Content-Type':
                          'application/x-www-form-urlencoded; charset=UTF-8'
                      },
                      callback=self.parse2)

Exemple #4

0

Afficher le fichier

Fichier : a36kr.py Projet : liuxinghui01/wolf-ai

    def parse(self, response):
        login_url = 'https://passport.36kr.com/passport/sign_in'
        cookie_jar = CookieJar()
        cookie_jar.extract_cookies(response, response.request)
        for m, n in cookie_jar._cookies.items():
            for m1, n1 in n.items():
                for m2, n2 in n1.items():
                    if m2 in [
                            'aliyungf_tc', 'krnewsfrontss', 'device-uid',
                            'M-XSRF-TOKEN'
                    ]:
                        self.cookie_jar[m2] = n2.value
        """
        type:login
        bind:false
        needCaptcha:false
        username:18616561846
        password:abcd.1234
        ok_url:https%3A%2F%2Frong.36kr.com%2Flist%2Fdetail%26%3FsortField%3DHOT_SCORE
        ktm_reghost:null
        """

        yield scrapy.Request(
            url=login_url,
            method='POST',
            body=
            'type=login&bind=false&needCaptcha=false&username=18616561846&password=abcd.1234&\
                    ok_url=https%3A%2F%2Frong.36kr.com%2Flist%2Fdetail%26%3FsortField%3DHOT_SCORE&ktm_reghost=null',
            headers={'Content-Type': 'application/x-www-form-urlencoded'},
            cookies=self.cookie_jar,
            callback=self.login)

Exemple #5

0

Afficher le fichier

    def parse(self, response):
        #response.text首页所有内容
        cookie_jar = CookieJar()
        cookie_jar.extract_cookies(response, response.request)

        #循环取出 cookie 并生成字典格式
        for k, v in cookie_jar._cookies.items():
            for i, j in v.items():
                for m, n in j.items():
                    self.cookie_dict[m] = n.value

        post_dict = {
            'phone': '8615915455813',
            'password': '******',
            'oneMonth': 1,
        }

        #发送post请求进行登录
        yield Request(
            url='https://dig.chouti.com/login',
            method='POST',
            cookies=self.cookie_dict,
            headers={
                'Content-Type':
                'application/x-www-form-urlencoded; charset=UTF-8'
            },
            body=urllib.parse.urlencode(post_dict),
            callback=self.parse2  #执行成功在执行回调函数
        )

Exemple #6

0

Afficher le fichier

    def parse(self, response):

        cookie_obj = CookieJar()
        cookie_obj.extract_cookies(response, response.request)
        # print('cookie get ==================')
        # print(cookie_obj)
        # print(dir(cookie_obj))
        # print(cookie_obj._cookies) # 访问cookies
        # print('cookie end ===================')
        # print(response)
        page = response.meta['page']
        next_page = page + 1
        logging.info('on parse')
        logging.info(f'next page ========== {next_page}')
        articles = response.xpath('//article[@class="excerpt"]')
        for article in articles:
            item = AsyncSandboxItem()
            category = article.xpath('./header/a[1]/text()').extract_first()
            title = article.xpath('./header/h2/a[1]/text()').extract_first()
            article_url = article.xpath(
                './header/h2/a[1]/@href').extract_first()
            item['title'] = title
            item['category'] = category
            item['article_url'] = article_url

            yield Request(url=article_url,
                          callback=self.parse_item,
                          meta={'item': item})

        if next_page < 900:
            yield Request(
                url=self.BASE_URL.format(next_page),
                meta={'page': next_page},
                # dont_filter=True
            )

Exemple #7

0

Afficher le fichier

Fichier : sdjnggzyjy.py Projet : BlueJillYang/Scrapy_spiders

 def __init__(self, start_url=None, history=True):
     super(Sdjnggzyjy, self).__init__()
     # jobs = start_url.split('|')
     jobs = self.start_urls
     self.cookie_jar = CookieJar()
     self.count = 0
     for job in jobs:
         self.headers = {
             'Accept application/json, text/javascript, */*;': 'q=0.01',
             # 'Accept-Encoding':'gzip,deflate',
             'Accept-Language':
             'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
             'Connection': 'keep-alive',
             'Content-Length': '90',
             'Content-Type': 'application/json',
             # 'User-Agent Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101':'Firefox/64.0',
             'Host': 'www.jnggzyjy.gov.cn',
             'Referer': job.split()[0],
             'Public-X-XSRF-TOKEN': '',
             # 'X-Requested-With':'XMLHttpRequest',
         }
         self.post_params.append({
             "url": job.split()[0],
             "ba_type": job.split()[1]
         })
     # dispatcher.connect(self.initial, signals.engine_started)
     self.history = history

Exemple #8

0

Afficher le fichier

Fichier : chouti.py Projet : Fosity/Scrapy-settingmodel

	def login(self,response):
		from scrapy.http.cookies import CookieJar
		cookie_jar=CookieJar()
		cookie_jar.extract_cookies(response,response.request)

		for k,v in cookie_jar._cookies.items():
			for i,j in v.items():
				for m,n in j.items():
					self.cookie_dict[m]=n.value

		post_dict={
			"phone": "8618001999999",
			"password":"******",
			"oneMonth":1,
		}

		import urllib.parse

		yield Request(
			url="http://dig.chouti.com/login",
			method='POST',
			cookies=self.cookie_dict,
			body=urllib.parse.urlencode(post_dict),
			headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'},
			callback=self.parse1
		)

Exemple #9

0

Afficher le fichier

Fichier : chouti.py Projet : AIF333/sp

    def login(self, response):
        cookie_jar = CookieJar()
        cookie_jar.extract_cookies(response, response.request)

        # 登录获取未认证的cookie
        for k, v in cookie_jar._cookies.items():
            # print("====",v.items())
            for i, j in v.items():
                for m, n in j.items():
                    self.cookie_dict[m] = n.value
                    # print("---",j.items())

        data = {
            "phone": "8613476152416",
            "password": "******",
            "oneMonth": 1,
        }
        # 将字典转为url形式   phone=8613476152416&password=yt123456&oneMonth=1
        # print("--",urlencode(data))
        req = Request(url='https://dig.chouti.com/login',
                      method='POST',
                      headers={
                          'Content-Type':
                          'application/x-www-form-urlencoded; charset=UTF-8'
                      },
                      body=urlencode(data),
                      cookies=self.cookie_dict,
                      callback=self.check_login)
        yield req

Exemple #10

0

Afficher le fichier

    def parse1(self, response):
        """获取首页登录"""
        # response.text 首页所有内容
        from scrapy.http.cookies import CookieJar
        cookie_jar = CookieJar()
        self.cookie_jar = cookie_jar.extract_cookies(
            response, response.request)  # 获取响应中的cookies

        post_dict = {
            'phone': '8617748232617',
            'password': '******',
            'oneMonth': 1,
        }

        import urllib.parse
        data = urllib.parse.urlencode(
            post_dict)  # urlencode转换为：phone=86123&password=123&oneMonth=1这种格式
        # 发送post请求准备登录
        yield Request(url='http://dig.chouti.com/login',
                      method='POST',
                      cookies=self.cookie_jar,
                      body=data,
                      headers={
                          'Content-Type':
                          'application/x-www-form-urlencoded; charset=UTF-8'
                      },
                      callback=self.parse2)

Exemple #11

0

Afficher le fichier

    def login(self, response):
        '''发送ajax请求来登录'''

        # 从response中拿到cookie信息

        cookie_jar = CookieJar()
        cookie_jar.extract_cookies(response, response.request)

        # for k, v in cookie_jar._cookies.items():
        #     for i, j in v.items():
        #         for m, n in j.items():
        #             self.cookie_dict[m] = n.value

        login_req = Request(
            url='http://dig.chouti.com/login',
            method='POST',
            headers={
                'Content-Type':
                'application/x-www-form-urlencoded; charset=UTF-8'
            },
            body='phone=8618922795525&password=woaiwojia89&oneMonth:1',
            cookies=self.cookie_dict,
            callback=self.check_login,
        )
        print(self.cookie_dict)
        print('执行了login')
        yield login_req

Exemple #12

0

Afficher le fichier

    def parse(self, response):
        """
        第一次访问抽屉返回的内容：response
        :param response:
        :return:
        """

        # 去响应头中获取cookie

        # 去响应头中获取cookie，cookie保存在cookie_jar对象
        cookie_jar = CookieJar()
        cookie_jar.extract_cookies(response, response.request)

        # 去对象中将cookie解析到字典
        for k, v in cookie_jar._cookies.items():
            for i, j in v.items():
                for m, n in j.items():
                    self.cookie_dict[m] = n.value

        yield Request(
            url='https://dig.chouti.com/login',
            method='POST',
            body=
            "phone=8613121758648&password=woshiniba&oneMonth=1",  # # body=urlencode({})"phone=8615131255555&password=12sdf32sdf&oneMonth=1"
            cookies=self.cookie_dict,
            headers={
                'Content-Type':
                'application/x-www-form-urlencoded; charset=UTF-8'
            },
            callback=self.check_login)

Exemple #13

0

Afficher le fichier

 def get_cookies(self, res):
     '''获取登陆成功后的cookies'''
     cookie_jar = CookieJar()
     cookie_jar.extract_cookies(res, res.request)
     # cookiejar是类字典类型的,将它写入到文件中
     with open('cookies.txt', 'w') as f:
         for cookie in cookie_jar:
             f.write(str(cookie) + '\n')

Exemple #14

0

Afficher le fichier

Fichier : importbitumenpricecif.py Projet : leeyouly/baiinfo

 def login_callback(self, response):
     cookiejar = CookieJar()
     cookiejar.extract_cookies(response, response.request)
     self.cookiejar = cookiejar
     
     request = scrapy.http.Request(url='http://www.baiinfo.com/Search/Index?wd=%E6%95%A3%E8%A3%85%E8%BF%9B%E5%8F%A3%E6%B2%A5%E9%9D%92%E5%88%B0%E5%B2%B8%E4%BB%B7', 
         callback = self.parse_index)
     return [request]

Exemple #15

0

Afficher le fichier

Fichier : test_http_cookies.py Projet : kenzouyeh/scrapy

 def test_missing_final_slash(self):
     # Missing slash from request URL's abs_path should be assumed present.
     url = "http://www.acme.com"
     c = CookieJar(DefaultCookiePolicy(rfc2965=True))
     interact_2965(c, url, "foo=bar; Version=1")
     req = Request(url)
     self.assertEquals(len(c), 1)
     c.add_cookie_header(req)
     self.assert_('Cookie' in req.headers)

Exemple #16

0

Afficher le fichier

Fichier : AutoLocationTarget.py Projet : 7134g/m_troops

def get_cookies(response) -> dict:
    cookie_dict = {}
    cookie_jar = CookieJar()
    cookie_jar.extract_cookies(response, response.request)
    for k, v in cookie_jar._cookies.items():
        for i, j in v.items():
            for m, n in j.items():
                cookie_dict[m] = n.value
    return cookie_dict

Exemple #17

0

Afficher le fichier

def extract_to_file(session=context.http_session):
    scrapy_jar = CookieJar(policy=session.cookies.get_policy())
    for cookie in session.cookies:
        scrapy_jar.set_cookie(cookie)

    with open(cookie_name(), 'wb') as io_writer:
        # save it in a way scrapy_cookies can gather later on
        pickle.dump({None: scrapy_jar}, io_writer)
    os.chmod(cookie_name(), mode=0o600)

Exemple #18

0

Afficher le fichier

    def login_callback(self, response):
        cookiejar = CookieJar()
        cookiejar.extract_cookies(response, response.request)
        self.cookiejar = cookiejar

        logging.debug('enter login_callback')
        request = scrapy.http.Request(url='http://oil.chem99.com/news/28156799.html',
            callback = self.parse_content)
        return [request]

Exemple #19

0

Afficher le fichier

def get_cookie_by_cookie_jar_from_response(response=None):
    cookies = {}
    if response:
        cookie_jar = CookieJar()
        cookie_jar.extract_cookies(response, response.request)
        for cookie in cookie_jar:
            cookies.update({
                cookie.name: cookie.value,
            })
    return cookies

Exemple #20

0

Afficher le fichier

Fichier : pixiv.py Projet : hyc-shayu/PixivSpider

 def saveCookies(self, response):
     cookieJar = CookieJar()
     cookieJar.extract_cookies(response, response.request)
     for cookie in cookieJar:
         self.cookies[cookie.name] = cookie.value
     print('Log in completed!')
     return Request(self._discorveryUrl,
                    headers={'Referer': self._discorveryReferer},
                    cookies=self.cookies,
                    callback=self.parse)

Exemple #21

0

Afficher le fichier

 def login_callback(self, response):
     cookiejar = CookieJar()
     cookiejar.extract_cookies(response, response.request)
     self.cookiejar = cookiejar
     for page in range(1, 2, 1):
         artlistURL = 'http://www.baiinfo.com.cn/Orders/NewsList/104?pageid=' + str(
             page)
         request = scrapy.http.Request(url=artlistURL,
                                       callback=self.parse_articleList)
         yield request

Exemple #22

0

Afficher le fichier

Fichier : chouti_spider.py Projet : Danielhui/demo

 def parse_index(self, response):
     #获取cookie，解析cookie
     cookie_dict = {}
     from scrapy.http.cookies import CookieJar
     cookie_jar = CookieJar()
     cookie_jar.extract_cookies(response, response.request)
     for k, v in cookie_jar._cookies.items():
         for i, j in v.items():
             for m, n in j.items():
                 cookie_dict[m] = n.value
     print(cookie_dict)

Exemple #23

0

Afficher le fichier

 def create_request(self, url, response=None, **kwargs):
     if response is not None:
         cookieJar = response.meta.setdefault('cookie_jar', CookieJar())
         cookieJar.extract_cookies(response, response.request)
     else:
         cookieJar = CookieJar()
     kwargs.update(meta={'dont_merge_cookies': True,
                         'cookie_jar': cookieJar})
     request = Request(url, **kwargs)
     cookieJar.add_cookie_header(request)
     return request

Exemple #24

0

Afficher le fichier

Fichier : ct.py Projet : xscon/douban_book

 def parse(self, response):
     cookie_obj = CookieJar()
     cookie_obj.extract_cookies(response,response.request)
     self.cookie = cookie_obj._cookies  # 将捕获的cookie赋值给cookie
     yield Request(
         url="https://dig.chouti.com/login",
         method="POST",
         body="phone=8618938685515&password=avvcd123&oneMonth=1",  # DATA提交内容
         headers={"Content-Type":'application/x-www-form-urlencoded; charset=UTF-8'},  # POST方式，有要写
         callback=self.check_login
     )

Exemple #25

0

Afficher le fichier

Fichier : meituan_qual.py Projet : zjz6b6/scrapy_waimai

 def parse(self, response):
     cookiejar = CookieJar()
     cookiejar.extract_cookies(response, response.request)
     if "i.waimai.meituan.com" not in cookiejar._cookies.keys():
         yield self.qual_pre_requests(response.meta["cookiejar"],
                                      response.meta["retry_times"] + 1)
         return None
     cookies = cookiejar._cookies["i.waimai.meituan.com"]["/"]
     cookies = {key: cookies[key].value for key in cookies}
     post_data = {"wm_poi_id": response.meta["cookiejar"]}
     yield self.contruct_request(response, post_data, cookies)

Exemple #26

0

Afficher le fichier

	def parse_province(self, response):
		cookieJar = CookieJar()
    		cookieJar.extract_cookies(response, response.request)
		self.logger.info('++++++++++++++++++++++++++%s****%d', cookieJar._cookies, len(cookieJar._cookies))	
		name=cookieJar._cookies['www.ipe.org.cn']['/']['ajaxkey'].name
		value=cookieJar._cookies['www.ipe.org.cn']['/']['ajaxkey'].value
		self.cookie='{0}={1}'.format(name, value)
		self.logger.info('self.cookie=%s', self.cookie)
		for k,v in provinces.items():
                       frmdata=self.get_frmdata(1, k)
                       yield scrapy.FormRequest(url=self.poll_url, formdata=frmdata, callback=self.parse_page, meta={'k':k, 'v':v})

Exemple #27

0

Afficher le fichier

Fichier : weibo.py Projet : shine-boy/manage-python

 def getTarget(self, response):
     print(response.request.headers)
     cookie_jar = CookieJar()
     cookie_jar.extract_cookies(response, response.request)
     print(cookie_jar)
     cookie_dict = dict()
     cookie_list = ''
     for k, v in cookie_jar._cookies.items():
         for i, j in v.items():
             for m, n in j.items():
                 cookie_dict[m] = n.value
     for i, j in cookie_dict.items():
         print(i, '----------------', j)

Exemple #28

0

Afficher le fichier

 def parse(self, response):
     print('输出结果如下')
     print(response.meta.get('depth', 0))  ##打印出当前深度出来,默认最开始是0
     cookie_obj = CookieJar()  ##把cookie解析出来
     cookie_obj.extract_cookies(response,
                                response.request)  ##得需要一个cookie-jar的对象来获取
     ##response里面有响应体，有响应头，可以去响应体里面获取cookies
     '''
     解析出响应头中的cookies，放到cookie_obj里面
     发送数据的时候，是&的格式进行发送的，data
     如果是json=什么的话，那么就是字典的形式发送过去
     '''
     '''

Exemple #29

0

Afficher le fichier

Fichier : rarbg_tv.py Projet : zvodd/scrapy_rarbg_cmdline_tvsearch

 def create_request(self, url, response=None, **kwargs):
     # This function could be replaced by using CookiesMiddleware instead.
     if response is not None:
         cookieJar = response.meta.setdefault('cookie_jar', CookieJar())
         cookieJar.extract_cookies(response, response.request)
     else:
         cookieJar = CookieJar()
     kwargs.update(meta={
         'dont_merge_cookies': True,
         'cookie_jar': cookieJar
     })
     request = Request(url, **kwargs)
     cookieJar.add_cookie_header(request)
     return request

Exemple #30

0

Afficher le fichier

 def parse(self, response):
     cookie_obj = CookieJar()
     cookie_obj.extract_cookies(response, response.request)
     self.cookie_dict = cookie_obj._cookies
     # 带上用户名密码+cookie
     yield Request(url="http://dig.chouti.com/login",
                   method='POST',
                   body="phone=8615131255089&password=woshiniba&oneMonth=1",
                   headers={
                       'Content-Type':
                       "application/x-www-form-urlencoded; charset=UTF-8"
                   },
                   cookies=cookie_obj._cookies,
                   callback=self.check_login)

Exemple #31

0

Afficher le fichier

 def login_callback(self, response):
     cookiejar = CookieJar()
     cookiejar.extract_cookies(response, response.request)
     self.cookiejar = cookiejar
     paths = [['http://www.baiinfo.com/youse/tong', u'有色金属']
              #('http://www.baiinfo.com/tiehejin/tiehejin', u'铁合金')
              ]
     for path in paths:
         yield scrapy.Request(url=path[0],
                              meta={
                                  'productid': path[1],
                                  'urls': path[0]
                              },
                              callback=self.index_url)

Exemple #32

0

Afficher le fichier

Fichier : test_http_cookies.py Projet : kenzouyeh/scrapy

    def test_two_component_domain_ns(self):
        # Netscape: .www.bar.com, www.bar.com, .bar.com, bar.com, no domain
        # should all get accepted, as should .acme.com, acme.com and no domain
        # for 2-component domains like acme.com.
        c = CookieJar()

        # two-component V0 domain is OK
        interact_netscape(c, "http://foo.net/", 'ns=bar')
        self.assertEquals(len(c), 1)
        self.assertEquals(c._cookies["foo.net"]["/"]["ns"].value, "bar")
        self.assertEquals(interact_netscape(c, "http://foo.net/"), "ns=bar")
        # *will* be returned to any other domain (unlike RFC 2965)...
        self.assertEquals(interact_netscape(c, "http://www.foo.net/"),
                          "ns=bar")
        # ...unless requested otherwise
        pol = DefaultCookiePolicy(
            strict_ns_domain=DefaultCookiePolicy.DomainStrictNonDomain)
        c.set_policy(pol)
        self.assertEquals(interact_netscape(c, "http://www.foo.net/"), "")

        # unlike RFC 2965, even explicit two-component domain is OK,
        # because .foo.net matches foo.net
        interact_netscape(c, "http://foo.net/foo/",
                          'spam1=eggs; domain=foo.net')
        # even if starts with a dot -- in NS rules, .foo.net matches foo.net!
        interact_netscape(c, "http://foo.net/foo/bar/",
                          'spam2=eggs; domain=.foo.net')
        self.assertEquals(len(c), 3)
        self.assertEquals(c._cookies[".foo.net"]["/foo"]["spam1"].value,
                          "eggs")
        self.assertEquals(c._cookies[".foo.net"]["/foo/bar"]["spam2"].value,
                          "eggs")
        self.assertEquals(interact_netscape(c, "http://foo.net/foo/bar/"),
                          "spam2=eggs; spam1=eggs; ns=bar")

        # top-level domain is too general
        interact_netscape(c, "http://foo.net/", 'nini="ni"; domain=.net')
        self.assertEquals(len(c), 3)

##         # Netscape protocol doesn't allow non-special top level domains (such
##         # as co.uk) in the domain attribute unless there are at least three
##         # dots in it.
        # Oh yes it does!  Real implementations don't check this, and real
        # cookies (of course) rely on that behaviour.
        interact_netscape(c, "http://foo.co.uk", 'nasty=trick; domain=.co.uk')
##         self.assertEquals(len(c), 2)
        self.assertEquals(len(c), 4)

Exemple #33

0

Afficher le fichier

Fichier : test_http_cookies.py Projet : kenzouyeh/scrapy

    def testMalformedCookieHeaderParsing(self):
        headers = Headers({'Set-Cookie': [
            'CUSTOMER=WILE_E_COYOTE; path=/; expires=Wednesday, 09-Nov-2100 23:12:40 GMT',
            'PART_NUMBER=ROCKET_LAUNCHER_0001; path=/',
            'SHIPPING=FEDEX; path=/foo',
            'COUNTRY=UY; path=/foo',
            'GOOD_CUSTOMER;',
            'NO_A_BOT;']})
        res = Response('http://www.perlmeister.com/foo', headers=headers)
        req = Request('http://www.perlmeister.com/foo')

        c = CookieJar()
        c.extract_cookies(res, req)
        c.add_cookie_header(req)
        self.assertEquals(req.headers.get('Cookie'),
                'COUNTRY=UY; SHIPPING=FEDEX; CUSTOMER=WILE_E_COYOTE; '
                'PART_NUMBER=ROCKET_LAUNCHER_0001; NO_A_BOT; GOOD_CUSTOMER')

Exemple #34

0

Afficher le fichier

Fichier : test_http_cookies.py Projet : kenzouyeh/scrapy

    def test_empty_path(self):
        # Test for empty path
        # Broken web-server ORION/1.3.38 returns to the client response like
        #
        #       Set-Cookie: JSESSIONID=ABCDERANDOM123; Path=
        #
        # ie. with Path set to nothing.
        # In this case, extract_cookies() must set cookie to / (root)
        c = CookieJar(DefaultCookiePolicy(rfc2965 = True))
        headers = Headers({'Set-Cookie': 'JSESSIONID=ABCDERANDOM123; Path='})

        req = Request("http://www.ants.com/")
        res = Response("http://www.ants.com/", headers=headers)
        c.extract_cookies(res, req)

        req = Request("http://www.ants.com/")
        c.add_cookie_header(req)

        self.assertEquals(req.headers.get("Cookie"),
                          "JSESSIONID=ABCDERANDOM123")
        self.assertEquals(req.headers.get("Cookie2"), '$Version="1"')

        # missing path in the request URI
        req = Request("http://www.ants.com:8080")
        c.add_cookie_header(req)

        self.assertEquals(req.headers.get("Cookie"),
                          "JSESSIONID=ABCDERANDOM123")
        self.assertEquals(req.headers.get("Cookie2"), '$Version="1"')

Exemple #35

0

Afficher le fichier

Fichier : zhihu.py Projet : weberwang/scrapy-zhihu

    def start_requests(self):
        _driver = webdriver.PhantomJS(service_log_path='./phantomjs.log')
        _driver.set_window_size(GetSystemMetrics(0), GetSystemMetrics(1))
        self.driver = _driver
        (cookies, expires) = self.getcookies()
        if expires < time.time():
            expires = False
            print('cookie过期')
        if cookies and expires:
            self.cookiejar = CookieJar()
            for key in cookies:
                self.cookiejar.set_cookie(cookies[key])
            for url in self.start_urls:
                requset = Request(url, headers=self.headers,
                                  meta={'dont_merge_cookies': True, 'cookiejar': 1},
                                  callback=self.parse_page)
                self.cookiejar.add_cookie_header(requset)
                return [requset]
        _driver.get("https://www.zhihu.com/#signin")
        # wait = WebDriverWait(driver, 12)  # 等待
        time.sleep(8)  # 等待页面加载完毕
        _xsrf = _driver.find_element_by_xpath('//input[@name="_xsrf"]')
        _xsrf = _xsrf.get_attribute('value')
        print('_xsrf------->', _xsrf)
        input_wrapper = _driver.find_element_by_xpath('//div[@data-za-module="SignInForm"]')
        # iCaptcha = True
        # 等待验证码加载完成
        try:
            # input_captcha = wait.until(
            #     EC.presence_of_element_located((By.XPATH, './/div[@class="input-wrapper captcha-module"]')))
            input_captcha = input_wrapper.find_element_by_xpath('.//div[@class="input-wrapper captcha-module"]')
        except:
            try:
                # input_captcha = wait.until(
                #     EC.presence_of_element_located((By.XPATH, './/div[@class="iCaptcha input-wrapper"]')))
                # iCaptcha = False
                input_captcha = input_wrapper.find_element_by_xpath('.//div[@class="iCaptcha input-wrapper"]')
            except:
                input_captcha = None
        if input_captcha:
            hasShow = input_captcha.is_displayed()
        else:
            hasShow = False
        print(input_captcha, '-----captcha_url----->', hasShow)

        if hasShow:
            # 有验证码,先下载验证码
            # todo 这个地方还需要在处理一下,不能直接下载验证码不然会被服务器刷新验证码
            captcha_url = input_wrapper.find_element_by_xpath('.//img').get_attribute('src')
            print('captcha_url---->', captcha_url)
            _driver.close()
            return [Request(captcha_url, headers=self.headers, callback=self.download_captcha, meta={'_xsrf': _xsrf})]
        else:
            _driver.close()
            return [self.post_login(_xsrf)]

Exemple #36

0

Afficher le fichier

Fichier : test_http_cookies.py Projet : kenzouyeh/scrapy

    def test_netscape_example_2(self):
        # Second Example transaction sequence:
        #
        # Assume all mappings from above have been cleared.
        #
        # Client receives:
        #
        #       Set-Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001; path=/
        #
        # When client requests a URL in path "/" on this server, it sends:
        #
        #       Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001
        #
        # Client receives:
        #
        #       Set-Cookie: PART_NUMBER=RIDING_ROCKET_0023; path=/ammo
        #
        # When client requests a URL in path "/ammo" on this server, it sends:
        #
        #       Cookie: PART_NUMBER=RIDING_ROCKET_0023; PART_NUMBER=ROCKET_LAUNCHER_0001
        #
        #       NOTE: There are two name/value pairs named "PART_NUMBER" due to
        #       the inheritance of the "/" mapping in addition to the "/ammo" mapping.

        c = CookieJar()
        headers = Headers({'Set-Cookie': 'PART_NUMBER=ROCKET_LAUNCHER_0001; path=/'})

        req = Request("http://www.acme.com/")
        res = Response("http://www.acme.com/", headers=headers)

        c.extract_cookies(res, req)

        req = Request("http://www.acme.com/")
        c.add_cookie_header(req)

        self.assertEquals(req.headers.get("Cookie"), "PART_NUMBER=ROCKET_LAUNCHER_0001")

        headers.appendlist("Set-Cookie", "PART_NUMBER=RIDING_ROCKET_0023; path=/ammo")
        res = Response("http://www.acme.com/", headers=headers)
        c.extract_cookies(res, req)

        req = Request("http://www.acme.com/ammo")
        c.add_cookie_header(req)

        self.assert_(re.search(r"PART_NUMBER=RIDING_ROCKET_0023;\s*"
                               "PART_NUMBER=ROCKET_LAUNCHER_0001",
                               req.headers.get("Cookie")))

Exemple #37

0

Afficher le fichier

Fichier : test_http_cookies.py Projet : kenzouyeh/scrapy

 def test_secure(self):
     for ns in True, False:
         for whitespace in " ", "":
             c = CookieJar()
             if ns:
                 pol = DefaultCookiePolicy(rfc2965=False)
                 int = interact_netscape
                 vs = ""
             else:
                 pol = DefaultCookiePolicy(rfc2965=True)
                 int = interact_2965
                 vs = "; Version=1"
             c.set_policy(pol)
             url = "http://www.acme.com/"
             int(c, url, "foo1=bar%s%s" % (vs, whitespace))
             int(c, url, "foo2=bar%s; secure%s" %  (vs, whitespace))
             self.assert_(
                 not c._cookies["www.acme.com"]["/"]["foo1"].secure,
                 "non-secure cookie registered secure")
             self.assert_(
                 c._cookies["www.acme.com"]["/"]["foo2"].secure,
                 "secure cookie registered non-secure")

Exemple #38

0

Afficher le fichier

Fichier : test_http_cookies.py Projet : kenzouyeh/scrapy

    def test_session_cookies(self):
        year_plus_one = time.localtime()[0] + 1

        # Check session cookies are deleted properly by
        # CookieJar.clear_session_cookies method

        req = Request('http://www.perlmeister.com/scripts')
        headers = Headers()
        headers.appendlist("Set-Cookie", "s1=session;Path=/scripts")
        headers.appendlist("Set-Cookie", "p1=perm; Domain=.perlmeister.com;Path=/;expires=Fri, 02-Feb-%d 23:24:20 GMT" % year_plus_one)
        headers.appendlist("Set-Cookie", "p2=perm;Path=/;expires=Fri, 02-Feb-%d 23:24:20 GMT" % year_plus_one)
        headers.appendlist("Set-Cookie", "s2=session;Path=/scripts;" "Domain=.perlmeister.com")
        headers.appendlist('Set-Cookie2', 's3=session;Version=1;Discard;Path="/"')
        res = Response('http://www.perlmeister.com/scripts', headers=headers)

        c = CookieJar()
        c.extract_cookies(res, req)
        # How many session/permanent cookies do we have?
        counter = {"session_after": 0,
                   "perm_after": 0,
                   "session_before": 0,
                   "perm_before": 0}
        for cookie in c:
            key = "%s_before" % cookie.value
            counter[key] = counter[key] + 1
        c.clear_session_cookies()
        # How many now?
        for cookie in c:
            key = "%s_after" % cookie.value
            counter[key] = counter[key] + 1

        self.assert_(not (
            # a permanent cookie got lost accidently
            counter["perm_after"] != counter["perm_before"] or
            # a session cookie hasn't been cleared
            counter["session_after"] != 0 or
            # we didn't have session cookies in the first place
            counter["session_before"] == 0))

Exemple #39

0

Afficher le fichier

Fichier : test_http_cookies.py Projet : kenzouyeh/scrapy

    def test_expires(self):
        from cookielib import time2netscape

        # if expires is in future, keep cookie...
        c = CookieJar()
        future = time2netscape(time.time()+3600)
        interact_netscape(c, "http://www.acme.com/", 'spam="bar"; expires=%s' %
                          future)
        self.assertEquals(len(c), 1)
        now = time2netscape(time.time()-1)
        # ... and if in past or present, discard it
        interact_netscape(c, "http://www.acme.com/", 'foo="eggs"; expires=%s' %
                          now)
        h = interact_netscape(c, "http://www.acme.com/")
        self.assertEquals(len(c), 1)
        self.assert_('spam="bar"' in h and "foo" not in h)

        # max-age takes precedence over expires, and zero max-age is request to
        # delete both new cookie and any old matching cookie
        interact_netscape(c, "http://www.acme.com/", 'eggs="bar"; expires=%s' %
                          future)
        interact_netscape(c, "http://www.acme.com/", 'bar="bar"; expires=%s' %
                          future)
        self.assertEquals(len(c), 3)
        interact_netscape(c, "http://www.acme.com/", 'eggs="bar"; '
                          'expires=%s; max-age=0' % future)
        interact_netscape(c, "http://www.acme.com/", 'bar="bar"; '
                          'max-age=0; expires=%s' % future)
        h = interact_netscape(c, "http://www.acme.com/")
        self.assertEquals(len(c), 1)

        # test expiry at end of session for cookies with no expires attribute
        interact_netscape(c, "http://www.rhubarb.net/", 'whum="fizz"')
        self.assertEquals(len(c), 2)
        c.clear_session_cookies()
        self.assertEquals(len(c), 1)
        self.assert_('spam="bar"' in h)

Exemple #40

0

Afficher le fichier

Fichier : test_http_cookies.py Projet : kenzouyeh/scrapy

    def test_netscape_misc(self):
        # Some additional Netscape cookies tests.
        c = CookieJar()
        headers = Headers()
        req = Request("http://foo.bar.acme.com/foo")

        # Netscape allows a host part that contains dots
        headers.appendlist("Set-Cookie", "Customer=WILE_E_COYOTE; domain=.acme.com")
        res = Response("http://www.acme.com/foo", headers=headers)
        c.extract_cookies(res, req)

        # and that the domain is the same as the host without adding a leading
        # dot to the domain.  Should not quote even if strange chars are used
        # in the cookie value.
        headers.appendlist("Set-Cookie", "PART_NUMBER=3,4; domain=foo.bar.acme.com")
        res = Response("http://www.acme.com/foo", headers=headers)
        c.extract_cookies(res, req)

        req = Request("http://foo.bar.acme.com/foo")
        c.add_cookie_header(req)
        self.assert_(
            "PART_NUMBER=3,4" in req.headers.get("Cookie") and
            "Customer=WILE_E_COYOTE" in req.headers.get("Cookie"))

Exemple #41

0

Afficher le fichier

Fichier : zhihu.py Projet : weberwang/scrapy-zhihu

class zhihuCrawler(CrawlSpider):
    allowed_domains = ["www.zhihu.com"]
    host_url = "https://www.zhihu.com"
    start_urls = [
        "https://www.zhihu.com"
    ]
    headers = {
        'Connection': 'Keep-Alive',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Accept': 'text / html, application / xhtml + xml, image / jxr, * / *',
        'Accept-Language': 'zh-Hans-CN,zh-Hans;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586',
        'Host': 'www.zhihu.com',
        'Referer': 'https://www.zhihu.com/'
        # ' Cookie': '_za=5852b28b-399a-4bd8-8282-59070203151f; _xsrf=7d7cdde47226ee4e485a9cc9925f2715; __utmc=51854390; q_c1=73b48dcd9e84486f81814ea556dac319|1468220250000|1468220250000; l_cap_id=NjFiN2M2YzBmYmMwNDRmODk3ZGU3NTQ0ODllMzYyYzY=|1468827275|ccd88305461b2a3f2d9c38ec5c651e1bfcba81de; cap_id=ZGQ1MjFjMzM5MGI2NDY5ZmFjMGQ5NzMxODI2M2EzNWM=|1468827275|e44ef0b232dd85e4a62077a6a67e83ccbe963692; _zap=82d8c931-4ad6-464b-8e3f-2e430cce84e0; d_c0=AIBAeHJDNgqPTo5KKrizojLF6zLSb8c38qo=|1468220251; login=ZGNlMjUwYzNjNmMxNDI0N2I1YjQyMjVlMDM3YjMwN2Y=|1468827275|1c4c6a2dd0dec9d3948653906728e6ceb22154b2; __utma=51854390.1408714905.1468575990.1468819740.1468824510.5; __utmz=51854390.1468824510.5.4.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/topic/19552832/top-answers; __utmv=51854390.000--|2=registration_date=20130613=1^3=entry_date=20160711=1; __utmb=51854390.8.10.1468824510; n_c=1'
    }

    name = 'zhihu'
    _xsrf = ''
    cookiejar = CookieJar()
    driver = None
    login_cookies = None
    login_cookies_dict = None

    # handle_httpstatus_list = [302]

    # rules = (
    #     Rule(SgmlLinkExtractor(allow=(r'/question/\d+',)), follow=True),
    #     Rule(SgmlLinkExtractor(allow=(r'/people/(\w+-?)+$',)), callback='parse_page'),
    # )

    def __init__(self):
        super(CrawlSpider, self).__init__()
        dispatcher.connect(self.spider_closed, signals.spider_closed)

    def spider_closed(self, spider):
        Request('http://www.zhihu.com/logout', method='GET', callback=self.logout)
        pass

    def logout(self):
        print('退出成功')
        pass

    def start_requests(self):
        _driver = webdriver.PhantomJS(service_log_path='./phantomjs.log')
        _driver.set_window_size(GetSystemMetrics(0), GetSystemMetrics(1))
        self.driver = _driver
        (cookies, expires) = self.getcookies()
        if expires < time.time():
            expires = False
            print('cookie过期')
        if cookies and expires:
            self.cookiejar = CookieJar()
            for key in cookies:
                self.cookiejar.set_cookie(cookies[key])
            for url in self.start_urls:
                requset = Request(url, headers=self.headers,
                                  meta={'dont_merge_cookies': True, 'cookiejar': 1},
                                  callback=self.parse_page)
                self.cookiejar.add_cookie_header(requset)
                return [requset]
        _driver.get("https://www.zhihu.com/#signin")
        # wait = WebDriverWait(driver, 12)  # 等待
        time.sleep(8)  # 等待页面加载完毕
        _xsrf = _driver.find_element_by_xpath('//input[@name="_xsrf"]')
        _xsrf = _xsrf.get_attribute('value')
        print('_xsrf------->', _xsrf)
        input_wrapper = _driver.find_element_by_xpath('//div[@data-za-module="SignInForm"]')
        # iCaptcha = True
        # 等待验证码加载完成
        try:
            # input_captcha = wait.until(
            #     EC.presence_of_element_located((By.XPATH, './/div[@class="input-wrapper captcha-module"]')))
            input_captcha = input_wrapper.find_element_by_xpath('.//div[@class="input-wrapper captcha-module"]')
        except:
            try:
                # input_captcha = wait.until(
                #     EC.presence_of_element_located((By.XPATH, './/div[@class="iCaptcha input-wrapper"]')))
                # iCaptcha = False
                input_captcha = input_wrapper.find_element_by_xpath('.//div[@class="iCaptcha input-wrapper"]')
            except:
                input_captcha = None
        if input_captcha:
            hasShow = input_captcha.is_displayed()
        else:
            hasShow = False
        print(input_captcha, '-----captcha_url----->', hasShow)

        if hasShow:
            # 有验证码,先下载验证码
            # todo 这个地方还需要在处理一下,不能直接下载验证码不然会被服务器刷新验证码
            captcha_url = input_wrapper.find_element_by_xpath('.//img').get_attribute('src')
            print('captcha_url---->', captcha_url)
            _driver.close()
            return [Request(captcha_url, headers=self.headers, callback=self.download_captcha, meta={'_xsrf': _xsrf})]
        else:
            _driver.close()
            return [self.post_login(_xsrf)]

    def download_captcha(self, response):
        # 下载验证码
        with open('captcha.gif', 'wb') as fp:
            fp.write(response.body)
        # 用软件打开验证码图片
        os.system('start captcha.gif')
        # 输入验证码
        print('Please enter captcha: ')
        captcha = input()
        return self.post_login(response.meta['_xsrf'], captcha)

    def post_login(self, _xsrf, captcha=None):
        formdata = {'_xsrf': _xsrf,
                    'password': projectsetting.PASS_WORD,  # 你的密码
                    'captcha_type': 'cn',
                    'remember_me': 'true',
                    'email': projectsetting.USER_NAME}  # 你的账号

        if captcha != None:
            formdata['captcha'] = captcha
        return FormRequest("https://www.zhihu.com/login/email", method='POST', headers=self.headers,
                           callback=self.login_result,
                           meta={'dont_merge_cookies': True},
                           formdata=formdata)  # 你的账号

        pass

    def login_result(self, response):
        body = json.loads(response.body.decode('utf-8'))
        print('content---->', body)
        if body.get('r') != 0:
            return
        self.cookiejar = response.meta.setdefault('cookiejar', CookieJar())
        self.cookiejar.extract_cookies(response, response.request)
        self.savecookies(self.cookiejar._cookies)
        for url in self.start_urls:
            requset = Request(url, headers=self.headers,
                              meta={'dont_merge_cookies': True, 'cookiejar': 1},
                              callback=self.parse_page)
            yield requset
        pass

    def savecookies(self, cookies):
        copyCookie = dict()
        with open('login_cookie.json', 'w') as cookiesfile:
            def convterall(cookies):
                for key in cookies.keys():
                    value = cookies.get(key)
                    if isinstance(value, Cookie):
                        copyCookie[key] = self.class2str(value)
                    elif isinstance(value, dict):
                        convterall(value)

            convterall(cookies)
            self.login_cookies_dict = copyCookie
            cookiesfile.write(json.dumps(copyCookie))
        pass

    def class2str(self, dictdata):
        dic = {}
        dic.update(dictdata.__dict__)
        return dic
        pass

    def dict2cookie(self, cookie_dict):
        result = {}
        for item in cookie_dict.items():
            param = ''
            for key in item[1]:
                value = item[1][key]
                if type(value) == str:
                    value = "'" + value + "'"
                if key[0] == '_':
                    key = key[1:]
                param += '{0}={1},'.format(key, value)
            param = param[0:-1]
            evalstr = 'Cookie({0})'.format(param)
            result[item[0]] = eval(evalstr)
        return result

    def getcookies(self):
        expires = 0
        if self.login_cookies:
            for key in self.login_cookies:
                expires = self.login_cookies[key].expires
                break
            return (self.login_cookies, expires)
        if not os.path.exists('login_cookie.json'):
            return (None, 0)
        with open('login_cookie.json', encoding='utf-8') as cookiesfile:
            cookiesstr = cookiesfile.read()
            if cookiesstr == '' or cookiesstr == None:
                return (None, 0)
            cookies = json.loads(cookiesstr)
            self.login_cookies_dict = cookies
            self.login_cookies = self.dict2cookie(cookies)
            expires = 0
            if self.login_cookies:
                for key in self.login_cookies:
                    expires = self.login_cookies[key].expires
                    if expires != None:
                        break
            return (self.login_cookies, expires)
        pass

    def parse_page(self, response):
        with open('users.json', 'w') as user:
            user.write('')
        sel = Selector(response)
        href = sel.xpath('//ul[@id="top-nav-profile-dropdown"]/li[1]/a/@href').extract()[0]
        print('href----->', href)
        cookiejar = response.meta['cookiejar']
        request = Request(self.host_url + href, headers=self.headers, meta={'cookiejar': cookiejar},
                          callback=self.people_page)
        return request
        pass

    def people_page(self, response):
        yield self.parse_item(response)
        sel = Selector(response)
        # 关注和被关注
        following = sel.xpath('//div[@class="zm-profile-side-following zg-clear"]')

        # todo 递归找出所有有效用户关注的数据
        followings = following.xpath('.//a/@href').extract()
        for follow_link in followings:
            # yield self.cookiejar_addcookies(response, url=follow_link, callback=self.followees_page) #这样调用会重定向 还没有决解
            self.webdriver_addcookies(follow_link)
            browerHeight = self.driver.execute_script('return document.body.scrollHeight;')
            while True:
                # do the scrolling
                self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(1)  # 等待加载完成数据
                scrollHeight = self.driver.execute_script('return document.body.scrollHeight;')
                if browerHeight == scrollHeight:
                    break
                browerHeight = scrollHeight
            peoplelinks = self.driver.find_elements_by_xpath('//a[@class="zm-item-link-avatar"]')
            for link in peoplelinks:
                href = link.get_attribute('href') #某些用户的链接在这里找不到,待查找
                yield self.cookiejar_addcookies(response, url=href, callback=self.people_page)
            pass
        # followees = followings[0]  # 关注的链接
        # followers = followings[1]  # 被关注
        pass

    def webdriver_addcookies(self, url):
        for key in self.login_cookies_dict:
            cookie = self.login_cookies_dict[key]
            self.driver.add_cookie({k: cookie[k] for k in ['name', 'value', 'domain', 'path']})
        if url.find('http://') > -1 or url.find('https://') > -1:
            pass
        else:
            url = self.host_url + url
        self.driver.get(url)
        pass

    def cookiejar_addcookies(self, response, url, callback):
        cookiejar = response.meta['cookiejar']
        if url.find('http://') > -1 or url.find('https://') > -1:
            pass
        else:
            url = self.host_url + url
        request = Request(url, headers=self.headers,
                          dont_filter=True,
                          meta={'cookiejar': cookiejar, 'dont_redirect': True, 'handle_httpstatus_list': [302]},
                          callback=callback)
        # cookiejar.add_cookie_header(request)
        return request
        pass

    def followees_page(self, response):
        if response.status in (302,) and 'Location' in response.headers:
            url = unquote(response.headers['Location'].decode('utf-8'))
            self.logger.debug(
                "(followees_page) Location header: %r" % response.urljoin(url))
            yield self.cookiejar_addcookies(response, response.urljoin(url),
                                            self.followees_page)
        sel = Selector(response)
        peoplelinks = sel.xpath('//a[@class="zm-item-link-avatar"]/@href').extract()
        for link in peoplelinks:
            yield self.cookiejar_addcookies(response, url=link, callback=self.people_page)
        pass

    def parse_item(self, response):
        sel = Selector(response)
        following = sel.xpath('//div[@class="zm-profile-side-following zg-clear"]')
        followees_followers = following.xpath('.//strong/text()').extract()
        count = 0
        for follow in followees_followers:
            count += int(follow)
        if count == 0:
            print('这是一个僵尸号:', response.url.replace(self.host_url + '/people/', ''))
            return
        topics_link = sel.xpath('//a[@class="zg-link-litblue"]/@href').extract()
        for topics in topics_link:
            if topics.find('topics') > -1:
                topics_link = topics
        print('topics->>>>>>>>>>>', topics_link)
        # 打开关注的话题
        self.webdriver_addcookies(topics_link)
        browerHeight = self.driver.execute_script('return document.body.scrollHeight;')
        while True:
            # do the scrolling
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)  # 等待加载完成数据
            scrollHeight = self.driver.execute_script('return document.body.scrollHeight;')
            if browerHeight == scrollHeight:
                break
            browerHeight = scrollHeight

        topic_list = self.driver.find_element_by_id('zh-profile-topic-list')
        item = ZhihuItem()
        item['name'] = self.driver.find_element_by_xpath('//a[@class="name"]').text
        try:
            item['business'] = self.driver.find_element_by_xpath('//span[@class="business item"]').get_attribute(
                'title')
        except:
            item['business'] = ''
        try:
            item['location'] = self.driver.find_element_by_xpath('//span[@class="location item"]').text
        except:
            item['location'] = ''
        topics = []
        topic_divs = topic_list.find_elements_by_xpath('./div')
        for topic in topic_divs:
            section = topic.find_element_by_xpath('./div[@class="zm-profile-section-main"]')
            links = section.find_elements_by_tag_name('a')
            topicdata = links[1]
            topic_id = os.path.basename(topicdata.get_attribute('href'))
            topic_name = topicdata.find_element_by_tag_name('strong').text
            topic_answers = int(links.pop().text.replace(' 个回答', ''))
            topics.append({'topic_id': topic_id, 'topic_name': topic_name, 'topic_answers': topic_answers})
        item['topics'] = topics
        # 临时写入文件，方便查看
        # item_pipeline 写完之后才能查看，数据量过大
        # with codecs.open('users.json', 'a', encoding='utf-8') as user:
        #     line = json.dumps(dict(item)) + ','
        #     user.write(line.encode('latin-1').decode('unicode_escape'))
        return item

Exemple #42

0

Afficher le fichier

Fichier : test_http_cookies.py Projet : kenzouyeh/scrapy

    def test_domain_allow(self):
        c = CookieJar(policy=DefaultCookiePolicy(
            blocked_domains=["acme.com"],
            allowed_domains=["www.acme.com"]))

        req = Request("http://acme.com/")
        headers = {"Set-Cookie": "CUSTOMER=WILE_E_COYOTE; path=/"}
        res = Response("http://acme.com/", headers=headers)
        c.extract_cookies(res, req)
        self.assertEquals(len(c), 0)

        req = Request("http://www.acme.com/")
        res = Response("http://www.acme.com/", headers=headers)
        c.extract_cookies(res, req)
        self.assertEquals(len(c), 1)

        req = Request("http://www.coyote.com/")
        res = Response("http://www.coyote.com/", headers=headers)
        c.extract_cookies(res, req)
        self.assertEquals(len(c), 1)

        # set a cookie with non-allowed domain...
        req = Request("http://www.coyote.com/")
        res = Response("http://www.coyote.com/", headers=headers)
        cookies = c.make_cookies(res, req)
        c.set_cookie(cookies[0])
        self.assertEquals(len(c), 2)
        # ... and check is doesn't get returned
        c.add_cookie_header(req)
        assert 'Cookie' not in req.headers

Exemple #43

0

Afficher le fichier

Fichier : test_http_cookies.py Projet : kenzouyeh/scrapy

    def test_domain_block(self):
        pol = DefaultCookiePolicy(
            rfc2965=True, blocked_domains=[".acme.com"])

        c = CookieJar(policy=pol)
        headers = {'Set-Cookie': 'CUSTOMER=WILE_E_COYOTE; path=/'}

        req = Request("http://www.acme.com/")
        res = Response('http://www.acme.com/', headers=headers)
        c.extract_cookies(res, req)
        self.assertEquals(len(c), 0)

        p = pol.set_blocked_domains(["acme.com"])
        c.extract_cookies(res, req)
        self.assertEquals(len(c), 1)

        c.clear()
        req = Request("http://www.roadrunner.net/")
        res = Response("http://www.roadrunner.net/", headers=headers)
        c.extract_cookies(res, req)
        self.assertEquals(len(c), 1)
        req = Request("http://www.roadrunner.net/")
        c.add_cookie_header(req)
        assert 'Cookie' in req.headers and 'Cookie2' in req.headers

        c.clear()
        pol.set_blocked_domains([".acme.com"])
        c.extract_cookies(res, req)
        self.assertEquals(len(c), 1)

        # set a cookie with blocked domain...
        req = Request("http://www.acme.com/")
        res = Response("http://www.acme.com/", headers=headers)
        cookies = c.make_cookies(res, req)
        c.set_cookie(cookies[0])
        self.assertEquals(len(c), 2)
        # ... and check is doesn't get returned
        c.add_cookie_header(req)
        assert 'Cookie' not in req.headers

Exemple #44

0

Afficher le fichier

Fichier : test_http_cookies.py Projet : kenzouyeh/scrapy

 def cookiejar_from_cookie_headers(headers):
     c = CookieJar()
     req = Request("http://www.example.com/")
     r = Response("http://www.example.com/", headers=headers)
     c.extract_cookies(r, req)
     return c

Exemple #45

0

Afficher le fichier

Fichier : test_http_cookies.py Projet : kenzouyeh/scrapy

    def test_netscape_example_1(self):
        #-------------------------------------------------------------------
        # First we check that it works for the original example at
        # http://www.netscape.com/newsref/std/cookie_spec.html

        # Client requests a document, and receives in the response:
        #
        #       Set-Cookie: CUSTOMER=WILE_E_COYOTE; path=/; expires=Wednesday, 09-Nov-99 23:12:40 GMT
        #
        # When client requests a URL in path "/" on this server, it sends:
        #
        #       Cookie: CUSTOMER=WILE_E_COYOTE
        #
        # Client requests a document, and receives in the response:
        #
        #       Set-Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001; path=/
        #
        # When client requests a URL in path "/" on this server, it sends:
        #
        #       Cookie: CUSTOMER=WILE_E_COYOTE; PART_NUMBER=ROCKET_LAUNCHER_0001
        #
        # Client receives:
        #
        #       Set-Cookie: SHIPPING=FEDEX; path=/fo
        #
        # When client requests a URL in path "/" on this server, it sends:
        #
        #       Cookie: CUSTOMER=WILE_E_COYOTE; PART_NUMBER=ROCKET_LAUNCHER_0001
        #
        # When client requests a URL in path "/foo" on this server, it sends:
        #
        #       Cookie: CUSTOMER=WILE_E_COYOTE; PART_NUMBER=ROCKET_LAUNCHER_0001; SHIPPING=FEDEX
        #
        # The last Cookie is buggy, because both specifications say that the
        # most specific cookie must be sent first.  SHIPPING=FEDEX is the
        # most specific and should thus be first.

        year_plus_one = time.localtime()[0] + 1

        c = CookieJar(DefaultCookiePolicy(rfc2965 = True))

        #req = Request("http://1.1.1.1/",
        #              headers={"Host": "www.acme.com:80"})
        req = Request("http://www.acme.com:80/", headers={"Host": "www.acme.com:80"})

        headers = Headers()
        headers['Set-Cookie'] = 'CUSTOMER=WILE_E_COYOTE; path=/ ; expires=Wednesday, 09-Nov-%d 23:12:40 GMT' % year_plus_one
        res = Response("http://www.acme.com/", headers=headers)
        c.extract_cookies(res, req)

        req = Request("http://www.acme.com/")
        c.add_cookie_header(req)

        self.assertEqual(req.headers.get("Cookie"), "CUSTOMER=WILE_E_COYOTE")
        self.assertEqual(req.headers.get("Cookie2"), '$Version="1"')

        headers.appendlist("Set-Cookie", "PART_NUMBER=ROCKET_LAUNCHER_0001; path=/")
        res = Response("http://www.acme.com/", headers=headers)
        c.extract_cookies(res, req)

        req = Request("http://www.acme.com/foo/bar")
        c.add_cookie_header(req)

        h = req.headers.get("Cookie")
        self.assert_("PART_NUMBER=ROCKET_LAUNCHER_0001" in h and
                     "CUSTOMER=WILE_E_COYOTE" in h)

        headers.appendlist('Set-Cookie', 'SHIPPING=FEDEX; path=/foo')
        res = Response("http://www.acme.com", headers=headers)
        c.extract_cookies(res, req)

        req = Request("http://www.acme.com/")
        c.add_cookie_header(req)

        h = req.headers.get("Cookie")
        self.assert_("PART_NUMBER=ROCKET_LAUNCHER_0001" in h and
                     "CUSTOMER=WILE_E_COYOTE" in h and
                     "SHIPPING=FEDEX" not in h)

        req = Request("http://www.acme.com/foo/")
        c.add_cookie_header(req)

        h = req.headers.get("Cookie")
        self.assert_(("PART_NUMBER=ROCKET_LAUNCHER_0001" in h and
                      "CUSTOMER=WILE_E_COYOTE" in h and
                      h.startswith("SHIPPING=FEDEX;")))