def parse_resp(self, resp):
        global request_list
        request_list.append(resp.url)
        if 'Product-Details' in resp.url:
            yield self.parse_detail(resp)
        elif 'productCategory=' in resp.url:
            html = resp.text.encode('utf-8')
            root = lxml.html.fromstring(html)
            # 获取页数
            search_result = root.xpath('//span[@class="SearchResult"]/text()')
            count = util.intval(search_result[0]) if search_result else 0
            pages = int(math.ceil(count / self.limit))
            print "O*O" * 20
            print pages
            if pages <= 1:
                yield None
                return
            # if resp.request.meta.get('next_page', False):
            #     links = LinkExtractor(allow=filter_rules).extract_links(resp)
            #     print "&%" * 20
            #     print links
            #     for link in links:
            #         yield Request(url=link.url, headers=self.headers, callback=self.parse_resp)
            form_data = {}
            # 获取翻页参数 post_back
            page_list = root.xpath('//tr[@class="Paging"]//a/@href')
            post_back_pattern = re.compile('\'([^\']+)\',\'([^\']+)\'')

            match = post_back_pattern.search(
                page_list[0]) if page_list else None
            post_data = match.group(1)

            # 获取事件参数 ctl00$scr
            match = re.search(r'(ctl00[^\"\',]+outerPanelPanel)', html)
            src = match.group() + '|' if match else ''

            # 获取事件参数 __VIEWSTATE
            field1 = root.xpath('//input[@id="__VIEWSTATE"]/@value')
            form_data['__VIEWSTATE'] = field1[0] if field1 else ''

            # 获取事件参数 __VIEWSTATEGENERATOR
            field2 = root.xpath('//input[@id="__VIEWSTATEGENERATOR"]/@value')
            form_data['__VIEWSTATEGENERATOR'] = field2[0] if field2 else ''

            # 获取事件参数 __VIEWSTATEENCRYPTED 没有这个参数请求会出错
            form_data['__VIEWSTATEENCRYPTED'] = ''

            # 获取事件参数 __EVENTVALIDATION
            field3 = root.xpath('//input[@id="__EVENTVALIDATION"]/@value')
            form_data['__EVENTVALIDATION'] = field3[0] if field3 else ''

            # 构造翻页表单
            for x in xrange(2, pages + 1):
                form_data.update({
                    'ctl00$scr':
                    src + post_data,
                    '__EVENTTARGET':
                    post_data,
                    '__EVENTARGUMENT':
                    'Page${page_num}'.format(page_num=x),
                })
                _headers = self.headers
                _headers.update({
                    'Content-Type':
                    'application/x-www-form-urlencoded; charset=UTF-8',
                    'X-MicrosoftAjax': 'Delta=true',
                    'Accept': '*/*'
                })
                # yield FormRequest(url=resp.url, headers=self.headers,
                #                   formdata=copy.deepcopy(form_data), meta={'next_page': True, 'page': x},
                #                   callback=self.parse_resp)
                yield FormRequest(url=resp.url,
                                  headers=self.headers,
                                  formdata=copy.deepcopy(form_data),
                                  meta={
                                      'next_page': True,
                                      'page': x
                                  })
    def get_details(self, response):
        if response.xpath(
                '//script[contains(@src, "https://www.google.com/recaptcha/api")]/@src'
        ):
            print(f"\t[{self.Registreringsnummer}] Captcha is found")

            formdata = {
                'key': self.api_key,
                'method': 'userrecaptcha',
                'googlekey': self.google_key,
                'pageurl': self.post_url,
                # 'proxy': 'http://1f7a28e9aa7446c491a11b8328a5ced7:@proxy.crawlera.com:8010/',
                'proxytype': 'http'
            }

            request = FormRequest(url=self.captcha_in_url,
                                  method='POST',
                                  formdata=formdata,
                                  headers=make_headers_1(),
                                  callback=self.get_captcha_id,
                                  errback=self.fail_captcha_id,
                                  dont_filter=True,
                                  meta={})
            yield request
        else:
            try:
                Försäkringsbolag = \
                    [elm.strip() for elm in
                     response.xpath('//strong[contains(text(), "kringsbolag")]/../text()').extract() if
                     elm.strip()][
                        0].strip()
            except:
                Försäkringsbolag = ''
            try:
                Försäkringsdatum = \
                    [elm.strip() for elm in
                     response.xpath('//strong[contains(text(), "kringsdatum")]/../text()').extract() if
                     elm.strip()][
                        0].strip()
            except:
                Försäkringsdatum = ''

            try:
                Fordonsstatus = \
                    [elm.strip() for elm in response.xpath('//a[@href="#ts-fordonsstatus"]/../../text()').extract() if
                     elm.strip()][0].strip()
            except:
                Fordonsstatus = ""
            try:
                Besiktigas_senast_8 = [
                    elm.strip() for elm in response.xpath(
                        '//strong[contains(text(), "Besiktigas senast")]/../text()'
                    ).extract() if elm.strip()
                ][-2].strip()
            except:
                Besiktigas_senast_8 = ""
            try:
                Upplysningar = \
                    [elm.strip() for elm in
                     response.xpath('//strong[contains(text(), "Upplysningar")]/../text()').extract()
                     if elm.strip()][0].strip()
            except:
                Upplysningar = ""
            try:
                Import_införsel = \
                    [elm.strip() for elm in response.xpath('//a[@href="#ts-import"]/../../text()').extract() if
                     elm.strip()][0].strip()
            except:
                Import_införsel = ""
            try:
                Besiktigas_senast = [
                    elm.strip() for elm in response.xpath(
                        '//strong[contains(text(), "Besiktigas senast")]/../text()'
                    ).extract() if elm.strip()
                ][-1].strip()
            except:
                Besiktigas_senast = ""
            try:
                Senast_godkända_besiktning = [
                    elm.strip() for elm in response.xpath(
                        '//strong[contains(text(), "Senast god") and contains(text(), "besiktning")]/../text()'
                    ).extract() if elm.strip()
                ][0].strip()
            except:
                Senast_godkända_besiktning = ""
            try:
                Mätarställning = \
                    [elm.strip() for elm in response.xpath('//a[@href="#ts-matarstallning"]/../../text()').extract() if
                     elm.strip()][0].strip()
            except:
                Mätarställning = ""

            item = FuRegnrItem()
            item['Registreringsnummer'] = self.Registreringsnummer
            item['Försäkringsbolag'] = Försäkringsbolag
            item['Försäkringsdatum'] = Försäkringsdatum
            item['Fordonsstatus'] = Fordonsstatus
            item['Besiktigas_senast_8'] = Besiktigas_senast_8
            item['Upplysningar'] = Upplysningar
            item['Import_införsel'] = Import_införsel
            item['Besiktigas_senast'] = Besiktigas_senast
            item['Senast_godkända_besiktning'] = Senast_godkända_besiktning
            item['Mätarställning'] = Mätarställning

            yield item

            result_row = [
                self.Registreringsnummer, Försäkringsbolag, Försäkringsdatum,
                Fordonsstatus, Besiktigas_senast_8, Upplysningar,
                Import_införsel, Besiktigas_senast, Senast_godkända_besiktning,
                Mätarställning
            ]
            self.total_cnt += 1
            print("\t[Result {}] {}".format(self.total_cnt, result_row))
            self.insert_row(result_row=result_row)

            self.total_scraping_done = True
Exemple #3
0
 def test_request_class(self):
     r = FormRequest("http://www.example.com")
     self._assert_serializes_ok(r, spider=self.spider)
     r = CustomRequest("http://www.example.com")
     self._assert_serializes_ok(r, spider=self.spider)
    def get_recaptchaClientToken(self, response):
        resp_cnt = response.meta['resp_cnt']

        print("\t[{}] {}".format(resp_cnt, response.text))
        if response.text.split('|')[0] == 'OK':
            self.recaptchaClientToken = response.text.split('|')[1]

            headers = {
                'user-agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                'Content-Type': 'application/x-www-form-urlencoded',
                'Sec-Fetch-Mode': 'navigate',
                'Sec-Fetch-Site': 'same-origin',
                'Sec-Fetch-User': '******',
                'cookie': self.cookie
            }
            payload = {
                '__RequestVerificationToken': self.__RequestVerificationToken,
                'Registreringsnummer': self.Registreringsnummer,
                'recaptchaClientToken': self.recaptchaClientToken,
                'Captcha.CaptchaResponse': ''
            }

            request = FormRequest(url=self.post_url,
                                  method='POST',
                                  headers=headers,
                                  formdata=payload,
                                  callback=self.get_details,
                                  errback=self.fail_details,
                                  dont_filter=True,
                                  meta={})
            yield request

        else:
            resp_cnt += 1
            if resp_cnt >= self.max_resp_cnt:
                formdata = {
                    'key': self.api_key,
                    'method': 'userrecaptcha',
                    'googlekey': self.google_key,
                    'pageurl': self.post_url,
                    # 'proxy': 'http://1f7a28e9aa7446c491a11b8328a5ced7:@proxy.crawlera.com:8010/',
                    'proxytype': 'http'
                }

                request = FormRequest(url=self.captcha_in_url,
                                      method='POST',
                                      formdata=formdata,
                                      headers=make_headers_1(),
                                      callback=self.get_captcha_id,
                                      errback=self.fail_captcha_id,
                                      dont_filter=True,
                                      meta={})
                yield request
            else:
                sleep(self.resp_time)
                fetch_url = self.captcha_res_url.format(self.captcha_id)
                request = FormRequest(url=fetch_url,
                                      method='GET',
                                      headers=make_headers_1(),
                                      callback=self.get_recaptchaClientToken,
                                      errback=self.fail_recaptchaClientToken,
                                      dont_filter=True,
                                      meta={
                                          'resp_cnt': resp_cnt,
                                      })
                yield request