def parse_resp(self, resp): global request_list request_list.append(resp.url) if 'Product-Details' in resp.url: yield self.parse_detail(resp) elif 'productCategory=' in resp.url: html = resp.text.encode('utf-8') root = lxml.html.fromstring(html) # 获取页数 search_result = root.xpath('//span[@class="SearchResult"]/text()') count = util.intval(search_result[0]) if search_result else 0 pages = int(math.ceil(count / self.limit)) print "O*O" * 20 print pages if pages <= 1: yield None return # if resp.request.meta.get('next_page', False): # links = LinkExtractor(allow=filter_rules).extract_links(resp) # print "&%" * 20 # print links # for link in links: # yield Request(url=link.url, headers=self.headers, callback=self.parse_resp) form_data = {} # 获取翻页参数 post_back page_list = root.xpath('//tr[@class="Paging"]//a/@href') post_back_pattern = re.compile('\'([^\']+)\',\'([^\']+)\'') match = post_back_pattern.search( page_list[0]) if page_list else None post_data = match.group(1) # 获取事件参数 ctl00$scr match = re.search(r'(ctl00[^\"\',]+outerPanelPanel)', html) src = match.group() + '|' if match else '' # 获取事件参数 __VIEWSTATE field1 = root.xpath('//input[@id="__VIEWSTATE"]/@value') form_data['__VIEWSTATE'] = field1[0] if field1 else '' # 获取事件参数 __VIEWSTATEGENERATOR field2 = root.xpath('//input[@id="__VIEWSTATEGENERATOR"]/@value') form_data['__VIEWSTATEGENERATOR'] = field2[0] if field2 else '' # 获取事件参数 __VIEWSTATEENCRYPTED 没有这个参数请求会出错 form_data['__VIEWSTATEENCRYPTED'] = '' # 获取事件参数 __EVENTVALIDATION field3 = root.xpath('//input[@id="__EVENTVALIDATION"]/@value') form_data['__EVENTVALIDATION'] = field3[0] if field3 else '' # 构造翻页表单 for x in xrange(2, pages + 1): form_data.update({ 'ctl00$scr': src + post_data, '__EVENTTARGET': post_data, '__EVENTARGUMENT': 'Page${page_num}'.format(page_num=x), }) _headers = self.headers _headers.update({ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'X-MicrosoftAjax': 'Delta=true', 'Accept': '*/*' }) # yield FormRequest(url=resp.url, headers=self.headers, # formdata=copy.deepcopy(form_data), meta={'next_page': True, 'page': x}, # callback=self.parse_resp) yield FormRequest(url=resp.url, headers=self.headers, formdata=copy.deepcopy(form_data), meta={ 'next_page': True, 'page': x })
def get_details(self, response): if response.xpath( '//script[contains(@src, "https://www.google.com/recaptcha/api")]/@src' ): print(f"\t[{self.Registreringsnummer}] Captcha is found") formdata = { 'key': self.api_key, 'method': 'userrecaptcha', 'googlekey': self.google_key, 'pageurl': self.post_url, # 'proxy': 'http://1f7a28e9aa7446c491a11b8328a5ced7:@proxy.crawlera.com:8010/', 'proxytype': 'http' } request = FormRequest(url=self.captcha_in_url, method='POST', formdata=formdata, headers=make_headers_1(), callback=self.get_captcha_id, errback=self.fail_captcha_id, dont_filter=True, meta={}) yield request else: try: Försäkringsbolag = \ [elm.strip() for elm in response.xpath('//strong[contains(text(), "kringsbolag")]/../text()').extract() if elm.strip()][ 0].strip() except: Försäkringsbolag = '' try: Försäkringsdatum = \ [elm.strip() for elm in response.xpath('//strong[contains(text(), "kringsdatum")]/../text()').extract() if elm.strip()][ 0].strip() except: Försäkringsdatum = '' try: Fordonsstatus = \ [elm.strip() for elm in response.xpath('//a[@href="#ts-fordonsstatus"]/../../text()').extract() if elm.strip()][0].strip() except: Fordonsstatus = "" try: Besiktigas_senast_8 = [ elm.strip() for elm in response.xpath( '//strong[contains(text(), "Besiktigas senast")]/../text()' ).extract() if elm.strip() ][-2].strip() except: Besiktigas_senast_8 = "" try: Upplysningar = \ [elm.strip() for elm in response.xpath('//strong[contains(text(), "Upplysningar")]/../text()').extract() if elm.strip()][0].strip() except: Upplysningar = "" try: Import_införsel = \ [elm.strip() for elm in response.xpath('//a[@href="#ts-import"]/../../text()').extract() if elm.strip()][0].strip() except: Import_införsel = "" try: Besiktigas_senast = [ elm.strip() for elm in response.xpath( '//strong[contains(text(), "Besiktigas senast")]/../text()' ).extract() if elm.strip() ][-1].strip() except: Besiktigas_senast = "" try: Senast_godkända_besiktning = [ elm.strip() for elm in response.xpath( '//strong[contains(text(), "Senast god") and contains(text(), "besiktning")]/../text()' ).extract() if elm.strip() ][0].strip() except: Senast_godkända_besiktning = "" try: Mätarställning = \ [elm.strip() for elm in response.xpath('//a[@href="#ts-matarstallning"]/../../text()').extract() if elm.strip()][0].strip() except: Mätarställning = "" item = FuRegnrItem() item['Registreringsnummer'] = self.Registreringsnummer item['Försäkringsbolag'] = Försäkringsbolag item['Försäkringsdatum'] = Försäkringsdatum item['Fordonsstatus'] = Fordonsstatus item['Besiktigas_senast_8'] = Besiktigas_senast_8 item['Upplysningar'] = Upplysningar item['Import_införsel'] = Import_införsel item['Besiktigas_senast'] = Besiktigas_senast item['Senast_godkända_besiktning'] = Senast_godkända_besiktning item['Mätarställning'] = Mätarställning yield item result_row = [ self.Registreringsnummer, Försäkringsbolag, Försäkringsdatum, Fordonsstatus, Besiktigas_senast_8, Upplysningar, Import_införsel, Besiktigas_senast, Senast_godkända_besiktning, Mätarställning ] self.total_cnt += 1 print("\t[Result {}] {}".format(self.total_cnt, result_row)) self.insert_row(result_row=result_row) self.total_scraping_done = True
def test_request_class(self): r = FormRequest("http://www.example.com") self._assert_serializes_ok(r, spider=self.spider) r = CustomRequest("http://www.example.com") self._assert_serializes_ok(r, spider=self.spider)
def get_recaptchaClientToken(self, response): resp_cnt = response.meta['resp_cnt'] print("\t[{}] {}".format(resp_cnt, response.text)) if response.text.split('|')[0] == 'OK': self.recaptchaClientToken = response.text.split('|')[1] headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Content-Type': 'application/x-www-form-urlencoded', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-User': '******', 'cookie': self.cookie } payload = { '__RequestVerificationToken': self.__RequestVerificationToken, 'Registreringsnummer': self.Registreringsnummer, 'recaptchaClientToken': self.recaptchaClientToken, 'Captcha.CaptchaResponse': '' } request = FormRequest(url=self.post_url, method='POST', headers=headers, formdata=payload, callback=self.get_details, errback=self.fail_details, dont_filter=True, meta={}) yield request else: resp_cnt += 1 if resp_cnt >= self.max_resp_cnt: formdata = { 'key': self.api_key, 'method': 'userrecaptcha', 'googlekey': self.google_key, 'pageurl': self.post_url, # 'proxy': 'http://1f7a28e9aa7446c491a11b8328a5ced7:@proxy.crawlera.com:8010/', 'proxytype': 'http' } request = FormRequest(url=self.captcha_in_url, method='POST', formdata=formdata, headers=make_headers_1(), callback=self.get_captcha_id, errback=self.fail_captcha_id, dont_filter=True, meta={}) yield request else: sleep(self.resp_time) fetch_url = self.captcha_res_url.format(self.captcha_id) request = FormRequest(url=fetch_url, method='GET', headers=make_headers_1(), callback=self.get_recaptchaClientToken, errback=self.fail_recaptchaClientToken, dont_filter=True, meta={ 'resp_cnt': resp_cnt, }) yield request