def test_splash_form_request(): req = SplashFormRequest('http://example.com', formdata={'foo': 'bar'}) assert req.method == 'POST' assert req.body == b'foo=bar' assert req.meta['splash']['args']['url'] == 'http://example.com' req = SplashFormRequest('http://example.com', method='GET', formdata={'foo': 'bar'}, endpoint='execute') assert req.method == 'GET' assert req.body == b'' assert req.url == req.meta['splash']['args']['url'] ==\ 'http://example.com?foo=bar' assert req.meta['splash']['endpoint'] == 'execute'
def parse(self, response): if not self.credentials(): self.logger.error('add the credentials first!') return yield SplashFormRequest.from_response( response, formdata={ 'txtUser': self.txtUser, 'txtPassword': self.txtPassword, 'txtDependencia': self.txtDependencia, 'ComboTipoDep': self.ComboTipoDep, 'btnAceptar': self.btnAceptar }, callback=self.parse_login, endpoint='execute', cache_args=['lua_source'], session_id='dummy', args={ 'html': 1, 'lua_source': self.getFileContentAsString(self.lua_dir + 'init.lua'), 'wait': 5 })
def start_requests(self): # 请求方式 # print(self.start_urls) for url in self.start_urls: if not self.dynamic: if self.method.lower() == 'post': request = FormRequest(url=url, formdata=self.form, headers=self.headers, cookies=self.cookies, callback=self.parse_first, dont_filter=True) else: request = Request(url=url, headers=self.headers, cookies=self.cookies, callback=self.parse_first) else: if self.method.lower() == 'post': request = SplashFormRequest(url=url, formdata=self.form, callback=self.parse_first, dont_filter=False, args=self.args_data) else: request = SplashRequest(url, callback=self.parse_first, endpoint='execute', args=self.args_data) yield request
def parse_subject(self, response): selector = Selector(text=response.body) subjectList = selector.xpath('//li') fieldId = response.meta['fieldId'] if len(subjectList): for subject in subjectList: item = subjectItem() subjectId = subject.xpath('@id').extract()[0] subjectName = subject.xpath('./text()').extract()[0] subjectName = re.sub('\ue6a2', '', subjectName) item['id'] = subjectId item['name'] = subjectName item['fieldId'] = fieldId yield item for subject in subjectList: subjectId = subject.xpath('@id').extract()[0] yield SplashFormRequest(self.base_major_url, formdata={'method': 'subCategoryXk', 'key': subjectId}, callback=self.parse_major, meta={'subjectId': subjectId} )
def login_me(self, response): RequestVerificationToken = response.selector.xpath( "//form[@id='login-form']//input[@name='__RequestVerificationToken']/@value" ).get("") # get the Captcha's options sitekey = response.selector.xpath( "//div[@class='g-recaptcha']/@data-sitekey").get("") gcaptcha_txt = self.solve_captcha(sitekey, response.url) if not gcaptcha_txt: return frm_data = { 'Empresa': '1', 'Email': self.e_mail, 'Senha': self.senha, 'g-recaptcha-response': gcaptcha_txt, '__RequestVerificationToken': RequestVerificationToken } print(frm_data) yield SplashFormRequest(self.start_url, formdata=frm_data, callback=self.get_main_page, errback=self.errback_func, endpoint='execute', cache_args=['lua_source'], args={'lua_source': script_10_sec_wait}, dont_filter=True)
def login_me(self, response): form_inputs = response.selector.xpath( "//form[.//input[@value='Entrar']]//input") frm_data = {} for inpt in form_inputs: inpt_name = inpt.xpath("./@name").get("") inpt_val = inpt.xpath("./@value").get("") if "urlRedirectLogin" in inpt_name: inpt_val = "/wps/portal/portaldetran/cidadao/infracoes/servicos/consultaMultas" elif "modalMensagem" in inpt_name: inpt_val = "Para realizar a pesquisa de débitos e restrições de veículos do proprietário,<br /> responda algumas perguntas, acesse com seu CPF e senha ou cadastre-se abaixo:" elif "numeroLogin" in inpt_name: inpt_val = self.cpf_cnpj elif "senhaLogin" in inpt_name: inpt_val = self.senha frm_data.update({inpt_name: inpt_val}) url = "http://www.detran.sp.gov.br" + frm_data['javax.faces.encodedURL'] # yield FormRequest(url, formdata=frm_data, callback=self.set_renavam, dont_filter=True) yield SplashFormRequest(url, formdata=frm_data, callback=self.set_renavam, errback=self.errback_func, endpoint='execute', cache_args=['lua_source'], args={'lua_source': script_10_sec_wait}, dont_filter=True)
def parse_field(self, response): selector = Selector(text=response.body) fieldList = selector.xpath('//li') degreeId = response.meta['degreeId'] if len(fieldList): for field in fieldList: item = fieldItem() fieldId = field.xpath('@id').extract()[0] fieldName = field.xpath('./text()').extract()[0] fieldName = re.sub('\ue6a2', '', fieldName) item['id'] = fieldId item['name'] = fieldName item['degreeId'] = degreeId yield item for field in fieldList: fieldId = field.xpath('@id').extract()[0] # self.logger.debug(degreeId) yield SplashFormRequest(self.base_major_url, formdata={'method': 'subCategoryMl', 'key': fieldId}, callback=self.parse_subject, meta={'fieldId': fieldId} )
def parse_degree(self, response): soup = BeautifulSoup(response.body, 'lxml') container = soup.find('div', attrs={'class': 'zyk-list'}) degreeContainer = container.find('ul', attrs={'class': 'zyk-cc-ul'}) degreeList = degreeContainer.findAll('li') for degree in degreeList: item = degreeItem() degreeId = degree.attrs['id'] degreeName = degree.text degreeName = re.sub('\ue6a2', '', degreeName) item['id'] = degreeId item['name'] = degreeName yield item for degree in degreeList: degreeId = degree.attrs['id'] # self.logger.debug(degreeId) yield SplashFormRequest(self.base_major_url, formdata={'method': 'subCategoryMl', 'key': degreeId}, callback=self.parse_field, meta={'degreeId': degreeId} )
def make_request_from_data(self, data): """ :param data: redis_key中的数据 :return: 生成 scrapy请求 """ scheduled = ScheduledRequest( **json.loads(bytes_to_str(data, self.redis_encoding))) callback, dont_filter = self.get_callback(scheduled.callback) if not callable(callback): raise OSError(f"{scheduled.callback}没有指定回调函数") params = { 'url': scheduled.url, 'method': scheduled.method, 'meta': scheduled.meta, 'dont_filter': dont_filter, 'callback': callback } if 'splash' in scheduled.meta: wait = scheduled.meta.get('splash').get('wait', 2) images = scheduled.meta.get('splash').get('images', 0) # 默认不下载图片 params['args'] = {'wait': wait, 'images': images} if scheduled.method == "POST": return SplashFormRequest(formdata=scheduled.body, **params) else: return SplashRequest(**params) if scheduled.method == "POST": return FormRequest(formdata=scheduled.body, **params) else: return Request(**params)
def preparation(self, resp): form = resp.xpath(u'//*[@id="aspnetForm"]') form_id = form.xpath(u"./@id").extract_first() formdata = { # u"__EVENTARGUMENT": u"saveToWindow=format:html;", u"__EVENTARGUMENT": u"saveToWindow=format:csv;", u"__EVENTTARGET": u"ctl00$ctl00$ctl00$MainContent$MainContent$ReportOutputPlaceHolder$uxTabControl$uxReportViewer", u"ctl00_ctl00_ctl00_MainContent_MainContent_ReportOutputPlaceHolder_uxTabControl_uxReportToolbar_Menu_ITCNT5_PageNumber_VI": u"1", u"ctl00$ctl00$ctl00$MainContent$MainContent$ReportOutputPlaceHolder$uxTabControl$uxReportToolbar$Menu$ITCNT5$PageNumber$DDD$L": u"1", u"ctl00$ctl00$ctl00$MainContent$MainContent$ReportOutputPlaceHolder$uxTabControl$uxReportToolbar$Menu$ITCNT5$PageNumber": u"1", u"ctl00$ctl00$ctl00$MainContent$MainContent$ReportOutputPlaceHolder$uxTabControl$uxReportToolbar$Menu$ITCNT6$PageCount": u"25", # u"ctl00_ctl00_ctl00_MainContent_MainContent_ReportOutputPlaceHolder_uxTabControl_uxReportToolbar_Menu_ITCNT11_SaveFormat_VI": u"html", # u"ctl00$ctl00$ctl00$MainContent$MainContent$ReportOutputPlaceHolder$uxTabControl$uxReportToolbar$Menu$ITCNT11$SaveFormat$DDD$L": u"html", # u"ctl00$ctl00$ctl00$MainContent$MainContent$ReportOutputPlaceHolder$uxTabControl$uxReportToolbar$Menu$ITCNT11$SaveFormat": u"Html", u"ctl00_ctl00_ctl00_MainContent_MainContent_ReportOutputPlaceHolder_uxTabControl_uxReportToolbar_Menu_ITCNT11_SaveFormat_VI": u"csv", u"ctl00$ctl00$ctl00$MainContent$MainContent$ReportOutputPlaceHolder$uxTabControl$uxReportToolbar$Menu$ITCNT11$SaveFormat$DDD$L": u"csv", u"ctl00$ctl00$ctl00$MainContent$MainContent$ReportOutputPlaceHolder$uxTabControl$uxReportToolbar$Menu$ITCNT11$SaveFormat": u"Csv", u"ctl00$ctl00$ctl00$MainContent$MainContent$ReportOutputPlaceHolder$uxTabControl$uxReportToolbar$Menu$ITCNT5$PageNumber$DDDState": u"{"windowsState":"0:0:-1:0:0:0:-10000:-10000:1:0:0:0"}", u"ctl00$ctl00$ctl00$MainContent$MainContent$ReportOutputPlaceHolder$uxTabControl$uxReportToolbar$Menu$ITCNT5$PageNumber$DDD$L$State": u"{"CustomCallback":""}", u"ctl00$ctl00$ctl00$MainContent$MainContent$ReportOutputPlaceHolder$uxTabControl$uxReportToolbar$Menu$ITCNT11$SaveFormat$DDDState": u"{"windowsState":"0:0:-1:430:140:1:68:165:1:0:0:0"}", u"ctl00$ctl00$ctl00$MainContent$MainContent$ReportOutputPlaceHolder$uxTabControl": u"{"activeTabIndex":1}", u"ctl00$ctl00$ctl00$MainContent$MainContent$ReportOutputPlaceHolder$uxTabControl$uxReportToolbar$Menu": u"{"selectedItemIndexPath":"","checkedState":""}", u"ctl00$ctl00$ctl00$MainContent$MainContent$ReportOutputPlaceHolder$uxTabControl$uxReportViewer": u"{"drillDown":{},"parameters":{},"cacheKey":"","currentPageIndex":0}", u"ctl00$ctl00$ctl00$MainContent$MainContent$ReportOutputPlaceHolder$uxTabControl$uxErrorGrid": u"{"keys":[],"callbackState":"BwQHAwIERGF0YQcnAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABwAHAAcAAgtGb3JtYXRTdGF0ZQcAAgVTdGF0ZQc/BwAHAAcABwAHAAIABQAAAIAJAgtUaW1lT2ZFcnJvcgcACQIAAgADBwQCAAcAAgEHAAcAAgACAQcABwAHAAcAAg1TaG93RmlsdGVyUm93CgIB","selection":""}", } yield SplashFormRequest.from_response( response=resp, formid=form_id, formdata=formdata, callback=self.get_station_data, # callback=self.parse_tags, dont_click=True, endpoint=u"execute", cache_args=[u"lua_source"], args={ u"http_method": u"POST", u"headers": { u"Content-Type": u"application/x-www-form-urlencoded", }, u"lua_source": script, })
def parse(self, response): csrf_token = response.xpath('//input[@name="csrf_token"]/@value').get() yield SplashFormRequest.from_response(response, formxpath='//form', formdata={ 'csrf_token': csrf_token, 'username': '******', 'password': '******' }, callback=self.after_login)
def start_requests(self): frm_data = {"email": self.e_mail, "password": self.senha} login_url = self.start_url + '/login/sign_in' yield SplashFormRequest(login_url, formdata=frm_data, callback=self.sign_in_me, errback=self.errback_func, endpoint='execute', cache_args=['lua_source'], args={'lua_source': script_10_sec_wait}, dont_filter=True)
def parse(self, response): # meta['splash']['args'] 包含了发往Splash的参数。 # meta['splash']['endpoint'] 指定了Splash所使用的endpoint,默认是render.html # meta['splash']['splash_url'] 覆盖了settings.py文件中配置的Splash URL # meta['splash']['splash_headers'] 运行你增加或修改发往Splash服务器的HTTP头部信息,注意这个不是修改发往远程web站点的HTTP头部 # meta['splash']['dont_send_headers'] 如果你不想传递headers给Splash,将它设置成True # meta['splash']['slot_policy'] 让你自定义Splash请求的同步设置 # meta['splash']['dont_process_response'] 当你设置成True后,SplashMiddleware不会修改默认的scrapy.Response请求. # 默认是会返回SplashResponse子类响应比如SplashTextResponse # meta['splash']['magic_response'] 默认为True,Splash会自动设置Response的一些属性,比如response.headers, response.body等 # SplashFormRequest使用 yield SplashFormRequest(response.url, self.next_parse, formdata={'name': '111'})
def set_renavam(self, response): error_message = response.selector.xpath( "//ul[contains(@class,'alert-error') and not(@style)]/li/span/text()" ).get("") if error_message: error_msg = { "error_type": "WRONG_CREDENTIALS", "details": error_message } self.errors.append(error_msg) self.logger.warning(error_msg) return # get the Captcha's options sitekey = response.selector.xpath( "//div[@class='g-recaptcha']/@data-sitekey").get("") gcaptcha_txt = self.solve_captcha(sitekey, response.url) if not gcaptcha_txt: return renavam_form = response.selector.xpath( "//form[.//td[contains(text(),'Renavam')]]") form_name = renavam_form.xpath("./@name").get("") frm_data = { "{}:_idcl".format(form_name): form_name.replace("form", "btAvancar"), 'g-recaptcha-response': gcaptcha_txt } form_inputs = renavam_form.xpath(".//input") for inpt in form_inputs: inpt_name = inpt.xpath("./@name").get("") inpt_val = inpt.xpath("./@value").get("") if ":Renavam" in inpt_name: inpt_val = self.renavam frm_data.update({inpt_name: inpt_val}) url = "http://www.detran.sp.gov.br" + renavam_form.xpath( "./@action").get("") yield SplashFormRequest(url, formdata=frm_data, callback=self.get_main_page, errback=self.errback_func, endpoint='execute', cache_args=['lua_source'], args={ 'lua_source': script_10_sec_wait, 'cookies': response.data['cookies'] }, dont_filter=True)
def parse_affiliates(self, response): countries for country in countries: yield SplashFormRequest( url='https://www.crossfit.com/affiliate-list', formxpath= "//div[@class='form-group']/select[@id='countryFilter']", formdata={'option': country}) paises = response.xpath("//table[@id='affiliateTable']/tbody/tr") for pais in paises: yield { 'gym name': country.xpath('.//td/a/text()').get(), 'local': country.xpath('.//td/text()').get(), 'country': country }
def parse(self, response): if self.password == '': self.logger.error('first add a password') return return SplashFormRequest.from_response( response, formdata= { 'email' : self.user_name, 'pass' : self.password }, callback = self.after_login, formid = 'loginform', dont_process_response = True )
def login_me(self, response): url = 'http://www.sefaz.ba.gov.br/scripts/ipva/dae/VeiculoCadastrado/ipva_texto_obter_desconto200.asp' frm_data = {'txt_renavam': self.renavam, 'txt_renavam1': ''} #yield FormRequest(url, formdata=frm_data, callback=self.get_main_page, # errback=self.errback_func, dont_filter=True) yield SplashFormRequest(url, formdata=frm_data, callback=self.get_main_page, errback=self.errback_func, endpoint='execute', cache_args=['lua_source'], args={ 'lua_source': script, 'cookies': response.data['cookies'] }, dont_filter=True)
def get_login_page(self, response): """Function to get request options to login. Used to get ReCaptcha token; image captcha value.""" # get the Captcha's options sitekey = response.selector.xpath( "//div[@class='g-recaptcha']/@data-sitekey").get("") gcaptcha_txt = self.solve_captcha(sitekey, response.url) if not gcaptcha_txt: return # Get options for request EVENTTARGET = response.selector.xpath( "//input[@id='__EVENTTARGET']/@value").get("") EVENTARGUMENT = response.selector.xpath( "//input[@id='__EVENTARGUMENT']/@value").get("") VIEWSTATE = response.selector.xpath( "//input[@id='__VIEWSTATE']/@value").get("") VIEWSTATEGENERATOR = response.selector.xpath( "//input[@id='__VIEWSTATEGENERATOR']/@value").get("") EVENTVALIDATION = response.selector.xpath( "//input[@id='__EVENTVALIDATION']/@value").get("") frm_data = { '__EVENTTARGET': EVENTTARGET, '__EVENTARGUMENT': EVENTARGUMENT, '__VIEWSTATE': VIEWSTATE, '__VIEWSTATEGENERATOR': VIEWSTATEGENERATOR, '__EVENTVALIDATION': EVENTVALIDATION, 'ctl00$conteudoPaginaPlaceHolder$txtRenavam': self.renavam, 'ctl00$conteudoPaginaPlaceHolder$txtPlaca': self.placa, 'g-recaptcha-response': gcaptcha_txt, 'ctl00$conteudoPaginaPlaceHolder$btn_Consultar': 'Consultar' } yield SplashFormRequest(self.start_url, formdata=frm_data, callback=self.login_me, errback=self.errback_func, endpoint='execute', cache_args=['lua_source'], args={ 'lua_source': script_10_sec_wait, 'cookies': response.data['cookies'] }, dont_filter=True)
def get_debito_calculado_ipva(self, response): valor_do_ipva = response.selector.xpath( "//font[contains(.,'Pagamento de cota')]/text()").get("").strip() ano_exercicio = response.selector.xpath( "//span[contains(.,'Ano Exercício')]/../input/@value").get( "").strip() data_do_vencimento = response.selector.xpath( "//span[contains(.,'Data do Vencimento')]/../input/@value").get( "").strip() valor_da_cota_unica = response.selector.xpath( "//span[contains(.,'Valor da Cota única')]/../input/@value").get( "").strip() print(valor_da_cota_unica) if valor_da_cota_unica: row_data = { 'valor_do_ipva': valor_do_ipva, 'ano_exercicio': ano_exercicio, 'data_do_vencimento': data_do_vencimento, 'valor_da_cota_unica': valor_da_cota_unica } if self.get_files: url = "http://www.sefaz.ba.gov.br/scripts/ipva/dae/VeiculoCadastrado/result_dae_avulso_ipva.asp" frm_data = { 'Lnum_cnpj_cpf_base': '', 'Lnum_cnpj_cpf_filial': '', 'Lnum_cnpj_cpf_digito': '' } #yield FormRequest(url, callback=self.test_file, # errback=self.errback_func, dont_filter=True) yield SplashFormRequest( url, formdata=frm_data, callback=self.print_html_to_pdf, #errback=self.errback_func, endpoint='execute', cache_args=['lua_source'], args={ 'lua_source': script, 'cookies': response.data['cookies'] }, meta={'row_data': row_data}, dont_filter=True) else: ipva_do_veiculo = self.result.get('ipva_do_veiculo', []) ipva_do_veiculo.append(row_data) self.result.update({'ipva_do_veiculo': ipva_do_veiculo})
def start_requests(self): tickers = [ '600001', '000777', ] for ticker in tickers: # try all tickers one by one if ticker[0] == '0': data = { # prepare for the requrest 'stock': ticker, 'searchkey': '年年度报告', 'category': 'category_ndbg_szsh', 'pageNum': '1', 'pageSize': '30', 'column': 'szse_main', # for Shenzhen stock exchange 'tabName': 'fulltext', 'sortName':'', 'sortType':'', 'limit': '', 'seDate': '', } elif ticker[0] == '6': data = { # prepare for the requrest 'stock': ticker, 'searchkey': '年年度报告', 'category': 'category_ndbg_szsh', 'pageNum': '1', 'pageSize': '30', 'column': 'sse', # for Shanghai stock exchange 'tabName': 'fulltext', 'sortName':'', 'sortType':'', 'limit': '', 'seDate': '', } else: print("Wrong ticker") continue yield SplashFormRequest( url='http://www.cninfo.com.cn/cninfo-new/announcement/query', formdata=data, callback=self.parse, # args={'wait': 2} )
def login(self, response): # 调用splash的SplashFormRequest,提交表单参数 yield SplashFormRequest.from_response( response=response, url=self.login_url, endpoint='execute', formdata={ 'user_agent': '', 'cookie': '' }, args={ 'wait': 30, 'lua_source': lg, 'proxy': 'http://:1080', 'search_key': self.search_key }, callback=self.after_login, errback=self.error_parse, )
def get_result_debito_ipva(self, response): cota3 = response.url.split("cota3=")[-1] url = "http://www.sefaz.ba.gov.br/scripts/ipva/dae/VeiculoCadastrado/debito_calculado_ipva.asp?cota3={}".format( cota3) today = dt.now().strftime("%d/%m/%Y") frm_data = {'txt_dtc_pagamento': today} #yield FormRequest(url, formdata=frm_data, callback=self.get_debito_calculado_ipva, # errback=self.errback_func, dont_filter=True) yield SplashFormRequest(url, formdata=frm_data, callback=self.get_debito_calculado_ipva, errback=self.errback_func, endpoint='execute', cache_args=['lua_source'], args={ 'lua_source': script, 'cookies': response.data['cookies'] }, dont_filter=True)
def login_me(self, response): login_url = "https://auth.netcombo.com.br/login" client_id = response.selector.xpath( "//input[@name='client_id']/@value").get("") redirect_uri = response.selector.xpath( "//input[@name='redirect_uri']/@value").get("") response_type = response.selector.xpath( "//input[@name='response_type']/@value").get("") scope = response.selector.xpath("//input[@name='scope']/@value").get( "") state = response.selector.xpath("//input[@name='state']/@value").get( "") authMs = response.selector.xpath("//input[@name='authMs']/@value").get( "") frm_data = { 'Username': self.login, 'password': self.senha, 'client_id': client_id, 'redirect_uri': redirect_uri, 'response_type': response_type, 'scope': scope, 'state': state, 'authMs': authMs, 'Auth_method': 'UP' } print(frm_data) yield SplashFormRequest(login_url, formdata=frm_data, callback=self.select_contract, errback=self.errback_func, endpoint='execute', cache_args=['lua_source'], args={ 'lua_source': script_30_sec_wait, 'cookies': response.data['cookies'], 'timeout': 60, 'images': 0 }, dont_filter=True)
def get_ipva_search(self, response): action = response.selector.xpath("//input[@id='action']/@value").get("") csrf_token = response.selector.xpath("//input[@id='csrf_token']/@value").get("") sitekey = response.selector.xpath( "//script[contains(@src,'recaptcha/api')]/@src").get("").split("render=")[-1] gcaptcha_txt = self.solve_captcha(sitekey, response.request.url, captcha_type=5, captcha_action='portal_consulta_renavam') if not gcaptcha_txt: return frm_data = {'action': action, 'renavam': response.meta['renavam'], 'csrf_token': csrf_token, 'recaptcha_response': gcaptcha_txt} ipva_url = "https://portalcontribuinte.sefin.ro.gov.br/Publico/__Resultado_Renavam_.jsp" yield SplashFormRequest(ipva_url, formdata=frm_data, endpoint='render.json', args=self.splash_args, meta={'renavam': response.meta['renavam']}, callback=self.get_ipva_result, dont_filter=True)
def login(self, response): # 调用splash的SplashFormRequest,提交表单参数(user、password) yield SplashFormRequest.from_response( response=response, url=self.login_url, formdata={ 'email': user, 'password': password }, endpoint='execute', args={ 'wait': 30, 'lua_source': log_lua_script, 'user_name': user, 'user_password': password, 'proxy': 'http://:1080' }, callback=self.after_login, errback=self.error_parse, )
def login_me(self, response): """Function to get request options to login. Used to get ReCaptcha token; image captcha value.""" # get the Captcha's options sitekey = response.selector.xpath( "//div[@class='g-recaptcha']/@data-sitekey").get("") gcaptcha_txt = self.solve_captcha(sitekey, response.url) if not gcaptcha_txt: return frm_data = { 'Renavam': self.renavam, 'Placa': self.placa, 'g-recaptcha-response': gcaptcha_txt} # yield FormRequest(self.start_url, formdata=frm_data, callback=self.get_main_page, dont_filter=True) yield SplashFormRequest(self.start_url, formdata=frm_data, callback=self.get_main_page, errback=self.errback_func, endpoint='execute', cache_args=['lua_source'], args={'lua_source': autos_detran_ro_script}, dont_filter=True)
def test_form_request_from_response(): # Copied from scrapy tests (test_from_response_submit_not_first_clickable) def _buildresponse(body, **kwargs): kwargs.setdefault('body', body) kwargs.setdefault('url', 'http://example.com') kwargs.setdefault('encoding', 'utf-8') return HtmlResponse(**kwargs) response = _buildresponse( """<form action="get.php" method="GET"> <input type="submit" name="clickable1" value="clicked1"> <input type="hidden" name="one" value="1"> <input type="hidden" name="two" value="3"> <input type="submit" name="clickable2" value="clicked2"> </form>""") req = SplashFormRequest.from_response( response, formdata={'two': '2'}, clickdata={'name': 'clickable2'}) assert req.method == 'GET' assert req.meta['splash']['args']['url'] == req.url fs = cgi.parse_qs(req.url.partition('?')[2], True) assert fs['clickable2'] == ['clicked2'] assert 'clickable1' not in fs assert fs['one'] == ['1'] assert fs['two'] == ['2']
def login(self, response): logging.debug('Screenshot: %s', response.data['png']) try: headers = json.loads(self.rpt_mp.headers) except json.decoder.JSONDecodeError as err: headers = {} formdata, url, method = fill_login_form(response.url, response.text, response.meta['username'], response.meta['password']) # because following request is a new Splash request response.meta.pop('splash') response.meta.pop('_splash_processed') return SplashFormRequest.from_response(response, endpoint='render.json', args=self.splash_args, headers=headers, dont_filter=True, formname='fm', formdata=formdata, meta=response.meta, callback=self.after_login)
def get_main_page(self, response): """Redirect to main page.""" error_message = response.selector.xpath("//div[@class='msgErro']/text()").get("") print(error_message) if "Nenhum registro encontrado, verifique os dados digitados" in error_message: error_msg = {"error_type": "WRONG_CREDENTIALS", "details": error_message} self.errors.append(error_msg) self.logger.warning(error_msg) return elif "Confirme que você não é um robô." in error_message: self.incorrect_captcha_report( self.captcha_service, self.g_recaptcha_id) if self.incorrect_captcha_retries > 0: yield Request(self.start_url, callback=self.login_me, meta={'dont_merge_cookies': True}, dont_filter=True) return regex = re.compile(r'\s+') self.result['placa'] = regex.sub(" ", response.selector.xpath("//span[text()='Placa']/../text()").get("").strip()) self.result['marca_modelo'] = regex.sub(" ", response.selector.xpath("//span[text()='Marca/Modelo']/../text()").get("").strip()) self.result['fabricacao_modelo'] = regex.sub(" ", response.selector.xpath("//span[text()='Fabricacao/Modelo']/../text()").get("").strip()) self.result['cor'] = regex.sub(" ", response.selector.xpath("//span[text()='Cor']/../text()").get("").strip()) renavam = regex.sub(" ", response.selector.xpath("//span[text()='Renavam']/../text()").get("").strip()) self.result['renavam'] = renavam self.result['tipo'] = regex.sub(" ", response.selector.xpath("//span[text()='Tipo']/../text()").get("").strip()) self.result['carroceria'] = regex.sub(" ", response.selector.xpath("//span[text()='Carroceria']/../text()").get("").strip()) self.result['especie'] = regex.sub(" ", response.selector.xpath("//span[text()='Especie']/../text()").get("").strip()) self.result['lugares'] = regex.sub(" ", response.selector.xpath("//span[text()='Lugares']/../text()").get("").strip()) self.result['categoria'] = regex.sub(" ", response.selector.xpath("//span[text()='Categoria']/../text()").get("").strip()) self.result['potencia'] = regex.sub(" ", response.selector.xpath("//span[text()='Potência']/../text()").get("").strip()) self.result['combustivel'] = regex.sub(" ", response.selector.xpath("//span[text()='Combustível']/../text()").get("").strip()) self.result['nome_do_proprietario'] = regex.sub(" ", response.selector.xpath("//span[text()='Nome do Proprietário']/../text()").get("").strip()) self.result['situacao_lacre'] = regex.sub(" ", response.selector.xpath("//span[text()='Situação Lacre']/../text()").get("").strip()) self.result['proprietario_anterior'] = regex.sub(" ", response.selector.xpath("//span[text()='Proprietário Anterior']/../text()").get("").strip()) self.result['origem_dos_dados_do_veiculo'] = regex.sub(" ", response.selector.xpath("//span[text()='Origem dos Dados do Veículo']/../text()").get("").strip()) self.result['placa_anterior'] = regex.sub(" ", response.selector.xpath("//span[text()='Placa Anterior']/../text()").get("").strip()) self.result['municipio_de_emplacamento'] = regex.sub(" ", response.selector.xpath("//span[text()='Municipio de Emplacamento']/../text()").get("").strip()) self.result['licenciado_ate'] = regex.sub(" ", response.selector.xpath("//span[text()='Licenciado ate']/../text()").get("").strip()) self.result['adquirido_em'] = regex.sub(" ", response.selector.xpath("//span[text()='Adquirido em']/../text()").get("").strip()) self.result['situacao'] = regex.sub(" ", response.selector.xpath("//span[text()='Situação']/../text()").get("").strip()) self.result['restricao_a_venda'] = regex.sub(" ", response.selector.xpath("//span[text()='Restrição a Venda']/../text()").get("").strip()) self.result['informacoes_pendentes_originadas_das_financeiras_via_sng_sistema_nacional_de_gravame'] = regex.sub(" ", response.selector.xpath( "//span[text()='Informações PENDENTES originadas das financeiras via SNG - Sistema Nacional de Gravame']/../text()").get("").strip()) self.result['impedimentos'] = regex.sub(" ", response.selector.xpath("//span[text()='Impedimentos']/../text()").get("").strip()) debitos_rows = response.selector.xpath("//div[@id='corpo_DebitosVeiculo']/div[@id='Integral']/table[@id='TabelaIntegral']/tbody/tr") debitos = [] for row in debitos_rows: descricao = regex.sub(" ", row.xpath("./td[1]/text()").get("").strip()) vencimento = regex.sub(" ", row.xpath("./td[2]/text()").get("").strip()) nominal_r = regex.sub(" ", row.xpath("./td[3]/text()").get("").strip()) corrigido_r = regex.sub(" ", row.xpath("./td[4]/text()").get("").strip()) desconto_r = regex.sub(" ", row.xpath("./td[5]/text()").get("").strip()) juros_r = regex.sub(" ", row.xpath("./td[6]/text()").get("").strip()) multa_r = regex.sub(" ", row.xpath("./td[7]/text()").get("").strip()) atual_r = regex.sub(" ", row.xpath("./td[8]/text()").get("").strip()) debitos.append({ 'descricao': descricao, 'vencimento': vencimento, 'nominal_r': nominal_r, 'corrigido_r': corrigido_r, 'desconto_r': desconto_r, 'juros_r': juros_r, 'multa_r': multa_r, 'atual_r': atual_r}) self.result.update({'debitos': debitos}) infracoes_em_autuacao_rows = response.selector.xpath( "//div[@id='corpo_AutuacoesVeiculo']//tbody/tr") infracoes_em_autuacao = [] for row in infracoes_em_autuacao_rows: num_auto = " ".join([s.strip() for s in row.xpath("./td[1]/text()").extract() if s.strip()]) if "veículo até o momento." in num_auto: break status = regex.sub(" ", " ".join(row.xpath("./td[2]/text()").extract()).strip()) descricao = regex.sub(" ", " ".join(row.xpath("./td[3]/text()").extract()).strip()) local_complemento = regex.sub(" ", " ".join(row.xpath("./td[4]/text()").extract()).strip()) valor = regex.sub(" ", " ".join(row.xpath("./td[5]/text()").extract()).strip()) infracoes_em_autuacao.append({ 'num_auto': num_auto, 'status': status, 'descricao': descricao, 'local_complemento': local_complemento, 'valor': valor}) self.result.update({'infracoes_em_autuacao': infracoes_em_autuacao}) penalidades_multas_rows = response.selector.xpath( "//div[@id='corpo_MultasVeiculo']//tbody/tr") penalidades_multas = [] for row in penalidades_multas_rows: num_auto = regex.sub(" ", " ".join(row.xpath("./td[1]/text()").extract()).strip()) if "veículo até o momento." in num_auto: break status = regex.sub(" ", " ".join(row.xpath("./td[2]/text()").extract()).strip()) descricao = regex.sub(" ", " ".join(row.xpath("./td[3]/text()").extract()).strip()) local_complemento = regex.sub(" ", " ".join(row.xpath("./td[4]/text()").extract()).strip()) valor = regex.sub(" ", " ".join(row.xpath("./td[5]/text()").extract()).strip()) penalidades_multas.append({ 'num_auto': num_auto, 'status': status, 'descricao': descricao, 'local_complemento': local_complemento, 'valor': valor}) self.result.update({'penalidades_multas': penalidades_multas}) recursos_infracao_rows = response.selector.xpath( "//div[@id='corpo_RecursosInfracao']//tbody/tr") recursos_infracao = [] for row in recursos_infracao_rows: processo = regex.sub(" ", " ".join(row.xpath("./td[1]/text()").extract()).strip()) if "veículo até o momento." in processo: break n_proc_renainf = regex.sub(" ", " ".join(row.xpath("./td[2]/text()").extract()).strip()) numero_do_auto = regex.sub(" ", " ".join(row.xpath("./td[3]/text()").extract()).strip()) detalhamento_da_infracao = regex.sub(" ", " ".join(row.xpath("./td[4]/text()").extract()).strip()) situacao_do_processo = regex.sub(" ", " ".join(row.xpath("./td[5]/text()").extract()).strip()) recursos_infracao.append({ 'processo': processo, 'nº_proc_renainf': n_proc_renainf, 'numero_do_auto': numero_do_auto, 'detalhamento_da_infracao': detalhamento_da_infracao, 'situacao_do_processo': situacao_do_processo}) self.result.update({'recursos_infracao': recursos_infracao}) dare_btn = response.selector.xpath("//input[@id='BotaoIntegral']") if self.get_files and dare_btn: dare_url = "https://consulta.detran.ro.gov.br/CentralDeConsultasInternet/Internet/DARE.asp" hdListaIdDebitos = response.selector.xpath("//input[@name='hdListaIdDebitos']/@value").get("") hdPlaca = response.selector.xpath("//input[@name='hdPlaca']/@value").get("") frm_data = {'hdListaIdDebitos': hdListaIdDebitos, 'hdPlaca': hdPlaca} print(frm_data) yield SplashFormRequest(dare_url, formdata=frm_data, endpoint='render.json', args=self.splash_args, meta={'result_key': 'debitos'}, callback=self.print_html_to_pdf, dont_filter=True) ipva_url = "https://portalcontribuinte.sefin.ro.gov.br/Publico/ConsultaRenavam.jsp?renavam={}".format(renavam) yield Request(ipva_url, callback=self.get_ipva_search, meta={'renavam': renavam}, dont_filter=True)
def parse(self, response): #专利名称 title = response.xpath("//h1") #申请公布号 openNo = response.xpath( '//div[@class="cp_box"]/div[@class="cp_linr"]/ul/li[1]') #申请公布日 openDate = response.xpath( '//div[@class="cp_box"]/div[@class="cp_linr"]/ul/li[2]') #申请号 applyNo = response.xpath( '//div[@class="cp_box"]/div[@class="cp_linr"]/ul/li[3]') #申请日 applyDate = response.xpath( '//div[@class="cp_box"]/div[@class="cp_linr"]/ul/li[4]') #申请人 applyPeople = response.xpath( '//div[@class="cp_box"]/div[@class="cp_linr"]/ul/li[5]') #发明人 inventor = response.xpath( '//div[@class="cp_box"]/div[@class="cp_linr"]/ul/li[6]') #地址 address = response.xpath( '//div[@class="cp_box"]/div[@class="cp_linr"]/ul/li[8]') #分类号 classifyNo = response.xpath( '//div[@class="cp_box"]/div[@class="cp_linr"]/ul/li[9]') #摘要 summery = response.xpath( '//div[@class="cp_box"]/div/div[@class="cp_jsh"]') #二维码 qrcodeurls = response.xpath('//div[@class="cp_box"]/a/img/@src') #缩略图 thumb = response.xpath( '//div[@class="cp_box"]/div[@class="cp_img"]/img/@src') baseUrl = 'http://epub.sipo.gov.cn/' for ti, on, od, an, ad, ap, inv, add, cf, su, qr, th in zip( title, openNo, openDate, applyNo, applyDate, applyPeople, inventor, address, classifyNo, summery, qrcodeurls, thumb): item = ZlItem() item['title'] = ti.xpath( "string(.)").extract_first().strip().split()[1] item['openNo'] = on.xpath("string(.)").extract_first()[6:] item['openDate'] = od.xpath("string(.)").extract_first()[6:] item['applyNo'] = an.xpath("string(.)").extract_first()[5:] item['applyDate'] = ad.xpath("string(.)").extract_first()[4:] item['applyPeople'] = ap.xpath("string(.)").extract_first()[4:] item['inventor'] = ''.join( inv.xpath("string(.)").extract_first()[4:].strip().split()) item['address'] = add.xpath("string(.)").extract_first()[3:] item['classifyNo'] = cf.xpath( "string(.)").extract_first().split()[0][4:] item['summery'] = su.xpath("string(.)").extract_first().split()[1] item['qrcodeurls'] = baseUrl + qr.extract() item['thumb'] = baseUrl + th.extract() yield item headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', } yield SplashFormRequest( url='http://epub.sipo.gov.cn/pam.action', callback=self.downparse, method='POST', args={'wait': 5}, formdata={ "strWhere": "PN='" + "{}".format(on.xpath("string(.)").extract_first()[6:]) + "'", "strSources": "pip" }, headers=headers)