def parse(self, response): # deleting files: try: if os.path.exists("newPost.txt"): os.remove("newPost.txt") if os.path.exists("newgetLinks.txt"): os.remove("newgetLinks.txt") if os.path.exists("scrappedurls.txt"): os.remove("scrappedurls.txt") except: pass # print "Status:",response.status # print "Request Headers" # print response.request.headers.items() # print "\n\n" # print "Response Headers" # print response.headers.items() # print "\n\n" login_user = self.credentials[response.request.url][0] print login_user login_pass = self.credentials[response.request.url][1] print login_pass args, url, method, name, number = fill_login_form(response.url, response.body, login_user, login_pass) if name: yield FormRequest.from_response( response, method=method, formdata=args, formname=name, callback=self.after_login ) else: yield FormRequest.from_response( response, method=method, formdata=args, formnumber=number, callback=self.after_login )
def parseVoteSearchResults(self, response): sel = Selector(response) validationstr = self.getValidationString(response) # Parse the first page of results for voteItem in self.parseVoteTableResults(response): yield voteItem pages = 1 # Grab the vote table voteTable = sel.css('#SelectVoteHistoryView_GridView1') rows = voteTable.css('tr') # The last row contains the page links paginationRow = rows[-1] firstCellElement = paginationRow.css('td>span::text') if not firstCellElement: # Can't find the navigate bar?? return firstCellContent = firstCellElement.extract()[0] # Check if there are any pages.. if str(firstCellContent).isdigit(): thisPage = int(firstCellContent)-1 # The last cell contains some js to skip to the final page lastCell = paginationRow.css('td')[-1] lastPageElem = lastCell.css('a::attr(onclick)') if len(lastPageElem) != 1: # We're on the last set of pages, so there's no last page link return lastPageLink = lastPageElem[0].extract() # Need to pull page number out of this: javascript:__gvSelectVoteHistoryView_GridView1.callback("69|0|/wFlpZg1Fw7NYQJzL0vuw+Y57ZPx1ug=|"); return false; # It's the first value in the param string # TODO: Error handling... pages = int(re.split('\"|\|',lastPageLink)[1]) + 1 # Only iterate over pages if this is a new table if response.meta['fetch']: for i in range(thisPage,min([pages,thisPage+11])): fetch = False if i == thisPage + 10: fetch = True if validationstr: # The fact we have a validationstr means we need a surrogate form (this is a callback) form = response.meta['form'] yield FormRequest.from_response(form, callback=self.parseVoteSearchResults, formdata={'__CALLBACKID':'SelectVoteHistoryView$GridView1', '__CALLBACKPARAM':str(i)+'|0|/wFlpZg1Fw7NYQJzL0vuw+Y57ZPx1ug=||' + str(thisPage) + '|0|/wFlpZg1Fw7NYQJzL0vuw+Y57ZPx1ug=|/wFk7SDVa18iduwa7ivhFHIM55t+AhI=', '__EVENTVALIDATION':validationstr},meta={'fetch':fetch,'form':form}) else: # Should only get here if this is first page #inspect_response(response) yield FormRequest.from_response(response, callback=self.parseVoteSearchResults, formdata={'__CALLBACKID':'SelectVoteHistoryView$GridView1', '__CALLBACKPARAM':str(i)+'|0|/wFlpZg1Fw7NYQJzL0vuw+Y57ZPx1ug=||' + str(thisPage) + '|0|/wFlpZg1Fw7NYQJzL0vuw+Y57ZPx1ug=|/wFk7SDVa18iduwa7ivhFHIM55t+AhI='},meta={'fetch':fetch,'form':response})
def search(self, response): # Fill in form (search field) with default term "data scientist" and search. data_form = None if 'data_form' in response.meta.keys(): data_form = response.meta['data_form'] else: data_form = self.get_input_data(self.input_file) if not data_form: return elif len(data_form) == 1: key, value = data_form.popitem() return FormRequest.from_response(response, formdata={"newNtk": key, "newNtt": value}, callback=self.parse_results) else: key, value = data_form.popitem() form_request = FormRequest.from_response(response, formdata={"newNtk": key, "newNtt": value}, callback=self.search) form_request.meta['data_form'] = data_form return form_request
def parse_start_url(self, response): base_data = { 'name': '', 'country': '', '_state': '1', 'language': '0', 'qualification': '0--', 'level': '0', 'speciality': '0', '_isTrainer': 'on' } data = base_data.copy() data['profession'] = str(PROFESSION) form_request = FormRequest.from_response(response, formdata=data) return form_request requests = [] for profession in PROFESSIONS: data = base_data.copy() data['profession'] = str(PROFESSION) form_request = FormRequest.from_response(response, formdata=data) requests.append(form_request) return requests
def parse(self, response): # args, url, method = fill_login_form(response.url, response.body, self.login_user, self.login_pass) #return FormRequest(url, method=method, formdata=args, dont_filter=True,callback=self.after_login) args, url, method, name , number = fill_login_form(response.url, response.body, self.login_user, self.login_pass) credentials = list() tmpparam = dict() print args for a in args: if a[0].find("user") > -1 or a[0].find("admin") > -1: tmpparam["userid"] = a[0] if a[0].find("password") > -1: tmpparam["passwordid"] = a[0] # tmpparam["submit"] = "submit" tmpparam["url"] = self.start_urls[0] tmpparam["login"] = "" credentials.append(tmpparam) f = open("json/credentials.json", 'w') f.write(json.dumps(credentials,indent= 4, sort_keys = True)) f.close() if name: yield FormRequest.from_response(response, method=method, formdata=args, formname=name, dont_filter=True,callback=self.after_login) else: yield FormRequest.from_response(response, method=method, formdata=args, formnumber=number, dont_filter=True,callback=self.after_login)
def parse_from(self, response): selector = Selector(response) stop_ids = selector.css('#confirm1_ddlTravellingTo option[value!="-1"]::attr(value)').extract() for stop_id in stop_ids: request = FormRequest.from_response(response, formname='ctl01', formdata={ 'confirm1$ddlLeavingFromMap': response.meta['stop_from'], 'confirm1$ddlTravellingTo': stop_id, 'confirm1$btnSearch': 'Search', }, callback=self.parse_to_stop) request.meta['stop_to'] = stop_id request.meta['stop_from'] = response.meta['stop_from'] yield request # reapeat to get the first stop (from) request = FormRequest.from_response(response, formname='ctl01', formdata={ 'confirm1$ddlLeavingFromMap': response.meta['stop_from'], 'confirm1$ddlTravellingTo': stop_id, 'confirm1$btnSearch': 'Search', }, callback=self.parse_from_stop) request.meta['stop_to'] = stop_id request.meta['stop_from'] = response.meta['stop_from'] yield request
def listing_parser(self, response): """ Given a paginated listing of alumni, parse members and reparse individual page and rerun on next pages """ # Get all pages x = HtmlXPathSelector(response) # if has numbered responses pages_hrefs = [i.extract().split("'")[1] for i in x.select("//div[contains(@class,'rgNumPart')]/a/@href")] pages = x.select("//div[contains(@class,'rgNumPart')]/a") requests = [] # if the first link is a previous pages, delete it if "Previous Pages" in pages[0].extract(): del pages_hrefs[0] # if next pages, pop and rerun listing_parser on it if "Next Pages" in pages[-1].extract(): requests.append(FormRequest.from_response(response, formdata = {'__EVENTTARGET' : pages_hrefs.pop(), }, callback=self.listing_parser)) # parse each page including the first with scraper for href in pages_hrefs: requests.append(FormRequest.from_response(response, formdata = {'__EVENTTARGET' : href, }, callback=self.listing_scraper)) return requests
def parse(self, response): # args, url, method = fill_login_form(response.url, response.body, self.login_user, self.login_pass) #return FormRequest(url, method=method, formdata=args, dont_filter=True,callback=self.after_login) args, url, method, name , number = fill_login_form(response.url, response.body, self.login_user, self.login_pass) if name: yield FormRequest.from_response(response, method=method, formdata=args, formname=name, dont_filter=True,callback=self.after_login) else: yield FormRequest.from_response(response, method=method, formdata=args, formnumber=number, dont_filter=True,callback=self.after_login)
def test_from_response_override_method(self): response = _buildresponse( '''<html><body> <form action="/app"></form> </body></html>''') request = FormRequest.from_response(response) self.assertEqual(request.method, 'GET') request = FormRequest.from_response(response, method='POST') self.assertEqual(request.method, 'POST')
def test_from_response_override_method(self): response = _buildresponse( """<html><body> <form action="/app"></form> </body></html>""" ) request = FormRequest.from_response(response) self.assertEqual(request.method, "GET") request = FormRequest.from_response(response, method="POST") self.assertEqual(request.method, "POST")
def test_from_response_override_url(self): response = _buildresponse( '''<html><body> <form action="/app"></form> </body></html>''') request = FormRequest.from_response(response) self.assertEqual(request.url, 'http://example.com/app') request = FormRequest.from_response(response, url='http://foo.bar/absolute') self.assertEqual(request.url, 'http://foo.bar/absolute') request = FormRequest.from_response(response, url='/relative') self.assertEqual(request.url, 'http://example.com/relative')
def parse(self, response): # DEBUG: Uncomment following line if restarting from specific record number. #return self.parse_main_list(response) # Submit empty form to obtain all data return [FormRequest.from_response(response, formdata={ "p_request": "RECHR" }, callback=self.parse_main_list)]
def parse(self, response): return FormRequest.from_response( response, formdata=data, method="GET", callback=self.search_result )
def login(self, response): with open(self.login_path, encoding='utf8') as f: login_data = json.load(f) yield FormRequest.from_response(response, formdata=login_data, formid='login-nav', callback=self.check_login)
def login(self, response): self._log_page(response, 'amazon_login.html') return [FormRequest.from_response(response, \ formdata = self.formdata,\ headers = self.headers,\ meta = {'cookiejar':response.meta['cookiejar']},\ callback = self.parse_item)]#success login
def parse(self, response): if 'login' in response.url: return [FormRequest.from_response( response, formdata={'j_username': self.username, 'j_password': self.password}, callback=self.after_login)]
def after_login(self, response): if 'BIN Lookup' in response.body: for binnum in self.bins: binnum = binnum.strip() yield FormRequest.from_response( response, formdata={'search': binnum}, callback=self.got_bin)
def parse(self, response): yield FormRequest.from_response( response, formname='frmConsulta', formdata={'data':'(@DTDE >="'+self.iDate+'") E (@DTDE <="'+self.fDate+'")', 'b':'ACOR'}, callback=self.parseSearchResults )
def course_terms_page(self, response): for term in self.terms: yield FormRequest.from_response( response, formxpath='/html/body/div[5]/form', formdata={'TERM': term}, meta={'term': term})
def parse(self, response): if parameter.login==True: args, url, method = fill_login_form(response.url, response.body, self.login_user, self.login_pass) #return FormRequest(url, method=method, formdata=args, callback=self.after_login) #args={'username':'******', 'password':'******'} #print args argsdict={} print response.headers for i in args: argsdict[i[0]]=i[1] print argsdict #print response.request return FormRequest.from_response( response, formdata=argsdict, dont_filter=True, meta = {'dont_merge_cookies': True}, callback=self.after_login ) else: #args, url, method = fill_login_form(response.url, response.body, self.login_user, self.login_pass) return Request( response.url, meta = {'dont_merge_cookies': True}, callback=self.parse_page )
def login(self, response): """ This is where I am stuck. Obviously response is not defined. """ return [FormRequest.from_response(response, formdata={'Email': '', 'Passwd': ''}, callback = self.after_login)]
def parse_workspecs_list(self, response): log.msg('parse_workspecs_list : [%s]' % response.url) hxs = HtmlXPathSelector(response) current_page = extract_current_page(hxs) total_pages = extract_total_pages(hxs) log.msg('Current page : %d / %d' % (current_page, total_pages)) workspecs_a = hxs.select('//a[contains(@id, "%s")]' % WORKSPEC_A_ID) log.msg('Number of workspecs : %d' % len(workspecs_a)) for a in workspecs_a: item = WorkSpecItem() item['shortname'] = a.select('text()').extract()[0] url = a.select('@href').extract()[0] item['url'] = url item['phid'] = a.select('@href').re(phid_re)[0] yield item full_url = urljoin_rfc(response.url, url) self.crawler.stats.inc_value('workspec_pages_queued') yield Request(full_url, callback=self.parse_workspec_page, dont_filter=True, meta={'phid':item['phid']}) if current_page < total_pages: formdata = dict(POST_DEFAULT_DATA) formdata[CURRENT_PAGE_INPUT] = str(current_page + 1) log.msg('Queueing next page : %d' % (current_page + 1)) yield FormRequest.from_response(response, dont_click=True, formdata=formdata, callback=self.parse_workspecs_list)
def parse_send_email(self, response): user_id = response.meta["user_id"] username = response.meta["username"] if any(message in response.body for message in self.CANT_SEND_EMAIL_MESSAGES): can_send_email = False else: can_send_email = True if can_send_email: return [FormRequest.from_response(response, formnumber=self.EMAIL_FORM_NUMBER, formdata={ "emailsubject" : settings["TITLE"], "message" : settings["MESSAGE"] % username }, meta = { "user_id" : user_id, "dont_redirect" : True }, callback=self.do_nothing, dont_filter=True)] else: self.log("Cannot send email to user_id=%s" % user_id) return self.make_send_message_request(user_id, username)
def parse(self, response): hxs = HtmlXPathSelector(response) id = hxs.select("//table[@class='lnkEvents']/tr/td[@class='tabUnselR']/a/@id").extract()[0].replace("_", "$") print id __EVENTTARGET = hxs.select("//input[@id='__EVENTTARGET']/@value").extract() __EVENTARGUMENT = hxs.select("//input[@id='__EVENTARGUMENT']/@value").extract() __LASTFOCUS = hxs.select("//input[@id='__LASTFOCUS']/@value").extract() __VIEWSTATE = hxs.select("//input[@id='__VIEWSTATE']/@value").extract() __PREVIOUSPAGE = hxs.select("//input[@id='__PREVIOUSPAGE']/@value").extract() __EVENTVALIDATION = hxs.select("//input[@id='__EVENTVALIDATION']/@value").extract() yield FormRequest.from_response( response, formdata={ "__EVENTTARGET": id, "__EVENTARGUMENT": __EVENTARGUMENT, "__LASTFOCUS": __LASTFOCUS, "__VIEWSTATE": __VIEWSTATE, "__PREVIOUSPAGE": __PREVIOUSPAGE, "__EVENTVALIDATION": __EVENTVALIDATION, }, callback=self.parse_event_list, dont_click=True, # dont_filter=True, )
def parse_send_message(self, response): user_id = response.meta["user_id"] username = response.meta["username"] can_send_message = True if any(message in response.body for message in self.CANT_SEND_EMAIL_MESSAGES): can_send_message = False else: can_send_message = True if can_send_message: self.log("Send message to %s" % username) return [FormRequest.from_response(response, formnumber=self.MESSAGE_FORM_NUMBER, formdata={ "title" : settings["TITLE"], "message" : settings["MESSAGE"] % username }, meta = { "user_id" : user_id, "dont_redirect" : True }, callback=self.do_nothing, dont_filter=True)] else: self.log("Cannot send message to this user")
def _handle_captcha(self, response, callback): captcha_solve_try = response.meta.get('captcha_solve_try', 0) url = response.url self.log("Captcha challenge for %s (try %d)." % (url, captcha_solve_try), level=INFO) captcha = self._solve_captcha(response) if captcha is None: self.log( "Failed to guess captcha for '%s' (try: %d)." % ( url, captcha_solve_try), level=ERROR ) result = None else: self.log( "On try %d, submitting captcha '%s' for '%s'." % ( captcha_solve_try, captcha, url), level=INFO ) meta = response.meta.copy() meta['captcha_solve_try'] = captcha_solve_try + 1 result = FormRequest.from_response( response, formname='', formdata={'field-keywords': captcha}, callback=callback, dont_filter=True, meta=meta) return result
def parse_send_message(self, response): user_id = response.meta["user_id"] username = response.meta["username"] can_send_message = True if any(message in response.body for message in self.CANT_SEND_EMAIL_MESSAGES): can_send_message = False else: can_send_message = True if can_send_message: hxs = HtmlXPathSelector(response) form_token = hxs.select('//input[@name="form_token"]/@value').extract()[0].strip() creation_time = hxs.select('//input[@name="creation_time"]/@value').extract()[0].strip() return [FormRequest.from_response(response, formnumber=self.MESSAGE_FORM_NUMBER, formdata={ "subject" : settings["TITLE"], "message" : settings["MESSAGE"] % username, "post" : "Submit", "creation_time" : creation_time, "form_token" : form_token }, meta = { "user_id" : user_id, "dont_redirect" : True }, callback=self.do_nothing, dont_filter=True)] else: self.log("Cannot send message to this user")
def parse(self, response): if len(response.xpath('//div[@class="over18-notice"]')) > 0: if self._retries < PTTSpider.MAX_RETRY: self._retries += 1 logging.warning('retry {} times...'.format(self._retries)) yield FormRequest.from_response(response, formdata={'yes': 'yes'}, callback=self.parse) else: logging.warning('!!!!!!!!!!!!!!!!!you cannot pass!!!!!!!!!!!!!!') else: filename = response.url.split('/')[-2] + '.html' with open(filename, 'wb') as f: f.write(response.body) self._pages += 1 for href in response.css('.r-ent > div.title > a::attr(href)'): url = response.urljoin(href.extract()) yield scrapy.Request(url, callback=self.parse_post) if self._pages < PTTSpider.MAX_PAGES: next_page = response.xpath(u'//div[@id="action-bar-container"]//a[contains(text(), "上頁")]/@href') logging.warning(next_page) logging.warning('231') if next_page: url = response.urljoin(next_page[0].extract()) yield scrapy.Request(url, self.parse) else: logging.warning('no next page') else: logging.warning('max pages reached')
def parse(self, response): hxs = HtmlXPathSelector(response) UFs = hxs.select('//*[@id="sgUe"]/option/@value').extract() for UF in UFs: if UF != '': yield FormRequest.from_response(response, formnumber=0, formdata={'acao':'pesquisar','sgUe':UF,'candidatura':'6','parcial':'0'}, callback=self.parseform, meta={'uf':UF})
def parse_legislative(self, response): yield FormRequest.from_response(response, formname='Form1', formdata={'Deptos': 'TODOS', 'ir.x': '14', 'ir.y': '11', }, callback=self.parse_politicianprofile)
def parse_home(self, response): ''' This method has multiple purposes: 1) Handle failed logins due to facebook 'save-device' redirection 2) Set language interface, if not already provided 3) Navigate to given page ''' #handle 'save-device' redirection if response.xpath("//div/a[contains(@href,'save-device')]"): self.logger.info('Going through the "save-device" checkpoint') return FormRequest.from_response( response, formdata={'name_action_selected': 'dont_save'}, callback=self.parse_home) #set language interface if self.lang == '_': if response.xpath("//input[@placeholder='Search Facebook']"): self.logger.info('Language recognized: lang="en"') self.lang = 'en' elif response.xpath("//input[@placeholder='Buscar en Facebook']"): self.logger.info('Language recognized: lang="es"') self.lang = 'es' elif response.xpath( "//input[@placeholder='Rechercher sur Facebook']"): self.logger.info('Language recognized: lang="fr"') self.lang = 'fr' elif response.xpath("//input[@placeholder='Cerca su Facebook']"): self.logger.info('Language recognized: lang="it"') self.lang = 'it' elif response.xpath( "//input[@placeholder='Pesquisa no Facebook']"): self.logger.info('Language recognized: lang="pt"') self.lang = 'pt' else: raise AttributeError( 'Language not recognized\n' 'Change your interface lang from facebook ' 'and try again') #navigate to provided page href = response.urljoin(self.page) self.logger.info('Scraping facebook page {}'.format(href)) return scrapy.Request(url=href, callback=self.parse_page, meta={'index': 1})
def process_search_again(self, failure): #sometimes the scraper loses the sequence in the paginator (server returns a 411 error), #when that occurs, then the same request will be sent again, #until it can be fullfilled or it is manually aborted. if (failure.check(HttpError)): self.logger.info( '[ComprasVisibles] HTTPERROR se intetara scrapear de nuevo') next_page_request = FormRequest.from_response( self.controlPage, formname='aspnetForm', formid='aspnetForm', formdata=self.formdata, callback=self.process_search, priority=10, errback=self.process_search_again, dont_filter=True) yield next_page_request
def parse(self, response): #parse回调函数 # 响应Cookie Cookie1 = response.headers.getlist( 'Set-Cookie') #查看一下响应Cookie,也就是第一次访问注册页面时后台写入浏览器的Cookie print(Cookie1) print('登录中') """第二次用表单post请求,携带Cookie、浏览器代理、用户登录信息,进行登录给Cookie授权""" return [ FormRequest.from_response( response, url='http://zhjw.scu.edu.cn/j_spring_security_check', #真实post地址 meta={'cookiejar': response.meta['cookiejar']}, headers=self.header, formdata=self.logindata, callback=self.next, ) ]
def parse(self, response): # 获取验证码图片所在地址,获取后赋给captcha变量,此时captcha为一个列表 captcha = response.xpath('//img[@id="captcha_image"]/@src').extract() # 因为登录时有时网页有验证码,有时网页没有验证码 # 所以需要判断此时是否需要输入验证码,若captcha列表中有元素,说明有验证码信息 if len(captcha) > 0: print("此时有验证码") # 设置将验证码图片存储到本地的本地地址 localpath = "E:/python_down/captcha.png" # 将服务器中的验证码图片存储到本地,供我们在本地直接进行查看 urllib.request.urlretrieve(captcha[0], filename=localpath) print("请查看本地图片captcha.png并输入对应验证码:") # 通过input()等待我们输入对应的验证码并赋给captcha_value变量 captcha_value = input() # 设置要传递的post信息 data = { # 设置登录账号,格式为账号字段名:具体账号 "form_email": "18606590295", # 设置登录密码,格式为密码字段名:具体密码,读者需要将账号密码换成自己的 # 因为笔者完成该项目后已经修改密码 "form_password": "******", # 设置验证码,格式为验证码字段名:具体验证码 "captcha-solution": captcha_value, # 设置需要转向的网址,由于我们需要爬取个人中心页,所以转向个人中心页 "redir": "https://www.douban.com/people/151968962/", } # 否则说明captcha列表中没有元素,即此时不需要输入验证码信息 else: print("此时没有验证码") # 设置要传递的post信息,此时没有验证码字段 data = { "form_email": "18606590295", "form_password": "******", "redir": "https://www.douban.com/people/151968962/", } print("登录中…") # 通过FormRequest.from_response()进行登录 return [FormRequest.from_response(response, # 设置cookie信息 meta={"cookiejar": response.meta["cookiejar"]}, # 设置headers信息模拟成浏览器 # 设置post表单中的数据 formdata=data, # 设置回调函数,此时回调函数为next() callback=self.next, )]
def parse(self, response): token = response.xpath('//input[@type="hidden"]/@value')[1].extract() open_in_browser(response) print("*************\n\n The token is", token) return FormRequest.from_response(response, formdata={ 'utf8': "✓", 'authenticity_token': token, 'redirect_to_ssl': "1", 'pseudonym_session[unique_id]': "", 'pseudonym_session[password]': "", 'pseudonym_session[remember_me]': "0" }, callback=self.scrape_pages, dont_filter=True)
def parse(self, response): # 获取验证码图片所在地址,获取后赋给captcha变量,此时captcha为一个列表 captcha = response.xpath('//img[@id="captcha_image"]/@src').extract() # 因为登录时有时有验证码,有时又没有 # 所以需要判断此时是否需要输入验证码,若captcha列表中有元素,说明有验证码信息 if len(captcha) > 0: print("此时有验证码") # 设置将验证码图片存储到本地的本地址 localpath = "L:/MyPythonProgr/SomePythonProjects/AboutSpider/loginpjt/captcha.png" # 将服务器中的验证码图片存储到本地,供我们在本地直接信息查看 urllib.request.urlretrieve(captcha[0], filename=localpath) print("请查看本地图片captcha.png并输入对应验证码: ") # 通过Input()等待我们输入对应的验证码并赋给captcha_value变量 captcha_value = input() data = { # 设置要传递的post信息 "form_email": "*****@*****.**", # 设置登录账号 "form_password": "******", # 设置登录密码 'source': 'index_nav', "captcha-solution": captcha_value, # 设置验证码 # 登录后转向的网址,由于我们爬个人中心页,所以转向个人中心页 "redir": "https://www.douban.com/people/182345352/", } else: print("此时没有验证码") # 设置要传递的post信息,此时没有验证码字段 data = { "form_email": "*****@*****.**", "form_password": "******", "redir": "https://www.douban.com/people/182345352/", } print("登录中...") # 通过FormRequest.from_response()进行登录 return [ FormRequest.from_response( response, # 设置cookie信息 meta={"cookiejar": response.meta["cookiejar"]}, # 模拟成浏览器 headers=self.header, # 设置post表单中的数据 formdata=data, # 设置回调函数,此时回调函数为next() callback=self.next, ) ]
def parse(self, response): #获取验证码地址 captcha = response.xpath('img[@id="captcha_image"]/@src').extract() #如果有验证码 if len(captcha) > 0: print("此处有验证码:") #将图片验证码保存在本地 localpath = "E:\program_file\python\crawler_learn\loginpjt\captcha.png" urllib.request.urlretrieve(captcha[0], filename=localpath) print("请查看图片输入对应验证码:") captcha_value = input() #设置要传递的post信息 data = { #设置登陆账号 "form_email": "15963020715", "form_password": "******", #设置验证码 "captcha-solution": captcha_value, #设置需要转向地址 "redir": "https://www.douban.com/people/172627333/", } else: print("此处没有验证码") #设置要传递的post信息 data = { #设置登陆账号 "form_email": "15963020715", "form_password": "******", #设置需要转向地址 "redir": "https://www.douban.com/people/172627333/", } print("登陆中。。。") #通过FormRequest.from_response()登陆 # return [FormRequest.from_response(response, meta={"cookiejar": response.meta["cookiejar"]}, # formdata=data, # callback=self.next)] return [ FormRequest.from_response( response, meta={"cookiejar": response.meta["cookiejar"]}, headers=self.header, formdata=data, callback=self.next, ) ]
def post_login(self, response): print 'Preparing login' # FormRequeset.from_response是Scrapy提供的一个函数, 用于post表单 # 登陆成功后, 会调用after_login回调函数,如果url跟Request页面的一样就省略掉 return [ FormRequest.from_response( response, url='https://hr.travelsky.net/hr/', meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, # 注意此处的headers formdata={ "act": "login", "staff_num": "", # 工号 "pass": "", # 密码 }, callback=self.after_login, dont_filter=True) ]
def post_login(self, response): print 'Preparing login' return [ FormRequest.from_response( response, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, #注意此处的headers formdata={ 'log': 'admin', 'pwd': 'lngwordpress', 'wp-submit': '登录', 'redirect_to': 'http://121.42.223.111/wp-admin/index.php', 'testcookie': '1' }, callback=self.after_login, dont_filter=True) ]
def parse(self, response): # 获取页面的name keys = response.xpath('//form//table//input/@name').extract() # 注册信息值 values = [ 'gjw1', 'gjw1', '*****@*****.**' % self.i, '123456', '123456', self._get_recaptcha(response) ] # 将上述二者组成字典 form = dict(zip(keys, values)) # 将字典信息传入 yield FormRequest.from_response(response, formdata=form, dont_filter=True, callback=self.check, meta={'_form': form})
def navigate_tabs(self, response): """Navigating results by submitting the search form""" soup = BeautifulSoup(response.body) items_count = int( soup.find('table', class_='resultsTbl').findAll('b')[2].string) tabs_count = items_count / 100 + (1 if items_count % 100 else 0) self.log('Items amount is {0}'.format(items_count)) self.log('Tabs amount is {0}'.format(tabs_count)) for page in range(1, tabs_count + 1): yield FormRequest.from_response(response, formname='refreshForm', formdata={ 'action': 'Advanced Search', 'start': str(page) }, callback=self.parse_page)
def parse(self, response): # Build season IDs season_ids = ["0" + str(x) for x in range(28, 100)] season_ids += ["10" + str(x) for x in range(0, 10)] season_ids += ["1" + str(x) for x in range(10, 14)] for season in season_ids: yield FormRequest.from_response( response, url="http://www.lfp.es/includes/ajax.php", formname='estadisticas_historicas', formdata={ 'input_competicion': 'Primera', 'input_temporada': str(season), 'input_temporada_nombre': str(season), 'action': 'estadisticas_historicas', 'tipo': 'clasificacion' }, callback=self.parse_classification)
def parse_home(self, response): if response.xpath("//div/a[contains(@href,'save-device')]"): self.logger.info('Got stuck in "save-device" checkpoint') self.logger.info('I will now try to redirect to the correct page') return FormRequest.from_response( response, formdata={'name_action_selected': 'dont_save'}, callback=self.parse_home) # #登入後詢問的頁面 ,傳送拒絕 href = response.urljoin(self.page) # page = os.getenv("PAGE") # #navigate to provided page # href = response.urljoin(page) print(href) return scrapy.Request(url=href, callback=self.parse_page, meta={'index': 1})