def start_requests(self): for sport in self.sports: self.data["league"] = sport request = FormRequest(self.url, formdata=self.data) request.meta["sport"] = self.data["league"] yield request
def parse(self, response): hxs = HtmlXPathSelector(response) cookies = response.request.cookies try: count = hxs.select( u'//select[@name="ddrQuantity"]/option/@value').extract()[-1] except Exception: count = 1 formdata = { u'ddrQuantity': unicode(count), # 购买数量 u'txtReceivingRole': cookies[QKYJConst.qkyj_config_Role], # 收获角色 u'txtSureReceivingRole': cookies[QKYJConst.qkyj_config_Role], # 确认 u'txtPhone': cookies[QKYJConst.qkyj_config_tel], # 联系电话 u'txtQq': cookies[QKYJConst.qkyj_config_QQ], # QQ } try: yield FormRequest.from_response( response, ZXY_QKYJ_Deal_Spider().parse, formdata=formdata, ) except Exception as e: self.log(u'%s' % str(e), log.INFO)
def _handle_captcha(self, response, callback): # FIXME This is untested and wrong. captcha_solve_try = response.meta.get('captcha_solve_try', 0) product = response.meta['product'] self.log("Captcha challenge for %s (try %d)." % (product['url'], captcha_solve_try), level=INFO) captcha = self._solve_captcha(response) if captcha is None: self.log( "Failed to guess captcha for '%s' (try: %d)." % ( product['url'], captcha_solve_try), level=ERROR ) result = None else: self.log( "On try %d, submitting captcha '%s' for '%s'." % ( captcha_solve_try, captcha, product['url']), level=INFO ) result = FormRequest.from_response( response, formname='', formdata={'field-keywords': captcha}, callback=callback) result.meta['captcha_solve_try'] = captcha_solve_try + 1 result.meta['product'] = product return result
def _handle_captcha(self, response, callback): # FIXME This is untested and wrong. captcha_solve_try = response.meta.get('captcha_solve_try', 0) url = response.url self.log("Captcha challenge for %s (try %d)." % (url, captcha_solve_try), level=INFO) captcha = self._solve_captcha(response) if captcha is None: self.log("Failed to guess captcha for '%s' (try: %d)." % (url, captcha_solve_try), level=ERROR) result = None else: self.log("On try %d, submitting captcha '%s' for '%s'." % (captcha_solve_try, captcha, url), level=INFO) meta = response.meta.copy() meta['captcha_solve_try'] = captcha_solve_try + 1 result = FormRequest.from_response( response, formname='', formdata={'field-keywords': captcha}, callback=callback, dont_filter=True, meta=meta) return result
def parse(self, response): one_day_time = timedelta(days=1) start_date = self.start_date today = date.today() body=lxml.html.fromstring(response.body) process_date = start_date view_state = body.cssselect('input[name=__VIEWSTATE]')[0].value event_target = "cal_Date" data = {} while True: data.update({ "cal_Date" : process_date.strftime('%Y-%m-%d') }) #scrapy 提供的好用功能!! 可以先幫你parse response 的目標form, 擷取資料後再修改自己的資料再送出就好XD yield FormRequest.from_response( response = response, formdata = data, formname = "frmInfo", callback = self.parse_item ) #日期遞增 process_date += one_day_time if process_date > today: break
def parse(self, response): return [FormRequest.from_response( response, formdata={'username': self.username, 'password': self.password}, formnumber=1, callback=self.after_login )]
def _create_post_requests(self, response, asin): url = ('http://www.amazon.com/ss/customer-reviews/ajax/reviews/get/' 'ref=cm_cr_pr_viewopt_sr') meta = response.meta meta['_current_star'] = {} for star in self.buyer_reviews_stars: args = { 'asin': asin, 'filterByStar': star, 'filterByKeyword': '', 'formatType': 'all_formats', 'pageNumber': '1', 'pageSize': '10', 'sortBy': 'helpful', 'reftag': 'cm_cr_pr_viewopt_sr', 'reviewerType': 'all_reviews', 'scope': 'reviewsAjax0', } meta['_current_star'] = star yield FormRequest( url=url, formdata=args, meta=meta, callback=self._get_rating_by_star_by_individual_request, dont_filter=True)
def do_get_code(self, response): """ Lawyee.CPWSW.List.js: $.ajax({ url: "/ValiCode/GetCode", type: "POST", async: false, data: { "guid": guid1 }, success: function (data) { yzm1 = data; } }); """ try: updateHeaders = self.headers.copy() updateHeaders['Referer'] = self.start_url updateHeaders['Origin'] = self.origin self.guid = self.prepare_guid() #print self.vjkl5 logging.debug('vjkl5:%s' % (self.vjkl5)) logging.debug('guid:%s' % (self.guid)) #logging.debug('mainbody:%s'%self.mainBody) logging.debug('pageno: %d' % (self.pageno)) yield self.proxy_request( FormRequest(url=self.code_url, method="POST", headers=updateHeaders, formdata={'guid': self.guid}, callback=self.do_list)) except Exception, e: self.exit_and_set(is_proxy=False, is_query=True)
def request(self, base_url, callback): """ Return a FormRequest object whose formdata fields are fields in self.formdata_fields that are also in self.kwargs. """ for name, attr in self.formdata_fields: self.formdata[name] = self.kwargs[attr] return FormRequest(base_url + self.url, formdata=self.formdata, callback=callback)
def over_under_trends_get(self, response): request = FormRequest.from_response( response, callback=self.over_under_trends ) request.meta['team'] = response.meta['team'] return request
def get_ext_requests_or_urls(self): requests = [] for i in range(len(self.urls)): form_data = self.form_data[i] request = FormRequest(url=self.urls[i], formdata=form_data, meta={"fd": form_data}, callback=self.parse, errback=self.err_parse, dont_filter=True) requests.append(request) return requests
def form_request(self, form_data, callback): req = FormRequest(url=self.form_url, method='POST', formdata=form_data, dont_filter=True, callback=callback, errback=self.handle_form_error) return req
def parse(self, response): cookies = response.request.cookies name = u"535521469" pwd = u"Corleone1016@" return FormRequest.from_response(response, u"loginform", formdata={u"u":name, u"p":pwd}, parse=FOST_Login_Spider().parse, cookies=cookies)
def start_requests(self): return [ FormRequest('http://localhost:5000/login', formdata={ 'username': '******', 'password': '******' }, method='POST') ]
def start_requests(self): keyword = '奥运会' url = '{url}?keyword={keyword}'.format(url=self.search_url, keyword=keyword) for page in range(self.max_page): data = {'mp': str(100), 'page': str(page + 1)} # 提交Post请求 yield FormRequest(url=url, callback=self.parse_index, formdata=data)
def log_in(self, response): # fill in username and password return FormRequest.from_response(response, url=self.login_url, formid="login_form", formdata={ "email": self.login_user, "pass": self.login_pass }, callback=self.after_login)
def start_requests(self): form_request = [ FormRequest("https://flixify.com/login", formdata={ 'email': '*****@*****.**', 'password': '******' }, callback=self.after_login) ] return form_request
def parse(self, response): return [ FormRequest.from_response(response, formdata={ 'username': self.username, 'password': self.password }, formnumber=1, callback=self.after_login) ]
def parse(self, response): if 'Player List' in response.body: return self.after_login(response) else: return [FormRequest.from_response( response, formdata={'login': YAHOO_USERNAME, 'passwd': YAHOO_PASSWORD}, callback=self.parse_page, dont_filter=True, dont_click=True)]
def start_requests(self): try: limit = '100' xxpl_map = self.get_channel() for xxpl_item in xxpl_map: channel_id = xxpl_item.channel_id for start in range(1000): formdata = { 'start': str(start * limit), 'limit': limit, 'channelId': str(channel_id) } url = "http://www.shclearing.com/shchapp/web/disclosureForTrsServer/search" request_data = FormRequest(url=url, formdata=formdata, callback=self.parse_data) request_data.meta['xxpl_item'] = xxpl_item yield request_data except Exception as e: logging.error(e, exc_info=True) logging.error("Error process: ")
def start_requests(self): # for update today = datetime.datetime.now() for year in range(today.year, 2009, -1): for month in range(12, 0, -1): if year == today.year and month > today.month: continue day1, ndays = calendar.monthrange(year, month) yearString = str(year).zfill(4) monthString = str(month).zfill(2) if year == today.year and month == today.month: ndays = today.day dayString = str(ndays).zfill(2) seDate = yearString + "-" + monthString + "-" + "01" + " ~ " + yearString + "-" + monthString + "-" + dayString formdata = { 'column': 'bond', 'tabName': 'fulltext', 'seDate': seDate, 'pageNum': '1', 'pageSize': '50' } # 债券公告 url = "http://www.cninfo.com.cn/cninfo-new/announcement/query" request_bond_latest = FormRequest(url=url, formdata=formdata, callback=self.parseData) request_bond_latest.meta['pageNum'] = 1 request_bond_latest.meta['sourceType'] = 10 request_bond_latest.meta['url'] = url request_bond_latest.meta['seDate'] = seDate yield request_bond_latest
def parse(self, response): ids_re = r'__doPostBack\(\'(.*)\'\)' for id in response.css('.AspNet-TreeView-Root a').re(ids_re): requestInfo = urllib.parse.unquote(id).split("','") formdata = { '__EVENTTARGET': requestInfo[0], '__EVENTARGUMENT': requestInfo[1], } request = FormRequest.from_response(response=response, formdata=formdata, callback=self.takeEachParty, dont_click=True) yield request
def _create_request(self, meta): order, direction = self.SORT_MODES[self._sort_order] return FormRequest( url=WaitroseProductsSpider.SEARCH_URL, formdata={ 'browse': WaitroseProductsSpider._DATA.format( search_term=meta['search_term'], page=meta['current_page'], order=order, direction=direction) }, meta=meta, )
def parse(self, response): self.log("Parsing", level=logging.INFO) if response.xpath('//form[@id="loginForm"]'): self.log("Found login form", level=logging.INFO) yield FormRequest.from_response(response, formid='loginForm', formdata={'cust': self.member, 'pin': self.pin, 'rToken': 'null', }, callback=self.after_login) self.log("Done parsing main page", logging.INFO)
def get_request_object(self, params): """构造request对象""" formdata = params.get('formdata', {}) if formdata: if isinstance(formdata, dict): return FormRequest(**params) else: s = json.dumps(formdata, ensure_ascii=False) log.warning("formdata:{}格式不对, 无法制造FormRequest对象".format(s)) return None else: temp_params = copy.deepcopy(params) if 'formdata' in temp_params: del temp_params['formdata'] return Request(**temp_params)
def parse(self, response): print "FFFFFFFFFFFFFFF" fd= { "accountLocked":"null", "auth_mode":"BASIC", "orig_url":"null", "password":"******", "user":"******", "userId":"null", "userName":"******", "userNameCB":"on", } return FormRequest.from_response(response, formdata=fd,callback=self.navi_swarm)
def login_request(self): # CMR data is only available to logged in users siteCreds = CMRCredentials().load_credentials() logindata = { 'user': siteCreds['username'], 'pass': siteCreds['password'] } # we're going to go to sleep for 5 second(s) before logging in, to make sure the other side stops being upset # time.sleep(5) # this request needs a high priority so when created it is executed ASAP return FormRequest(self.login_url, formdata=logindata, callback=self.verify_login, priority=100, dont_filter=True)
def parse_book(self, response): book = response.url.split("/")[-1] for sel in response.xpath( '//table[@id="ctl00_ctl00_MainContent_SubMain_gvTitles"]/tr[not(@class="paging")]/td/li' ): sermon = SermonItem() sermon['book'] = book sermon['title'] = (sel.xpath('h5/a/text()').extract() or [None])[0] date_preached_val = ( sel.xpath('p[1]/strong[@class="title"]/text()').extract() or [None])[0] date_preached = parse_date( date_preached_val) if date_preached_val else None sermon['date_preached'] = date_preached.strftime( "%Y-%m-%d") if date_preached else '' sermon['scripture'] = ( sel.xpath('p[1]/strong[@class="date"]/text()').extract() or [None])[0] sermon['ref'] = (sel.xpath('p[1]/text()').extract() or [None])[1] sermon['link'] = ( sel.xpath('p[2]/strong[@class="date"]/a/@href').extract() or [None])[0] yield sermon viewstate = response.xpath( "//input[@id='__VIEWSTATE']/@value").extract().pop() pages = response.xpath( '//table[@id="ctl00_ctl00_MainContent_SubMain_gvTitles"]/tr[@class="paging"][1]/td/table/tr/td' ) current_page_elems = pages.xpath('span/text()').extract() if len(current_page_elems) > 0: current_page = int(current_page_elems[0]) next_page = current_page + 1 if next_page <= len(pages): argument = u"Page$%s" % str(next_page) data = { '__EVENTTARGET': u"ctl00$ctl00$MainContent$SubMain$gvTitles", '__EVENTARGUMENT': argument, '__LASTFOCUS': u'', '__EVENTVALIDATION': u'', '__VIEWSTATE': viewstate } yield FormRequest(response.url, formdata=data, callback=self.parse_book)
def parse(self, response): form_data = response.meta.get("fd", {}) self.logger1.info("start get pages in parse function, url:{}".format(response.url)) try: page_str = ''.join(response.xpath(".//div[@class='page']//a[last()]//@onclick").extract()) pages = ''.join(re.findall("(\d+)", page_str)) pages = int(pages) if pages else 1 # 翻页 for page in range(1, pages+1): form_data.update({"pageNo": str(page)}) yield FormRequest(url=response.url, formdata=form_data, callback=self.parse_page, meta={"pg": page}, errback=self.err_parse, dont_filter=True) except Exception: msg = traceback.format_exc() self.logger1.error("something goes wrong in parse function: {}".format(msg))
def parse(self, response): hxs = HtmlXPathSelector(response) url = self.base_url + "/" + hxs.select('/html/body/form[@id="Form1"]/@action').extract()[0] viewstate = hxs.select('//input[@id="__VIEWSTATE"]/@value').extract()[0] for tr in hxs.select('//div[@id="MarketRent1_pnlRegion"]/div/table/tr[position() > 1 and position() < last()]'): # for tr in hxs.select('//div[@id="MarketRent1_pnlRegion"]/div/table/tr[position() > 1 and position() < 4]'): formdata = {} formdata['__VIEWSTATE'] = viewstate formdata['__EVENTTARGET'] = '' formdata['__EVENTARGUMENT'] = '' formdata['__VIEWSTATEENCRYPTED'] = '' formdata[tr.select('td/input/@name').extract()[0]] = tr.select('td/input/@value').extract()[0] region = tr.select('td/input/@title').extract()[0] yield FormRequest(url, formdata=formdata, callback=lambda r, region=region:self.parse_district(r, region))
def login(self, response=None): print 'login' url = self.login_page if(response == None): return Request(url, method="POST", dont_filter=True, callback=self.login) name = html.fromstring(response.body).cssselect('#ithelpProfile h3') if(name): return self.make_requests() else: user = raw_input("enter username: "******"enter password: "******"username":user, "password":password} return FormRequest.from_response(response, formname="login", formdata=data, callback=self.login, dont_filter=True)
def parse_district(self, response, region): # self.log("Parsing Region: " + region + "...") hxs = HtmlXPathSelector(response) url = self.base_url + "/Utilities/marketrent/" + hxs.select('/html/body/form[@id="Form1"]/@action').extract()[0] viewstate = hxs.select('//input[@id="__VIEWSTATE"]/@value').extract()[0] for tr in hxs.select('//div[@id="MarketRent1_pnlLocation"]/div/table/tr[position() > 1]'): # for tr in hxs.select('//div[@id="MarketRent1_pnlLocation"]/div/table/tr[position() > 1 and position() < 4]'): formdata = {} formdata['__VIEWSTATE'] = viewstate formdata['__EVENTTARGET'] = '' formdata['__EVENTARGUMENT'] = '' formdata['__VIEWSTATEENCRYPTED'] = '' formdata[tr.select('td/input/@name').extract()[0]] = tr.select('td/input/@value').extract()[0] (district, area) = tr.select('td/input/@title').extract()[0].split(' - ') yield FormRequest(url, formdata=formdata, callback=lambda r, region=region, district=district, area=area:self.parse_area(r, region, district, area))
def make_request(url, method='GET', formdata=None, jsondata=None, headers=None, **kwargs): if formdata: return FormRequest(url=url, method=method, formdata=formdata, headers=headers, **kwargs) elif jsondata: return JsonRequest(url=url, method=method, jsondata=jsondata, headers=headers, **kwargs) else: return Request(url, method=method, headers=headers, **kwargs)
def parse(self, response): self.email = '*****@*****.**' % self.username fr = FormRequest.from_response( response=response, formxpath='//*[@id="registerForm"]', formdata={ 'action': 'register', 'redirect': 'ajax', 'source': 'Checkout', 'SFproductID': '', 'first_name': self.username, 'last_name': self.username, 'username': self.email, 'password': self.password, }, callback=self.account_created ) fr = fr.replace(url="https://www.veritasprep.com/checkout/LIBRARY/auth/AEntry.php") yield fr
def parse_search_page(self, response): self.log("English selected", logging.INFO) # with codecs.open('index.html', 'w', 'utf-8') as f: # f.write(response.text) # inspect_response(response, self) # extract some info about the user user_name = response.xpath('//span[@class="header-name"]/text()').extract_first() user_number = response.xpath('//span[@class="header-number"]/text()').extract_first() user_mileage = response.xpath('//span[@class="header-mileage"]/text()').extract_first() self.log("name=" + user_name, logging.INFO) self.log("number=" + user_number, logging.INFO) self.log("mileage=" + user_mileage, logging.INFO) # submit search form to /adr/SearchProcess.do; this will redirect # to /adr/Results.do, which is responsible for displaying the progress # bar yield FormRequest.from_response(response, formxpath='//form[@name="onewayTravel"]', formdata={ 'currentTripTab': 'oneway', 'modifySearch': 'false', 'forceIkk': 'false', 'city1FromOneway': 'YYZ', 'city1FromOnewayCode': 'YYZ', 'city1ToOneway': 'LHR', 'city1ToOnewayCode': 'LHR', 'l1Oneway': '2016-07-09', 'l1OnewayDate': '2016-07-09', 'OnewayFlexibleDatesHidden': '0', 'OnewayAdultsNb': '1', 'OnewayChildrenNb': '0', 'OnewayTotalPassengerNb': '1', 'OnewayCabin': 'Business', }, callback=self.parse_results_do )
def _generate_request(self, index, rule, link): """ generate request by rule :param index: rule index :param rule: rule object :param link: link object :return: new request object """ url = furl(link.url).add(rule.params).url if rule.params else link.url if rule.method == 'POST': r = FormRequest(url=url, formdata=rule.data, headers=rule.headers, priority=rule.priority, dont_filter=rule.dont_filter, callback=self._response_downloaded) else: r = Request(url=url, method=rule.method, headers=rule.headers, priority=rule.priority, dont_filter=rule.dont_filter, callback=self._response_downloaded) # update meta args r.meta.update(**rule.meta) # update rule index and link text r.meta.update(rule=index, link_text=link.text) meta_items = [ 'dont_redirect', 'dont_retry', 'handle_httpstatus_list', 'handle_httpstatus_all', 'dont_cache', 'dont_obey_robotstxt', 'download_timeout', 'max_retry_times', 'proxy', 'render' ] meta_args = { meta_item: getattr(rule, meta_item) for meta_item in meta_items if not getattr(rule, meta_item) is None } # update extra meta args r.meta.update(**meta_args) return r
def parse(self, response): hxs = HtmlXPathSelector(response) cookies = response.request.cookies try: count = hxs.select(u'//select[@name="ddrQuantity"]/option/@value').extract()[-1] except Exception : count = 1 formdata = {u'ddrQuantity':unicode(count), # 购买数量 u'txtReceivingRole':cookies[QKYJConst.qkyj_config_Role], # 收获角色 u'txtSureReceivingRole':cookies[QKYJConst.qkyj_config_Role], # 确认 u'txtPhone':cookies[QKYJConst.qkyj_config_tel], # 联系电话 u'txtQq':cookies[QKYJConst.qkyj_config_QQ], # QQ } try: yield FormRequest.from_response(response, ZXY_QKYJ_Deal_Spider().parse, formdata=formdata,) except Exception as e: self.log(u'%s'%str(e),log.INFO)
def parse(self, response): # print(response.request.headers) sel = Selector(response) if len(sel.extract() ) < 10000: # this is an empirical value to prevent error page # log.msg(str(len(sel.extract())) + "Retrying page with " + response.request.body, level = log.INFO) new_request = response.request.copy() new_request.dont_filter = True yield new_request else: file = open( "C:/Users/Dell/Desktop/test/page_%s.html" % str(self.page_count), "w") for line in sel.extract(): file.write(line.encode("utf-8")) file.close() self.page_count = self.page_count + 1 log.msg("page length is " + str(len(sel.extract()))) next_formdata = self.formdata.copy() next_page_js = sel.css("a#_ctl0_m_DisplayCore_dpy2") hasNext = True if next_page_js: next_page_js = next_page_js.xpath("@href").extract()[0] mat = self.js_call_pat.match(next_page_js) next_formdata["__EVENTTARGET"] = urllib.unquote(mat.group(1)) next_formdata["__EVENTARGUMENT"] = urllib.unquote(mat.group(2)) viewstate = sel.css("input#__VIEWSTATE")[0].xpath( "@value").extract()[0] else: body = "" for line in sel.extract(): body += line start = body.rfind("_ctl0_m_DisplayCore_dpy2") end = body.find("Next") if start >= end: hasNext = False else: try: next_js = body[start:end] mat = re.match(r".*?%5C'(.*?)%5C.*%5C'(.*?)%5C'", next_js) next_formdata["__EVENTTARGET"] = urllib.unquote( mat.group(1)) next_formdata["__EVENTARGUMENT"] = urllib.unquote( mat.group(2)) start = body.find("__VIEWSTATE") # print start end = body.find("|", start + 12) # print end viewstate = body[start + 12:end] except Exception, e: hasNext = False print(e) print(traceback.format_exc()) if hasNext: next_formdata["__VIEWSTATE"] = urllib.unquote(viewstate) # print(next_page_js) log.msg("Yield Next Request %d" % self.page_count, level=log.INFO) yield FormRequest(url=self.start_urls[0], formdata=next_formdata, callback=self.parse) contents = sel.css( "div.singleLineDisplay.ajax_display.d1085m_show table.d1085m2 td.d1085m10 a" ) for ele in contents: item_str = ele.extract() mat = self.js_call_pat.match(item_str) cur_formdata = self.formdata.copy() cur_formdata["__EVENTTARGET"] = urllib.unquote(mat.group(1)) cur_formdata["__EVENTARGUMENT"] = urllib.unquote(mat.group(2)) cur_formdata["__VIEWSTATE"] = urllib.unquote(viewstate) yield FormRequest(url=self.start_urls[0], formdata=cur_formdata, callback=self.parse_item)
def parseData(self, response): try: result_json_string = response.body_as_unicode() sourceType = response.meta.get('sourceType') try: result_json = json.loads(result_json_string) announcements_json = result_json['classifiedAnnouncements'] except: # not available site return existCount = 0 dataJsonList = [] if announcements_json is None: if 'announcements' in result_json: announcements_json = result_json['announcements'] if announcements_json is not None: for data_json in announcements_json: dataJsonList.append(data_json) else: for array_json in announcements_json: for data_json in array_json: dataJsonList.append(data_json) if len(dataJsonList) == 0: return for data_json in dataJsonList: dataItem = CninfoItem() self.fillItemByJson(data_json, dataItem) dataItem["sourceType"] = sourceType if self.session.query(Cninfo).filter( Cninfo.announcementId == dataItem['announcementId']).first(): existCount += 1 # if existCount >= 3: # return else: existCount = 0 # need continue 3 # download file dataItem['file_path'] = None yield dataItem # if dataItem["adjunctUrl"] is not None: # adjunctUrl = dataItem["adjunctUrl"] # relative_path = self.downloadFile(adjunctUrl,sourceType) # if relative_path is not None: # dataItem['file_path'] = relative_path # yield dataItem seDate = response.meta.get('seDate') pageNum = response.meta.get('pageNum') pageNum += 1 formdata = { 'column': 'bond', 'tabName': 'fulltext', 'seDate': seDate, 'pageNum': str(pageNum), 'pageSize': '50' } url = response.meta.get('url') request = FormRequest(url=url, formdata=formdata, callback=self.parseData) request.meta['pageNum'] = pageNum request.meta['sourceType'] = sourceType request.meta['url'] = url yield request except Exception as e: logging.error(e, exc_info=True) logging.error("Error process: " + response.url)
def parse_start_url(self, response): if 'init' in response.url: yield FormRequest.from_response(response, formnumber=1)
def login(self, response): return FormRequest.from_response(response, formxpath='//ul[@class="ipsForm ipsForm_vertical ipsPad_double left"]', formdata={'ips_username': '', # your username 'ips_password': ''}, # your password callback=self.check_login_response)
def login( self, response ): """Generate a login request.""" return FormRequest.from_response( response, formdata = {'name': 'herman', 'password': '******'}, callback = self.check_login_response )
def parse(self, response): yield FormRequest.from_response(response, formnumber=1, formdata={'termChoice': curTerm}, callback=self.parse_result_page)
def do_list(self, response): def getKeyFunc(): #idx1=self.mainBody.find('var _fxxx') idx1 = self.mainBody.find('function getKey()') idx2 = self.mainBody.rfind('</script>') #print self.mainBody[idx1:idx2] #sys.exit(1) #md5=open('../md5js','r').read() md5 = open('../js/md5.js', 'r').read() base64 = open('../js/base64.js', 'r').read() sha1 = open('../js/sha1.js', 'r').read() lawye = open('../js/Lawyee.CPWSW.Common.js', 'r').read() flag_list = [1, 1, 1, 1] js_list = [md5, base64, sha1, lawye] cont = '' for idx in range(len(flag_list)): flag = flag_list[idx] if flag == 1: cont += '\n' + js_list[idx] #md5+"\n"+base64+'\n'+sha1+"\n"+lawye cont += '\n' cont += self.mainBody[idx1:idx2] #open('/tmp/js','w').write(cont).close() #cont=self.mainBody[idx1:idx2]+'\n'+md5 #print cont return cont def runJSFunc(jscode): """ getkey """ jscode += "\nfunction getCookie(xx){return '%s'}" % (self.vjkl5) #print 'aaa' #print jscode import sys sys.setrecursionlimit(100000) context = js2py.EvalJs() context.execute(jscode) return context.getKey() def getvkey(): jscode = runJSFunc(getKeyFunc()) return jscode try: self.number = response.body.decode(response.encoding) vkey = getvkey() param = self.generate_post_param() formdata = { 'Param': param, #'Param':'文书类型:判决书', 'Index': '%d' % (self.pageno), 'Page': '%d' % (self.num_per_page), 'Order': '法院层级', 'Direction': 'asc', 'guid': self.guid, 'number': self.number, 'vl5x': vkey } logging.debug(formdata) updateHeaders = self.headers.copy() updateHeaders['Referer'] = self.start_url updateHeaders['Origin'] = self.origin yield self.proxy_request( FormRequest(url=self.list_url, method="POST", headers=updateHeaders, formdata=formdata, callback=self.after_list)) except Exception, e: logging.error(e) self.exit_and_set(is_proxy=False, is_query=True)
def login(self, response): #"""Generate a login request.""" return FormRequest.from_response(response, formdata={'username': self.username, 'password': self.password}, callback=self.check_login_response)
def login(self, response): return FormRequest.from_response(response, formdata={'username': '******', 'password': '******'}, callback=self.after_login)