def next_page(self,response,now_page,total_page,__VIEWSTATE,__EVENTTARGET): if int(total_page) > int(now_page): #if int(now_page) < 6: formdata_n = { '__EVENTTARGET': __EVENTTARGET, '__VIEWSTATE': __VIEWSTATE, 'FManageDeptID': '-1', 'FLevel': '0', 'FIsWright': '-1' } return FormRequest(response.url, formdata=formdata_n, callback=self.parse)
def start_requests(self): for i in self.keywords.keys(): for j in range(self.keywords[i]): self.payload['search'] = i self.payload['page'] = str(j) url = 'https://www.laosiji.com/proxy/api' yield FormRequest(url=url, callback=self.parse, formdata=self.payload, headers=self.headers, dont_filter=True)
def start_requests(self): yield FormRequest(self.url, headers=self.headers, formdata={ 'first': 'false', 'pn': str(self.page), 'kd': 'Python', 'city': '广州' }, callback=self.parse)
def start_requests(self): with open('../out/docs_simple4.json', 'r') as f: for line in f.readlines(): doc = json.loads(line) if not os.path.isfile(get_path(doc)): if random.random() > 0.07: continue yield FormRequest('http://ras.arbitr.ru/Ras/HtmlDocument/%s' % doc['doc_id'], formdata={'hilightText': 'null'}, meta=doc, headers={'User-Agent': 'Wget/1.19.4 (linux-gnu)'})
def start_requests(self): # 最大页码 # MAX_PAGE_COUNT = 64; MAX_PAGE_COUNT = 1 for page in range(0, MAX_PAGE_COUNT): url = list_origin_url % (page * 60) yield FormRequest(url, meta={'cookiejar': str(page)}, headers=headers, cookies=cookies, callback=self.parse)
def run_basket_parse(self, response): formdata = { 'dlw100$Update$1': '1', 'ddlDeliveryCC': '36', 'dlw$MatrixID': '1', '__EVENTTARGET': 'dlw100$DeliveryUpdate' } yield FormRequest( 'http://www.prodirectsoccer.com/V3_1/V3_1_Basket.aspx', self.parse_shipping, formdata=formdata)
def _parse_hg_mid(self, response): mid_categories = response.xpath(".//*[@class='yahei f14 rgt mr20']") for mid_category in mid_categories: page_url = mid_category.xpath("./@href").extract()[0] url = urljoin(self.base_url, page_url) request = FormRequest(url, callback=self._parse_first, dont_filter=True) request.meta["large_category"] = response.meta["large_category"] request.meta["callback"] = self._parse_hg yield request
def start_requests(self): data = { 'start': '0', 'length': '6', 'pageLength': '6', '_order': '1:b' } yield FormRequest(url=self.start_url, formdata=data, callback=self.dataset_list_parse, meta={'data': data})
def _parse_page_free(self, response): total_pages = int( clean_text( response.xpath(".//*[@class='pages']//a//text()").extract() [-2].strip())) first_url = response.meta["first_url"] request = FormRequest(first_url, callback=self._parse_free, dont_filter=True) request.meta["large_category"] = response.meta["large_category"] yield request if total_pages > 1: for i in xrange(1, total_pages): next_page = first_url[:-5] + '-p' + str(i + 1) + '.html' request = FormRequest(next_page, callback=self._parse_free, dont_filter=True) request.meta["large_category"] = response.meta[ "large_category"] yield request
def start_requests(self): yield FormRequest( url= 'http://www.sanxianginvest.com/api.php?c=login&f=save&_noCache=0.8049219487167945', headers={'Referer': 'http://www.sanxianginvest.com'}, formdata={ 'post_date': '2025-05-08 16:00:43', 'pdip': '203.110.179.245', 'user': self.username, 'pass': self.password })
def parse_pre_login(self, response): yield FormRequest(url='http://www.shcfic.com/user.php?act=act_login', formdata={'mobile': '13523794375', 'name': '123456', 'utf8': '✓', 'back_act': 'http://www.shcfic.com/index.php' }, meta={ 'handle_httpstatus_list': [302], }, callback=self.parse_login)
def parse(self, response): areas = response.xpath('//*[@id="selArea"]/optgroup/option') for area in areas: formdata = { "areaID": area.xpath('.//@value').extract_first(), "areaNombre": area.xpath('.//text()').extract_first() } yield FormRequest(URL_API_MATERIAS, formdata=formdata, callback=self.parse_areas, meta={'formdata': formdata})
def parse_link(self, response): id = response.meta['id'] newsType = response.meta['newsType'] url = 'http://xyxx.zjfda.gov.cn/ajax/ajax!detail_cjbhg_sp.do' post_request = FormRequest(url=url, formdata={ 'queryBean.id': '%s' % id, 'queryBean.newsType': '%s' % newsType }, callback=self.parse_page) yield post_request
def start_requests(self): # Get the year to be crawled from the arguments # The year is passed like this: scrapy crawl gazettes -a year=2017 # Default to current year if year not passed in try: year = self.year except AttributeError: year = datetime.now().strftime('%Y') url = 'https://dds.crl.edu/item/json' form_data = {'year': str(year), 'TitleLink': str(27040)} yield FormRequest(url, callback=self.parse, formdata=form_data)
def start_requests(self): keyword = '000001' url = '{url}?keyword={keyword}'.format(url=self.search_url, keyword=keyword) # for page in range(self.max_page + 1): for page in range(1): data = {'mp': str(self.max_page), 'page': str(page)} yield FormRequest(url=url, callback=self.parse_index, formdata=data, dont_filter=True)
def parse_pre_login(self, response): yield FormRequest(url='http://www.sz-sgd.com/User.ashx', formdata={'username': '******', 'password': '******', 'r':'0.9917246479356379', 'type': 'login', }, meta={ 'handle_httpstatus_list': [302], }, callback=self.parse_login)
def start_requests(self): post_url = 'https://job.alibaba.com/zhaopin/socialPositionList/doList.json' for i in range(1, 804): data = { 'pageSize': '10', 't': '0.9258839192303483', 'pageIndex': '%d' % i } yield FormRequest(url=post_url, formdata=data, callback=self.parse_json)
def start_requests(self): city = '北京' needAddtionalResult = 'false' url = self.url.format(city=city, needAddtionalResult=needAddtionalResult) first = 'true' for page in range(self.max_pn + 1): data = {'first': first, 'kd': self.kd, 'pn': str(page)} print(FormRequest(url, formdata=data)) # yield FormRequest(url, callback=self.parse_info, formdata=data) first = 'false'
def start_requests(self): for keyword in self.keywords: url = '{url}?keyword={keyword}'.format(url=self.search_url, keyword=keyword) for page in range(self.max_page + 1): url += '&page={page}'.format(page=str(page)) data = { 'keyword': keyword, 'page': str(page), } yield FormRequest(url, callback=self.parse, formdata=data)
def start_requests(self): end_date = date.today() periods_of_interest = [(date.year, date.month) for date in rrule( freq=MONTHLY, dtstart=self.start_date, until=end_date)] for year, month in periods_of_interest: data = dict(ano=str(year), mes=str(month), passo="1", enviar="") yield FormRequest( "http://www.pmf.sc.gov.br/governo/index.php?pagina=govdiariooficial", formdata=data, )
def start_requests(self): url = "https://space.bilibili.com/ajax/member/GetInfo" data = { "mid":"", "csrf":"578fcf3bca4387ff58c87f70040785e8", } for mid in range(1,10000): if self.is_pause: break data["mid"] = str(mid) yield FormRequest(url=url,formdata=data,callback=self.parse)
def parse(self, response): # total_page = dict(response.xpath("//a[@sf='pagebar']/@*[name()='sf:data']").extract_first().strip("()")).get("pc") total_page = int(re.findall(".*?pc:(\d+).*", response.xpath("//a[@sf='pagebar']/@*[name()='sf:data']").extract_first())[0]) for page in range(1, total_page+1): #for page in range(1, 50): formdata = { '$total': str(total_page), '$reload': '0', '$pg': str(page), '$pgsz': '15' } yield FormRequest(response.url, formdata=formdata, callback=self.parse_companylist)
def start_requests(self): mformat = 'csi%Y%m%d.zip' end_date = datetime.now().strftime(mformat) start_date = self.get_nday_ago(end_date, 10, dformat=mformat) while start_date <= end_date: furl = self.start_url + start_date yield FormRequest(url=furl, method='GET', callback=self.parse, errback=self.errback_httpbin) start_date = self.get_tomorrow_date(sdate=start_date, dformat=mformat)
def start_requests(self): workbook = xlrd.open_workbook(r'C:\\Users\\99329\\Desktop\\待测试数据.xls') for i in self.open_file(): s = i.strip() #s = 'http://upload.51qianmai.com/20180126064925821.jpg' data = {'channel': 'abc', 'picturl': s} #data = {'channel':'abc','picturl':i} yield FormRequest(url=self.start_url, formdata=data, callback=self.parse, meta={'url': s})
def parse_topic(self, response): #得到各个子话题的网址 # 获取传递的变量 offset = response.meta.get("offset") topic_id = response.meta.get("topic_id") topic_name = response.meta.get("name") # 解析获得的响应 json_info = json.loads(response.text) # 此时json_info为一个字典 msg_info = json_info['msg'] # 键为 msg 的值对应为一个列表 offset += len(msg_info) date = {"topic": topic_name} # 判断 msg_info 里的 msg 数量是否小于20,小于的话表示已经是最后一页,就不再请求了 for x in msg_info: child_id = re.search(r'\/topic\/(\d+)', x).group() id = re.search(r'(\d+)', child_id).group() # 为下面的请求传所需要的参数 url1 = 'https://www.zhihu.com/api/v4/topics/' # 此处是topics而不是topic url2 = '/feeds/essence?include=data%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.is_normal%2Ccomment_count%2Cvoteup_count%2Ccontent%2Crelevant_info%2Cexcerpt.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Darticle%29%5D.target.content%2Cvoteup_count%2Ccomment_count%2Cvoting%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Dpeople%29%5D.target.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Darticle%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dquestion%29%5D.target.annotation_detail%2Ccomment_count%3B&limit=10&offset=0' #url2 = '/feeds/top_question?include=data%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.is_normal%2Ccomment_count%2Cvoteup_count%2Ccontent%2Crelevant_info%2Cexcerpt.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Darticle%29%5D.target.content%2Cvoteup_count%2Ccomment_count%2Cvoting%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Dpeople%29%5D.target.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Darticle%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dquestion%29%5D.target.annotation_detail%2Ccomment_count%3B&limit=10&offset=10' #url2 = '/feeds/top_activity?include=data%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.is_normal%2Ccomment_count%2Cvoteup_count%2Ccontent%2Crelevant_info%2Cexcerpt.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Darticle%29%5D.target.content%2Cvoteup_count%2Ccomment_count%2Cvoting%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Dpeople%29%5D.target.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Darticle%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dquestion%29%5D.target.annotation_detail%2Ccomment_count%3B&limit=5' url = url1 + id + url2 yield Request( url=url, callback=self.parseQuestions, meta=date, dont_filter=True, ) if not len(msg_info) < 20: yield FormRequest("https://www.zhihu.com/node/TopicsPlazzaListV2", callback=self.parse_topic, dont_filter=True, meta={ "offset": offset, "topic_id": topic_id, "name": topic_name }, formdata={ "method": "next", "_xsrf": "anaUqgXhz0GbjNTjnykooNIwJJuQz0CY", "params": json.dumps({ "topic_id": topic_id, "offset": offset, "hash_id": "5d6d053d9cca5b5d463f76e7f866080a" }) })
def start_requests(self): for i in range(1, self.endPageNum): form_data = { "VENUS_PAGE_NO_KEY_INPUT": str(i), "VENUS_PAGE_NO_KEY": str(i), # "VENUS_PAGE_COUNT_KEY": "2633", "VENUS_PAGE_SIZE_KEY": "15", } request = FormRequest(self.tmpl_url, callback=self.parse_page, formdata=form_data) yield request
def get_currency(self, response): verification_token = response.xpath('//input[@name="__RequestVerificationToken"]/@value').extract()[0] yield FormRequest('http://www.wiggle.fr/internationaloptions/update', formdata={'__RequestVerificationToken': verification_token, 'langId': self._lang_form_lang_id, 'currencyId': self._lang_form_currencyID, 'countryId': self._lang_form_countryID, 'action': 'Update', 'returnUrl': '/', 'cancelUrl': '/'}, dont_filter=True, callback=self.init_requests)
def parse(self, response): if self.sort_mode and not response.meta.get('sort_forced', False): formdata = {"ctl00$content$ddlProductListSort": self.sort_mode, "__EVENTTARGET": "ctl00$content$ddlProductListSort"} self._post_set_viewstate(formdata, response) meta = response.meta.copy() meta['sort_forced'] = True yield FormRequest(response.url, formdata=formdata, meta=meta, dont_filter=True) else: for item in super(BhinnekaProductsSpider, self).parse(response): yield item
def start_requests(self): """Simulate login action by sending http post request to get the cookies. Args: Yields: scrapy.FormRequest: """ yield FormRequest(url=self.start_urls[0], formdata={'autpor': '57'}, callback=self.parse_main_page)
def umatk(self, response): jdata = json.loads(response.body.decode()) if jdata['result_code'] == 0: print('umatk SUCCESS') formdata = { 'tk': jdata['newapptk'] } url = 'https://kyfw.12306.cn/otn/uamauthclient' print('uamtkclient START') return FormRequest(url, formdata=formdata, meta={'cookiejar': self.cookiejar}, callback=self.umatkauthclient) else: print('umatk FAILED,请重试')