def _request_number_of_pages(self, date_str): url = self.base_url + '/consulta_paginarBusquedaVisitas' request = FormRequest(url=url, meta={ 'date': date_str, }, formdata={ 'fechaDesde': date_str, 'fechaHasta': date_str, 'paginaActual': '1', 'visita.visitanteNombres': '', 'visita.personalNombre': '', 'visita.oficinaNombre': '', 'visita.sedeId': '00', 'visita.ano': '', 'visita.mes': '', 'visita.fechaIngreso': '', 'paginaNueva': '0', 'visita.visitanteId': '0', 'visita.personalId': '0', 'visita.oficinaId': '0', }, dont_filter=True, callback=self.parse_initial_request) request.meta['date'] = date_str return request
def _parse_list(self, response): report_list = response.xpath("//div[@class=\"reportlist bluelink\"]/ul//a/@href") for report_url in report_list: request = FormRequest(urljoin(self.base_url, report_url.extract()), callback=self.parse_item, dont_filter=False) request.meta["large_category_name"] = response.meta["large_category_name"] request.meta["mid_category_name"] = response.meta["mid_category_name"] request.meta["report_type"] = response.meta["report_type"] yield request
def parse(self, response): large_categories = response.xpath(".//*[@class='rptmap']//strong//a") for large_category in large_categories: large_category_name = clean_text(large_category.xpath("./text()").extract()[0].strip()) page_url = large_category.xpath("./@href").extract()[0] url = urljoin(self.base_url, page_url) request = FormRequest(url, callback=self.parse_middle_category, dont_filter=True) request.meta["large_category"] = large_category_name yield request
def parse_middle_category(self, response): report_types = response.xpath(u"//li[contains(text(),'报告')]") for report_type in report_types: mid_category_url = urljoin(self.base_url, report_type.xpath(u"./preceding-sibling::span[1]/a/@href").extract()[0]) request = FormRequest(mid_category_url, callback=self.parse_page, dont_filter=True) request.meta["large_category_name"] = response.meta["large_category_name"] request.meta["mid_category_name"] = response.meta["mid_category_name"] request.meta["report_type"] = clean_text(report_type.xpath("./text()").extract()[0].strip()) request.meta["page_base_url"] = mid_category_url yield request
def parse_middle_category(self, response): mid_categories = response.xpath(".//*[@class='report2']//h2//a") for mid_category in mid_categories: mid_category_name = clean_text(mid_category.xpath("./text()").extract()[0].strip()) page_url = mid_category.xpath("./@href").extract()[0] url = urljoin(self.base_url, page_url) request = FormRequest(url, callback=self._parse_item, dont_filter=True) request.meta["large_category"] = response.meta["large_category"] request.meta["mid_category"] = mid_category_name request.meta["first_url"] = url yield request
def parse(self, response): large_categories = response.xpath("//*[@class='tabContent bluelink']//*[contains(@style, 'padding')]/a") for large_category in large_categories: large_category_name = clean_text(large_category.xpath(".//text()").extract()[0].strip()) mid_categorys = large_category.xpath("./parent::*/following-sibling::*[1]/a") for mid_category in mid_categorys: mid_category_name = clean_text(mid_category.xpath("./text()").extract()[0]) mid_category_url = urljoin(self.base_url, mid_category.xpath("./@href").extract()[0]) request = FormRequest(mid_category_url, callback=self.parse_middle_category, dont_filter=True) request.meta["large_category_name"] = large_category_name request.meta["mid_category_name"] = mid_category_name yield request
def _parse_page_free(self, response): total_pages = int(clean_text(response.xpath(".//*[@class='pages']//a//text()").extract()[-2].strip())) first_url = response.meta["first_url"] request = FormRequest(first_url, callback=self._parse_free, dont_filter=True) request.meta["large_category"] = response.meta["large_category"] yield request if total_pages>1: for i in xrange(1,total_pages): next_page = first_url[:-5] + '-p' + str(i+1) + '.html' request = FormRequest(next_page, callback=self._parse_free, dont_filter=True) request.meta["large_category"] = response.meta["large_category"] yield request
def parse(self, response): large_categories = response.xpath(".//*[@class='shopleft_bt']//a") middle_categories = response.xpath(".//*[@class='shopnav2']") for i in xrange(len(large_categories)): large_category_name = clean_text(large_categories[i].xpath("./text()").extract()[0].strip()) middle_category_list = middle_categories[i].xpath(".//*[@class='shopleft_wt']") for middle_category in middle_category_list: middle_category_name = clean_text(middle_category.xpath(".//a/text()").extract()) page_url = middle_category.xpath(".//a//@href").extract()[0] url = urljoin(self.base_url, page_url) request = FormRequest(url, callback=self._parse_item, dont_filter=True) request.meta["large_category"] = large_category_name request.meta["mid_category"] = middle_category_name yield request
def _request_next_page(self, response, date_str, callback): current_page = int(response.meta['current_page']) total_string = response.css('#LblTotal').xpath('./text()').extract_first(default='') total = re.search(r'(\d+)', total_string) if total: # Deal with the next page. total = total.group(1) number_of_pages = self._get_number_of_pages(int(total)) if current_page < number_of_pages: current_page += 1 formdata = { 'TxtFecha': date_str, 'BtnBuscar': 'Buscar', 'LwVisitasCR$DpVisitasCR$ctl02$ctl00.x': '1', 'LwVisitasCR$DpVisitasCR$ctl02$ctl00.y': '1' } request = FormRequest.from_response(response, formdata=formdata, dont_click=True, dont_filter=True, callback=callback, ) request.meta['date'] = date_str request.meta['current_page'] = current_page return request
def parse(self, response): """ 这是默认的回调方法,得到response后: 1. 如果需要登录,则先通过FormRequest登录论坛; 2. 如果不需要登录,通过Request继续请求; :param response: :return: """ # 需要登录,使用FormRequest.from_response模拟登录 if 'id="lsform"' in response.body: logging.info('in parse, need to login, url: {0}'.format(response.url)) form_data = {'handlekey': 'ls', 'quickforward': 'yes', 'username': '******', 'password': '******'} request = FormRequest.from_response(response=response, headers=self.headers, formxpath='//form[contains(@id, "lsform")]', formdata=form_data, callback=self.parse_list ) else: logging.info('in parse, NOT need to login, url: {0}'.format(response.url)) request = Request(url=response.url, headers=self.headers, callback=self.parse_list, ) yield request
def parse(self, response) : # test_urls = [ # "http://ntiaoji.kaoyan.com/tjadm/1.html", # "http://ntiaoji.kaoyan.com/tjadm/2.html", # "http://ntiaoji.kaoyan.com/tjadm/3.html", # "http://ntiaoji.kaoyan.com/tjadm/4.html", # "http://ntiaoji.kaoyan.com/tjadm/5.html", # "http://ntiaoji.kaoyan.com/tjadm/6.html", # "http://ntiaoji.kaoyan.com/tjadm/7.html" # ] # # for url in test_urls : # print url # time.sleep(2) # self.headers['Referer'] = url # yield FormRequest.from_response(response, # headers = self.headers, # formdata = { # 'username' : 'kytj1', # 'password' : '6ujBJ4XQyLeGmJmB' # }, # callback = self.download_page, # dont_filter = True # ) return FormRequest.from_response(response, headers = self.headers, formdata = { 'username' : 'kytj1', 'password' : '6ujBJ4XQyLeGmJmB' }, callback = self.after_login, dont_filter = True )
def parse_page(self, response): request_list = self._parse_list(response) for r in request_list: yield r next_page = response.xpath(u"//*[@id='AspNetPager1']/a[text()=\"下一页\"]/@href") if len(next_page) > 0: next_page_url = urljoin(self.base_url, next_page.extract()[0]) if not next_page_url.startswith(response.meta["page_base_url"]): if next_page_url.endswith("html"): next_page_url = response.meta["page_base_url"] + next_page_url[next_page_url.rindex("/") + 1:len(next_page_url)] request = FormRequest(next_page_url, callback=self.parse_page, dont_filter=True) request.meta["large_category_name"] = response.meta["large_category_name"] request.meta["mid_category_name"] = response.meta["mid_category_name"] request.meta["report_type"] = response.meta["report_type"] request.meta["page_base_url"] = response.meta["page_base_url"] yield request
def parse(self, response): form_data = {'username': '******', 'password': '******', 'remember_me': '1'} return FormRequest.from_response(response, headers=self.headers, formxpath='//form[@class="form-login"]', formdata=form_data, callback=self.after_login, )
def parse(self,response): ## page_count_text {string} # @example # # if(ANP_checkInput('AspNetPager1_input',3270,'页索引超出范围!','页索引不是有效的数值!')) # {ANP_goToPage('AspNetPager1_input','page','http://www.bgku.cn/sitemap_1', # 'http://www.bgku.cn/sitemap_{page}','',3270,false);};return false; ## page_count_text= response.xpath('//*[@id="AspNetPager1_btn"]/@onclick').extract()[0] match= re.search(',\d{4,},',page_count_text) page_count= int(match.group(0).strip(',')) for page in range(1,page_count+1): url= 'http://www.bgku.cn/sitemap_'+str(page) request = FormRequest(url, callback=self.parse_index_page, dont_filter=True) request.meta["page"] = page yield request
def parse(self, response): yield FormRequest.from_response( response, formname='aspnetForm', formdata={'Skin$body$FundingSourceChoices$0': '1', 'Skin$body$FundingSourceChoices$1': '0'}, meta={'curr_listing_page': 1, 'flag': False}, callback=self.after_login)
def parse(self, response): url = response.url if "research" in url: categories = response.xpath(".//*[@class='catec']") for i in xrange(len(categories)-1): large_categories = categories[i].xpath(".//*[@class='fl']") large_category_name = clean_text(large_categories.xpath(".//text()").extract()[0].strip()) mid_categories = categories[i].xpath(".//span") for mid_category in mid_categories: mid_category_name = clean_text(mid_category.xpath(".//text()").extract()[0].strip()) page_url = mid_category.xpath(".//@href").extract()[0] request = FormRequest(page_url, callback=self._parse_page_research, dont_filter=True) request.meta["large_category"] = large_category_name request.meta["mid_category"] = mid_category_name request.meta["first_url"] = page_url yield request elif "free" in url: large_categories = response.xpath(".//*[@class='tul2']//h2//a") for i in xrange(len(large_categories)): large_category_name = clean_text(large_categories[i].xpath(".//text()").extract()[0].strip()) page_url = large_categories[i].xpath("./@href").extract()[0] request = FormRequest(page_url, callback=self._parse_page_free, dont_filter=True) request.meta["large_category"] = large_category_name request.meta["first_url"] = page_url yield request
def parse(self, response) : return FormRequest.from_response(response, headers = self.headers, formdata = { 'username' : 'kytj1', 'password' : '6ujBJ4XQyLeGmJmB' }, callback = self.after_login, dont_filter = True )
def parse(self, response): login_form = { 'login': self.username, 'password': self.password, } return FormRequest.from_response( response, formdata=login_form, callback=self.after_login )
def parse(self, response): yield FormRequest.from_response(response, formdata={ 'tanggal': '20160817#Rabu, 17 Agustus 2016', 'origination': 'KAC#KIARACONDONG', 'destination': 'MN#MADIUN', 'adult': '1', 'infant': '0' }, callback=self.parseInfo)
def login(self,response): # login = requests.post(response.url, # headers = self.headers, # data={ # 'source':'None', # 'redir':'https://www.douban.com/people/60012975/', # 'form_email':'*****@*****.**', # 'form_password':'******', # # 'remember':'on', # 'login':u'登录' # }) hxs = Selector(response) if hxs.xpath('//*[@name="captcha-id"]/@value').extract(): captchaID = hxs.xpath('//*[@name="captcha-id"]/@value').extract()[0] captchAdd = hxs.xpath('//*[@id="captcha_image"]/@src').extract()[0] urllib.urlretrieve(captchAdd,'captcha.jpg') captch = raw_input('please input the captcha:') yield FormRequest.from_response(response, meta =response.meta, # headers = self.headers, formdata={'source':'None', 'redir':'https://www.douban.com/people/unlucky_strike/', 'form_email':'*****@*****.**', 'form_password':'******', 'captcha-solution':captch, 'captcha-id':captchaID, 'remember':'on', 'login':u'登录'}, callback=self.parse) else: yield FormRequest.from_response(response, meta ={'cookiejar':response.meta['cookiejar']}, # headers = self.headers, formdata={'source':'None', 'redir':'https://www.douban.com/people/unlucky_strike/', 'form_email':'*****@*****.**', 'form_password':'******', 'remember':'on', 'login':u'登录'}, callback=self.parse)
def parse(self, response): '''Parse login page''' return FormRequest.from_response( response, formxpath='//form[contains(@action, "login")]', formdata={ 'email': self.username, 'pass': self.password, }, callback=self.parse_home, )
def _get_page_request(self, response, page, date): request = FormRequest.from_response( response, formdata={"txtDesde": date, "__EVENTTARGET": "gvwConsulta", "__EVENTARGUMENT": "Page${}".format(page)}, dont_filter=True, callback=self.parse, ) request.meta["date"] = date return request
def currency_form(self, response): """ Currency form viewed and change to USD posted. """ self.log('currency_form', level=logging.INFO) formdata = { 'ddlCountry1': 'United States', 'ddlCurrency': '503329C6-40CB-47E6-91D1-9F11AF63F706' } return FormRequest.from_response(response, formdata=formdata, callback=self.currency_changed)
def __engine_codes_request(self, series): url = 'https://' + self.allowed_domains[0] + '/v2/engine_code/selection' request = FormRequest( url = url, formdata = { 'manufacturer': series['link'], 'body': series['model_family_id'], 'litres': series['litres'], 'fuel': series['fuel'], 'freetext': series['text'], 'vehicletype': series['vehicletype'], 'module': 'RT' }, callback = self.parse_engine_codes, meta = { 'series': series, }, dont_filter = True ) self.__prepare_request(request) request.method = 'POST' return request
def parse_home(self, response): '''Parse user news feed page''' if response.css('#approvals_code'): # Handle 'Approvals Code' checkpoint (ask user to enter code). if not self.code: # Show facebook messages via logs # and request user for approval code. message = response.css('._50f4::text').extract()[0] self.log(process_string(message)) message = response.css('._3-8y._50f4').xpath('string()').extract()[0] self.log(process_string(message)) self.code = input('Enter the code: ') self.code = str(self.code) if not (self.code and self.code.isdigit()): self.log('Bad approvals code detected.') return return FormRequest.from_response( response, formdata={'approvals_code': self.code}, callback=self.parse_home, ) elif response.css('input#u_0_1'): # Handle 'Save Browser' checkpoint. return FormRequest.from_response( response, formdata={'name_action_selected': 'dont_save'}, callback=self.parse_home, dont_filter=True, ) elif response.css('button#checkpointSubmitButton'): # Handle `Someone tried to log into your account` warning. return FormRequest.from_response( response, callback=self.parse_home, dont_filter=True,) # Else go to the user profile. href = response.css('a[title="Profile"]::attr(href)').extract()[0] return Request( response.urljoin(href), callback=self.parse_profile, )
def parse_start_url(self, response): sel=Selector(response) passwd=sel.xpath(r'/html/body/div[2]/form/div/input[2]/@name').extract_first() captchaUrl=sel.xpath(r'/html/body/div[2]/form/div/img[1]/@src').extract_first() code=requests.get(captchaUrl) with open('/home/shichangtai/code.gif','wb') as f: f.write(code.content) captcha=raw_input('请输入验证码: ') #此次的meta是第一次请求获取cookie,以后每次的请求都讲传送这个cookie_jar return [FormRequest.from_response(response=response, formdata={'mobile':self.account,passwd:self.password,'code':captcha}, meta = {'cookiejar':1},#不要设置'dont_merge_cookies'为True callback=self.after_log)]
def _get_page_request(self, response, page, date): request = FormRequest.from_response( response, formdata={ 'txtDesde': date, '__EVENTTARGET': 'gvwConsulta', '__EVENTARGUMENT': 'Page${}'.format(page), }, dont_filter=True, callback=self.parse, ) request.meta['date'] = date return request
def parse_initial_request(self, response): date = response.meta["date"] request = FormRequest.from_response( response, formdata={"txtDesde": date, "btnBuscar.x": "1", "btnBuscar.y": "1"}, dont_filter=True, callback=self.parse_page, ) request.meta["date"] = date yield request
def parse_category(self, response): items = response.xpath('//div[@class="datagrid"]//tr') for item in items: product = item.xpath('td//font/b/span[contains(@id, "main_GDVMain_lblProductName")]/text()').extract() #price = item.xpath('td//font/b/span[contains(@id, "main_GDVMain_lblHarga")]/text()').extract() #link = if (len(product) > 0): print product pages = item.xpath('td[@colspan="3"]//a/@href').re("doPostBack\(([^)]+')") if len(pages) > 0: for page in pages: yield FormRequest.from_response(response, formdata={'__EVENTTARGET': eventtarget, '__EVENTARGUMENT': eventargument}, callback = self.parse_items, dont_click = True)
def parse(self, response): """ Overwrites Spiders parse method. Fill in log in details in log in form and submit. :return: """ print('custom settings:') print(self._settings) return FormRequest.from_response( response, formxpath='//div[contains(concat(" ", normalize-space(@class), " "), " main-container ")]/descendant::form', formdata={'EmailOrUsername': self._settings['username'], 'Password': self._settings['password']}, callback=self.go_to_search_site )
def parse(self, response): yield FormRequest.from_response( response, formid='register', formdata={ 'username':'******', 'password':'******', 'redirect' : '/', 'debug_token':' ', 'login':'******', 'user-agent' : response.request.headers.get('User-Agent').decode('utf-8') }, callback=self.after_login )
def start_requests(self): """The City Hall website publish the gazettes in a page with a form that allow users to browse through different years and months. This form sends requests via POST, so this method emulates a series of these POSTs. @url http://www.pmf.sc.gov.br/governo/index.php?pagina=govdiariooficial @returns requests 1 """ target = date.today() while target >= self.AVAILABLE_FROM: year, month = str(target.year), str(target.month) data = dict(ano=year, mes=month, passo='1', enviar='') yield FormRequest(url=self.URL, formdata=data, callback=self.parse) target = target + relativedelta(months=1)
def parse_pre_login(self, response): authenticity_token = response.xpath( '//input[@name="authenticity_token"]/@value').extract_first() yield FormRequest( url='http://www.qiyaoinvest.com/pc/login/submit_user', formdata={ 'utf8': '✓', 'authenticity_token': authenticity_token, 'login_name': '13916427906', 'password': '******', 'auto_login': '******', 'login': '******' }, callback=self.parse_login)
def login(self, response): #解析所有的form表字段 post_data = parse_form(response.text) post_data['email'] = '*****@*****.**' post_data['password'] = '******' #提交表单,登录使用,相当于post一个url请求 return [ FormRequest( 'http://example.webscraping.com/places/default/user/login', formdata=post_data, headers=headers, callback=self.after_login) ]
def make_request(self, response): form_data = {} for i in response.xpath('//form//input'): name = i.xpath('./@name').extract_first() value = i.xpath('./@value').extract_first() or '' form_data[name] = value month_before = (datetime.now() - timedelta(days=30)).strftime('%m/%d/%Y') form_data['txtFromDate'] = month_before yield FormRequest(url=response.request.url, formdata=form_data, callback=self.parse_retraction_data)
def login(self, response): xsrf = response.xpath('.//input[@name="_xsrf"]/@value').extract_first() return FormRequest('https://www.zhihu.com/login/phone_num', callback=self.after_login, meta={ 'cookiejar': response.meta['cookiejar'], 'xsrf': xsrf }, formdata={ '_xsrf': xsrf, 'phone_num': get_project_settings().get('USER'), 'password': get_project_settings().get('PWD'), 'captcha_type': 'cn' })
def log_in(self, response): cms = response.xpath( '//input[@name="__cmsform__"]/@value').extract_first() yield FormRequest(url='http://www.dapufund.com/Public/checkLogins', formdata={ 'name': '13916427906', 'pass': '******', '__cmsform__': cms }, meta={ 'dont_redirect': True, 'handle_httpstatus_list': [302, 301] }, callback=self.start_requests)
def start_requests(self): #start_urls = ['https://www.luxuryhomemarketing.com/real-estate-agents/advanced_search.html/'] area_codes = ['AK', 'AL'] for area_code in area_codes: yield FormRequest( url= 'https://www.luxuryhomemarketing.com/real-estate-agents/advanced_search.html/', formdata={ 'Country': 'US/CA', 'State_prov': area_code }, callback=self.parse, meta={"area_code": area_code}, dont_filter=True)
def login_to_library(self, response): start_urls = response.meta['start_urls'] login_form = response.xpath(self.login_form_xpath) if login_form: credentials = { 'user': self.settings['CREDENTIALS']['user'], 'pass': self.settings['CREDENTIALS']['pass'], } # open_in_browser(response) yield FormRequest.from_response(response, formxpath=self.login_form_xpath, formdata=credentials, meta={'start_urls': start_urls}, dont_filter=True, callback=self.parse_journals) else: # If no login required # open_in_browser(response) yield scrapy.Request(response.url, dont_filter=True, meta={'start_urls': start_urls}, callback=self.parse_journals)
def start_requests(self): for i in range(1, self.pageIndex): form_data = { 'page': str(i), 'rows': '10', } request = FormRequest(self.tmpl_url, callback=self.parse_page, formdata=form_data, dont_filter=True, meta={ "dynamic": True, }) yield request
def start_requests(self): for date in date_range(self.start_date, self.end_date): data = { 'diaAtual': '{:02d}'.format(date.day), 'mesAtual': '{:02d}'.format(date.month), 'anoAtual': str(date.year), } yield FormRequest( method='POST', url=RECORDS_BY_DATE_URL, formdata=data, meta={'date': date}, callback=self.parse_record_list, )
def parse(self, response): """解析省级地名""" try: data = json.loads(response.body) areaName_list = [item['Key'] for item in data[3]['Child']] for areaName in areaName_list: formdata = {'Param': '全文检索:诈骗,法院地域:%s' % (areaName)} yield FormRequest(url=self.court_url, formdata=formdata, meta={'areaName': areaName}, dont_filter=True, callback=self.parse_second) except: print '获取省级域名内容失败,请检查!'
def parse(self, response): _corp_id = response.meta['corp_id'] if _corp_id != '00000000': # 解析数据 try: ret_data = json.loads(response.body_as_unicode()) except Exception as e: print(f'{_corp_id}') print(e) else: corp_data = ret_data.get('Data') # 修改item定义 item = BussinessriskItem() item['Data'] = corp_data item['corp_id'] = _corp_id item['corp_str'] = response.meta['corp_str'] yield item corp_strs = self.rds.srandmember(self.redis_key, 2) # 在pipeline 中删除 # 迭代链接 if corp_strs is not None: for corp_str in corp_strs: corp_dict = json.loads(corp_str) corp_id = corp_dict.get('corp_id') corp_old_id = corp_dict.get('corp_old_id') data = {} yield FormRequest(url=self.post_url, headers=self.headers, formdata=data, callback=self.parse, meta={ 'corp_id': corp_id, 'corp_str': corp_str }) else: print('queue empty, exit') return
def pre_parse(self, response): #获取登录需要的参数 form = response.xpath("//form[@action='/session']") #获取form表单 keys = ['utf8', 'authenticity_token', 'commit'] #需要获取的key values = [] for key in keys: #从网页循环获取keys的值 xpath = "//form[@action='/session']//input[@name='%s']/@value" % key value = response.xpath(xpath).extract()[0] values.append(value) postdata = dict(zip(keys, values)) #转换成为字典 postdata['login'] = '******' postdata['password'] = '******' yield FormRequest(self.loginurls, formdata=postdata, callback=self.login_parse) #想gitHub post表单
def start_requests(self): # self.start # self.end self.rnd = int(self.start) yield FormRequest( url='https://dhlottery.co.kr/gameResult.do?method=byWin', method='POST', formdata={ 'drwNo': str(self.rnd), 'dwrNoList': str(self.rnd) }, callback=self.parse)
def start_requests(self): yield FormRequest(url='http://www.jdoor.cn/api/customerlogin/', method='POST', body=json.dumps({ "phone": "13916427906", "verifycode": self.verifycode }), headers={ 'org': '1819b573-3e70-4adc-95f1-d8a2b8a09787', 'Content-Type': 'application/json', 'Pragma': 'no-cache', 'Accept': '*/*' }, callback=self.parse_login)
def start_requests(self): page = 0 post_group_data = dict(QType='queryv3', Data=self.group_data % page) post_free_data = dict(QType='queryv3', Data=self.free_data % page) post_ticket_data = dict(QType='queryv3', Data=self.ticket_data % page) post_oneday_data = dict(QType='queryv3', Data=self.oneday_data % page) post_cruise_data = dict(QType='queryv3', Data=self.cruise_data % page) # 请求跟团游 yield FormRequest(self.post_url, formdata=post_group_data, callback=self.parse_group, meta={'page': page}) # 请求自由行 yield FormRequest(self.post_url, formdata=post_free_data, callback=self.parse_free, meta={'page': page}) # 请求门票 yield FormRequest(self.post_url, formdata=post_ticket_data, callback=self.parse_ticket, meta={'page': page}) # 请求一日游 yield FormRequest(self.post_url, formdata=post_oneday_data, callback=self.parse_oneday, meta={'page': page}) # 请求邮轮游 yield FormRequest(self.post_url, formdata=post_cruise_data, callback=self.parse_cruise, meta={'page': page})
def start_requests(self): base_url = 'http://httpbin.org' module = 'user' #sheet name file_name = cs.FILE_NAME #用例文件名 #eggfile = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))),'case',file_name) #读到路径也不能直接读取egg中的文件 eggpath = os.path.dirname( os.path.dirname(os.path.dirname( os.path.realpath(__file__)))) #获取临时蛋文件的位置 zipf = zipfile.ZipFile(eggpath) zipf.extract('xlstest/case/' + file_name, '/') #将蛋文件中的测试用例xlsx拷贝到根目录(会连同路径一起拷贝) path = '/xlstest/case/' + file_name #获取拷贝出来的测试用例xlsx #读取xlsx文件 excel.open_excel(path) sheet = excel.get_sheet(module) rows = excel.get_rows(sheet) #逐行获取用例的相关请求信息 for i in range(2, rows): testNumber = excel.get_content(sheet, i, cs.CASE_NUMBER) #用例编号 testName = excel.get_content(sheet, i, cs.CASE_NAME) #用例名称 testUrl = excel.get_content(sheet, i, cs.CASE_URL) #用例接口地址 testUrl = base_url + testUrl testMethod = excel.get_content(sheet, i, cs.CASE_METHOD) #用例请求类型 testMethod = testMethod.strip().upper() #去前后空格并转大写 testdata = excel.get_content(sheet, i, cs.CASE_DATA) #用例参数 testdata = {} if testdata.strip() == '' else eval( testdata) #为空时转为空字典 testHeaders = eval(excel.get_content(sheet, i, cs.CASE_HEADERS)) testCode = str(excel.get_content(sheet, i, cs.CASE_CODE)) #预期值 #判断请求类型并放入队列 if testMethod == 'POST': yield FormRequest(testUrl, self.parse, headers=testHeaders, formdata=testdata, meta={ 'testCode': testCode, 'testNumber': testNumber }) else: yield Request(testUrl, self.parse, meta={ 'testCode': testCode, 'testNumber': testNumber })
def parse(self, response): data = json.loads(response.text) feed = data['data']['viewer']['marketplace_feed_stories'] for e in feed['edges']: node = e['node'] node.pop('tracking') node.pop('id') listing = node.pop('listing', {}) listing_name = listing.pop('__typename') primary_listing_photo = listing.pop('primary_listing_photo') formatted_price = listing.pop('formatted_price') location = listing.pop('location') custom_sub_titles = listing.pop( 'custom_sub_titles_with_rendering_flags') pre_recorded_videos = listing.pop('pre_recorded_videos') delivery_types = listing.pop('delivery_types') seller = listing.pop('marketplace_listing_seller') story = listing.pop('story') node.update(listing) node['listing_name'] = listing_name image = primary_listing_photo['image']['uri'] node['image_urls'] = [image] if image else [] node['price'] = formatted_price['text'] geo_code = location['reverse_geocode'] node['location'] = ', '.join([geo_code['city'], geo_code['state']]) node['custom_sub_titles'] = ', '.join(e['subtitle'] for e in custom_sub_titles) node['delivery_types'] = ', '.join(delivery_types) node['seller_id'] = seller['id'] node['seller_type'] = seller['__typename'] node['seller_name'] = seller['name'] node['story_url'] = story['url'] yield node if not feed['page_info']['has_next_page']: print("No Next Page") return self.variables['cursor'] = feed['page_info']['end_cursor'] self.data['variables'] = json.dumps(self.variables) yield FormRequest(url=self.graph_url_t, formdata=self.data, meta=response.meta)
def parse_doodle(self, response): """解析其他用户对图片的涂鸦内容""" data = json.loads(response.text) if data.get('data') and data.get('data').get('list'): list = data.get('data').get('list') if len(list) > 0: create_time = list[-1].get('create_time') else: create_time = datetime.now() pd_list = [] for doodle in list: user = {} user['user_avatar'] = doodle.get('avatar') user['doodle_time'] = doodle.get('create_time') user['doodle_list'] = [] for d in doodle.get('chartlet_list'): d_dict = {} d_dict['angle'] = d.get('angle') d_dict['center_x'] = d.get('center_x') d_dict['center_y'] = d.get('center_y') d_dict['zoom'] = d.get('zoom') d_dict['picture'] = d.get('picture') d_dict['is_turn'] = d.get('is_turn') d_dict['rect_upper_left_x'] = d.get('rect_upper_left_x') d_dict['rect_upper_left_y'] = d.get('rect_upper_left_y') d_dict['rect_width'] = d.get('rect_width') d_dict['rect_height'] = d.get('rect_height') d_dict['word'] = d.get('word') user['doodle_list'].append(d_dict) pd_list.append(user) meta_d = { 'id': response.meta.get('id'), 'tag': response.meta.get('tag'), 'pet_pic': response.meta.get('pet_pic'), 'like_num': response.meta.get('like_num'), 'owner_name': response.meta.get('owner_name'), 'owner_avatar': response.meta.get('owner_avatar'), 'city': response.meta.get('city'), 'v_second': response.meta.get('v_second'), 'video': response.meta.get('video'), 'doodle_list': pd_list, 'create_time': create_time } yield FormRequest(url=self.ty_comment.format(cid=response.meta.get('id')), meta=meta_d, callback=self.parse_comment)
def follows_info(self, response): fb_dtsg = response.meta["fb_dtsg"] ajax = response.meta["ajax"] data = { "m_sess": "", "fb_dtsg": fb_dtsg, "__dyn": " ", "__req": " ", "__ajax__": ajax, "__user": "******" } html = copy.deepcopy(response.body.decode()) next_cursor = re.findall( r'm_more_friends.*?href\\\":\\\"\\\\\\(.*?)\"', html) if len(next_cursor) != 0: real_cursor = self.base_url + re.sub( r'\\', '', re.sub(r"\\u0025", "%", next_cursor[0], 2)) # print(next_cursor) r_html = re.sub( r'\\', '', re.sub( r'\\u003C', '<', re.findall(r'\"html\":\"(.*?)\",\"replaceifexists\"', html)[0])) # print(r_html) real_html = lxml.etree.HTML(HTMLParser().unescape(r_html)).xpath( "//div[@class='_55wo _55x2']//div[@class='_55wp _4g33 _5pxa']") for i in real_html: person_info = {} home_page = i.xpath("./div[@class='_5s61 _2b4m']/a/@href") if len(home_page) == 0: home_page = '' else: home_page = home_page[0] name = i.xpath("./div[@class='_5s61 _2b4m']/a/i/@aria-label")[0] person_info["name"] = name person_info["home_page"] = home_page yield person_info if len(next_cursor) != 0: yield FormRequest(url=real_cursor, formdata=data, callback=self.follows_info, meta={ "fb_dtsg": fb_dtsg, "ajax": ajax }, headers=self.headers)
def start_requests(self): #uids=self.read_uid() uids = self.get_nickname() for uid in uids: result=self.db['users'].find_one({"NickName": uid}, {'_id': 0, 'NickName': 1}) if not result: url='https://weibo.cn/search/' postdata={'keyword':uid,'suser':'******'} yield FormRequest(url, formdata=postdata,callback=self.parse_userurl,meta={'uid':uid},priority=22,dont_filter=True) self.requestcount+=1 else: uid=self.db['users'].find_one({"NickName":uid}, {'_id':0, 'Id': 1}) uid=uid["Id"] url = 'https://weibo.cn/{}/info'.format(uid) yield Request(url, callback=self.parse_user_info,priority=22,dont_filter=True)
def start_requests(self): yield FormRequest( url= 'http://www.ifc-cherami.com/index.php?g=portal&m=user&a=dologin', method='POST', formdata={ 'login_phone': self.username, 'login_passwd': self.password, 'checkbox': 'on' }, headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8' }, )
def get_page(self, response): for page in range(1, 31): form_data = { "city": self.city, # "kd": self.keyword, "kd": u"爬虫", "pn": str(page), 'needAddtionalResult': 'False', 'isSchoolJob': '0' } yield FormRequest(url=self.url, formdata=form_data, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.get_job_url)
def post_login(self, response): html = pq(response.text) server_time = html("input[type='hidden']").eq(1).val() login_url = 'http://passport.lagou.com/login/login.html?ts={}'.format( server_time) form_data = { "isValidate": "true", "username": "******", "password": "******" } yield FormRequest(login_url, formdata=form_data, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.get_page)
def get_pages(self, response: scrapy.FormRequest) -> scrapy.FormRequest: """ Manages the multiple pages for bigger queries :param response: Page :type response: scrapy.FormRequest :return: concrete page FormRequest :rtype: scrapy.FormRequest """ # Pagination row paginator = response.css( "#ctl00_cphMainContent_gvSearchResults tr.gridPager:first-child td table tr td" ) # Multi-pages with 'Last' problem (more than paginator length) /// solved pg = 0 while int(paginator.css("a::text")[-1].extract()) != pg: pg += 1 yield scrapy.FormRequest( url=response.url, formdata={ "ctl00_cphMainContent_txtLCSTartDate_dateInput_text": f"1/1/{CURRENT_YEAR}", "ctl00_cphMainContent_txtLCEndDate_dateInput_text": f"12/31/{CURRENT_YEAR}", "ctl00$cphMainContent$ddlLCDocumentType$vddlDropDown": "101627", '__VIEWSTATE': response.css( 'input#__VIEWSTATE::attr(value)').extract_first(), '__EVENTARGUMENT': f"Page${pg}", "__EVENTTARGET": "ctl00$cphMainContent$gvSearchResults", }, callback=self.get_rows)
def start_requests(self): base_url = "http://www.natal.rn.gov.br/dom/" initial_date = date(self.start_date.year, self.start_date.month, 1) end_date = date.today() periods_of_interest = [(date.year, date.month) for date in rrule( freq=MONTHLY, dtstart=initial_date, until=end_date)] for year, month in periods_of_interest: data = { "ano": str(year), "mes": str(month).zfill(2), "list": "Listar" } yield FormRequest(url=base_url, formdata=data)
def parse(self, response): user_origin = response.xpath( '//input[@name="userOriginHook"]/@value').get() yield FormRequest.from_response(response, formdata={ 'gdToken': '', 'userOriginHook': user_origin, 'postLoginUrl': '', 'emailOptOut': '', 'user.email_x': '*****@*****.**', 'user.password_x': 'glassdoorbot12345' }, callback=self.after_login)
def parse(self, response): form = response.xpath('//form[@id="formDepAtual"]') congresspeople = form.xpath('.//select[@name="deputado"]/option[string(@value)]/@value').extract() for congressperson in congresspeople: #yield FormRequest( # url=form.attrib['action'] #) yield FormRequest.from_response( response=response, formid='formDepAtual', formdata={ 'deputado': congressperson }, callback=self.parse_congressperson )
def amazon_request_start(self, response): req = 1 for i in self.product_list: title = i['title'] + ' ' + i['ssd'] if i['cpu'] is '' else \ i['title'] + ' ' + i['cpu'] + ' ' + i['year'] yield FormRequest.from_response( response, formname='site-search', formdata={'field-keywords': title}, meta={'item': i, 'depth': 10}, callback=self.start_amazon_search ) print('Started request #' + str(req)) req += 1 time.sleep(1)