コード例 #1
0
    def _request_number_of_pages(self, date_str):
        url = self.base_url + '/consulta_paginarBusquedaVisitas'

        request = FormRequest(url=url,
                              meta={
                                  'date': date_str,
                              },
                              formdata={
                                  'fechaDesde': date_str,
                                  'fechaHasta': date_str,
                                  'paginaActual': '1',
                                  'visita.visitanteNombres': '',
                                  'visita.personalNombre': '',
                                  'visita.oficinaNombre': '',
                                  'visita.sedeId': '00',
                                  'visita.ano': '',
                                  'visita.mes': '',
                                  'visita.fechaIngreso': '',
                                  'paginaNueva': '0',
                                  'visita.visitanteId': '0',
                                  'visita.personalId': '0',
                                  'visita.oficinaId': '0',
                              },
                              dont_filter=True,
                              callback=self.parse_initial_request)

        request.meta['date'] = date_str
        return request
コード例 #2
0
ファイル: chinaidrSpider.py プロジェクト: hanwei2008/crawl
 def _parse_list(self, response):
     report_list = response.xpath("//div[@class=\"reportlist bluelink\"]/ul//a/@href")
     for report_url in report_list:
         request = FormRequest(urljoin(self.base_url, report_url.extract()), callback=self.parse_item, dont_filter=False)
         request.meta["large_category_name"] = response.meta["large_category_name"]
         request.meta["mid_category_name"] = response.meta["mid_category_name"]
         request.meta["report_type"] = response.meta["report_type"]
         yield request
コード例 #3
0
 def parse(self, response):
     large_categories = response.xpath(".//*[@class='rptmap']//strong//a")
     for large_category in large_categories:
         large_category_name = clean_text(large_category.xpath("./text()").extract()[0].strip())
         page_url = large_category.xpath("./@href").extract()[0]
         url = urljoin(self.base_url, page_url)
         request = FormRequest(url, callback=self.parse_middle_category, dont_filter=True)
         request.meta["large_category"] = large_category_name
         yield request
コード例 #4
0
ファイル: chinaidrSpider.py プロジェクト: hanwei2008/crawl
 def parse_middle_category(self, response):
     report_types = response.xpath(u"//li[contains(text(),'报告')]")
     for report_type in report_types:
         mid_category_url = urljoin(self.base_url, report_type.xpath(u"./preceding-sibling::span[1]/a/@href").extract()[0])
         request = FormRequest(mid_category_url, callback=self.parse_page, dont_filter=True)
         request.meta["large_category_name"] = response.meta["large_category_name"]
         request.meta["mid_category_name"] = response.meta["mid_category_name"]
         request.meta["report_type"] = clean_text(report_type.xpath("./text()").extract()[0].strip())
         request.meta["page_base_url"] = mid_category_url
         yield request
コード例 #5
0
 def parse_middle_category(self, response):
     mid_categories = response.xpath(".//*[@class='report2']//h2//a")
     for mid_category in mid_categories:
         mid_category_name = clean_text(mid_category.xpath("./text()").extract()[0].strip())
         page_url = mid_category.xpath("./@href").extract()[0]
         url = urljoin(self.base_url, page_url)
         request = FormRequest(url, callback=self._parse_item, dont_filter=True)
         request.meta["large_category"] = response.meta["large_category"]
         request.meta["mid_category"] = mid_category_name
         request.meta["first_url"] = url
         yield request
コード例 #6
0
ファイル: chinaidrSpider.py プロジェクト: hanwei2008/crawl
 def parse(self, response):
     large_categories = response.xpath("//*[@class='tabContent bluelink']//*[contains(@style, 'padding')]/a")
     for large_category in large_categories:
         large_category_name = clean_text(large_category.xpath(".//text()").extract()[0].strip())
         mid_categorys = large_category.xpath("./parent::*/following-sibling::*[1]/a")
         for mid_category in mid_categorys:
             mid_category_name = clean_text(mid_category.xpath("./text()").extract()[0])
             mid_category_url = urljoin(self.base_url, mid_category.xpath("./@href").extract()[0])
             request = FormRequest(mid_category_url, callback=self.parse_middle_category, dont_filter=True)
             request.meta["large_category_name"] = large_category_name
             request.meta["mid_category_name"] = mid_category_name
             yield request
コード例 #7
0
 def _parse_page_free(self, response):
     total_pages = int(clean_text(response.xpath(".//*[@class='pages']//a//text()").extract()[-2].strip()))
     first_url = response.meta["first_url"]
     request = FormRequest(first_url, callback=self._parse_free, dont_filter=True)
     request.meta["large_category"] = response.meta["large_category"]
     yield request
     if total_pages>1:
         for i in xrange(1,total_pages):
             next_page = first_url[:-5] + '-p' + str(i+1) + '.html'
             request = FormRequest(next_page, callback=self._parse_free, dont_filter=True)
             request.meta["large_category"] = response.meta["large_category"]
             yield request
コード例 #8
0
 def parse(self, response):
     large_categories = response.xpath(".//*[@class='shopleft_bt']//a")
     middle_categories = response.xpath(".//*[@class='shopnav2']")
     for i in xrange(len(large_categories)):
         large_category_name = clean_text(large_categories[i].xpath("./text()").extract()[0].strip())
         middle_category_list = middle_categories[i].xpath(".//*[@class='shopleft_wt']")
         for middle_category in middle_category_list:
             middle_category_name = clean_text(middle_category.xpath(".//a/text()").extract())
             page_url = middle_category.xpath(".//a//@href").extract()[0]
             url = urljoin(self.base_url, page_url)
             request = FormRequest(url, callback=self._parse_item, dont_filter=True)
             request.meta["large_category"] = large_category_name
             request.meta["mid_category"] = middle_category_name
             yield request
コード例 #9
0
ファイル: congreso.py プロジェクト: andyfires/manolo_scraper
    def _request_next_page(self, response, date_str, callback):
        current_page = int(response.meta['current_page'])

        total_string = response.css('#LblTotal').xpath('./text()').extract_first(default='')

        total = re.search(r'(\d+)', total_string)

        if total:
            # Deal with the next page.
            total = total.group(1)
            number_of_pages = self._get_number_of_pages(int(total))

            if current_page < number_of_pages:
                current_page += 1

                formdata = {
                    'TxtFecha': date_str,
                    'BtnBuscar': 'Buscar',
                    'LwVisitasCR$DpVisitasCR$ctl02$ctl00.x': '1',
                    'LwVisitasCR$DpVisitasCR$ctl02$ctl00.y': '1'
                }

                request = FormRequest.from_response(response,
                                                    formdata=formdata,
                                                    dont_click=True,
                                                    dont_filter=True,
                                                    callback=callback,
                                                    )

                request.meta['date'] = date_str
                request.meta['current_page'] = current_page

                return request
コード例 #10
0
    def parse(self, response):
        """
        这是默认的回调方法,得到response后:
        1. 如果需要登录,则先通过FormRequest登录论坛;
        2. 如果不需要登录,通过Request继续请求;
        :param response:
        :return:
        """
        # 需要登录,使用FormRequest.from_response模拟登录
        if 'id="lsform"' in response.body:
            logging.info('in parse, need to login, url: {0}'.format(response.url))
            form_data = {'handlekey': 'ls', 'quickforward': 'yes', 'username': '******', 'password': '******'}
            request = FormRequest.from_response(response=response,
                                                headers=self.headers,
                                                formxpath='//form[contains(@id, "lsform")]',
                                                formdata=form_data,
                                                callback=self.parse_list
                                                )
        else:
            logging.info('in parse, NOT need to login, url: {0}'.format(response.url))
            request = Request(url=response.url,
                              headers=self.headers,
                              callback=self.parse_list,
                              )

        yield request
コード例 #11
0
    def parse(self, response) :
#        test_urls = [
#        "http://ntiaoji.kaoyan.com/tjadm/1.html",
#        "http://ntiaoji.kaoyan.com/tjadm/2.html",
#        "http://ntiaoji.kaoyan.com/tjadm/3.html",
#        "http://ntiaoji.kaoyan.com/tjadm/4.html",
#        "http://ntiaoji.kaoyan.com/tjadm/5.html",
#        "http://ntiaoji.kaoyan.com/tjadm/6.html",
#        "http://ntiaoji.kaoyan.com/tjadm/7.html"
#	]
#
#	for url in test_urls :
#	    print url
#	    time.sleep(2)
#	    self.headers['Referer'] = url
#            yield FormRequest.from_response(response,
#	        headers = self.headers,
#	        formdata = {
#	        'username' : 'kytj1',
#	        'password' : '6ujBJ4XQyLeGmJmB'
#	        },
#	        callback = self.download_page,
#	        dont_filter = True
#	    )
        return FormRequest.from_response(response,
	    headers = self.headers,
	    formdata = {
	        'username' : 'kytj1',
	        'password' : '6ujBJ4XQyLeGmJmB'
	    },
	    callback = self.after_login,
	    dont_filter = True
        )
コード例 #12
0
ファイル: chinaidrSpider.py プロジェクト: hanwei2008/crawl
 def parse_page(self, response):
     request_list = self._parse_list(response)
     for r in request_list:
         yield r
     next_page = response.xpath(u"//*[@id='AspNetPager1']/a[text()=\"下一页\"]/@href")
     if len(next_page) > 0:
         next_page_url = urljoin(self.base_url, next_page.extract()[0])
         if not next_page_url.startswith(response.meta["page_base_url"]):
             if next_page_url.endswith("html"):
                 next_page_url = response.meta["page_base_url"] + next_page_url[next_page_url.rindex("/") + 1:len(next_page_url)]
         request = FormRequest(next_page_url, callback=self.parse_page, dont_filter=True)
         request.meta["large_category_name"] = response.meta["large_category_name"]
         request.meta["mid_category_name"] = response.meta["mid_category_name"]
         request.meta["report_type"] = response.meta["report_type"]
         request.meta["page_base_url"] = response.meta["page_base_url"]
         yield request
コード例 #13
0
 def parse(self, response):
     form_data = {'username': '******', 'password': '******', 'remember_me': '1'}
     return FormRequest.from_response(response,
                                      headers=self.headers,
                                      formxpath='//form[@class="form-login"]',
                                      formdata=form_data,
                                      callback=self.after_login,
                                      )
コード例 #14
0
ファイル: bgkuSpider.py プロジェクト: hanwei2008/crawl
    def parse(self,response):
        ## page_count_text {string}
        # @example
        #
        # if(ANP_checkInput('AspNetPager1_input',3270,'页索引超出范围!','页索引不是有效的数值!'))
        # {ANP_goToPage('AspNetPager1_input','page','http://www.bgku.cn/sitemap_1',
        # 'http://www.bgku.cn/sitemap_{page}','',3270,false);};return false;
        ##

        page_count_text= response.xpath('//*[@id="AspNetPager1_btn"]/@onclick').extract()[0]
        match= re.search(',\d{4,},',page_count_text)
        page_count= int(match.group(0).strip(','))
        for page in range(1,page_count+1):
            url= 'http://www.bgku.cn/sitemap_'+str(page)
            request = FormRequest(url, callback=self.parse_index_page, dont_filter=True)
            request.meta["page"] = page
            yield request
コード例 #15
0
 def parse(self, response):
     yield FormRequest.from_response(
         response,
         formname='aspnetForm',
         formdata={'Skin$body$FundingSourceChoices$0': '1',
                   'Skin$body$FundingSourceChoices$1': '0'},
         meta={'curr_listing_page': 1,  'flag': False},
         callback=self.after_login)
コード例 #16
0
 def parse(self, response):
     url  = response.url
     if "research" in url:
         categories = response.xpath(".//*[@class='catec']")
         for i in xrange(len(categories)-1):
             large_categories = categories[i].xpath(".//*[@class='fl']")
             large_category_name = clean_text(large_categories.xpath(".//text()").extract()[0].strip())
             mid_categories = categories[i].xpath(".//span")
             for mid_category in mid_categories:
                 mid_category_name = clean_text(mid_category.xpath(".//text()").extract()[0].strip())
                 page_url = mid_category.xpath(".//@href").extract()[0]
                 request = FormRequest(page_url, callback=self._parse_page_research, dont_filter=True)
                 request.meta["large_category"] = large_category_name
                 request.meta["mid_category"] = mid_category_name
                 request.meta["first_url"] = page_url
                 yield request
     elif "free" in url:
         large_categories = response.xpath(".//*[@class='tul2']//h2//a")
         for i in xrange(len(large_categories)):
             large_category_name = clean_text(large_categories[i].xpath(".//text()").extract()[0].strip())
             page_url = large_categories[i].xpath("./@href").extract()[0]
             request = FormRequest(page_url, callback=self._parse_page_free, dont_filter=True)
             request.meta["large_category"] = large_category_name
             request.meta["first_url"] = page_url
             yield request
コード例 #17
0
    def parse(self, response) :
        return FormRequest.from_response(response,
	    headers = self.headers,
	    formdata = {
	        'username' : 'kytj1',
	        'password' : '6ujBJ4XQyLeGmJmB'
	    },
	    callback = self.after_login,
	    dont_filter = True
        )
コード例 #18
0
 def parse(self, response):
     login_form = {
         'login': self.username,
         'password': self.password,
     }
     return FormRequest.from_response(
         response,
         formdata=login_form,
         callback=self.after_login
     )
コード例 #19
0
ファイル: Spiders.py プロジェクト: adityamahesa/SpiderKai
 def parse(self, response):
     yield FormRequest.from_response(response,
                                     formdata={
                                         'tanggal': '20160817#Rabu, 17 Agustus 2016',
                                         'origination': 'KAC#KIARACONDONG',
                                         'destination': 'MN#MADIUN',
                                         'adult': '1',
                                         'infant': '0'
                                     },
                                     callback=self.parseInfo)
コード例 #20
0
ファイル: doubanmovie.py プロジェクト: Suluo/spider-Scrapy
    def login(self,response):
        # login = requests.post(response.url,
        #                       headers = self.headers,
        #                       data={
        #                              'source':'None',
        #                              'redir':'https://www.douban.com/people/60012975/',
        #                              'form_email':'*****@*****.**',
        #                              'form_password':'******',
        #
        #                              'remember':'on',
        #                              'login':u'登录'
        #                       })

        hxs = Selector(response)
        if hxs.xpath('//*[@name="captcha-id"]/@value').extract():
            captchaID = hxs.xpath('//*[@name="captcha-id"]/@value').extract()[0]
            captchAdd = hxs.xpath('//*[@id="captcha_image"]/@src').extract()[0]
            urllib.urlretrieve(captchAdd,'captcha.jpg')
            captch = raw_input('please input the captcha:')
            yield FormRequest.from_response(response,
                                            meta =response.meta,
                                            # headers = self.headers,
                                            formdata={'source':'None',
                                                      'redir':'https://www.douban.com/people/unlucky_strike/',
                                                      'form_email':'*****@*****.**',
                                                      'form_password':'******',
                                                      'captcha-solution':captch,
                                                      'captcha-id':captchaID,
                                                      'remember':'on',
                                                      'login':u'登录'},
                                            callback=self.parse)
        else:
            yield FormRequest.from_response(response,
                                            meta ={'cookiejar':response.meta['cookiejar']},
                                            # headers = self.headers,
                                            formdata={'source':'None',
                                                      'redir':'https://www.douban.com/people/unlucky_strike/',
                                                      'form_email':'*****@*****.**',
                                                      'form_password':'******',
                                                      'remember':'on',
                                                      'login':u'登录'},
                                            callback=self.parse)
コード例 #21
0
 def parse(self, response):
     '''Parse login page'''
     return FormRequest.from_response(
         response,
         formxpath='//form[contains(@action, "login")]',
         formdata={
             'email': self.username,
             'pass': self.password,
         },
         callback=self.parse_home,
     )
コード例 #22
0
ファイル: ambiente.py プロジェクト: matiskay/manolo_scraper
    def _get_page_request(self, response, page, date):

        request = FormRequest.from_response(
            response,
            formdata={"txtDesde": date, "__EVENTTARGET": "gvwConsulta", "__EVENTARGUMENT": "Page${}".format(page)},
            dont_filter=True,
            callback=self.parse,
        )

        request.meta["date"] = date

        return request
コード例 #23
0
ファイル: oxygen.py プロジェクト: ercchy/oxygendemo
 def currency_form(self, response):
     """
     Currency form viewed and change to USD posted.
     """
     self.log('currency_form', level=logging.INFO)
     formdata = {
         'ddlCountry1': 'United States',
         'ddlCurrency': '503329C6-40CB-47E6-91D1-9F11AF63F706'
     }
     return FormRequest.from_response(response,
                                      formdata=formdata,
                                      callback=self.currency_changed)
コード例 #24
0
ファイル: working.py プロジェクト: gerich/autodata
 def __engine_codes_request(self, series):
     url = 'https://' + self.allowed_domains[0] + '/v2/engine_code/selection'
     request = FormRequest(
         url = url,
         formdata = {
             'manufacturer': series['link'],
             'body': series['model_family_id'],
             'litres': series['litres'],
             'fuel': series['fuel'],
             'freetext': series['text'],
             'vehicletype': series['vehicletype'],
             'module': 'RT'
         },
         callback = self.parse_engine_codes,
         meta = {
             'series': series,
         },
         dont_filter = True
     )
     self.__prepare_request(request)
     request.method = 'POST'
     return request
コード例 #25
0
 def parse_home(self, response):
     '''Parse user news feed page'''
     if response.css('#approvals_code'):
         # Handle 'Approvals Code' checkpoint (ask user to enter code).
         if not self.code:
             # Show facebook messages via logs
             # and request user for approval code.
             message = response.css('._50f4::text').extract()[0]
             self.log(process_string(message))
             message = response.css('._3-8y._50f4').xpath('string()').extract()[0]
             self.log(process_string(message))
             self.code = input('Enter the code: ')
         self.code = str(self.code)
         if not (self.code and self.code.isdigit()):
             self.log('Bad approvals code detected.')
             return
         return FormRequest.from_response(
             response,
             formdata={'approvals_code': self.code},
             callback=self.parse_home,
         )
     elif response.css('input#u_0_1'):
         # Handle 'Save Browser' checkpoint.
         return FormRequest.from_response(
             response,
             formdata={'name_action_selected': 'dont_save'},
             callback=self.parse_home,
             dont_filter=True,
         )
     elif response.css('button#checkpointSubmitButton'):
         # Handle `Someone tried to log into your account` warning.
         return FormRequest.from_response(
             response, callback=self.parse_home, dont_filter=True,)
     # Else go to the user profile.
     href = response.css('a[title="Profile"]::attr(href)').extract()[0]
     return Request(
         response.urljoin(href),
         callback=self.parse_profile,
     )
コード例 #26
0
ファイル: Sina.py プロジェクト: shichangtai/ScrapySpider
 def parse_start_url(self, response):
     sel=Selector(response)
     passwd=sel.xpath(r'/html/body/div[2]/form/div/input[2]/@name').extract_first()
     captchaUrl=sel.xpath(r'/html/body/div[2]/form/div/img[1]/@src').extract_first()
     code=requests.get(captchaUrl)
     with open('/home/shichangtai/code.gif','wb') as f:
         f.write(code.content)
     captcha=raw_input('请输入验证码: ')
     #此次的meta是第一次请求获取cookie,以后每次的请求都讲传送这个cookie_jar
     return [FormRequest.from_response(response=response,
                 formdata={'mobile':self.account,passwd:self.password,'code':captcha},
                 meta = {'cookiejar':1},#不要设置'dont_merge_cookies'为True
                 callback=self.after_log)]
コード例 #27
0
 def _get_page_request(self, response, page, date):
     request = FormRequest.from_response(
         response,
         formdata={
             'txtDesde': date,
             '__EVENTTARGET': 'gvwConsulta',
             '__EVENTARGUMENT': 'Page${}'.format(page),
         },
         dont_filter=True,
         callback=self.parse,
     )
     request.meta['date'] = date
     return request
コード例 #28
0
ファイル: ambiente.py プロジェクト: matiskay/manolo_scraper
    def parse_initial_request(self, response):
        date = response.meta["date"]

        request = FormRequest.from_response(
            response,
            formdata={"txtDesde": date, "btnBuscar.x": "1", "btnBuscar.y": "1"},
            dont_filter=True,
            callback=self.parse_page,
        )

        request.meta["date"] = date

        yield request
コード例 #29
0
ファイル: waroengkom.py プロジェクト: trihatmaja/scraping
 def parse_category(self, response):
     items = response.xpath('//div[@class="datagrid"]//tr')
     for item in items:
         product = item.xpath('td//font/b/span[contains(@id, "main_GDVMain_lblProductName")]/text()').extract()
         #price = item.xpath('td//font/b/span[contains(@id, "main_GDVMain_lblHarga")]/text()').extract()
         #link = 
         if (len(product) > 0):
             print product
             
         pages = item.xpath('td[@colspan="3"]//a/@href').re("doPostBack\(([^)]+')")
         if len(pages) > 0:
             for page in pages:
                 yield FormRequest.from_response(response, formdata={'__EVENTTARGET': eventtarget, '__EVENTARGUMENT': eventargument}, callback = self.parse_items, dont_click = True)
コード例 #30
0
 def parse(self, response):
     """
     Overwrites Spiders parse method. Fill in log in details in log in form and submit.
     :return:
     """
     print('custom settings:')
     print(self._settings)
     return FormRequest.from_response(
         response,
         formxpath='//div[contains(concat(" ", normalize-space(@class), " "), " main-container ")]/descendant::form',
         formdata={'EmailOrUsername': self._settings['username'], 'Password': self._settings['password']},
         callback=self.go_to_search_site
     )
コード例 #31
0
 def parse(self, response):
     yield FormRequest.from_response(
         response,
         formid='register',
         formdata={
             'username':'******',
             'password':'******',
             'redirect' : '/',
             'debug_token':' ',
             'login':'******',
             'user-agent' : response.request.headers.get('User-Agent').decode('utf-8')
         },
         callback=self.after_login
     )
コード例 #32
0
 def start_requests(self):
     """The City Hall website publish the gazettes in a page with a form
     that allow users to browse through different years and months. This
     form sends requests via POST, so this method emulates a series of these
     POSTs.
     @url http://www.pmf.sc.gov.br/governo/index.php?pagina=govdiariooficial
     @returns requests 1
     """
     target = date.today()
     while target >= self.AVAILABLE_FROM:
         year, month = str(target.year), str(target.month)
         data = dict(ano=year, mes=month, passo='1', enviar='')
         yield FormRequest(url=self.URL, formdata=data, callback=self.parse)
         target = target + relativedelta(months=1)
コード例 #33
0
 def parse_pre_login(self, response):
     authenticity_token = response.xpath(
         '//input[@name="authenticity_token"]/@value').extract_first()
     yield FormRequest(
         url='http://www.qiyaoinvest.com/pc/login/submit_user',
         formdata={
             'utf8': '✓',
             'authenticity_token': authenticity_token,
             'login_name': '13916427906',
             'password': '******',
             'auto_login': '******',
             'login': '******'
         },
         callback=self.parse_login)
コード例 #34
0
    def login(self, response):
        #解析所有的form表字段
        post_data = parse_form(response.text)
        post_data['email'] = '*****@*****.**'
        post_data['password'] = '******'

        #提交表单,登录使用,相当于post一个url请求
        return [
            FormRequest(
                'http://example.webscraping.com/places/default/user/login',
                formdata=post_data,
                headers=headers,
                callback=self.after_login)
        ]
コード例 #35
0
    def make_request(self, response):
        form_data = {}
        for i in response.xpath('//form//input'):
            name = i.xpath('./@name').extract_first()
            value = i.xpath('./@value').extract_first() or ''
            form_data[name] = value

        month_before = (datetime.now() -
                        timedelta(days=30)).strftime('%m/%d/%Y')
        form_data['txtFromDate'] = month_before

        yield FormRequest(url=response.request.url,
                          formdata=form_data,
                          callback=self.parse_retraction_data)
コード例 #36
0
 def login(self, response):
     xsrf = response.xpath('.//input[@name="_xsrf"]/@value').extract_first()
     return FormRequest('https://www.zhihu.com/login/phone_num',
                        callback=self.after_login,
                        meta={
                            'cookiejar': response.meta['cookiejar'],
                            'xsrf': xsrf
                        },
                        formdata={
                            '_xsrf': xsrf,
                            'phone_num': get_project_settings().get('USER'),
                            'password': get_project_settings().get('PWD'),
                            'captcha_type': 'cn'
                        })
コード例 #37
0
ファイル: DaPuAseet.py プロジェクト: qianbin0205/ggscrap
 def log_in(self, response):
     cms = response.xpath(
         '//input[@name="__cmsform__"]/@value').extract_first()
     yield FormRequest(url='http://www.dapufund.com/Public/checkLogins',
                       formdata={
                           'name': '13916427906',
                           'pass': '******',
                           '__cmsform__': cms
                       },
                       meta={
                           'dont_redirect': True,
                           'handle_httpstatus_list': [302, 301]
                       },
                       callback=self.start_requests)
コード例 #38
0
 def start_requests(self):
     #start_urls = ['https://www.luxuryhomemarketing.com/real-estate-agents/advanced_search.html/']
     area_codes = ['AK', 'AL']
     for area_code in area_codes:
         yield FormRequest(
             url=
             'https://www.luxuryhomemarketing.com/real-estate-agents/advanced_search.html/',
             formdata={
                 'Country': 'US/CA',
                 'State_prov': area_code
             },
             callback=self.parse,
             meta={"area_code": area_code},
             dont_filter=True)
コード例 #39
0
 def login_to_library(self, response):
     start_urls = response.meta['start_urls']
     login_form = response.xpath(self.login_form_xpath)
     if login_form:
         credentials = {
             'user': self.settings['CREDENTIALS']['user'],
             'pass': self.settings['CREDENTIALS']['pass'],
         }
         # open_in_browser(response)
         yield FormRequest.from_response(response, formxpath=self.login_form_xpath, formdata=credentials, meta={'start_urls': start_urls}, dont_filter=True, callback=self.parse_journals)
     else:
         # If no login required
         # open_in_browser(response)
         yield scrapy.Request(response.url, dont_filter=True, meta={'start_urls': start_urls}, callback=self.parse_journals)
コード例 #40
0
 def start_requests(self):
     for i in range(1, self.pageIndex):
         form_data = {
             'page': str(i),
             'rows': '10',
         }
         request = FormRequest(self.tmpl_url,
                               callback=self.parse_page,
                               formdata=form_data,
                               dont_filter=True,
                               meta={
                                   "dynamic": True,
                               })
         yield request
コード例 #41
0
ファイル: records.py プロジェクト: zntt/stf
 def start_requests(self):
     for date in date_range(self.start_date, self.end_date):
         data = {
             'diaAtual': '{:02d}'.format(date.day),
             'mesAtual': '{:02d}'.format(date.month),
             'anoAtual': str(date.year),
         }
         yield FormRequest(
             method='POST',
             url=RECORDS_BY_DATE_URL,
             formdata=data,
             meta={'date': date},
             callback=self.parse_record_list,
         )
コード例 #42
0
 def parse(self, response):
     """解析省级地名"""
     try:
         data = json.loads(response.body)
         areaName_list = [item['Key'] for item in data[3]['Child']]
         for areaName in areaName_list:
             formdata = {'Param': '全文检索:诈骗,法院地域:%s' % (areaName)}
             yield FormRequest(url=self.court_url,
                               formdata=formdata,
                               meta={'areaName': areaName},
                               dont_filter=True,
                               callback=self.parse_second)
     except:
         print '获取省级域名内容失败,请检查!'
コード例 #43
0
    def parse(self, response):

        _corp_id = response.meta['corp_id']

        if _corp_id != '00000000':

            # 解析数据
            try:
                ret_data = json.loads(response.body_as_unicode())
            except Exception as e:
                print(f'{_corp_id}')
                print(e)

            else:

                corp_data = ret_data.get('Data')

                # 修改item定义
                item = BussinessriskItem()

                item['Data'] = corp_data
                item['corp_id'] = _corp_id
                item['corp_str'] = response.meta['corp_str']

                yield item

        corp_strs = self.rds.srandmember(self.redis_key, 2)  # 在pipeline 中删除

        # 迭代链接
        if corp_strs is not None:

            for corp_str in corp_strs:
                corp_dict = json.loads(corp_str)
                corp_id = corp_dict.get('corp_id')
                corp_old_id = corp_dict.get('corp_old_id')

                data = {}

                yield FormRequest(url=self.post_url,
                                  headers=self.headers,
                                  formdata=data,
                                  callback=self.parse,
                                  meta={
                                      'corp_id': corp_id,
                                      'corp_str': corp_str
                                  })

        else:
            print('queue empty, exit')
            return
コード例 #44
0
 def pre_parse(self, response):  #获取登录需要的参数
     form = response.xpath("//form[@action='/session']")  #获取form表单
     keys = ['utf8', 'authenticity_token', 'commit']  #需要获取的key
     values = []
     for key in keys:  #从网页循环获取keys的值
         xpath = "//form[@action='/session']//input[@name='%s']/@value" % key
         value = response.xpath(xpath).extract()[0]
         values.append(value)
     postdata = dict(zip(keys, values))  #转换成为字典
     postdata['login'] = '******'
     postdata['password'] = '******'
     yield FormRequest(self.loginurls,
                       formdata=postdata,
                       callback=self.login_parse)  #想gitHub post表单
コード例 #45
0
    def start_requests(self):

        # self.start
        # self.end
        self.rnd = int(self.start)

        yield FormRequest(
            url='https://dhlottery.co.kr/gameResult.do?method=byWin',
            method='POST',
            formdata={
                'drwNo': str(self.rnd),
                'dwrNoList': str(self.rnd)
            },
            callback=self.parse)
コード例 #46
0
ファイル: ShiyuInvest.py プロジェクト: xhsong2009/ggscrapy
 def start_requests(self):
     yield FormRequest(url='http://www.jdoor.cn/api/customerlogin/',
                       method='POST',
                       body=json.dumps({
                           "phone": "13916427906",
                           "verifycode": self.verifycode
                       }),
                       headers={
                           'org': '1819b573-3e70-4adc-95f1-d8a2b8a09787',
                           'Content-Type': 'application/json',
                           'Pragma': 'no-cache',
                           'Accept': '*/*'
                       },
                       callback=self.parse_login)
コード例 #47
0
ファイル: ctrip.py プロジェクト: LevyYuan/scrapy_ctrip
    def start_requests(self):
        page = 0
        post_group_data = dict(QType='queryv3', Data=self.group_data % page)
        post_free_data = dict(QType='queryv3', Data=self.free_data % page)
        post_ticket_data = dict(QType='queryv3', Data=self.ticket_data % page)
        post_oneday_data = dict(QType='queryv3', Data=self.oneday_data % page)
        post_cruise_data = dict(QType='queryv3', Data=self.cruise_data % page)

        # 请求跟团游
        yield FormRequest(self.post_url,
                          formdata=post_group_data,
                          callback=self.parse_group,
                          meta={'page': page})

        # 请求自由行
        yield FormRequest(self.post_url,
                          formdata=post_free_data,
                          callback=self.parse_free,
                          meta={'page': page})

        # 请求门票
        yield FormRequest(self.post_url,
                          formdata=post_ticket_data,
                          callback=self.parse_ticket,
                          meta={'page': page})

        # 请求一日游
        yield FormRequest(self.post_url,
                          formdata=post_oneday_data,
                          callback=self.parse_oneday,
                          meta={'page': page})

        # 请求邮轮游
        yield FormRequest(self.post_url,
                          formdata=post_cruise_data,
                          callback=self.parse_cruise,
                          meta={'page': page})
コード例 #48
0
    def start_requests(self):
        base_url = 'http://httpbin.org'
        module = 'user'  #sheet name
        file_name = cs.FILE_NAME  #用例文件名

        #eggfile = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))),'case',file_name) #读到路径也不能直接读取egg中的文件
        eggpath = os.path.dirname(
            os.path.dirname(os.path.dirname(
                os.path.realpath(__file__))))  #获取临时蛋文件的位置
        zipf = zipfile.ZipFile(eggpath)
        zipf.extract('xlstest/case/' + file_name,
                     '/')  #将蛋文件中的测试用例xlsx拷贝到根目录(会连同路径一起拷贝)
        path = '/xlstest/case/' + file_name  #获取拷贝出来的测试用例xlsx

        #读取xlsx文件
        excel.open_excel(path)
        sheet = excel.get_sheet(module)
        rows = excel.get_rows(sheet)

        #逐行获取用例的相关请求信息
        for i in range(2, rows):
            testNumber = excel.get_content(sheet, i, cs.CASE_NUMBER)  #用例编号
            testName = excel.get_content(sheet, i, cs.CASE_NAME)  #用例名称
            testUrl = excel.get_content(sheet, i, cs.CASE_URL)  #用例接口地址
            testUrl = base_url + testUrl
            testMethod = excel.get_content(sheet, i, cs.CASE_METHOD)  #用例请求类型
            testMethod = testMethod.strip().upper()  #去前后空格并转大写
            testdata = excel.get_content(sheet, i, cs.CASE_DATA)  #用例参数
            testdata = {} if testdata.strip() == '' else eval(
                testdata)  #为空时转为空字典
            testHeaders = eval(excel.get_content(sheet, i, cs.CASE_HEADERS))
            testCode = str(excel.get_content(sheet, i, cs.CASE_CODE))  #预期值
            #判断请求类型并放入队列
            if testMethod == 'POST':
                yield FormRequest(testUrl,
                                  self.parse,
                                  headers=testHeaders,
                                  formdata=testdata,
                                  meta={
                                      'testCode': testCode,
                                      'testNumber': testNumber
                                  })
            else:
                yield Request(testUrl,
                              self.parse,
                              meta={
                                  'testCode': testCode,
                                  'testNumber': testNumber
                              })
コード例 #49
0
    def parse(self, response):
        data = json.loads(response.text)
        feed = data['data']['viewer']['marketplace_feed_stories']

        for e in feed['edges']:
            node = e['node']
            node.pop('tracking')
            node.pop('id')

            listing = node.pop('listing', {})
            listing_name = listing.pop('__typename')
            primary_listing_photo = listing.pop('primary_listing_photo')
            formatted_price = listing.pop('formatted_price')
            location = listing.pop('location')
            custom_sub_titles = listing.pop(
                'custom_sub_titles_with_rendering_flags')
            pre_recorded_videos = listing.pop('pre_recorded_videos')
            delivery_types = listing.pop('delivery_types')
            seller = listing.pop('marketplace_listing_seller')
            story = listing.pop('story')

            node.update(listing)
            node['listing_name'] = listing_name
            image = primary_listing_photo['image']['uri']
            node['image_urls'] = [image] if image else []
            node['price'] = formatted_price['text']

            geo_code = location['reverse_geocode']
            node['location'] = ', '.join([geo_code['city'], geo_code['state']])

            node['custom_sub_titles'] = ', '.join(e['subtitle']
                                                  for e in custom_sub_titles)
            node['delivery_types'] = ', '.join(delivery_types)

            node['seller_id'] = seller['id']
            node['seller_type'] = seller['__typename']
            node['seller_name'] = seller['name']
            node['story_url'] = story['url']

            yield node

        if not feed['page_info']['has_next_page']:
            print("No Next Page")
            return
        self.variables['cursor'] = feed['page_info']['end_cursor']
        self.data['variables'] = json.dumps(self.variables)
        yield FormRequest(url=self.graph_url_t,
                          formdata=self.data,
                          meta=response.meta)
コード例 #50
0
    def parse_doodle(self, response):
        """解析其他用户对图片的涂鸦内容"""
        data = json.loads(response.text)
        if data.get('data') and data.get('data').get('list'):
            list = data.get('data').get('list')
            if len(list) > 0:
                create_time = list[-1].get('create_time')
            else:
                create_time = datetime.now()
            pd_list = []
            for doodle in list:
                user = {}
                user['user_avatar'] = doodle.get('avatar')
                user['doodle_time'] = doodle.get('create_time')
                user['doodle_list'] = []
                for d in doodle.get('chartlet_list'):
                    d_dict = {}
                    d_dict['angle'] = d.get('angle')
                    d_dict['center_x'] = d.get('center_x')
                    d_dict['center_y'] = d.get('center_y')
                    d_dict['zoom'] = d.get('zoom')
                    d_dict['picture'] = d.get('picture')
                    d_dict['is_turn'] = d.get('is_turn')
                    d_dict['rect_upper_left_x'] = d.get('rect_upper_left_x')
                    d_dict['rect_upper_left_y'] = d.get('rect_upper_left_y')
                    d_dict['rect_width'] = d.get('rect_width')
                    d_dict['rect_height'] = d.get('rect_height')
                    d_dict['word'] = d.get('word')

                    user['doodle_list'].append(d_dict)

                pd_list.append(user)

            meta_d = {
                'id': response.meta.get('id'),
                'tag': response.meta.get('tag'),
                'pet_pic': response.meta.get('pet_pic'),
                'like_num': response.meta.get('like_num'),
                'owner_name': response.meta.get('owner_name'),
                'owner_avatar': response.meta.get('owner_avatar'),
                'city': response.meta.get('city'),
                'v_second': response.meta.get('v_second'),
                'video': response.meta.get('video'),
                'doodle_list': pd_list,
                'create_time': create_time
            }
            yield FormRequest(url=self.ty_comment.format(cid=response.meta.get('id')),
                              meta=meta_d,
                              callback=self.parse_comment)
コード例 #51
0
    def follows_info(self, response):
        fb_dtsg = response.meta["fb_dtsg"]
        ajax = response.meta["ajax"]
        data = {
            "m_sess": "",
            "fb_dtsg": fb_dtsg,
            "__dyn": " ",
            "__req": " ",
            "__ajax__": ajax,
            "__user": "******"
        }

        html = copy.deepcopy(response.body.decode())
        next_cursor = re.findall(
            r'm_more_friends.*?href\\\":\\\"\\\\\\(.*?)\"', html)
        if len(next_cursor) != 0:
            real_cursor = self.base_url + re.sub(
                r'\\', '', re.sub(r"\\u0025", "%", next_cursor[0], 2))
        # print(next_cursor)
        r_html = re.sub(
            r'\\', '',
            re.sub(
                r'\\u003C', '<',
                re.findall(r'\"html\":\"(.*?)\",\"replaceifexists\"',
                           html)[0]))
        # print(r_html)
        real_html = lxml.etree.HTML(HTMLParser().unescape(r_html)).xpath(
            "//div[@class='_55wo _55x2']//div[@class='_55wp _4g33 _5pxa']")
        for i in real_html:
            person_info = {}
            home_page = i.xpath("./div[@class='_5s61 _2b4m']/a/@href")
            if len(home_page) == 0:
                home_page = ''
            else:
                home_page = home_page[0]
            name = i.xpath("./div[@class='_5s61 _2b4m']/a/i/@aria-label")[0]
            person_info["name"] = name
            person_info["home_page"] = home_page
            yield person_info

        if len(next_cursor) != 0:
            yield FormRequest(url=real_cursor,
                              formdata=data,
                              callback=self.follows_info,
                              meta={
                                  "fb_dtsg": fb_dtsg,
                                  "ajax": ajax
                              },
                              headers=self.headers)
コード例 #52
0
 def start_requests(self):
     #uids=self.read_uid()
     uids = self.get_nickname()
     for uid in uids:
         result=self.db['users'].find_one({"NickName": uid}, {'_id': 0, 'NickName': 1})
         if not result:
             url='https://weibo.cn/search/'
             postdata={'keyword':uid,'suser':'******'}
             yield FormRequest(url, formdata=postdata,callback=self.parse_userurl,meta={'uid':uid},priority=22,dont_filter=True)
             self.requestcount+=1
         else:
             uid=self.db['users'].find_one({"NickName":uid}, {'_id':0, 'Id': 1})
             uid=uid["Id"]
             url = 'https://weibo.cn/{}/info'.format(uid)
             yield Request(url, callback=self.parse_user_info,priority=22,dont_filter=True)
コード例 #53
0
 def start_requests(self):
     yield FormRequest(
         url=
         'http://www.ifc-cherami.com/index.php?g=portal&m=user&a=dologin',
         method='POST',
         formdata={
             'login_phone': self.username,
             'login_passwd': self.password,
             'checkbox': 'on'
         },
         headers={
             'Content-Type':
             'application/x-www-form-urlencoded; charset=UTF-8'
         },
     )
コード例 #54
0
 def get_page(self, response):
     for page in range(1, 31):
         form_data = {
             "city": self.city,
             # "kd": self.keyword,
             "kd": u"爬虫",
             "pn": str(page),
             'needAddtionalResult': 'False',
             'isSchoolJob': '0'
         }
         yield FormRequest(url=self.url,
                           formdata=form_data,
                           meta={'cookiejar': response.meta['cookiejar']},
                           headers=self.headers,
                           callback=self.get_job_url)
コード例 #55
0
 def post_login(self, response):
     html = pq(response.text)
     server_time = html("input[type='hidden']").eq(1).val()
     login_url = 'http://passport.lagou.com/login/login.html?ts={}'.format(
         server_time)
     form_data = {
         "isValidate": "true",
         "username": "******",
         "password": "******"
     }
     yield FormRequest(login_url,
                       formdata=form_data,
                       meta={'cookiejar': response.meta['cookiejar']},
                       headers=self.headers,
                       callback=self.get_page)
コード例 #56
0
    def get_pages(self, response: scrapy.FormRequest) -> scrapy.FormRequest:
        """
        Manages the multiple pages for bigger queries
        :param response: Page
        :type response: scrapy.FormRequest
        :return: concrete page FormRequest
        :rtype: scrapy.FormRequest
        """

        # Pagination row
        paginator = response.css(
            "#ctl00_cphMainContent_gvSearchResults tr.gridPager:first-child td table tr td"
        )

        # Multi-pages with 'Last' problem (more than paginator length) /// solved
        pg = 0
        while int(paginator.css("a::text")[-1].extract()) != pg:
            pg += 1
            yield scrapy.FormRequest(
                url=response.url,
                formdata={
                    "ctl00_cphMainContent_txtLCSTartDate_dateInput_text":
                    f"1/1/{CURRENT_YEAR}",
                    "ctl00_cphMainContent_txtLCEndDate_dateInput_text":
                    f"12/31/{CURRENT_YEAR}",
                    "ctl00$cphMainContent$ddlLCDocumentType$vddlDropDown":
                    "101627",
                    '__VIEWSTATE':
                    response.css(
                        'input#__VIEWSTATE::attr(value)').extract_first(),
                    '__EVENTARGUMENT':
                    f"Page${pg}",
                    "__EVENTTARGET":
                    "ctl00$cphMainContent$gvSearchResults",
                },
                callback=self.get_rows)
コード例 #57
0
    def start_requests(self):
        base_url = "http://www.natal.rn.gov.br/dom/"

        initial_date = date(self.start_date.year, self.start_date.month, 1)
        end_date = date.today()

        periods_of_interest = [(date.year, date.month) for date in rrule(
            freq=MONTHLY, dtstart=initial_date, until=end_date)]
        for year, month in periods_of_interest:
            data = {
                "ano": str(year),
                "mes": str(month).zfill(2),
                "list": "Listar"
            }
            yield FormRequest(url=base_url, formdata=data)
コード例 #58
0
 def parse(self, response):
     user_origin = response.xpath(
         '//input[@name="userOriginHook"]/@value').get()
     yield FormRequest.from_response(response,
                                     formdata={
                                         'gdToken': '',
                                         'userOriginHook': user_origin,
                                         'postLoginUrl': '',
                                         'emailOptOut': '',
                                         'user.email_x':
                                         '*****@*****.**',
                                         'user.password_x':
                                         'glassdoorbot12345'
                                     },
                                     callback=self.after_login)
コード例 #59
0
 def parse(self, response):
     form = response.xpath('//form[@id="formDepAtual"]')
     congresspeople = form.xpath('.//select[@name="deputado"]/option[string(@value)]/@value').extract()
     for congressperson in congresspeople:
         #yield FormRequest(
         #    url=form.attrib['action']
         #)
         yield FormRequest.from_response(
             response=response,
             formid='formDepAtual',
             formdata={
                 'deputado': congressperson
             },
             callback=self.parse_congressperson
         )
コード例 #60
0
 def amazon_request_start(self, response):
     req = 1
     for i in self.product_list:
         title = i['title'] + ' ' + i['ssd'] if i['cpu'] is '' else \
             i['title'] + ' ' + i['cpu'] + ' ' + i['year']
         yield FormRequest.from_response(
             response,
             formname='site-search',
             formdata={'field-keywords': title},
             meta={'item': i, 'depth': 10},
             callback=self.start_amazon_search
         )
         print('Started request #' + str(req))
         req += 1
         time.sleep(1)