def parse(self, response):
        # deleting files:
        try:
            if os.path.exists("newPost.txt"):
                os.remove("newPost.txt")
            if os.path.exists("newgetLinks.txt"):
                os.remove("newgetLinks.txt")
            if os.path.exists("scrappedurls.txt"):
                os.remove("scrappedurls.txt")
        except:
            pass
        # print "Status:",response.status
        # print "Request Headers"
        # print response.request.headers.items()
        # print "\n\n"
        # print "Response Headers"
        # print response.headers.items()
        # print "\n\n"

        login_user = self.credentials[response.request.url][0]
        print login_user
        login_pass = self.credentials[response.request.url][1]
        print login_pass
        args, url, method, name, number = fill_login_form(response.url, response.body, login_user, login_pass)

        if name:
            yield FormRequest.from_response(
                response, method=method, formdata=args, formname=name, callback=self.after_login
            )
        else:
            yield FormRequest.from_response(
                response, method=method, formdata=args, formnumber=number, callback=self.after_login
            )
Exemple #2
0
    def parseVoteSearchResults(self, response):
        sel = Selector(response)
        
        validationstr = self.getValidationString(response)
        
        # Parse the first page of results
        for voteItem in self.parseVoteTableResults(response):
            yield voteItem
        
        pages = 1
        
        # Grab the vote table
        voteTable = sel.css('#SelectVoteHistoryView_GridView1')
        rows = voteTable.css('tr')
        
        # The last row contains the page links
        paginationRow = rows[-1]
        
        firstCellElement = paginationRow.css('td>span::text')
        if not firstCellElement:
            # Can't find the navigate bar??
            return
        firstCellContent = firstCellElement.extract()[0]
        
        # Check if there are any pages..
        if str(firstCellContent).isdigit():
            thisPage = int(firstCellContent)-1
            
            # The last cell contains some js to skip to the final page
            lastCell = paginationRow.css('td')[-1]
            lastPageElem = lastCell.css('a::attr(onclick)')
            if len(lastPageElem) != 1:
                # We're on the last set of pages, so there's no last page link
                return
            
            lastPageLink = lastPageElem[0].extract()
    
            # Need to pull page number out of this: javascript:__gvSelectVoteHistoryView_GridView1.callback("69|0|/wFlpZg1Fw7NYQJzL0vuw+Y57ZPx1ug=|"); return false;
            # It's the first value in the param string
            # TODO: Error handling...
            pages = int(re.split('\"|\|',lastPageLink)[1]) + 1
            
            # Only iterate over pages if this is a new table
            if response.meta['fetch']:
                for i in range(thisPage,min([pages,thisPage+11])):
                    fetch = False
                    if i == thisPage + 10:
                        fetch = True

                    if validationstr:
                        # The fact we have a validationstr means we need a surrogate form (this is a callback)
                        form = response.meta['form']
                        yield FormRequest.from_response(form, callback=self.parseVoteSearchResults, formdata={'__CALLBACKID':'SelectVoteHistoryView$GridView1',
                                                                                                       '__CALLBACKPARAM':str(i)+'|0|/wFlpZg1Fw7NYQJzL0vuw+Y57ZPx1ug=||' + str(thisPage) + '|0|/wFlpZg1Fw7NYQJzL0vuw+Y57ZPx1ug=|/wFk7SDVa18iduwa7ivhFHIM55t+AhI=',
                                                                                                        '__EVENTVALIDATION':validationstr},meta={'fetch':fetch,'form':form})
                    else:
                        # Should only get here if this is first page
                        #inspect_response(response)
                        yield FormRequest.from_response(response, callback=self.parseVoteSearchResults, formdata={'__CALLBACKID':'SelectVoteHistoryView$GridView1',
                                                                                                       '__CALLBACKPARAM':str(i)+'|0|/wFlpZg1Fw7NYQJzL0vuw+Y57ZPx1ug=||' + str(thisPage) + '|0|/wFlpZg1Fw7NYQJzL0vuw+Y57ZPx1ug=|/wFk7SDVa18iduwa7ivhFHIM55t+AhI='},meta={'fetch':fetch,'form':response})
    def search(self, response):
        # Fill in form (search field) with default term "data scientist" and search.

        data_form = None
        if 'data_form' in response.meta.keys():
            data_form = response.meta['data_form']
        else:
            data_form = self.get_input_data(self.input_file)

        if not data_form:
            return
        elif len(data_form) == 1:
            key, value = data_form.popitem()
            return FormRequest.from_response(response,
                        formdata={"newNtk": key,
                                  "newNtt": value},
                        callback=self.parse_results)
        else:
            key, value = data_form.popitem()
            form_request = FormRequest.from_response(response,
                    formdata={"newNtk": key,
                              "newNtt": value},
                    callback=self.search)
            form_request.meta['data_form'] = data_form
            return form_request
    def parse_start_url(self, response):
        base_data = {
            'name': '',
            'country': '',
            '_state': '1',
            'language': '0',
            'qualification': '0--',
            'level': '0',
            'speciality': '0',
            '_isTrainer': 'on'
        }

        data = base_data.copy()
        data['profession'] = str(PROFESSION)
        form_request = FormRequest.from_response(response,
            formdata=data)
        return form_request

        requests = []
        for profession in PROFESSIONS:
            data = base_data.copy()
            data['profession'] = str(PROFESSION)
            form_request = FormRequest.from_response(response,
                formdata=data)
            requests.append(form_request)
        return requests
    def parse(self, response):
      # args, url, method = fill_login_form(response.url, response.body, self.login_user, self.login_pass)
       #return FormRequest(url, method=method, formdata=args, dont_filter=True,callback=self.after_login)
       args, url, method, name , number = fill_login_form(response.url, response.body, self.login_user, self.login_pass)
       credentials = list()
       tmpparam = dict()
       print args
       for a in args:
        if a[0].find("user") > -1 or a[0].find("admin") > -1:
          tmpparam["userid"] = a[0]
        if a[0].find("password") > -1:
          tmpparam["passwordid"] = a[0]
       
     #  tmpparam["submit"] = "submit"
       tmpparam["url"] =  self.start_urls[0]
       tmpparam["login"] = ""
       credentials.append(tmpparam)
       f = open("json/credentials.json", 'w')
       f.write(json.dumps(credentials,indent= 4, sort_keys = True))
       f.close()


       if name:
                yield FormRequest.from_response(response, method=method, formdata=args, formname=name,  dont_filter=True,callback=self.after_login)        
       else:
                yield FormRequest.from_response(response, method=method, formdata=args, formnumber=number,  dont_filter=True,callback=self.after_login)
    def parse_from(self, response):
        selector = Selector(response)
        stop_ids = selector.css('#confirm1_ddlTravellingTo option[value!="-1"]::attr(value)').extract()

        for stop_id in stop_ids:
            request = FormRequest.from_response(response,
                            formname='ctl01',
                            formdata={
                                'confirm1$ddlLeavingFromMap': response.meta['stop_from'],
                                'confirm1$ddlTravellingTo': stop_id,
                                'confirm1$btnSearch': 'Search',
                            },
                            callback=self.parse_to_stop)
            request.meta['stop_to'] = stop_id
            request.meta['stop_from'] = response.meta['stop_from']
            yield request

        # reapeat to get the first stop (from)
        request = FormRequest.from_response(response,
                        formname='ctl01',
                        formdata={
                            'confirm1$ddlLeavingFromMap': response.meta['stop_from'],
                            'confirm1$ddlTravellingTo': stop_id,
                            'confirm1$btnSearch': 'Search',
                        },
                        callback=self.parse_from_stop)
        request.meta['stop_to'] = stop_id
        request.meta['stop_from'] = response.meta['stop_from']
        yield request
  def listing_parser(self, response):
    """
    Given a paginated listing of alumni, parse members and reparse individual page and rerun on next
    pages
    """
    # Get all pages
    x = HtmlXPathSelector(response)

    # if has numbered responses
    pages_hrefs = [i.extract().split("'")[1] for i in x.select("//div[contains(@class,'rgNumPart')]/a/@href")]
    pages = x.select("//div[contains(@class,'rgNumPart')]/a")

    requests = []

    # if the first link is a previous pages, delete it
    if "Previous Pages" in pages[0].extract():
      del pages_hrefs[0]

    # if next pages, pop and rerun listing_parser on it
    if "Next Pages" in pages[-1].extract():
      requests.append(FormRequest.from_response(response,
          formdata = {'__EVENTTARGET' : pages_hrefs.pop(), },
          callback=self.listing_parser))

    # parse each page including the first with scraper
    for href in pages_hrefs:
      requests.append(FormRequest.from_response(response,
          formdata = {'__EVENTTARGET' : href, },
          callback=self.listing_scraper))

    return requests
 def parse(self, response):
   # args, url, method = fill_login_form(response.url, response.body, self.login_user, self.login_pass)
    #return FormRequest(url, method=method, formdata=args, dont_filter=True,callback=self.after_login)
    args, url, method, name , number = fill_login_form(response.url, response.body, self.login_user, self.login_pass)
    if name:
             yield FormRequest.from_response(response, method=method, formdata=args, formname=name,  dont_filter=True,callback=self.after_login)        
    else:
             yield FormRequest.from_response(response, method=method, formdata=args, formnumber=number,  dont_filter=True,callback=self.after_login)
Exemple #9
0
 def test_from_response_override_method(self):
     response = _buildresponse(
             '''<html><body>
             <form action="/app"></form>
             </body></html>''')
     request = FormRequest.from_response(response)
     self.assertEqual(request.method, 'GET')
     request = FormRequest.from_response(response, method='POST')
     self.assertEqual(request.method, 'POST')
Exemple #10
0
 def test_from_response_override_method(self):
     response = _buildresponse(
         """<html><body>
             <form action="/app"></form>
             </body></html>"""
     )
     request = FormRequest.from_response(response)
     self.assertEqual(request.method, "GET")
     request = FormRequest.from_response(response, method="POST")
     self.assertEqual(request.method, "POST")
 def test_from_response_override_url(self):
     response = _buildresponse(
             '''<html><body>
             <form action="/app"></form>
             </body></html>''')
     request = FormRequest.from_response(response)
     self.assertEqual(request.url, 'http://example.com/app')
     request = FormRequest.from_response(response, url='http://foo.bar/absolute')
     self.assertEqual(request.url, 'http://foo.bar/absolute')
     request = FormRequest.from_response(response, url='/relative')
     self.assertEqual(request.url, 'http://example.com/relative')
 def parse(self, response):
     # DEBUG: Uncomment following line if restarting from specific record number.
     #return self.parse_main_list(response)
     # Submit empty form to obtain all data
     return [FormRequest.from_response(response,
                                       formdata={ "p_request": "RECHR" },
                                       callback=self.parse_main_list)]
 def parse(self, response):
     return FormRequest.from_response(
                         response,
                         formdata=data,
                         method="GET",
                         callback=self.search_result
                         )
Exemple #14
0
 def login(self, response):
     with open(self.login_path, encoding='utf8') as f:
         login_data = json.load(f)
     yield FormRequest.from_response(response,
                                     formdata=login_data,
                                     formid='login-nav',
                                     callback=self.check_login)
Exemple #15
0
 def login(self, response):
     self._log_page(response, 'amazon_login.html')
     return [FormRequest.from_response(response, \
                         formdata = self.formdata,\
                         headers = self.headers,\
                         meta = {'cookiejar':response.meta['cookiejar']},\
                         callback = self.parse_item)]#success login
Exemple #16
0
 def parse(self, response):
     if 'login' in response.url:
         return [FormRequest.from_response(
             response,
             formdata={'j_username': self.username,
                       'j_password': self.password},
             callback=self.after_login)]
Exemple #17
0
 def after_login(self, response):
     if 'BIN Lookup' in response.body:
         for binnum in self.bins:
             binnum = binnum.strip()
             yield FormRequest.from_response(
                 response, formdata={'search': binnum},
                 callback=self.got_bin)
 def parse(self, response):
     yield FormRequest.from_response(
         response,
         formname='frmConsulta',
         formdata={'data':'(@DTDE >="'+self.iDate+'") E (@DTDE <="'+self.fDate+'")', 'b':'ACOR'},
         callback=self.parseSearchResults
     )
Exemple #19
0
 def course_terms_page(self, response):
     for term in self.terms:
         yield FormRequest.from_response(
                 response,
                 formxpath='/html/body/div[5]/form',
                 formdata={'TERM': term},
                 meta={'term': term})
 def parse(self, response):
     if parameter.login==True:
         args, url, method = fill_login_form(response.url, response.body, self.login_user, self.login_pass)
         #return FormRequest(url, method=method, formdata=args, callback=self.after_login)
         #args={'username':'******', 'password':'******'}
         #print args
         argsdict={}
         print response.headers
         for i in args:
             argsdict[i[0]]=i[1]
         print argsdict
         #print response.request
         return FormRequest.from_response(
             response,
             formdata=argsdict,
             dont_filter=True,
             meta = {'dont_merge_cookies': True},
             callback=self.after_login
         )
     else:
         #args, url, method = fill_login_form(response.url, response.body, self.login_user, self.login_pass)
         return Request(
             response.url,
             meta = {'dont_merge_cookies': True},
             callback=self.parse_page
         )
 def login(self, response):
     """
     This is where I am stuck.  Obviously response is not defined.
     """
     return [FormRequest.from_response(response,
         formdata={'Email': '', 'Passwd': ''},
         callback = self.after_login)]
Exemple #22
0
    def parse_workspecs_list(self, response):
        log.msg('parse_workspecs_list : [%s]' % response.url)
        hxs = HtmlXPathSelector(response)
        current_page = extract_current_page(hxs)
        total_pages = extract_total_pages(hxs)
        log.msg('Current page : %d / %d' % (current_page, total_pages))
        workspecs_a = hxs.select('//a[contains(@id, "%s")]' % WORKSPEC_A_ID)
        log.msg('Number of workspecs : %d' % len(workspecs_a))
        for a in workspecs_a:
            item = WorkSpecItem()
            item['shortname'] = a.select('text()').extract()[0]
            url = a.select('@href').extract()[0]
            item['url'] = url
            item['phid'] = a.select('@href').re(phid_re)[0]
            yield item
            full_url = urljoin_rfc(response.url, url)
            self.crawler.stats.inc_value('workspec_pages_queued')
            yield Request(full_url, callback=self.parse_workspec_page,
                          dont_filter=True,
                          meta={'phid':item['phid']})

        if current_page < total_pages:
            formdata = dict(POST_DEFAULT_DATA)
            formdata[CURRENT_PAGE_INPUT] = str(current_page + 1)
            log.msg('Queueing next page : %d' % (current_page + 1))
            yield FormRequest.from_response(response, dont_click=True,
                    formdata=formdata, callback=self.parse_workspecs_list)
    def parse_send_email(self, response):
        user_id = response.meta["user_id"]
        username = response.meta["username"]

        if any(message in response.body for message in self.CANT_SEND_EMAIL_MESSAGES):
            can_send_email = False
        else:
            can_send_email = True

        if can_send_email:
            return [FormRequest.from_response(response,
                        formnumber=self.EMAIL_FORM_NUMBER,
                        formdata={
                            "emailsubject" : settings["TITLE"],
                            "message" : settings["MESSAGE"] % username
                        },
                        meta = {
                            "user_id" : user_id,
                            "dont_redirect" : True
                        },
                        callback=self.do_nothing,
                        dont_filter=True)]
        else:
            self.log("Cannot send email to user_id=%s" % user_id)
            return self.make_send_message_request(user_id, username)
    def parse(self, response):

        hxs = HtmlXPathSelector(response)
        id = hxs.select("//table[@class='lnkEvents']/tr/td[@class='tabUnselR']/a/@id").extract()[0].replace("_", "$")
        print id

        __EVENTTARGET = hxs.select("//input[@id='__EVENTTARGET']/@value").extract()
        __EVENTARGUMENT = hxs.select("//input[@id='__EVENTARGUMENT']/@value").extract()
        __LASTFOCUS = hxs.select("//input[@id='__LASTFOCUS']/@value").extract()
        __VIEWSTATE = hxs.select("//input[@id='__VIEWSTATE']/@value").extract()
        __PREVIOUSPAGE = hxs.select("//input[@id='__PREVIOUSPAGE']/@value").extract()
        __EVENTVALIDATION = hxs.select("//input[@id='__EVENTVALIDATION']/@value").extract()

        yield FormRequest.from_response(
            response,
            formdata={
                "__EVENTTARGET": id,
                "__EVENTARGUMENT": __EVENTARGUMENT,
                "__LASTFOCUS": __LASTFOCUS,
                "__VIEWSTATE": __VIEWSTATE,
                "__PREVIOUSPAGE": __PREVIOUSPAGE,
                "__EVENTVALIDATION": __EVENTVALIDATION,
            },
            callback=self.parse_event_list,
            dont_click=True,
            #                                    dont_filter=True,
        )
    def parse_send_message(self, response):
        user_id = response.meta["user_id"]
        username = response.meta["username"]
        can_send_message = True
        if any(message in response.body for message in self.CANT_SEND_EMAIL_MESSAGES):
            can_send_message = False
        else:
            can_send_message = True

        if can_send_message:
            self.log("Send message to %s" % username)
            return [FormRequest.from_response(response,
                        formnumber=self.MESSAGE_FORM_NUMBER,
                        formdata={
                            "title" : settings["TITLE"],
                            "message" : settings["MESSAGE"] % username
                        },
                        meta = {
                            "user_id" : user_id,
                            "dont_redirect" : True
                        },
                        callback=self.do_nothing,
                        dont_filter=True)]
        else:
            self.log("Cannot send message to this user")
Exemple #26
0
    def _handle_captcha(self, response, callback):
        captcha_solve_try = response.meta.get('captcha_solve_try', 0)
        url = response.url
        self.log("Captcha challenge for %s (try %d)."
                 % (url, captcha_solve_try),
                 level=INFO)

        captcha = self._solve_captcha(response)

        if captcha is None:
            self.log(
                "Failed to guess captcha for '%s' (try: %d)." % (
                    url, captcha_solve_try),
                level=ERROR
            )
            result = None
        else:
            self.log(
                "On try %d, submitting captcha '%s' for '%s'." % (
                    captcha_solve_try, captcha, url),
                level=INFO
            )
            meta = response.meta.copy()
            meta['captcha_solve_try'] = captcha_solve_try + 1
            result = FormRequest.from_response(
                response,
                formname='',
                formdata={'field-keywords': captcha},
                callback=callback,
                dont_filter=True,
                meta=meta)

        return result
    def parse_send_message(self, response):
        user_id = response.meta["user_id"]
        username = response.meta["username"]
        can_send_message = True
        
        if any(message in response.body for message in self.CANT_SEND_EMAIL_MESSAGES):
            can_send_message = False
        else:
            can_send_message = True

        if can_send_message:
            hxs = HtmlXPathSelector(response)
            form_token = hxs.select('//input[@name="form_token"]/@value').extract()[0].strip()
            creation_time = hxs.select('//input[@name="creation_time"]/@value').extract()[0].strip()
            
            return [FormRequest.from_response(response,
                        formnumber=self.MESSAGE_FORM_NUMBER,
                        formdata={
                            "subject" : settings["TITLE"],
                            "message" : settings["MESSAGE"] % username,
                            "post" : "Submit",
                            "creation_time" : creation_time,
                            "form_token" : form_token
                        },
                        meta = {
                            "user_id" : user_id,
                            "dont_redirect" : True
                        },
                        callback=self.do_nothing,
                        dont_filter=True)]
        else:
            self.log("Cannot send message to this user")
    def parse(self, response):
         if len(response.xpath('//div[@class="over18-notice"]')) > 0:
            if self._retries < PTTSpider.MAX_RETRY:
                self._retries += 1
                logging.warning('retry {} times...'.format(self._retries))
                yield FormRequest.from_response(response,
                                                formdata={'yes': 'yes'},
                                                callback=self.parse)
            else:
                logging.warning('!!!!!!!!!!!!!!!!!you cannot pass!!!!!!!!!!!!!!')
         else:
                filename = response.url.split('/')[-2] + '.html' 
                

                with open(filename, 'wb') as f:
                    f.write(response.body)
                    self._pages += 1

                for href in response.css('.r-ent > div.title > a::attr(href)'):
                    url = response.urljoin(href.extract())
                    yield scrapy.Request(url, callback=self.parse_post)

                if self._pages < PTTSpider.MAX_PAGES:
                    next_page = response.xpath(u'//div[@id="action-bar-container"]//a[contains(text(), "上頁")]/@href')
                    logging.warning(next_page)
                    logging.warning('231')
                    if next_page:
                        url = response.urljoin(next_page[0].extract())
                        yield scrapy.Request(url, self.parse)
                    else:
                        logging.warning('no next page')
                else:
                    logging.warning('max pages reached')
	def parse(self, response):
		hxs = HtmlXPathSelector(response)
		
		UFs = hxs.select('//*[@id="sgUe"]/option/@value').extract()
		for UF in UFs:
			if UF != '':
				yield FormRequest.from_response(response, formnumber=0, formdata={'acao':'pesquisar','sgUe':UF,'candidatura':'6','parcial':'0'}, callback=self.parseform, meta={'uf':UF})
Exemple #30
0
 def parse_legislative(self, response):
     yield FormRequest.from_response(response,
                                     formname='Form1',
                                     formdata={'Deptos': 'TODOS',
                                               'ir.x': '14',
                                               'ir.y': '11',
                                               }, callback=self.parse_politicianprofile)
Exemple #31
0
    def parse_home(self, response):
        '''
        This method has multiple purposes:
        1) Handle failed logins due to facebook 'save-device' redirection
        2) Set language interface, if not already provided
        3) Navigate to given page 
        '''
        #handle 'save-device' redirection
        if response.xpath("//div/a[contains(@href,'save-device')]"):
            self.logger.info('Going through the "save-device" checkpoint')
            return FormRequest.from_response(
                response,
                formdata={'name_action_selected': 'dont_save'},
                callback=self.parse_home)

        #set language interface
        if self.lang == '_':
            if response.xpath("//input[@placeholder='Search Facebook']"):
                self.logger.info('Language recognized: lang="en"')
                self.lang = 'en'
            elif response.xpath("//input[@placeholder='Buscar en Facebook']"):
                self.logger.info('Language recognized: lang="es"')
                self.lang = 'es'
            elif response.xpath(
                    "//input[@placeholder='Rechercher sur Facebook']"):
                self.logger.info('Language recognized: lang="fr"')
                self.lang = 'fr'
            elif response.xpath("//input[@placeholder='Cerca su Facebook']"):
                self.logger.info('Language recognized: lang="it"')
                self.lang = 'it'
            elif response.xpath(
                    "//input[@placeholder='Pesquisa no Facebook']"):
                self.logger.info('Language recognized: lang="pt"')
                self.lang = 'pt'
            else:
                raise AttributeError(
                    'Language not recognized\n'
                    'Change your interface lang from facebook '
                    'and try again')

        #navigate to provided page
        href = response.urljoin(self.page)
        self.logger.info('Scraping facebook page {}'.format(href))
        return scrapy.Request(url=href,
                              callback=self.parse_page,
                              meta={'index': 1})
Exemple #32
0
 def process_search_again(self, failure):
     #sometimes the scraper loses the sequence in the paginator (server returns a 411 error),
     #when that occurs, then the same request will be sent again,
     #until it can be fullfilled or it is manually aborted.
     if (failure.check(HttpError)):
         self.logger.info(
             '[ComprasVisibles] HTTPERROR se intetara scrapear de nuevo')
         next_page_request = FormRequest.from_response(
             self.controlPage,
             formname='aspnetForm',
             formid='aspnetForm',
             formdata=self.formdata,
             callback=self.process_search,
             priority=10,
             errback=self.process_search_again,
             dont_filter=True)
         yield next_page_request
 def parse(self, response):  #parse回调函数
     # 响应Cookie
     Cookie1 = response.headers.getlist(
         'Set-Cookie')  #查看一下响应Cookie,也就是第一次访问注册页面时后台写入浏览器的Cookie
     print(Cookie1)
     print('登录中')
     """第二次用表单post请求,携带Cookie、浏览器代理、用户登录信息,进行登录给Cookie授权"""
     return [
         FormRequest.from_response(
             response,
             url='http://zhjw.scu.edu.cn/j_spring_security_check',  #真实post地址
             meta={'cookiejar': response.meta['cookiejar']},
             headers=self.header,
             formdata=self.logindata,
             callback=self.next,
         )
     ]
Exemple #34
0
 def parse(self, response):
     # 获取验证码图片所在地址,获取后赋给captcha变量,此时captcha为一个列表
     captcha = response.xpath('//img[@id="captcha_image"]/@src').extract()
     # 因为登录时有时网页有验证码,有时网页没有验证码
     # 所以需要判断此时是否需要输入验证码,若captcha列表中有元素,说明有验证码信息
     if len(captcha) > 0:
         print("此时有验证码")
         # 设置将验证码图片存储到本地的本地地址
         localpath = "E:/python_down/captcha.png"
         # 将服务器中的验证码图片存储到本地,供我们在本地直接进行查看
         urllib.request.urlretrieve(captcha[0], filename=localpath)
         print("请查看本地图片captcha.png并输入对应验证码:")
         # 通过input()等待我们输入对应的验证码并赋给captcha_value变量
         captcha_value = input()
         # 设置要传递的post信息
         data = {
             # 设置登录账号,格式为账号字段名:具体账号
             "form_email": "18606590295",
             # 设置登录密码,格式为密码字段名:具体密码,读者需要将账号密码换成自己的
             # 因为笔者完成该项目后已经修改密码
             "form_password": "******",
             # 设置验证码,格式为验证码字段名:具体验证码
             "captcha-solution": captcha_value,
             # 设置需要转向的网址,由于我们需要爬取个人中心页,所以转向个人中心页
             "redir": "https://www.douban.com/people/151968962/",
         }
     # 否则说明captcha列表中没有元素,即此时不需要输入验证码信息
     else:
         print("此时没有验证码")
         # 设置要传递的post信息,此时没有验证码字段
         data = {
             "form_email": "18606590295",
             "form_password": "******",
             "redir": "https://www.douban.com/people/151968962/",
          }
     print("登录中…")
     # 通过FormRequest.from_response()进行登录
     return [FormRequest.from_response(response,
                                       # 设置cookie信息
                                       meta={"cookiejar": response.meta["cookiejar"]},
                                       # 设置headers信息模拟成浏览器
                                       # 设置post表单中的数据
                                       formdata=data,
                                       # 设置回调函数,此时回调函数为next()
                                       callback=self.next,
                                       )]
Exemple #35
0
 def parse(self, response):
     token = response.xpath('//input[@type="hidden"]/@value')[1].extract()
     open_in_browser(response)
     print("*************\n\n The token is", token)
     return FormRequest.from_response(response,
                                      formdata={
                                          'utf8': "✓",
                                          'authenticity_token': token,
                                          'redirect_to_ssl': "1",
                                          'pseudonym_session[unique_id]':
                                          "",
                                          'pseudonym_session[password]': "",
                                          'pseudonym_session[remember_me]':
                                          "0"
                                      },
                                      callback=self.scrape_pages,
                                      dont_filter=True)
Exemple #36
0
 def parse(self, response):
     # 获取验证码图片所在地址,获取后赋给captcha变量,此时captcha为一个列表
     captcha = response.xpath('//img[@id="captcha_image"]/@src').extract()
     # 因为登录时有时有验证码,有时又没有
     # 所以需要判断此时是否需要输入验证码,若captcha列表中有元素,说明有验证码信息
     if len(captcha) > 0:
         print("此时有验证码")
         # 设置将验证码图片存储到本地的本地址
         localpath = "L:/MyPythonProgr/SomePythonProjects/AboutSpider/loginpjt/captcha.png"
         # 将服务器中的验证码图片存储到本地,供我们在本地直接信息查看
         urllib.request.urlretrieve(captcha[0], filename=localpath)
         print("请查看本地图片captcha.png并输入对应验证码: ")
         # 通过Input()等待我们输入对应的验证码并赋给captcha_value变量
         captcha_value = input()
         data = {                                       # 设置要传递的post信息
             "form_email": "*****@*****.**",       # 设置登录账号
             "form_password": "******",        # 设置登录密码
             'source': 'index_nav',
             "captcha-solution": captcha_value,         # 设置验证码
             # 登录后转向的网址,由于我们爬个人中心页,所以转向个人中心页
             "redir": "https://www.douban.com/people/182345352/",
         }
     else:
         print("此时没有验证码")
         # 设置要传递的post信息,此时没有验证码字段
         data = {
             "form_email": "*****@*****.**",
             "form_password": "******",
             "redir": "https://www.douban.com/people/182345352/",
         }
     print("登录中...")
     # 通过FormRequest.from_response()进行登录
     return [
         FormRequest.from_response(
             response,
             # 设置cookie信息
             meta={"cookiejar": response.meta["cookiejar"]},
             # 模拟成浏览器
             headers=self.header,
             # 设置post表单中的数据
             formdata=data,
             # 设置回调函数,此时回调函数为next()
             callback=self.next,
         )
     ]
Exemple #37
0
 def parse(self, response):
     #获取验证码地址
     captcha = response.xpath('img[@id="captcha_image"]/@src').extract()
     #如果有验证码
     if len(captcha) > 0:
         print("此处有验证码:")
         #将图片验证码保存在本地
         localpath = "E:\program_file\python\crawler_learn\loginpjt\captcha.png"
         urllib.request.urlretrieve(captcha[0], filename=localpath)
         print("请查看图片输入对应验证码:")
         captcha_value = input()
         #设置要传递的post信息
         data = {
             #设置登陆账号
             "form_email": "15963020715",
             "form_password": "******",
             #设置验证码
             "captcha-solution": captcha_value,
             #设置需要转向地址
             "redir": "https://www.douban.com/people/172627333/",
         }
     else:
         print("此处没有验证码")
         #设置要传递的post信息
         data = {
             #设置登陆账号
             "form_email": "15963020715",
             "form_password": "******",
             #设置需要转向地址
             "redir": "https://www.douban.com/people/172627333/",
         }
     print("登陆中。。。")
     #通过FormRequest.from_response()登陆
     # return [FormRequest.from_response(response, meta={"cookiejar": response.meta["cookiejar"]},
     #                                      formdata=data,
     #                                      callback=self.next)]
     return [
         FormRequest.from_response(
             response,
             meta={"cookiejar": response.meta["cookiejar"]},
             headers=self.header,
             formdata=data,
             callback=self.next,
         )
     ]
Exemple #38
0
 def post_login(self, response):
     print 'Preparing login'
     # FormRequeset.from_response是Scrapy提供的一个函数, 用于post表单
     # 登陆成功后, 会调用after_login回调函数,如果url跟Request页面的一样就省略掉
     return [
         FormRequest.from_response(
             response,
             url='https://hr.travelsky.net/hr/',
             meta={'cookiejar': response.meta['cookiejar']},
             headers=self.headers,  # 注意此处的headers
             formdata={
                 "act": "login",
                 "staff_num": "",  # 工号
                 "pass": "",  # 密码
             },
             callback=self.after_login,
             dont_filter=True)
     ]
Exemple #39
0
    def post_login(self, response):
        print 'Preparing login'

        return [
            FormRequest.from_response(
                response,
                meta={'cookiejar': response.meta['cookiejar']},
                headers=self.headers,  #注意此处的headers
                formdata={
                    'log': 'admin',
                    'pwd': 'lngwordpress',
                    'wp-submit': '登录',
                    'redirect_to': 'http://121.42.223.111/wp-admin/index.php',
                    'testcookie': '1'
                },
                callback=self.after_login,
                dont_filter=True)
        ]
Exemple #40
0
    def parse(self, response):
        # 获取页面的name
        keys = response.xpath('//form//table//input/@name').extract()

        # 注册信息值
        values = [
            'gjw1', 'gjw1',
            '*****@*****.**' % self.i, '123456', '123456',
            self._get_recaptcha(response)
        ]
        # 将上述二者组成字典
        form = dict(zip(keys, values))
        # 将字典信息传入
        yield FormRequest.from_response(response,
                                        formdata=form,
                                        dont_filter=True,
                                        callback=self.check,
                                        meta={'_form': form})
    def navigate_tabs(self, response):
        """Navigating results by submitting the search form"""

        soup = BeautifulSoup(response.body)
        items_count = int(
            soup.find('table', class_='resultsTbl').findAll('b')[2].string)
        tabs_count = items_count / 100 + (1 if items_count % 100 else 0)
        self.log('Items amount is {0}'.format(items_count))
        self.log('Tabs amount is {0}'.format(tabs_count))

        for page in range(1, tabs_count + 1):
            yield FormRequest.from_response(response,
                                            formname='refreshForm',
                                            formdata={
                                                'action': 'Advanced Search',
                                                'start': str(page)
                                            },
                                            callback=self.parse_page)
 def parse(self, response):
     # Build season IDs
     season_ids = ["0" + str(x) for x in range(28, 100)]
     season_ids += ["10" + str(x) for x in range(0, 10)]
     season_ids += ["1" + str(x) for x in range(10, 14)]
     for season in season_ids:
         yield FormRequest.from_response(
             response,
             url="http://www.lfp.es/includes/ajax.php",
             formname='estadisticas_historicas',
             formdata={
                 'input_competicion': 'Primera',
                 'input_temporada': str(season),
                 'input_temporada_nombre': str(season),
                 'action': 'estadisticas_historicas',
                 'tipo': 'clasificacion'
             },
             callback=self.parse_classification)
Exemple #43
0
    def parse_home(self, response):
        if response.xpath("//div/a[contains(@href,'save-device')]"):
            self.logger.info('Got stuck in "save-device" checkpoint')
            self.logger.info('I will now try to redirect to the correct page')
            return FormRequest.from_response(
                response,
                formdata={'name_action_selected': 'dont_save'},
                callback=self.parse_home)

    #     #登入後詢問的頁面 ,傳送拒絕
        href = response.urljoin(self.page)
        # page = os.getenv("PAGE")
        #     #navigate to provided page
        # href = response.urljoin(page)
        print(href)
        return scrapy.Request(url=href,
                              callback=self.parse_page,
                              meta={'index': 1})