Ejemplo n.º 1
0
 def get_next_page_request(self, response):
     # noinspection PyPropertyAccess
     br = self.br
     mechanize_response = response_scrapy2mechanize(response)
     br.set_response(mechanize_response)
     default_encoding = self._site_default_encoding
     encoding = default_encoding if default_encoding else response.encoding
     next_page_word = self.next_page_word.encode(encoding)
     next_page_link = self.get_next_page_link()
     # noinspection PyUnusedLocal
     try:
         if next_page_link:
             next_page_request = br.click_link(link=next_page_link)
         else:
             next_page_request = br.click_link(text=next_page_word)
         scrapy_request = request_mechanize2scrapy(next_page_request)
         scrapy_request.callback = self.query_callback
         return scrapy_request
     except LinkNotFoundError as e:
         return None
     except Exception as e:
         #到这里一般是解析到的链接是js的.
         #已关键字监控这条日志
         log.msg('spider turn page error:%s' % str(e), level=log.INFO)
         return None
 def get_next_page_request(self, response):
     br = self.br
     mechanize_response = response_scrapy2mechanize(response)
     br.set_response(mechanize_response)
     encoding = response.encoding
     next_page_word = self.next_page_word.encode(encoding)
     next_page_link = self.get_next_page_link()
     try:
         if next_page_link:
             next_page_request = br.click_link(link=next_page_link)
         else:
             next_page_request = br.click_link(text=next_page_word)
         if next_page_request:
             url = response.url
             query = get_url_query(url)
             page = str(int(query.get('page', '1')) + 1)
             query['page'] = page
             url = change_url_query(url, query)
             scrapy_request = Request(url=url, callback=self.query_callback)
             return scrapy_request
         else:
             return None
     except LinkNotFoundError as e:
         return None
     except Exception as e:
         self.log('spider turn page error:%s' % e, level=log.INFO)
         return None
Ejemplo n.º 3
0
 def get_query_request(self, response):
     intime = self.intime
     if intime == '全部时间':
         return super(SogouBbsSpider, self).get_query_request(response)
     # noinspection PyPropertyAccess
     br = self.br
     mechanize_response = response_scrapy2mechanize(response)
     br.set_response(mechanize_response)
     br.select_form(nr=self.search_form_order)
     query = response.meta['query']
     encoding = response.encoding
     query = query.encode(encoding)
     search_input_name = self.search_input_name.encode(encoding)
     br[search_input_name] = query
     br.submit()
     intime = intime.encode('utf8')
     query_request = br.click_link(text=intime)
     scrapy_request = request_mechanize2scrapy(query_request)
     scrapy_request.callback = self.query_callback
     url = scrapy_request.url
     query = get_url_query(url)
     query['num'] = 100
     new_url = change_url_query(url, query)
     new_request = scrapy_request.replace(url=new_url)
     return new_request
Ejemplo n.º 4
0
 def get_query_request(self, response):
     # noinspection PyPropertyAccess
     br = self.br
     mechanize_response = response_scrapy2mechanize(response)
     br.set_response(mechanize_response)
     br.select_form(nr=self.search_form_order)
     query = response.meta['query']
     default_encoding = self._site_default_encoding
     encoding = default_encoding if default_encoding else response.encoding
     query = query.encode(encoding)
     search_input_name = self.search_input_name.encode(encoding)
     br[search_input_name] = query
     query_request = br.click()
     scrapy_request = request_mechanize2scrapy(query_request)
     scrapy_request.callback = self.query_callback
     return scrapy_request
Ejemplo n.º 5
0
 def get_query_request(self, response):
     """
     填表单,构造相应请求
     """
     # noinspection PyPropertyAccess
     br = self.br
     mechanize_response = response_scrapy2mechanize(response)
     br.set_response(mechanize_response)
     br.select_form(nr=self.search_form_order)
     query = response.meta['query']
     encoding = response.encoding
     query = query.encode(encoding)
     search_input_name = self.search_input_name.encode(encoding)
     br[search_input_name] = query
     br[b'rn'] = [self.item_count_per_page]
     activate_controls(br.form)
     if not self.begin_date and not self.end_date:
         br[b's'] = [b'1']
         br[b'begin_date'] = b''
         br[b'end_date'] = b''
     else:
         br[b's'] = [b'2']
         br[b'begin_date'] = self.begin_date.encode(encoding)
         br[b'end_date'] = self.end_date.encode(encoding)
         y0, m0, d0 = self.begin_date.split('-')
         y1, m1, d1 = self.end_date.split('-')
         br[b'y0'] = y0.encode(encoding)
         br[b'm0'] = m0.encode(encoding)
         br[b'd0'] = d0.encode(encoding)
         br[b'y1'] = y1.encode(encoding)
         br[b'm1'] = m1.encode(encoding)
         br[b'd1'] = d1.encode(encoding)
         br[b'bt'] = str(int(time.mktime(time.strptime(self.begin_date, '%Y-%m-%d'))))
         br[b'et'] = str(int(time.mktime(time.strptime(self.end_date, '%Y-%m-%d'))))
     query_request = br.click()
     scrapy_request = request_mechanize2scrapy(query_request)
     scrapy_request.callback = self.query_callback
     return scrapy_request