def parse(self, response): #print '====start %s==' %response.url # test the status of hbase and thrift server if self.test_hbase: try: self.htable = HBaseTest(host=self.tool.HOST_HBASE1, table='origin') self.htable.close_trans() self.test_hbase = False except: raise CloseSpider('no thrift or hbase server!') #抽取并解析新闻网页内容 items = self.parse_items(response) #构造一个Xpath的select对象,用来进行网页元素抽取 sel = Selector(response) #抽取搜索结果页详细页面链接 requests = [] for url in sel.xpath( u'//div[@class="long-pages"]/a[text()="下一页"]/@href').re( 'go\(([\d]*?)\)'): tp_url = re.sub('&pn=[\d]+?', '', response.url) requests.append(self.make_requests_from_url(tp_url + '&pn=' + url)) for item in items: yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content) #return requests for request in requests: continue yield request
def parse(self,response): # test the status of hbase and thrift server if self.test_hbase: try: self.htable=HBaseTest(host = self.tool.HOST_HBASE1, table = 'origin') self.htable.close_trans() self.test_hbase = False except: raise CloseSpider('no thrift or hbase server!') #print '====start %s==' %response.url #未成功获取query if response.url == self.domain_url: print 'error of query' return #抽取并解析新闻网页内容 items = self.parse_items(response) #构造一个Xpath的select对象,用来进行网页元素抽取 sel = Selector(response) #抽取搜索结果页详细页面链接 requests = [] for url in sel.xpath(u'//a[@class="np"]/@href').extract(): requests.append(self.make_requests_from_url(self.domain_url + url)) for item in items: yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content) #return requests for request in requests: continue yield request
def parse(self, response): # test the status of hbase and thrift server if self.test_hbase: try: self.htable=HBaseTest(host = self.tool.HOST_HBASE1, table = 'origin') self.htable.close_trans() self.test_hbase = False except: raise CloseSpider('no thrift or hbase server!') #print '====start %s==' %response.url #抽取并解析新闻网页内容 items = self.parse_items(response) for item in items: yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content)
def parse(self, response): # test the status of hbase and thrift server if self.test_hbase: try: self.htable = HBaseTest(host=self.tool.HOST_HBASE1, table='origin') self.htable.close_trans() self.test_hbase = False except: raise CloseSpider('no thrift or hbase server!') print '====start %s==' % response.url #抽取并解析新闻网页内容 items = self.parse_items(response) #尝试寻找下一页 requests = [] if response.url.find('page') < 0: #构造一个Xpath的select对象,用来进行网页元素抽取 sel = Selector(response) page_num = sel.xpath('//div[@class="pg"]/label/span') if page_num: page_num = re.sub("<.*?>", "", page_num.extract()[0]) page_num = int(re.search("([\d]+)", page_num).group(1)) for idx in range(2, page_num + 1): url = response.url + ("&page=%d" % idx) requests.append(self.make_requests_from_url(url)) for item in items: yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content) #return requests for request in requests: yield request