Esempio n. 1
0
    def parse(self, response):
        #print '====start %s==' %response.url
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable = HBaseTest(host=self.tool.HOST_HBASE1,
                                        table='origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')

        #抽取并解析新闻网页内容
        items = self.parse_items(response)
        #构造一个Xpath的select对象,用来进行网页元素抽取
        sel = Selector(response)
        #抽取搜索结果页详细页面链接

        requests = []
        for url in sel.xpath(
                u'//div[@class="long-pages"]/a[text()="下一页"]/@href').re(
                    'go\(([\d]*?)\)'):
            tp_url = re.sub('&pn=[\d]+?', '', response.url)
            requests.append(self.make_requests_from_url(tp_url + '&pn=' + url))

        for item in items:
            yield Request(url=item['url'],
                          meta={'item': item},
                          callback=self.parse_content)

        #return requests
        for request in requests:
            continue
            yield request
Esempio n. 2
0
    def parse(self,response):
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable=HBaseTest(host = self.tool.HOST_HBASE1, 
                                      table = 'origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')        
        
        #print '====start %s==' %response.url
        #未成功获取query    
        if response.url == self.domain_url:
            print 'error of query'
            return
        
        #抽取并解析新闻网页内容
        items = self.parse_items(response)
        #构造一个Xpath的select对象,用来进行网页元素抽取
        sel = Selector(response)
        #抽取搜索结果页详细页面链接
        
        requests = []
        for url in sel.xpath(u'//a[@class="np"]/@href').extract():
            requests.append(self.make_requests_from_url(self.domain_url + url))

        for item in items:
            yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content)
                    
        #return requests
        for request in requests:
            continue
            yield request
Esempio n. 3
0
    def parse(self, response):
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable=HBaseTest(host = self.tool.HOST_HBASE1, 
                                      table = 'origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')
                        
        #print '====start %s==' %response.url
        
        #抽取并解析新闻网页内容
        items = self.parse_items(response)

        for item in items:
            yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content)
Esempio n. 4
0
    def parse(self, response):
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable = HBaseTest(host=self.tool.HOST_HBASE1,
                                        table='origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')

        print '====start %s==' % response.url

        #抽取并解析新闻网页内容
        items = self.parse_items(response)

        #尝试寻找下一页
        requests = []
        if response.url.find('page') < 0:
            #构造一个Xpath的select对象,用来进行网页元素抽取
            sel = Selector(response)
            page_num = sel.xpath('//div[@class="pg"]/label/span')

            if page_num:
                page_num = re.sub("<.*?>", "", page_num.extract()[0])
                page_num = int(re.search("([\d]+)", page_num).group(1))
                for idx in range(2, page_num + 1):
                    url = response.url + ("&page=%d" % idx)
                    requests.append(self.make_requests_from_url(url))

        for item in items:
            yield Request(url=item['url'],
                          meta={'item': item},
                          callback=self.parse_content)
        #return requests
        for request in requests:
            yield request