class TestDepthMiddleware(TestCase): def setUp(self): crawler = get_crawler(Spider) self.spider = crawler._create_spider('scrapytest.org') self.stats = StatsCollector(crawler) self.stats.open_spider(self.spider) self.mw = DepthMiddleware(1, self.stats, True) def test_process_spider_output(self): req = Request('http://scrapytest.org') resp = Response('http://scrapytest.org') resp.request = req result = [Request('http://scrapytest.org')] out = list(self.mw.process_spider_output(resp, result, self.spider)) self.assertEquals(out, result) rdc = self.stats.get_value('request_depth_count/1', spider=self.spider) self.assertEquals(rdc, 1) req.meta['depth'] = 1 out2 = list(self.mw.process_spider_output(resp, result, self.spider)) self.assertEquals(out2, []) rdm = self.stats.get_value('request_depth_max', spider=self.spider) self.assertEquals(rdm, 1) def tearDown(self): self.stats.close_spider(self.spider, '')
class TestDepthMiddleware(TestCase): def setUp(self): crawler = get_crawler(Spider) self.spider = crawler._create_spider("scrapytest.org") self.stats = StatsCollector(crawler) self.stats.open_spider(self.spider) self.mw = DepthMiddleware(1, self.stats, True) def test_process_spider_output(self): req = Request("http://scrapytest.org") resp = Response("http://scrapytest.org") resp.request = req result = [Request("http://scrapytest.org")] out = list(self.mw.process_spider_output(resp, result, self.spider)) self.assertEquals(out, result) rdc = self.stats.get_value("request_depth_count/1", spider=self.spider) self.assertEquals(rdc, 1) req.meta["depth"] = 1 out2 = list(self.mw.process_spider_output(resp, result, self.spider)) self.assertEquals(out2, []) rdm = self.stats.get_value("request_depth_max", spider=self.spider) self.assertEquals(rdm, 1) def tearDown(self): self.stats.close_spider(self.spider, "")
def parse(self, response): #getting the question blocks from response. question_blocks = Selector( text=json.loads(response.body.decode("utf-8"))['msg'][1]).xpath( '//div[contains(@itemtype, "http://schema.org/Question")]') for question_block in question_blocks: item = ZhihuapiItem() item['question_name'] = question_block.xpath( './/div/div/h2/a/text()').extract_first() item['question_url'] = question_block.xpath( './/div/div/h2/a/@href').extract_first() item['question_answer'] = question_block.xpath( './/div/div/div[1]/div[5]/div/a/@href').extract_first() item['question_answer_author_profile'] = question_block.xpath( './/div/div/div[1]/div[3]/span/span[1]/a/@href').extract_first( ) item['question_answer_author'] = question_block.xpath( './/div/div/div[1]/div[3]/span/span[1]/a/text()' ).extract_first() self.logger.info( 'Question info: question name - {}, question answer - {}, question url - {}, question answer author profile - {}, question answer author - {}' .format(item['question_name'], item['question_answer'], item['question_url'], item['question_answer_author_profile'], item['question_answer_author'])) yield item if len(question_blocks) > 0: last_data_score = question_blocks[len(question_blocks) - 1].xpath( '@data-score').extract_first() else: self.logger.info("No more new questions, waiting to stop...") StatsCollector.close_spider(self, spider=zhihuSpider, reason="No more questions...") self.logger.info('Last Data Score is - {}'.format(last_data_score)) yield scrapy.http.FormRequest(self.topic_url, method='POST', headers=self.headers, formdata={ 'start': '0', 'offset': str(last_data_score) }, callback=self.parse)