def _crawl_link(self, link): spider = Spider(link, self.user_agent, get_tor_session(9150)) spider.crawl() self.log.debug( 'Creating document for: {0}, title {1}, body: {2}'.format( link, spider.title, spider.body[0::50])) self._create_document(link, spider.title, spider.html) self._manager.mark_link_crawled(link, spider.success) if spider.success: return spider.links else: return []
class Test(TestCase): def setUp(self): client = pymongo.MongoClient() client.drop_database('test') db = client['test'] self.term_code = '021' self.major_code = '0120123111' self.p = mock.patch( 'spider.spider.Spider.iter_term_and_major', lambda v: ((self.term_code, None), (self.term_code, self.major_code)) ) self.p.start() self.shortcut = hfut.Student(2013217413, '123456789012', 'XC') self.job_manager = JobManager(pool_size=20) self.db_manager = DatabaseManager(db, batch_size=80) self.j = Spider(self.shortcut, self.job_manager, self.db_manager) def tearDown(self): self.p.stop() @profile def test_dfs_stability(self): # self.j.crawl() self.j.crawl() self.check() def check(self): # 专业和学期被 patch 掉了 self.assertEqual(self.db_manager.db['major'].count(), 0) self.assertEqual(self.db_manager.db['term'].count(), 0) self.assertEqual(self.db_manager.db['course'].count(), 9) self.assertEqual(self.db_manager.db['plan'].count(), 9) self.assertEqual(self.db_manager.db['class'].count(), 201) self.assertEqual(self.db_manager.db['student'].count(), 2621) self.assertEqual(self.db_manager.db['class_student'].count(), 20236)