def _build(self, config): try: self.name = config['name'] Logger.register(self.name) Logger.load() self.logger = Logger(self.name) self.debug = config.get('debug', True) self.logger.info(self.name, 'Start building...') self.scraper = Scraper.get(config['scraper']['name'])(self) if 'args' in config['scraper']: self.scraper.setargs(config['scraper']['args']) self.frontier = Frontier.get(config['frontier']['name'])(self) if 'args' in config['frontier']: self.frontier.setargs(config['frontier']['args']) for each in config['handlers']: handler = Handler.get(each['name'])(self) if 'args' in each: handler.setargs(each['args']) self.handlers.append(handler) self.logger.info(self.name, 'Build successful!') except KeyError as e: raise PyCrawlerException('Key \''+e.args[0]+'\' missing in config dict')
def test_register(self): Logger.register('Test') self.assertIn('Test-logger', LoggingConfig['loggers']) self.assertIn('Test-file', LoggingConfig['handlers']) self.assertIn('Test-console', LoggingConfig['handlers'])
def test__fmt(self): Logger.load() logger = Logger('Default') self.assertEqual('[test] test', logger._fmt('test', 'test'))
def test_error(self): Logger.load() logger = Logger('Default') logger.error('unittest', 'testerror')
def test_warning(self): Logger.load() logger = Logger('Default') logger.warning('unittest', 'testwarning')
def test_info(self): Logger.load() logger = Logger('Default') logger.info('unittest', 'testinfo')
def test_debug(self): Logger.load() logger = Logger('Default') logger.debug('unittest', 'testdebug')
def test_load(self): Logger.load() logger = Logger('Default') self.assertIsInstance(logger._logger, logging.Logger)
class Spider(Thread): def __init__(self, config, debug=True): super(Spider, self).__init__() self.name = '' self.config = config self.scraper = None self.frontier = None self.handlers = [] self.logger = None self.debug = debug self._build(config) self.keep = True self.ispaused = False def _build(self, config): try: self.name = config['name'] Logger.register(self.name) Logger.load() self.logger = Logger(self.name) self.debug = config.get('debug', True) self.logger.info(self.name, 'Start building...') self.scraper = Scraper.get(config['scraper']['name'])(self) if 'args' in config['scraper']: self.scraper.setargs(config['scraper']['args']) self.frontier = Frontier.get(config['frontier']['name'])(self) if 'args' in config['frontier']: self.frontier.setargs(config['frontier']['args']) for each in config['handlers']: handler = Handler.get(each['name'])(self) if 'args' in each: handler.setargs(each['args']) self.handlers.append(handler) self.logger.info(self.name, 'Build successful!') except KeyError as e: raise PyCrawlerException('Key \''+e.args[0]+'\' missing in config dict') def reload(self, config): self._build(config) def addtask(self, task): self.frontier.add(task) def run(self): self.logger.warning(self.name, 'Start crawling...') while self.frontier.hasnext() and self.keep: urls = self.frontier.next(100) results = self.scraper.fetch(urls) for url, body in results.iteritems(): for handler in self.handlers: items = handler.parse(body, url) if isinstance(items, list): for item in items: item.save() elif isinstance(items, Document): items.save() self._checkpause() self.logger.warning(self.name, 'Crawling finished!') def recover(self, filename): if not os.path.exists(filename): self.logger.info(self.name, 'File {0} not found'.format(filename)) else: self.logger.info(self.name, 'Recovering from '+filename) count = 0 with open(filename, 'r') as f: for each in f.readlines(): url, body = self.scraper.fetchone(each) if not body: continue count += 1 for handler in self.handlers: items = handler.parse(body, url) try: if isinstance(items, list): for item in items: item.save() elif isinstance(items, Document): items.save() except AttributeError: raise PyCrawlerException('Items must implement save() method.') self.logger.info(self.name, 'Recovered {0} urls'.format(count)) def clean(self, *args): self.frontier.clean(*args) def report(self): s = self.summary() results = [self.name+' report:', 'Todo urls: {0}'.format(s['todo']), 'Visited urls: {0}'.format(s['visited']), 'Failed urls: {0}'.format(s['failed'])] for each in results: print(each) def summary(self): result = {'todo': len(self.frontier), 'visited': len(self.frontier.visitednum()), 'failed': 'Not supported'} return result def pause(self): if self.isAlive() and not self.ispaused: self.ispaused = True def resume(self): if self.isAlive() and self.ispaused: self.ispaused = False def retire(self): if self.isAlive(): self.logger.info(self.name, 'Stopped by driver') self.ispaused = False self.keep = False def _checkpause(self): if self.ispaused: self.logger.info(self.name, 'Paused by driver') while self.ispaused: pass else: self.logger.info(self.name, 'Resumed by driver')