Beispiel #1
0
 def _build(self, config):
     try:
         self.name = config['name']
         Logger.register(self.name)
         Logger.load()
         self.logger = Logger(self.name)
         self.debug = config.get('debug', True)
         self.logger.info(self.name, 'Start building...')
         self.scraper = Scraper.get(config['scraper']['name'])(self)
         if 'args' in config['scraper']:
             self.scraper.setargs(config['scraper']['args'])
         self.frontier = Frontier.get(config['frontier']['name'])(self)
         if 'args' in config['frontier']:
             self.frontier.setargs(config['frontier']['args'])
         for each in config['handlers']:
             handler = Handler.get(each['name'])(self)
             if 'args' in each:
                 handler.setargs(each['args'])
             self.handlers.append(handler)
         self.logger.info(self.name, 'Build successful!')
     except KeyError as e:
         raise PyCrawlerException('Key \''+e.args[0]+'\' missing in config dict')
Beispiel #2
0
 def test_register(self):
     Logger.register('Test')
     self.assertIn('Test-logger', LoggingConfig['loggers'])
     self.assertIn('Test-file', LoggingConfig['handlers'])
     self.assertIn('Test-console', LoggingConfig['handlers'])
Beispiel #3
0
 def test__fmt(self):
     Logger.load()
     logger = Logger('Default')
     self.assertEqual('[test] test', logger._fmt('test', 'test'))
Beispiel #4
0
 def test_error(self):
     Logger.load()
     logger = Logger('Default')
     logger.error('unittest', 'testerror')
Beispiel #5
0
 def test_warning(self):
     Logger.load()
     logger = Logger('Default')
     logger.warning('unittest', 'testwarning')
Beispiel #6
0
 def test_info(self):
     Logger.load()
     logger = Logger('Default')
     logger.info('unittest', 'testinfo')
Beispiel #7
0
 def test_debug(self):
     Logger.load()
     logger = Logger('Default')
     logger.debug('unittest', 'testdebug')
Beispiel #8
0
 def test_load(self):
     Logger.load()
     logger = Logger('Default')
     self.assertIsInstance(logger._logger, logging.Logger)
Beispiel #9
0
class Spider(Thread):
    def __init__(self, config, debug=True):
        super(Spider, self).__init__()
        self.name = ''
        self.config = config
        self.scraper = None
        self.frontier = None
        self.handlers = []
        self.logger = None
        self.debug = debug
        self._build(config)
        self.keep = True
        self.ispaused = False

    def _build(self, config):
        try:
            self.name = config['name']
            Logger.register(self.name)
            Logger.load()
            self.logger = Logger(self.name)
            self.debug = config.get('debug', True)
            self.logger.info(self.name, 'Start building...')
            self.scraper = Scraper.get(config['scraper']['name'])(self)
            if 'args' in config['scraper']:
                self.scraper.setargs(config['scraper']['args'])
            self.frontier = Frontier.get(config['frontier']['name'])(self)
            if 'args' in config['frontier']:
                self.frontier.setargs(config['frontier']['args'])
            for each in config['handlers']:
                handler = Handler.get(each['name'])(self)
                if 'args' in each:
                    handler.setargs(each['args'])
                self.handlers.append(handler)
            self.logger.info(self.name, 'Build successful!')
        except KeyError as e:
            raise PyCrawlerException('Key \''+e.args[0]+'\' missing in config dict')

    def reload(self, config):
        self._build(config)

    def addtask(self, task):
        self.frontier.add(task)

    def run(self):
        self.logger.warning(self.name, 'Start crawling...')
        while self.frontier.hasnext() and self.keep:
            urls = self.frontier.next(100)
            results = self.scraper.fetch(urls)
            for url, body in results.iteritems():
                for handler in self.handlers:
                    items = handler.parse(body, url)
                    if isinstance(items, list):
                        for item in items:
                            item.save()
                    elif isinstance(items, Document):
                        items.save()
            self._checkpause()
        self.logger.warning(self.name, 'Crawling finished!')

    def recover(self, filename):
        if not os.path.exists(filename):
            self.logger.info(self.name, 'File {0} not found'.format(filename))
        else:
            self.logger.info(self.name, 'Recovering from '+filename)
            count = 0
            with open(filename, 'r') as f:
                for each in f.readlines():
                    url, body = self.scraper.fetchone(each)
                    if not body:
                        continue
                    count += 1
                    for handler in self.handlers:
                        items = handler.parse(body, url)
                        try:
                            if isinstance(items, list):
                                for item in items:
                                    item.save()
                            elif isinstance(items, Document):
                                items.save()
                        except AttributeError:
                            raise PyCrawlerException('Items must implement save() method.')
            self.logger.info(self.name, 'Recovered {0} urls'.format(count))

    def clean(self, *args):
        self.frontier.clean(*args)

    def report(self):
        s = self.summary()
        results = [self.name+' report:',
                   'Todo urls: {0}'.format(s['todo']),
                   'Visited urls: {0}'.format(s['visited']),
                   'Failed urls: {0}'.format(s['failed'])]
        for each in results:
            print(each)

    def summary(self):
        result = {'todo': len(self.frontier),
                  'visited': len(self.frontier.visitednum()),
                  'failed': 'Not supported'}
        return result

    def pause(self):
        if self.isAlive() and not self.ispaused:
            self.ispaused = True

    def resume(self):
        if self.isAlive() and self.ispaused:
            self.ispaused = False

    def retire(self):
        if self.isAlive():
            self.logger.info(self.name, 'Stopped by driver')
            self.ispaused = False
            self.keep = False

    def _checkpause(self):
        if self.ispaused:
            self.logger.info(self.name, 'Paused by driver')
            while self.ispaused:
                pass
            else:
                self.logger.info(self.name, 'Resumed by driver')