Exemple #1
0
    def run(self, queue_name='default', sleep_time=1, verbose=False):
        """Worker process: ties to the given queue and handle all incoming tasks

        @queue_name - name of the queue to tie with
        @sleep_time - pause time between two tasks"""
        queue = Queue()
        while not self.SHUTDOWN_IN_PROGRESS:
            try:
                if verbose:
                    print '[remaining tasks number: %s] ' % queue.length(queue_name),
                result = queue.pop(queue_name=queue_name, timeout=10)
            except Exception, e:
                result = None
                print >> sys.stderr, 'An exception occurs during the proc processing: %s' % e
            if verbose:
                print result
            time.sleep(sleep_time)
Exemple #2
0
    def __init__(self):
        try:
            self._delay_logger = open(
                os.path.join(ROOT_LOG, DELAY_LOG + '.log'), 'a')
            self._error_logger = open(
                os.path.join(ROOT_LOG, ERROR_LOG + '.log'), 'a')
        except IOError:
            raise LogFileException, 'Failed to open log file'

        self._redis_client = redis.Redis(host=REDIS_HOST, port=int(REDIS_PORT))
        self.downloaded_url = Rediset(
            hash_generated_keys=True,
            redis_client=self._redis_client).Set('downloaded_url_set')
        self.todownload_url_queue = Queue('todownload_url_queue',
                                          host=REDIS_HOST,
                                          port=REDIS_PORT)
        self.todownload_url_set = Rediset(
            hash_generated_keys=True,
            redis_client=self._redis_client).Set('todownload_url_set')

        self._rlock = gevent.coros.RLock()
Exemple #3
0
    def __init__(self):
        try:
            self._delay_logger = open(os.path.join(ROOT_LOG, DELAY_LOG + '.log'), 'a')
            self._error_logger = open(os.path.join(ROOT_LOG, ERROR_LOG + '.log'), 'a')
        except IOError:
            raise LogFileException, 'Failed to open log file'

        self._redis_client = redis.Redis(host=REDIS_HOST, port=int(REDIS_PORT))
        self.downloaded_url = Rediset(hash_generated_keys=True,
                redis_client=self._redis_client).Set('downloaded_url_set')
        self.todownload_url_queue = Queue('todownload_url_queue',
                host=REDIS_HOST, port=REDIS_PORT)
        self.todownload_url_set = Rediset(hash_generated_keys=True,
                redis_client=self._redis_client).Set('todownload_url_set')

        self._rlock = gevent.coros.RLock()
Exemple #4
0
class CrawlerEngine:
    '''
    Core of the Crawler
    '''
    def __init__(self):
        try:
            self._delay_logger = open(
                os.path.join(ROOT_LOG, DELAY_LOG + '.log'), 'a')
            self._error_logger = open(
                os.path.join(ROOT_LOG, ERROR_LOG + '.log'), 'a')
        except IOError:
            raise LogFileException, 'Failed to open log file'

        self._redis_client = redis.Redis(host=REDIS_HOST, port=int(REDIS_PORT))
        self.downloaded_url = Rediset(
            hash_generated_keys=True,
            redis_client=self._redis_client).Set('downloaded_url_set')
        self.todownload_url_queue = Queue('todownload_url_queue',
                                          host=REDIS_HOST,
                                          port=REDIS_PORT)
        self.todownload_url_set = Rediset(
            hash_generated_keys=True,
            redis_client=self._redis_client).Set('todownload_url_set')

        self._rlock = gevent.coros.RLock()

    def __del__(self):
        pass

    def _push_to_queue(self, urls):
        try:
            for url in urls:
                self.todownload_url_queue.append(url)
                self.todownload_url_set.add(url)
        except redis.exceptions.ConnectionError:
            raise RedisQueueException('Failed to connect to redis server')

    def clear_data(self):
        try:
            self._redis_client.delete('downloaded_url_set',
                                      'todownload_url_set',
                                      'todownload_url_queue')
        except redis.exceptions.ConnectionError:
            raise RedisQueueException('Failed to connect to redis server')

    def set_delay_logger(self, directory):
        self._delay_logger.close()
        self._delay_logger = open(directory, 'a')

    def set_error_logger(self, directory):
        self._error_logger.close()
        self._error_logger = open(directory, 'a')

    def clear_delay_logger(self, directory):
        pass

    def clear_error_logger(self, directory):
        pass

    def start(self, start_urls=None, contin=False):
        if start_urls is None:
            start_urls = []
        if not isinstance(start_urls, list):
            raise TypeError("Parameter 'start_urls' should be a list")
        if not contin:
            if len(start_urls) == 0:
                raise Exception('You should specify at lease one start url')
            self.clear_data()
            self._push_to_queue(start_urls)
        greenlets = []
        for i in xrange(CRAWLER_NUMBER):
            greenlets.append(gevent.spawn(self._run))

        gevent.joinall(greenlets)

        print 'Hey buddy, I have finished my work.'

    def _run(self):
        downloader = Downloader(delay_logger=self._delay_logger,
                                error_logger=self._error_logger,
                                domain=DOMAIN)
        urlextractor = UrlExtractor(domain=DOMAIN)

        while True:
            try:
                url = self.todownload_url_queue.popleft()
            except IndexError, e:
                gevent.sleep(5)
                try:
                    url = self.todownload_url_queue.popleft()
                except IndexError, e:
                    break

            self.todownload_url_set.remove(url)
            try:
                data = downloader.get(url)
            except Exception, e:
                print 'Uncaptured exception: (%s) arise when getting url: (%s), \
                        please check it out' % (e, url)
                continue
Exemple #5
0
class CrawlerEngine:
    '''
    Core of the Crawler
    '''

    def __init__(self):
        try:
            self._delay_logger = open(os.path.join(ROOT_LOG, DELAY_LOG + '.log'), 'a')
            self._error_logger = open(os.path.join(ROOT_LOG, ERROR_LOG + '.log'), 'a')
        except IOError:
            raise LogFileException, 'Failed to open log file'

        self._redis_client = redis.Redis(host=REDIS_HOST, port=int(REDIS_PORT))
        self.downloaded_url = Rediset(hash_generated_keys=True,
                redis_client=self._redis_client).Set('downloaded_url_set')
        self.todownload_url_queue = Queue('todownload_url_queue',
                host=REDIS_HOST, port=REDIS_PORT)
        self.todownload_url_set = Rediset(hash_generated_keys=True,
                redis_client=self._redis_client).Set('todownload_url_set')

        self._rlock = gevent.coros.RLock()

    def __del__(self):
        pass

    def _push_to_queue(self, urls):
        try:
            for url in urls:
                self.todownload_url_queue.append(url)
                self.todownload_url_set.add(url)
        except redis.exceptions.ConnectionError:
            raise RedisQueueException('Failed to connect to redis server')

    def clear_data(self):
        try:
            self._redis_client.delete('downloaded_url_set', 'todownload_url_set',
                                      'todownload_url_queue')
        except redis.exceptions.ConnectionError:
            raise RedisQueueException('Failed to connect to redis server')

    def set_delay_logger(self, directory):
        self._delay_logger.close()
        self._delay_logger = open(directory, 'a')

    def set_error_logger(self, directory):
        self._error_logger.close()
        self._error_logger = open(directory, 'a')

    def clear_delay_logger(self, directory):
        pass

    def clear_error_logger(self, directory):
        pass

    def start(self, start_urls=None, contin=False):
        if start_urls is None:
            start_urls = []
        if not isinstance(start_urls, list):
            raise TypeError("Parameter 'start_urls' should be a list")
        if not contin:
            if len(start_urls) == 0:
                raise Exception('You should specify at lease one start url')
            self.clear_data()
            self._push_to_queue(start_urls)
        greenlets = []
        for i in xrange(CRAWLER_NUMBER):
            greenlets.append(gevent.spawn(self._run))

        gevent.joinall(greenlets)

        print 'Hey buddy, I have finished my work.'

    def _run(self):
        downloader = Downloader(delay_logger=self._delay_logger,
                                error_logger=self._error_logger, domain=DOMAIN)
        urlextractor = UrlExtractor(domain=DOMAIN)

        while True:
            try:
                url = self.todownload_url_queue.popleft()
            except IndexError, e:
                gevent.sleep(5)
                try:
                    url = self.todownload_url_queue.popleft()
                except IndexError, e:
                    break

            self.todownload_url_set.remove(url)
            try:
                data = downloader.get(url)
            except Exception, e:
                print 'Uncaptured exception: (%s) arise when getting url: (%s), \
                        please check it out' % (e, url)
                continue
 def setUp(self):
     self.redis_args = dict(host='127.0.0.1', port=6379)
     self.key = 'test_redis_queue'
     self.queue = Queue(self.key, **self.redis_args)
class TestRedisQueue(unittest.TestCase):
    def setUp(self):
        self.redis_args = dict(host='127.0.0.1', port=6379)
        self.key = 'test_redis_queue'
        self.queue = Queue(self.key, **self.redis_args)
    
    def tearDown(self):
        redis = Redis(**self.redis_args)
        redis.delete(self.key)
    
    def test_right(self):
        self.queue.append('one')
        self.queue.append('two')
        self.queue.append('three')
        self.assertEqual(self.queue.pop(), 'three')
        self.assertEqual(self.queue.pop(), 'two')
        self.assertEqual(self.queue.pop(), 'one')
        self.assertRaises(IndexError, self.queue.pop)
    
    def test_left(self):
        self.queue.appendleft('one')
        self.queue.appendleft('two')
        self.queue.appendleft('three')
        self.assertEqual(self.queue.popleft(), 'three')
        self.assertEqual(self.queue.popleft(), 'two')
        self.assertEqual(self.queue.popleft(), 'one')
        self.assertRaises(IndexError, self.queue.pop)
    
    def test_right_left(self):
        self.queue.append('one')
        self.queue.append('two')
        self.queue.append('three')
        self.assertEqual(self.queue.popleft(), 'one')
        self.assertEqual(self.queue.popleft(), 'two')
        self.assertEqual(self.queue.popleft(), 'three')
        self.assertRaises(IndexError, self.queue.pop)
    
    def test_left_right(self):
        self.queue.appendleft('one')
        self.queue.appendleft('two')
        self.queue.appendleft('three')
        self.assertEqual(self.queue.pop(), 'one')
        self.assertEqual(self.queue.pop(), 'two')
        self.assertEqual(self.queue.pop(), 'three')
        self.assertRaises(IndexError, self.queue.pop)

    def test_clear(self):
        self.queue.appendleft('one')
        self.queue.appendleft('two')
        self.queue.appendleft('three')
        self.queue.clear()
        self.assertRaises(IndexError, self.queue.pop)
        self.assertRaises(IndexError, self.queue.popleft)
    
    def test_remove(self):
        self.queue.append('one')
        self.queue.append('two')
        self.queue.append('two')
        self.queue.append('three')
        self.queue.append('two')
        self.queue.remove('two')
        self.assertEqual(self.queue.pop(), 'three')
        self.assertEqual(self.queue.pop(), 'one')
        self.assertRaises(IndexError, self.queue.pop)
    
    def test_contains(self):
        self.queue.append('one')
        self.queue.append('two')
        self.queue.append('three')
        self.assert_('one' in self.queue)
        self.assert_('two' in self.queue)
        self.assert_('three' in self.queue)
        self.assert_('four' not in self.queue)
    
    def test_extend_right(self):
        self.queue.extend(['one', 'two', 'three'])
        self.assertEqual(self.queue.pop(), 'three')
        self.assertEqual(self.queue.pop(), 'two')
        self.assertEqual(self.queue.pop(), 'one')
        self.assertRaises(IndexError, self.queue.pop)
    
    def test_extend_left(self):
        self.queue.extendleft(['one', 'two', 'three'])
        self.assertEqual(self.queue.popleft(), 'three')
        self.assertEqual(self.queue.popleft(), 'two')
        self.assertEqual(self.queue.popleft(), 'one')
        self.assertRaises(IndexError, self.queue.pop)
    
    def test_iteration(self):
        self.queue.append('one')
        self.queue.append('two')
        self.queue.append('three')
        self.assertEqual(list(iter(self.queue)), ['one', 'two', 'three'])
    
    def test_length(self):
        self.queue.append('one')
        self.queue.append('two')
        self.queue.append('three')
        self.assertEqual(len(self.queue), 3)
Exemple #8
0
def init_queue(host, port, password):
    global _queue
    _queue = Queue(_queue_name, host=host, port=port, password=password)