def run(self, queue_name='default', sleep_time=1, verbose=False): """Worker process: ties to the given queue and handle all incoming tasks @queue_name - name of the queue to tie with @sleep_time - pause time between two tasks""" queue = Queue() while not self.SHUTDOWN_IN_PROGRESS: try: if verbose: print '[remaining tasks number: %s] ' % queue.length(queue_name), result = queue.pop(queue_name=queue_name, timeout=10) except Exception, e: result = None print >> sys.stderr, 'An exception occurs during the proc processing: %s' % e if verbose: print result time.sleep(sleep_time)
def __init__(self): try: self._delay_logger = open( os.path.join(ROOT_LOG, DELAY_LOG + '.log'), 'a') self._error_logger = open( os.path.join(ROOT_LOG, ERROR_LOG + '.log'), 'a') except IOError: raise LogFileException, 'Failed to open log file' self._redis_client = redis.Redis(host=REDIS_HOST, port=int(REDIS_PORT)) self.downloaded_url = Rediset( hash_generated_keys=True, redis_client=self._redis_client).Set('downloaded_url_set') self.todownload_url_queue = Queue('todownload_url_queue', host=REDIS_HOST, port=REDIS_PORT) self.todownload_url_set = Rediset( hash_generated_keys=True, redis_client=self._redis_client).Set('todownload_url_set') self._rlock = gevent.coros.RLock()
def __init__(self): try: self._delay_logger = open(os.path.join(ROOT_LOG, DELAY_LOG + '.log'), 'a') self._error_logger = open(os.path.join(ROOT_LOG, ERROR_LOG + '.log'), 'a') except IOError: raise LogFileException, 'Failed to open log file' self._redis_client = redis.Redis(host=REDIS_HOST, port=int(REDIS_PORT)) self.downloaded_url = Rediset(hash_generated_keys=True, redis_client=self._redis_client).Set('downloaded_url_set') self.todownload_url_queue = Queue('todownload_url_queue', host=REDIS_HOST, port=REDIS_PORT) self.todownload_url_set = Rediset(hash_generated_keys=True, redis_client=self._redis_client).Set('todownload_url_set') self._rlock = gevent.coros.RLock()
class CrawlerEngine: ''' Core of the Crawler ''' def __init__(self): try: self._delay_logger = open( os.path.join(ROOT_LOG, DELAY_LOG + '.log'), 'a') self._error_logger = open( os.path.join(ROOT_LOG, ERROR_LOG + '.log'), 'a') except IOError: raise LogFileException, 'Failed to open log file' self._redis_client = redis.Redis(host=REDIS_HOST, port=int(REDIS_PORT)) self.downloaded_url = Rediset( hash_generated_keys=True, redis_client=self._redis_client).Set('downloaded_url_set') self.todownload_url_queue = Queue('todownload_url_queue', host=REDIS_HOST, port=REDIS_PORT) self.todownload_url_set = Rediset( hash_generated_keys=True, redis_client=self._redis_client).Set('todownload_url_set') self._rlock = gevent.coros.RLock() def __del__(self): pass def _push_to_queue(self, urls): try: for url in urls: self.todownload_url_queue.append(url) self.todownload_url_set.add(url) except redis.exceptions.ConnectionError: raise RedisQueueException('Failed to connect to redis server') def clear_data(self): try: self._redis_client.delete('downloaded_url_set', 'todownload_url_set', 'todownload_url_queue') except redis.exceptions.ConnectionError: raise RedisQueueException('Failed to connect to redis server') def set_delay_logger(self, directory): self._delay_logger.close() self._delay_logger = open(directory, 'a') def set_error_logger(self, directory): self._error_logger.close() self._error_logger = open(directory, 'a') def clear_delay_logger(self, directory): pass def clear_error_logger(self, directory): pass def start(self, start_urls=None, contin=False): if start_urls is None: start_urls = [] if not isinstance(start_urls, list): raise TypeError("Parameter 'start_urls' should be a list") if not contin: if len(start_urls) == 0: raise Exception('You should specify at lease one start url') self.clear_data() self._push_to_queue(start_urls) greenlets = [] for i in xrange(CRAWLER_NUMBER): greenlets.append(gevent.spawn(self._run)) gevent.joinall(greenlets) print 'Hey buddy, I have finished my work.' def _run(self): downloader = Downloader(delay_logger=self._delay_logger, error_logger=self._error_logger, domain=DOMAIN) urlextractor = UrlExtractor(domain=DOMAIN) while True: try: url = self.todownload_url_queue.popleft() except IndexError, e: gevent.sleep(5) try: url = self.todownload_url_queue.popleft() except IndexError, e: break self.todownload_url_set.remove(url) try: data = downloader.get(url) except Exception, e: print 'Uncaptured exception: (%s) arise when getting url: (%s), \ please check it out' % (e, url) continue
class CrawlerEngine: ''' Core of the Crawler ''' def __init__(self): try: self._delay_logger = open(os.path.join(ROOT_LOG, DELAY_LOG + '.log'), 'a') self._error_logger = open(os.path.join(ROOT_LOG, ERROR_LOG + '.log'), 'a') except IOError: raise LogFileException, 'Failed to open log file' self._redis_client = redis.Redis(host=REDIS_HOST, port=int(REDIS_PORT)) self.downloaded_url = Rediset(hash_generated_keys=True, redis_client=self._redis_client).Set('downloaded_url_set') self.todownload_url_queue = Queue('todownload_url_queue', host=REDIS_HOST, port=REDIS_PORT) self.todownload_url_set = Rediset(hash_generated_keys=True, redis_client=self._redis_client).Set('todownload_url_set') self._rlock = gevent.coros.RLock() def __del__(self): pass def _push_to_queue(self, urls): try: for url in urls: self.todownload_url_queue.append(url) self.todownload_url_set.add(url) except redis.exceptions.ConnectionError: raise RedisQueueException('Failed to connect to redis server') def clear_data(self): try: self._redis_client.delete('downloaded_url_set', 'todownload_url_set', 'todownload_url_queue') except redis.exceptions.ConnectionError: raise RedisQueueException('Failed to connect to redis server') def set_delay_logger(self, directory): self._delay_logger.close() self._delay_logger = open(directory, 'a') def set_error_logger(self, directory): self._error_logger.close() self._error_logger = open(directory, 'a') def clear_delay_logger(self, directory): pass def clear_error_logger(self, directory): pass def start(self, start_urls=None, contin=False): if start_urls is None: start_urls = [] if not isinstance(start_urls, list): raise TypeError("Parameter 'start_urls' should be a list") if not contin: if len(start_urls) == 0: raise Exception('You should specify at lease one start url') self.clear_data() self._push_to_queue(start_urls) greenlets = [] for i in xrange(CRAWLER_NUMBER): greenlets.append(gevent.spawn(self._run)) gevent.joinall(greenlets) print 'Hey buddy, I have finished my work.' def _run(self): downloader = Downloader(delay_logger=self._delay_logger, error_logger=self._error_logger, domain=DOMAIN) urlextractor = UrlExtractor(domain=DOMAIN) while True: try: url = self.todownload_url_queue.popleft() except IndexError, e: gevent.sleep(5) try: url = self.todownload_url_queue.popleft() except IndexError, e: break self.todownload_url_set.remove(url) try: data = downloader.get(url) except Exception, e: print 'Uncaptured exception: (%s) arise when getting url: (%s), \ please check it out' % (e, url) continue
def setUp(self): self.redis_args = dict(host='127.0.0.1', port=6379) self.key = 'test_redis_queue' self.queue = Queue(self.key, **self.redis_args)
class TestRedisQueue(unittest.TestCase): def setUp(self): self.redis_args = dict(host='127.0.0.1', port=6379) self.key = 'test_redis_queue' self.queue = Queue(self.key, **self.redis_args) def tearDown(self): redis = Redis(**self.redis_args) redis.delete(self.key) def test_right(self): self.queue.append('one') self.queue.append('two') self.queue.append('three') self.assertEqual(self.queue.pop(), 'three') self.assertEqual(self.queue.pop(), 'two') self.assertEqual(self.queue.pop(), 'one') self.assertRaises(IndexError, self.queue.pop) def test_left(self): self.queue.appendleft('one') self.queue.appendleft('two') self.queue.appendleft('three') self.assertEqual(self.queue.popleft(), 'three') self.assertEqual(self.queue.popleft(), 'two') self.assertEqual(self.queue.popleft(), 'one') self.assertRaises(IndexError, self.queue.pop) def test_right_left(self): self.queue.append('one') self.queue.append('two') self.queue.append('three') self.assertEqual(self.queue.popleft(), 'one') self.assertEqual(self.queue.popleft(), 'two') self.assertEqual(self.queue.popleft(), 'three') self.assertRaises(IndexError, self.queue.pop) def test_left_right(self): self.queue.appendleft('one') self.queue.appendleft('two') self.queue.appendleft('three') self.assertEqual(self.queue.pop(), 'one') self.assertEqual(self.queue.pop(), 'two') self.assertEqual(self.queue.pop(), 'three') self.assertRaises(IndexError, self.queue.pop) def test_clear(self): self.queue.appendleft('one') self.queue.appendleft('two') self.queue.appendleft('three') self.queue.clear() self.assertRaises(IndexError, self.queue.pop) self.assertRaises(IndexError, self.queue.popleft) def test_remove(self): self.queue.append('one') self.queue.append('two') self.queue.append('two') self.queue.append('three') self.queue.append('two') self.queue.remove('two') self.assertEqual(self.queue.pop(), 'three') self.assertEqual(self.queue.pop(), 'one') self.assertRaises(IndexError, self.queue.pop) def test_contains(self): self.queue.append('one') self.queue.append('two') self.queue.append('three') self.assert_('one' in self.queue) self.assert_('two' in self.queue) self.assert_('three' in self.queue) self.assert_('four' not in self.queue) def test_extend_right(self): self.queue.extend(['one', 'two', 'three']) self.assertEqual(self.queue.pop(), 'three') self.assertEqual(self.queue.pop(), 'two') self.assertEqual(self.queue.pop(), 'one') self.assertRaises(IndexError, self.queue.pop) def test_extend_left(self): self.queue.extendleft(['one', 'two', 'three']) self.assertEqual(self.queue.popleft(), 'three') self.assertEqual(self.queue.popleft(), 'two') self.assertEqual(self.queue.popleft(), 'one') self.assertRaises(IndexError, self.queue.pop) def test_iteration(self): self.queue.append('one') self.queue.append('two') self.queue.append('three') self.assertEqual(list(iter(self.queue)), ['one', 'two', 'three']) def test_length(self): self.queue.append('one') self.queue.append('two') self.queue.append('three') self.assertEqual(len(self.queue), 3)
def init_queue(host, port, password): global _queue _queue = Queue(_queue_name, host=host, port=port, password=password)