def test_default_caching_and_override(self): self.rediset = Rediset(key_prefix=self.PREFIX, default_cache_seconds=10) s1 = self.rediset.Set('key1') s2 = self.rediset.Set('key2') intersection = self.rediset.Intersection(s1, s2) self.assertEqual(intersection.cache_seconds, 10) intersection = self.rediset.Intersection(s1, s2, cache_seconds=5) self.assertEqual(intersection.cache_seconds, 5)
def __init__(self): try: self._delay_logger = open( os.path.join(ROOT_LOG, DELAY_LOG + '.log'), 'a') self._error_logger = open( os.path.join(ROOT_LOG, ERROR_LOG + '.log'), 'a') except IOError: raise LogFileException, 'Failed to open log file' self._redis_client = redis.Redis(host=REDIS_HOST, port=int(REDIS_PORT)) self.downloaded_url = Rediset( hash_generated_keys=True, redis_client=self._redis_client).Set('downloaded_url_set') self.todownload_url_queue = Queue('todownload_url_queue', host=REDIS_HOST, port=REDIS_PORT) self.todownload_url_set = Rediset( hash_generated_keys=True, redis_client=self._redis_client).Set('todownload_url_set') self._rlock = gevent.coros.RLock()
def test_key_hashing(self): # hashing disabled rs = Rediset(hash_generated_keys=False) key = rs.create_key('foo') self.assertEqual(key, 'foo') key = rs.create_key('foo', generated=True) self.assertEqual(key, 'rediset:foo') # hashing enabled rs = Rediset(hash_generated_keys=True) key = rs.create_key('foo') self.assertEqual(key, 'foo') key = rs.create_key('foo', generated=True) self.assertEqual(key, 'rediset:acbd18db4cc2f85cedef654fccc4a4d8')
def __init__(self): try: self._delay_logger = open(os.path.join(ROOT_LOG, DELAY_LOG + '.log'), 'a') self._error_logger = open(os.path.join(ROOT_LOG, ERROR_LOG + '.log'), 'a') except IOError: raise LogFileException, 'Failed to open log file' self._redis_client = redis.Redis(host=REDIS_HOST, port=int(REDIS_PORT)) self.downloaded_url = Rediset(hash_generated_keys=True, redis_client=self._redis_client).Set('downloaded_url_set') self.todownload_url_queue = Queue('todownload_url_queue', host=REDIS_HOST, port=REDIS_PORT) self.todownload_url_set = Rediset(hash_generated_keys=True, redis_client=self._redis_client).Set('todownload_url_set') self._rlock = gevent.coros.RLock()
class CachingTestCase(RedisTestCase): def test_default_caching_and_override(self): self.rediset = Rediset(key_prefix=self.PREFIX, default_cache_seconds=10) s1 = self.rediset.Set('key1') s2 = self.rediset.Set('key2') intersection = self.rediset.Intersection(s1, s2) self.assertEqual(intersection.cache_seconds, 10) intersection = self.rediset.Intersection(s1, s2, cache_seconds=5) self.assertEqual(intersection.cache_seconds, 5) def test_caching(self): s1 = self.rediset.Set('key1') s2 = self.rediset.Set('key2') s1.add('a', 'b') s2.add('b', 'c') intersection = self.rediset.Intersection(s1, s2, cache_seconds=1) len(intersection) len(intersection) self.assertEqual(intersection.rs.redis.sinterstore.call_count, 1) sleep(2) len(intersection) self.assertEqual(intersection.rs.redis.sinterstore.call_count, 2) def test_caching_empty_sets(self): s1 = self.rediset.Set('key1') s2 = self.rediset.Set('key2') s1.add('a', 'b') s2.add('c', 'd') intersection = self.rediset.Intersection(s1, s2, cache_seconds=1) len(intersection) len(intersection) self.assertEqual(intersection.rs.redis.sinterstore.call_count, 1)
def test_key_generation(self): rs = Rediset(key_prefix='some-prefix') key = rs.create_key('foo') self.assertEqual(key, 'some-prefix:foo')
def setUp(self): self.rediset = Rediset(key_prefix=self.PREFIX) self.rediset.redis = Mock(wraps=self.rediset.redis)
class CrawlerEngine: ''' Core of the Crawler ''' def __init__(self): try: self._delay_logger = open( os.path.join(ROOT_LOG, DELAY_LOG + '.log'), 'a') self._error_logger = open( os.path.join(ROOT_LOG, ERROR_LOG + '.log'), 'a') except IOError: raise LogFileException, 'Failed to open log file' self._redis_client = redis.Redis(host=REDIS_HOST, port=int(REDIS_PORT)) self.downloaded_url = Rediset( hash_generated_keys=True, redis_client=self._redis_client).Set('downloaded_url_set') self.todownload_url_queue = Queue('todownload_url_queue', host=REDIS_HOST, port=REDIS_PORT) self.todownload_url_set = Rediset( hash_generated_keys=True, redis_client=self._redis_client).Set('todownload_url_set') self._rlock = gevent.coros.RLock() def __del__(self): pass def _push_to_queue(self, urls): try: for url in urls: self.todownload_url_queue.append(url) self.todownload_url_set.add(url) except redis.exceptions.ConnectionError: raise RedisQueueException('Failed to connect to redis server') def clear_data(self): try: self._redis_client.delete('downloaded_url_set', 'todownload_url_set', 'todownload_url_queue') except redis.exceptions.ConnectionError: raise RedisQueueException('Failed to connect to redis server') def set_delay_logger(self, directory): self._delay_logger.close() self._delay_logger = open(directory, 'a') def set_error_logger(self, directory): self._error_logger.close() self._error_logger = open(directory, 'a') def clear_delay_logger(self, directory): pass def clear_error_logger(self, directory): pass def start(self, start_urls=None, contin=False): if start_urls is None: start_urls = [] if not isinstance(start_urls, list): raise TypeError("Parameter 'start_urls' should be a list") if not contin: if len(start_urls) == 0: raise Exception('You should specify at lease one start url') self.clear_data() self._push_to_queue(start_urls) greenlets = [] for i in xrange(CRAWLER_NUMBER): greenlets.append(gevent.spawn(self._run)) gevent.joinall(greenlets) print 'Hey buddy, I have finished my work.' def _run(self): downloader = Downloader(delay_logger=self._delay_logger, error_logger=self._error_logger, domain=DOMAIN) urlextractor = UrlExtractor(domain=DOMAIN) while True: try: url = self.todownload_url_queue.popleft() except IndexError, e: gevent.sleep(5) try: url = self.todownload_url_queue.popleft() except IndexError, e: break self.todownload_url_set.remove(url) try: data = downloader.get(url) except Exception, e: print 'Uncaptured exception: (%s) arise when getting url: (%s), \ please check it out' % (e, url) continue
class CrawlerEngine: ''' Core of the Crawler ''' def __init__(self): try: self._delay_logger = open(os.path.join(ROOT_LOG, DELAY_LOG + '.log'), 'a') self._error_logger = open(os.path.join(ROOT_LOG, ERROR_LOG + '.log'), 'a') except IOError: raise LogFileException, 'Failed to open log file' self._redis_client = redis.Redis(host=REDIS_HOST, port=int(REDIS_PORT)) self.downloaded_url = Rediset(hash_generated_keys=True, redis_client=self._redis_client).Set('downloaded_url_set') self.todownload_url_queue = Queue('todownload_url_queue', host=REDIS_HOST, port=REDIS_PORT) self.todownload_url_set = Rediset(hash_generated_keys=True, redis_client=self._redis_client).Set('todownload_url_set') self._rlock = gevent.coros.RLock() def __del__(self): pass def _push_to_queue(self, urls): try: for url in urls: self.todownload_url_queue.append(url) self.todownload_url_set.add(url) except redis.exceptions.ConnectionError: raise RedisQueueException('Failed to connect to redis server') def clear_data(self): try: self._redis_client.delete('downloaded_url_set', 'todownload_url_set', 'todownload_url_queue') except redis.exceptions.ConnectionError: raise RedisQueueException('Failed to connect to redis server') def set_delay_logger(self, directory): self._delay_logger.close() self._delay_logger = open(directory, 'a') def set_error_logger(self, directory): self._error_logger.close() self._error_logger = open(directory, 'a') def clear_delay_logger(self, directory): pass def clear_error_logger(self, directory): pass def start(self, start_urls=None, contin=False): if start_urls is None: start_urls = [] if not isinstance(start_urls, list): raise TypeError("Parameter 'start_urls' should be a list") if not contin: if len(start_urls) == 0: raise Exception('You should specify at lease one start url') self.clear_data() self._push_to_queue(start_urls) greenlets = [] for i in xrange(CRAWLER_NUMBER): greenlets.append(gevent.spawn(self._run)) gevent.joinall(greenlets) print 'Hey buddy, I have finished my work.' def _run(self): downloader = Downloader(delay_logger=self._delay_logger, error_logger=self._error_logger, domain=DOMAIN) urlextractor = UrlExtractor(domain=DOMAIN) while True: try: url = self.todownload_url_queue.popleft() except IndexError, e: gevent.sleep(5) try: url = self.todownload_url_queue.popleft() except IndexError, e: break self.todownload_url_set.remove(url) try: data = downloader.get(url) except Exception, e: print 'Uncaptured exception: (%s) arise when getting url: (%s), \ please check it out' % (e, url) continue