Example #1
0
    def test_default_caching_and_override(self):
        self.rediset = Rediset(key_prefix=self.PREFIX,
                               default_cache_seconds=10)
        s1 = self.rediset.Set('key1')
        s2 = self.rediset.Set('key2')

        intersection = self.rediset.Intersection(s1, s2)
        self.assertEqual(intersection.cache_seconds, 10)

        intersection = self.rediset.Intersection(s1, s2, cache_seconds=5)
        self.assertEqual(intersection.cache_seconds, 5)
Example #2
0
    def __init__(self):
        try:
            self._delay_logger = open(
                os.path.join(ROOT_LOG, DELAY_LOG + '.log'), 'a')
            self._error_logger = open(
                os.path.join(ROOT_LOG, ERROR_LOG + '.log'), 'a')
        except IOError:
            raise LogFileException, 'Failed to open log file'

        self._redis_client = redis.Redis(host=REDIS_HOST, port=int(REDIS_PORT))
        self.downloaded_url = Rediset(
            hash_generated_keys=True,
            redis_client=self._redis_client).Set('downloaded_url_set')
        self.todownload_url_queue = Queue('todownload_url_queue',
                                          host=REDIS_HOST,
                                          port=REDIS_PORT)
        self.todownload_url_set = Rediset(
            hash_generated_keys=True,
            redis_client=self._redis_client).Set('todownload_url_set')

        self._rlock = gevent.coros.RLock()
Example #3
0
    def test_key_hashing(self):

        # hashing disabled
        rs = Rediset(hash_generated_keys=False)
        key = rs.create_key('foo')
        self.assertEqual(key, 'foo')
        key = rs.create_key('foo', generated=True)
        self.assertEqual(key, 'rediset:foo')

        # hashing enabled
        rs = Rediset(hash_generated_keys=True)
        key = rs.create_key('foo')
        self.assertEqual(key, 'foo')
        key = rs.create_key('foo', generated=True)
        self.assertEqual(key, 'rediset:acbd18db4cc2f85cedef654fccc4a4d8')
Example #4
0
    def __init__(self):
        try:
            self._delay_logger = open(os.path.join(ROOT_LOG, DELAY_LOG + '.log'), 'a')
            self._error_logger = open(os.path.join(ROOT_LOG, ERROR_LOG + '.log'), 'a')
        except IOError:
            raise LogFileException, 'Failed to open log file'

        self._redis_client = redis.Redis(host=REDIS_HOST, port=int(REDIS_PORT))
        self.downloaded_url = Rediset(hash_generated_keys=True,
                redis_client=self._redis_client).Set('downloaded_url_set')
        self.todownload_url_queue = Queue('todownload_url_queue',
                host=REDIS_HOST, port=REDIS_PORT)
        self.todownload_url_set = Rediset(hash_generated_keys=True,
                redis_client=self._redis_client).Set('todownload_url_set')

        self._rlock = gevent.coros.RLock()
Example #5
0
class CachingTestCase(RedisTestCase):
    def test_default_caching_and_override(self):
        self.rediset = Rediset(key_prefix=self.PREFIX,
                               default_cache_seconds=10)
        s1 = self.rediset.Set('key1')
        s2 = self.rediset.Set('key2')

        intersection = self.rediset.Intersection(s1, s2)
        self.assertEqual(intersection.cache_seconds, 10)

        intersection = self.rediset.Intersection(s1, s2, cache_seconds=5)
        self.assertEqual(intersection.cache_seconds, 5)

    def test_caching(self):
        s1 = self.rediset.Set('key1')
        s2 = self.rediset.Set('key2')

        s1.add('a', 'b')
        s2.add('b', 'c')

        intersection = self.rediset.Intersection(s1, s2, cache_seconds=1)

        len(intersection)
        len(intersection)

        self.assertEqual(intersection.rs.redis.sinterstore.call_count, 1)

        sleep(2)

        len(intersection)

        self.assertEqual(intersection.rs.redis.sinterstore.call_count, 2)

    def test_caching_empty_sets(self):
        s1 = self.rediset.Set('key1')
        s2 = self.rediset.Set('key2')

        s1.add('a', 'b')
        s2.add('c', 'd')

        intersection = self.rediset.Intersection(s1, s2, cache_seconds=1)

        len(intersection)
        len(intersection)

        self.assertEqual(intersection.rs.redis.sinterstore.call_count, 1)
Example #6
0
    def test_key_hashing(self):

        # hashing disabled
        rs = Rediset(hash_generated_keys=False)
        key = rs.create_key('foo')
        self.assertEqual(key, 'foo')
        key = rs.create_key('foo', generated=True)
        self.assertEqual(key, 'rediset:foo')

        # hashing enabled
        rs = Rediset(hash_generated_keys=True)
        key = rs.create_key('foo')
        self.assertEqual(key, 'foo')
        key = rs.create_key('foo', generated=True)
        self.assertEqual(key, 'rediset:acbd18db4cc2f85cedef654fccc4a4d8')
Example #7
0
 def test_key_generation(self):
     rs = Rediset(key_prefix='some-prefix')
     key = rs.create_key('foo')
     self.assertEqual(key, 'some-prefix:foo')
Example #8
0
 def setUp(self):
     self.rediset = Rediset(key_prefix=self.PREFIX)
     self.rediset.redis = Mock(wraps=self.rediset.redis)
Example #9
0
class CrawlerEngine:
    '''
    Core of the Crawler
    '''
    def __init__(self):
        try:
            self._delay_logger = open(
                os.path.join(ROOT_LOG, DELAY_LOG + '.log'), 'a')
            self._error_logger = open(
                os.path.join(ROOT_LOG, ERROR_LOG + '.log'), 'a')
        except IOError:
            raise LogFileException, 'Failed to open log file'

        self._redis_client = redis.Redis(host=REDIS_HOST, port=int(REDIS_PORT))
        self.downloaded_url = Rediset(
            hash_generated_keys=True,
            redis_client=self._redis_client).Set('downloaded_url_set')
        self.todownload_url_queue = Queue('todownload_url_queue',
                                          host=REDIS_HOST,
                                          port=REDIS_PORT)
        self.todownload_url_set = Rediset(
            hash_generated_keys=True,
            redis_client=self._redis_client).Set('todownload_url_set')

        self._rlock = gevent.coros.RLock()

    def __del__(self):
        pass

    def _push_to_queue(self, urls):
        try:
            for url in urls:
                self.todownload_url_queue.append(url)
                self.todownload_url_set.add(url)
        except redis.exceptions.ConnectionError:
            raise RedisQueueException('Failed to connect to redis server')

    def clear_data(self):
        try:
            self._redis_client.delete('downloaded_url_set',
                                      'todownload_url_set',
                                      'todownload_url_queue')
        except redis.exceptions.ConnectionError:
            raise RedisQueueException('Failed to connect to redis server')

    def set_delay_logger(self, directory):
        self._delay_logger.close()
        self._delay_logger = open(directory, 'a')

    def set_error_logger(self, directory):
        self._error_logger.close()
        self._error_logger = open(directory, 'a')

    def clear_delay_logger(self, directory):
        pass

    def clear_error_logger(self, directory):
        pass

    def start(self, start_urls=None, contin=False):
        if start_urls is None:
            start_urls = []
        if not isinstance(start_urls, list):
            raise TypeError("Parameter 'start_urls' should be a list")
        if not contin:
            if len(start_urls) == 0:
                raise Exception('You should specify at lease one start url')
            self.clear_data()
            self._push_to_queue(start_urls)
        greenlets = []
        for i in xrange(CRAWLER_NUMBER):
            greenlets.append(gevent.spawn(self._run))

        gevent.joinall(greenlets)

        print 'Hey buddy, I have finished my work.'

    def _run(self):
        downloader = Downloader(delay_logger=self._delay_logger,
                                error_logger=self._error_logger,
                                domain=DOMAIN)
        urlextractor = UrlExtractor(domain=DOMAIN)

        while True:
            try:
                url = self.todownload_url_queue.popleft()
            except IndexError, e:
                gevent.sleep(5)
                try:
                    url = self.todownload_url_queue.popleft()
                except IndexError, e:
                    break

            self.todownload_url_set.remove(url)
            try:
                data = downloader.get(url)
            except Exception, e:
                print 'Uncaptured exception: (%s) arise when getting url: (%s), \
                        please check it out' % (e, url)
                continue
Example #10
0
 def test_key_generation(self):
     rs = Rediset(key_prefix='some-prefix')
     key = rs.create_key('foo')
     self.assertEqual(key, 'some-prefix:foo')
Example #11
0
class CrawlerEngine:
    '''
    Core of the Crawler
    '''

    def __init__(self):
        try:
            self._delay_logger = open(os.path.join(ROOT_LOG, DELAY_LOG + '.log'), 'a')
            self._error_logger = open(os.path.join(ROOT_LOG, ERROR_LOG + '.log'), 'a')
        except IOError:
            raise LogFileException, 'Failed to open log file'

        self._redis_client = redis.Redis(host=REDIS_HOST, port=int(REDIS_PORT))
        self.downloaded_url = Rediset(hash_generated_keys=True,
                redis_client=self._redis_client).Set('downloaded_url_set')
        self.todownload_url_queue = Queue('todownload_url_queue',
                host=REDIS_HOST, port=REDIS_PORT)
        self.todownload_url_set = Rediset(hash_generated_keys=True,
                redis_client=self._redis_client).Set('todownload_url_set')

        self._rlock = gevent.coros.RLock()

    def __del__(self):
        pass

    def _push_to_queue(self, urls):
        try:
            for url in urls:
                self.todownload_url_queue.append(url)
                self.todownload_url_set.add(url)
        except redis.exceptions.ConnectionError:
            raise RedisQueueException('Failed to connect to redis server')

    def clear_data(self):
        try:
            self._redis_client.delete('downloaded_url_set', 'todownload_url_set',
                                      'todownload_url_queue')
        except redis.exceptions.ConnectionError:
            raise RedisQueueException('Failed to connect to redis server')

    def set_delay_logger(self, directory):
        self._delay_logger.close()
        self._delay_logger = open(directory, 'a')

    def set_error_logger(self, directory):
        self._error_logger.close()
        self._error_logger = open(directory, 'a')

    def clear_delay_logger(self, directory):
        pass

    def clear_error_logger(self, directory):
        pass

    def start(self, start_urls=None, contin=False):
        if start_urls is None:
            start_urls = []
        if not isinstance(start_urls, list):
            raise TypeError("Parameter 'start_urls' should be a list")
        if not contin:
            if len(start_urls) == 0:
                raise Exception('You should specify at lease one start url')
            self.clear_data()
            self._push_to_queue(start_urls)
        greenlets = []
        for i in xrange(CRAWLER_NUMBER):
            greenlets.append(gevent.spawn(self._run))

        gevent.joinall(greenlets)

        print 'Hey buddy, I have finished my work.'

    def _run(self):
        downloader = Downloader(delay_logger=self._delay_logger,
                                error_logger=self._error_logger, domain=DOMAIN)
        urlextractor = UrlExtractor(domain=DOMAIN)

        while True:
            try:
                url = self.todownload_url_queue.popleft()
            except IndexError, e:
                gevent.sleep(5)
                try:
                    url = self.todownload_url_queue.popleft()
                except IndexError, e:
                    break

            self.todownload_url_set.remove(url)
            try:
                data = downloader.get(url)
            except Exception, e:
                print 'Uncaptured exception: (%s) arise when getting url: (%s), \
                        please check it out' % (e, url)
                continue