Ejemplo n.º 1
0
def main(max_threads):
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    cache.clear()
    process_crawler(scrape_callback.seed_url,
                    scrape_callback=scrape_callback,
                    cache=cache,
                    max_threads=max_threads,
                    timeout=10)
Ejemplo n.º 2
0
def main(max_threads):
    from mongo_cache import MongoCache
    from alexa_cb import AlexaCallback
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    cache.clear()
    process_crawler(scrape_callback.seed_url,
                    scrape_callback=scrape_callback,
                    cache=cache,
                    max_threads=max_threads,
                    timeout=10)  # process_crawler
Ejemplo n.º 3
0
class TestCache(unittest.TestCase):

    def setUp(self):
        self.cache = MongoCache(default_timeout=0)

    def tearDown(self):
        self.cache.collection.delete_many({})

    def test_get(self):
        x = MockData(1)

        self.cache.set('key-1', x)

        xc = self.cache.get('key-1')
        self.assertEqual(x, xc)

    def test_delete_existing(self):
        x = MockData(1)

        self.cache.set('key-1', x)

        self.assertTrue(self.cache.delete('key-1'))

    def test_delete_not_existing(self):
        self.assertFalse(self.cache.delete('key-1'))

    def test_set(self):
        x = MockData(1)
        self.cache.set('key-1', x)

        xc = self.cache.get('key-1')

        self.assertEqual(x, xc)

    def test_add_not_existing(self):
        x = MockData(1)

        added = self.cache.add('key-1', x)

        self.assertTrue(added)

    def test_add_existing(self):
        x = MockData(1)
        self.cache.set('key-1', x)

        y = MockData(2)
        added = self.cache.add('key-1', y)

        self.assertFalse(added)

    def test_clear(self):
        x = MockData(1)
        self.cache.set('key-1', x)

        cleared = self.cache.clear()
        xc = self.cache.get('key-1')

        self.assertTrue(cleared)
        self.assertIsNone(xc)

    def test_set_overwrite(self):
        x1 = MockData(1)
        key = 'key-set-overwrite'
        self.cache.set(key, x1)

        x2 = MockData(2)
        self.cache.set(key, x2)

        _filter = {'_id': key}
        count_keys = self.cache.collection.count(_filter)

        self.assertEqual(1, count_keys)

    def test_inc_with_exist_key(self):
        value = 10
        key = 'key-inc-with-exist-key'
        self.cache.set(key, value)

        delta = 9
        new_value = self.cache.inc(key, delta)

        value_cache = self.cache.get(key)

        result = delta + value

        self.assertEqual(result, value_cache)
        self.assertEqual(result, new_value)

    def test_inc_witho_exist_key(self):
        key = 'key-inc-without-exist-key'
        delta = 9
        new_value = self.cache.inc(key, delta)

        value_cache = self.cache.get(key)

        self.assertEqual(delta, value_cache)
        self.assertEqual(delta, new_value)

    def test_inc_with_error(self):
        value = MockData(1)
        key = 'key-inc-with-error'
        self.cache.add(key, value)

        delta = 9
        new_value = self.cache.inc(key, delta)

        value_cache = self.cache.get(key)

        self.assertEqual(value, value_cache)
        self.assertEqual(None, new_value)

    def test_has_with_add_key(self):
        key = 'key-has-with-add-key'
        value = MockData(1)

        self.cache.add(key, value)

        has_key = self.cache.has(key)

        self.assertTrue(has_key)

    def test_has_without_add_key(self):
        key = 'key-has-without-add-key'

        has_key = self.cache.has(key)

        self.assertFalse(has_key)

    def test_dec_with_exist_key(self):
        value = 10
        key = 'key-dec-with-exist-key'
        self.cache.set(key, value)

        delta = 9
        new_value = self.cache.dec(key, delta)

        value_cache = self.cache.get(key)

        result = value - delta

        self.assertEqual(result, value_cache)
        self.assertEqual(result, new_value)

    def test_dec_witho_exist_key(self):
        key = 'key-dec-without-exist-key'
        delta = 9
        new_value = self.cache.dec(key, delta)

        value_cache = self.cache.get(key)

        self.assertEqual(-delta, value_cache)
        self.assertEqual(-delta, new_value)

    def test_dec_with_error(self):
        value = MockData(1)
        key = 'key-dec-with-error'
        self.cache.add(key, value)

        delta = 9
        new_value = self.cache.dec(key, delta)

        value_cache = self.cache.get(key)

        self.assertEqual(value, value_cache)
        self.assertEqual(None, new_value)

    def test_get_many(self):
        key_x_value = {'key-%s' % i: MockData(i) for i in range(1, 11)}

        for key, value in key_x_value.items():
            self.cache.add(key, value)

        values = self.cache.get_many(*key_x_value.keys())

        self.assertEqual(10, len(values))
        for _return, _value in zip(values, key_x_value.values()):
            self.assertEqual(_value, _return)

    def test_get_dict(self):
        key_x_value = {'key-%s' % i: MockData(i) for i in range(1, 6)}

        for key, value in key_x_value.items():
            self.cache.add(key, value)

        results = self.cache.get_dict(*key_x_value.keys())

        self.assertIsInstance(results, dict)
        for key, value in key_x_value.items():
            self.assertIn(key, results)
            self.assertEqual(key_x_value[key], results[key])

    def test_delete_many(self):
        key_x_value = {'key-%s' % i: MockData(i) for i in range(1, 6)}

        for key, value in key_x_value.items():
            self.cache.add(key, value)

        self.assertEqual(5, self.cache.collection.count({'_id': {'$in': key_x_value.keys()}}))

        result = self.cache.delete_many(*key_x_value.keys())

        self.assertTrue(result)
        self.assertEqual(0, self.cache.collection.count({'_id': {'$in': key_x_value.keys()}}))

    def test_set_many(self):
        key_x_value = {'key-set-many-%s' % i: MockData(i) for i in range(1, 6)}

        result = self.cache.set_many(key_x_value)

        self.assertTrue(result)
        self.assertEqual(5, self.cache.collection.count({'_id': {'$in': key_x_value.keys()}}))
Ejemplo n.º 4
0
def main(max_threads):
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    cache.clear()
    process_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10)
Ejemplo n.º 5
0
# -*- coding: utf-8 -*-
from datetime import timedelta

from pymongo import MongoClient

from mongo_cache import MongoCache

cache = MongoCache()
cache.clear()
url = 'http://example.webscraping.comasdf'
result = {'html': '...'}
cache[url] = result
print(cache[url]['html'] == result['html'])
cache = MongoCache(expires=timedelta())
cache[url] = result
import time
time.sleep(60)
print(cache[url])
Ejemplo n.º 6
0
def main():
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    cache.clear()
    link_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, user_agent='GoodCrawler', ignore_robots=True)
                        else:
                            stop = 1                                                      # 该链表页下的所有详情页为空, 不再增加链表页
                if 'top250' in url and stop == 0:
                    page_size += 25
                    next_link = form_url.format(page_size)
                    if next_link not in seen:
                        seen.add(next_link)
                        crawl_queue.append(next_link)
    # 等待所有的下载线程结束
    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                # 移除已经停止的进程
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue:
            # 开始更多的线程
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True)
            thread.start()
            threads.append(thread)
        time.sleep(np.random.randint(6, 12))



if __name__ == '__main__':
    Scrape_Back = GetDetailInfo
    Cache = MongoCache()
    Cache.clear()
    threaded_crawler(scrape_callback=Scrape_Back, cache=Cache)
Ejemplo n.º 8
0
 def test_result_is_not_none(self):
     cache = MongoCache()
     cache.clear()
     self.assertIsNotNone(self.result['html'])
Ejemplo n.º 9
0
class TestCache(unittest.TestCase):
    def setUp(self):
        self.cache = MongoCache(default_timeout=0)

    def tearDown(self):
        self.cache.collection.delete_many({})

    def test_get(self):
        x = MockData(1)

        self.cache.set('key-1', x)

        xc = self.cache.get('key-1')
        self.assertEqual(x, xc)

    def test_delete_existing(self):
        x = MockData(1)

        self.cache.set('key-1', x)

        self.assertTrue(self.cache.delete('key-1'))

    def test_delete_not_existing(self):
        self.assertFalse(self.cache.delete('key-1'))

    def test_set(self):
        x = MockData(1)
        self.cache.set('key-1', x)

        xc = self.cache.get('key-1')

        self.assertEqual(x, xc)

    def test_add_not_existing(self):
        x = MockData(1)

        added = self.cache.add('key-1', x)

        self.assertTrue(added)

    def test_add_existing(self):
        x = MockData(1)
        self.cache.set('key-1', x)

        y = MockData(2)
        added = self.cache.add('key-1', y)

        self.assertFalse(added)

    def test_clear(self):
        x = MockData(1)
        self.cache.set('key-1', x)

        cleared = self.cache.clear()
        xc = self.cache.get('key-1')

        self.assertTrue(cleared)
        self.assertIsNone(xc)

    def test_set_overwrite(self):
        x1 = MockData(1)
        key = 'key-set-overwrite'
        self.cache.set(key, x1)

        x2 = MockData(2)
        self.cache.set(key, x2)

        _filter = {'_id': key}
        count_keys = self.cache.collection.count(_filter)

        self.assertEqual(1, count_keys)

    def test_inc_with_exist_key(self):
        value = 10
        key = 'key-inc-with-exist-key'
        self.cache.set(key, value)

        delta = 9
        new_value = self.cache.inc(key, delta)

        value_cache = self.cache.get(key)

        result = delta + value

        self.assertEqual(result, value_cache)
        self.assertEqual(result, new_value)

    def test_inc_witho_exist_key(self):
        key = 'key-inc-without-exist-key'
        delta = 9
        new_value = self.cache.inc(key, delta)

        value_cache = self.cache.get(key)

        self.assertEqual(delta, value_cache)
        self.assertEqual(delta, new_value)

    def test_inc_with_error(self):
        value = MockData(1)
        key = 'key-inc-with-error'
        self.cache.add(key, value)

        delta = 9
        new_value = self.cache.inc(key, delta)

        value_cache = self.cache.get(key)

        self.assertEqual(value, value_cache)
        self.assertEqual(None, new_value)

    def test_has_with_add_key(self):
        key = 'key-has-with-add-key'
        value = MockData(1)

        self.cache.add(key, value)

        has_key = self.cache.has(key)

        self.assertTrue(has_key)

    def test_has_without_add_key(self):
        key = 'key-has-without-add-key'

        has_key = self.cache.has(key)

        self.assertFalse(has_key)

    def test_dec_with_exist_key(self):
        value = 10
        key = 'key-dec-with-exist-key'
        self.cache.set(key, value)

        delta = 9
        new_value = self.cache.dec(key, delta)

        value_cache = self.cache.get(key)

        result = value - delta

        self.assertEqual(result, value_cache)
        self.assertEqual(result, new_value)

    def test_dec_witho_exist_key(self):
        key = 'key-dec-without-exist-key'
        delta = 9
        new_value = self.cache.dec(key, delta)

        value_cache = self.cache.get(key)

        self.assertEqual(-delta, value_cache)
        self.assertEqual(-delta, new_value)

    def test_dec_with_error(self):
        value = MockData(1)
        key = 'key-dec-with-error'
        self.cache.add(key, value)

        delta = 9
        new_value = self.cache.dec(key, delta)

        value_cache = self.cache.get(key)

        self.assertEqual(value, value_cache)
        self.assertEqual(None, new_value)

    def test_get_many(self):
        key_x_value = {'key-%s' % i: MockData(i) for i in range(1, 11)}

        for key, value in key_x_value.items():
            self.cache.add(key, value)

        values = self.cache.get_many(*key_x_value.keys())

        self.assertEqual(10, len(values))
        for _return, _value in zip(values, key_x_value.values()):
            self.assertEqual(_value, _return)

    def test_get_dict(self):
        key_x_value = {'key-%s' % i: MockData(i) for i in range(1, 6)}

        for key, value in key_x_value.items():
            self.cache.add(key, value)

        results = self.cache.get_dict(*key_x_value.keys())

        self.assertIsInstance(results, dict)
        for key, value in key_x_value.items():
            self.assertIn(key, results)
            self.assertEqual(key_x_value[key], results[key])

    def test_delete_many(self):
        key_x_value = {'key-%s' % i: MockData(i) for i in range(1, 6)}

        for key, value in key_x_value.items():
            self.cache.add(key, value)

        self.assertEqual(
            5,
            self.cache.collection.count({'_id': {
                '$in': key_x_value.keys()
            }}))

        result = self.cache.delete_many(*key_x_value.keys())

        self.assertTrue(result)
        self.assertEqual(
            0,
            self.cache.collection.count({'_id': {
                '$in': key_x_value.keys()
            }}))

    def test_set_many(self):
        key_x_value = {'key-set-many-%s' % i: MockData(i) for i in range(1, 6)}

        result = self.cache.set_many(key_x_value)

        self.assertTrue(result)
        self.assertEqual(
            5,
            self.cache.collection.count({'_id': {
                '$in': key_x_value.keys()
            }}))