def main(max_threads): scrape_callback = AlexaCallback() cache = MongoCache() cache.clear() process_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10)
def main(max_threads): from mongo_cache import MongoCache from alexa_cb import AlexaCallback scrape_callback = AlexaCallback() cache = MongoCache() cache.clear() process_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10) # process_crawler
class TestCache(unittest.TestCase): def setUp(self): self.cache = MongoCache(default_timeout=0) def tearDown(self): self.cache.collection.delete_many({}) def test_get(self): x = MockData(1) self.cache.set('key-1', x) xc = self.cache.get('key-1') self.assertEqual(x, xc) def test_delete_existing(self): x = MockData(1) self.cache.set('key-1', x) self.assertTrue(self.cache.delete('key-1')) def test_delete_not_existing(self): self.assertFalse(self.cache.delete('key-1')) def test_set(self): x = MockData(1) self.cache.set('key-1', x) xc = self.cache.get('key-1') self.assertEqual(x, xc) def test_add_not_existing(self): x = MockData(1) added = self.cache.add('key-1', x) self.assertTrue(added) def test_add_existing(self): x = MockData(1) self.cache.set('key-1', x) y = MockData(2) added = self.cache.add('key-1', y) self.assertFalse(added) def test_clear(self): x = MockData(1) self.cache.set('key-1', x) cleared = self.cache.clear() xc = self.cache.get('key-1') self.assertTrue(cleared) self.assertIsNone(xc) def test_set_overwrite(self): x1 = MockData(1) key = 'key-set-overwrite' self.cache.set(key, x1) x2 = MockData(2) self.cache.set(key, x2) _filter = {'_id': key} count_keys = self.cache.collection.count(_filter) self.assertEqual(1, count_keys) def test_inc_with_exist_key(self): value = 10 key = 'key-inc-with-exist-key' self.cache.set(key, value) delta = 9 new_value = self.cache.inc(key, delta) value_cache = self.cache.get(key) result = delta + value self.assertEqual(result, value_cache) self.assertEqual(result, new_value) def test_inc_witho_exist_key(self): key = 'key-inc-without-exist-key' delta = 9 new_value = self.cache.inc(key, delta) value_cache = self.cache.get(key) self.assertEqual(delta, value_cache) self.assertEqual(delta, new_value) def test_inc_with_error(self): value = MockData(1) key = 'key-inc-with-error' self.cache.add(key, value) delta = 9 new_value = self.cache.inc(key, delta) value_cache = self.cache.get(key) self.assertEqual(value, value_cache) self.assertEqual(None, new_value) def test_has_with_add_key(self): key = 'key-has-with-add-key' value = MockData(1) self.cache.add(key, value) has_key = self.cache.has(key) self.assertTrue(has_key) def test_has_without_add_key(self): key = 'key-has-without-add-key' has_key = self.cache.has(key) self.assertFalse(has_key) def test_dec_with_exist_key(self): value = 10 key = 'key-dec-with-exist-key' self.cache.set(key, value) delta = 9 new_value = self.cache.dec(key, delta) value_cache = self.cache.get(key) result = value - delta self.assertEqual(result, value_cache) self.assertEqual(result, new_value) def test_dec_witho_exist_key(self): key = 'key-dec-without-exist-key' delta = 9 new_value = self.cache.dec(key, delta) value_cache = self.cache.get(key) self.assertEqual(-delta, value_cache) self.assertEqual(-delta, new_value) def test_dec_with_error(self): value = MockData(1) key = 'key-dec-with-error' self.cache.add(key, value) delta = 9 new_value = self.cache.dec(key, delta) value_cache = self.cache.get(key) self.assertEqual(value, value_cache) self.assertEqual(None, new_value) def test_get_many(self): key_x_value = {'key-%s' % i: MockData(i) for i in range(1, 11)} for key, value in key_x_value.items(): self.cache.add(key, value) values = self.cache.get_many(*key_x_value.keys()) self.assertEqual(10, len(values)) for _return, _value in zip(values, key_x_value.values()): self.assertEqual(_value, _return) def test_get_dict(self): key_x_value = {'key-%s' % i: MockData(i) for i in range(1, 6)} for key, value in key_x_value.items(): self.cache.add(key, value) results = self.cache.get_dict(*key_x_value.keys()) self.assertIsInstance(results, dict) for key, value in key_x_value.items(): self.assertIn(key, results) self.assertEqual(key_x_value[key], results[key]) def test_delete_many(self): key_x_value = {'key-%s' % i: MockData(i) for i in range(1, 6)} for key, value in key_x_value.items(): self.cache.add(key, value) self.assertEqual(5, self.cache.collection.count({'_id': {'$in': key_x_value.keys()}})) result = self.cache.delete_many(*key_x_value.keys()) self.assertTrue(result) self.assertEqual(0, self.cache.collection.count({'_id': {'$in': key_x_value.keys()}})) def test_set_many(self): key_x_value = {'key-set-many-%s' % i: MockData(i) for i in range(1, 6)} result = self.cache.set_many(key_x_value) self.assertTrue(result) self.assertEqual(5, self.cache.collection.count({'_id': {'$in': key_x_value.keys()}}))
def main(max_threads): scrape_callback = AlexaCallback() cache = MongoCache() cache.clear() process_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10)
# -*- coding: utf-8 -*- from datetime import timedelta from pymongo import MongoClient from mongo_cache import MongoCache cache = MongoCache() cache.clear() url = 'http://example.webscraping.comasdf' result = {'html': '...'} cache[url] = result print(cache[url]['html'] == result['html']) cache = MongoCache(expires=timedelta()) cache[url] = result import time time.sleep(60) print(cache[url])
def main(): scrape_callback = AlexaCallback() cache = MongoCache() cache.clear() link_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, user_agent='GoodCrawler', ignore_robots=True)
else: stop = 1 # 该链表页下的所有详情页为空, 不再增加链表页 if 'top250' in url and stop == 0: page_size += 25 next_link = form_url.format(page_size) if next_link not in seen: seen.add(next_link) crawl_queue.append(next_link) # 等待所有的下载线程结束 threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): # 移除已经停止的进程 threads.remove(thread) while len(threads) < max_threads and crawl_queue: # 开始更多的线程 thread = threading.Thread(target=process_queue) thread.setDaemon(True) thread.start() threads.append(thread) time.sleep(np.random.randint(6, 12)) if __name__ == '__main__': Scrape_Back = GetDetailInfo Cache = MongoCache() Cache.clear() threaded_crawler(scrape_callback=Scrape_Back, cache=Cache)
def test_result_is_not_none(self): cache = MongoCache() cache.clear() self.assertIsNotNone(self.result['html'])
class TestCache(unittest.TestCase): def setUp(self): self.cache = MongoCache(default_timeout=0) def tearDown(self): self.cache.collection.delete_many({}) def test_get(self): x = MockData(1) self.cache.set('key-1', x) xc = self.cache.get('key-1') self.assertEqual(x, xc) def test_delete_existing(self): x = MockData(1) self.cache.set('key-1', x) self.assertTrue(self.cache.delete('key-1')) def test_delete_not_existing(self): self.assertFalse(self.cache.delete('key-1')) def test_set(self): x = MockData(1) self.cache.set('key-1', x) xc = self.cache.get('key-1') self.assertEqual(x, xc) def test_add_not_existing(self): x = MockData(1) added = self.cache.add('key-1', x) self.assertTrue(added) def test_add_existing(self): x = MockData(1) self.cache.set('key-1', x) y = MockData(2) added = self.cache.add('key-1', y) self.assertFalse(added) def test_clear(self): x = MockData(1) self.cache.set('key-1', x) cleared = self.cache.clear() xc = self.cache.get('key-1') self.assertTrue(cleared) self.assertIsNone(xc) def test_set_overwrite(self): x1 = MockData(1) key = 'key-set-overwrite' self.cache.set(key, x1) x2 = MockData(2) self.cache.set(key, x2) _filter = {'_id': key} count_keys = self.cache.collection.count(_filter) self.assertEqual(1, count_keys) def test_inc_with_exist_key(self): value = 10 key = 'key-inc-with-exist-key' self.cache.set(key, value) delta = 9 new_value = self.cache.inc(key, delta) value_cache = self.cache.get(key) result = delta + value self.assertEqual(result, value_cache) self.assertEqual(result, new_value) def test_inc_witho_exist_key(self): key = 'key-inc-without-exist-key' delta = 9 new_value = self.cache.inc(key, delta) value_cache = self.cache.get(key) self.assertEqual(delta, value_cache) self.assertEqual(delta, new_value) def test_inc_with_error(self): value = MockData(1) key = 'key-inc-with-error' self.cache.add(key, value) delta = 9 new_value = self.cache.inc(key, delta) value_cache = self.cache.get(key) self.assertEqual(value, value_cache) self.assertEqual(None, new_value) def test_has_with_add_key(self): key = 'key-has-with-add-key' value = MockData(1) self.cache.add(key, value) has_key = self.cache.has(key) self.assertTrue(has_key) def test_has_without_add_key(self): key = 'key-has-without-add-key' has_key = self.cache.has(key) self.assertFalse(has_key) def test_dec_with_exist_key(self): value = 10 key = 'key-dec-with-exist-key' self.cache.set(key, value) delta = 9 new_value = self.cache.dec(key, delta) value_cache = self.cache.get(key) result = value - delta self.assertEqual(result, value_cache) self.assertEqual(result, new_value) def test_dec_witho_exist_key(self): key = 'key-dec-without-exist-key' delta = 9 new_value = self.cache.dec(key, delta) value_cache = self.cache.get(key) self.assertEqual(-delta, value_cache) self.assertEqual(-delta, new_value) def test_dec_with_error(self): value = MockData(1) key = 'key-dec-with-error' self.cache.add(key, value) delta = 9 new_value = self.cache.dec(key, delta) value_cache = self.cache.get(key) self.assertEqual(value, value_cache) self.assertEqual(None, new_value) def test_get_many(self): key_x_value = {'key-%s' % i: MockData(i) for i in range(1, 11)} for key, value in key_x_value.items(): self.cache.add(key, value) values = self.cache.get_many(*key_x_value.keys()) self.assertEqual(10, len(values)) for _return, _value in zip(values, key_x_value.values()): self.assertEqual(_value, _return) def test_get_dict(self): key_x_value = {'key-%s' % i: MockData(i) for i in range(1, 6)} for key, value in key_x_value.items(): self.cache.add(key, value) results = self.cache.get_dict(*key_x_value.keys()) self.assertIsInstance(results, dict) for key, value in key_x_value.items(): self.assertIn(key, results) self.assertEqual(key_x_value[key], results[key]) def test_delete_many(self): key_x_value = {'key-%s' % i: MockData(i) for i in range(1, 6)} for key, value in key_x_value.items(): self.cache.add(key, value) self.assertEqual( 5, self.cache.collection.count({'_id': { '$in': key_x_value.keys() }})) result = self.cache.delete_many(*key_x_value.keys()) self.assertTrue(result) self.assertEqual( 0, self.cache.collection.count({'_id': { '$in': key_x_value.keys() }})) def test_set_many(self): key_x_value = {'key-set-many-%s' % i: MockData(i) for i in range(1, 6)} result = self.cache.set_many(key_x_value) self.assertTrue(result) self.assertEqual( 5, self.cache.collection.count({'_id': { '$in': key_x_value.keys() }}))