def crawler(self): login_url = 'https://cas.gzhu.edu.cn/cas_server/login' username = self.edit1.text() password = self.edit2.text() grades_crawler(login_url, username=username, password=password, scrape_callback=ScrapeCallback(), cache=DiskCache()) sys.exit()
class TestDiskCache(unittest.TestCase): """Test the DiskCache class """ def setUp(self): self.obj = "I am the object to be saved" self.objid = "objid" self.cache = DiskCache(".") def tearDown(self): os.remove(self.objid) os.remove(self.objid + ".lock") def test_normal(self): """Test a few things """ # No cache file exists self.assertFalse(os.path.isfile(self.objid)) # The cache is empty at the beginning cached_obj = self.cache.get(self.objid) self.assertIsNone(cached_obj) # Save the object and check the cache file exists now self.cache.save(self.objid, self.obj) self.assertTrue(os.path.isfile(self.objid)) # Check we get the object from the cache cached_obj = self.cache.get(self.objid) self.assertEqual(self.obj, cached_obj) # Create a new cache (e.g., from another process) and verify we get the object from file cache = DiskCache(".") cached_obj = cache.get(self.objid) self.assertEqual(self.obj, cached_obj) # But from another directory we do not hit the cache new_location = tempfile.mkdtemp() cache = DiskCache(new_location) cached_obj = cache.get(self.objid) self.assertIsNone(cached_obj) os.rmdir(new_location) # And now what happens if we let the cache expire? We do not get the object! self.cache.cache_duration = 2 #2 seconds time.sleep(2) cached_obj = self.cache.get(self.objid) self.assertIsNone(cached_obj) # But if I save it again it works (assuming this takes less than 2 seconds to be executes: # we are not running on potatoes) self.cache.save(self.objid, self.obj) cached_obj = self.cache.get(self.objid) self.assertEqual(self.obj, cached_obj) # It works also on another cache with empty memory cache cache = DiskCache(".") cached_obj = cache.get(self.objid) self.assertEqual(self.obj, cached_obj)
def main(max_threads): scrape_callback = AlexaCallback() cache = DiskCache() #cache.clear() threaded_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10)
if link not in seen: seen.add(link) crawl_queue.append(link) threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue: thread = threading.Thread(target=process_queue) thread.setDaemon(True) thread.start() threads.append(thread) time.sleep(SLEEP_TIME) if __name__ == "__main__": # link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, user_agent='BadCrawler') import time start = time.time() link_crawler('http://example.webscraping.com', '/(index|view)', delay=1, num_retries=5, max_depth=3, user_agent='Baidu', cache=DiskCache(), max_urls=10) end = time.time() print "* Elapsed time: %3.2f seconds" % (end - start)
def setUp(self): self.obj = "I am the object to be saved" self.objid = "objid" self.cache = DiskCache(".")
def main(): crawler = NoCrawler(cache=MongoCache(disk_cache=DiskCache(), db_name='video')) crawler.main()