def crawler(self):
     login_url = 'https://cas.gzhu.edu.cn/cas_server/login'
     username = self.edit1.text()
     password = self.edit2.text()
     grades_crawler(login_url, username=username, password=password,
                    scrape_callback=ScrapeCallback(), cache=DiskCache())
     sys.exit()
class TestDiskCache(unittest.TestCase):
    """Test the DiskCache class
    """
    def setUp(self):
        self.obj = "I am the object to be saved"
        self.objid = "objid"
        self.cache = DiskCache(".")

    def tearDown(self):
        os.remove(self.objid)
        os.remove(self.objid + ".lock")

    def test_normal(self):
        """Test a few things
        """
        # No cache file exists
        self.assertFalse(os.path.isfile(self.objid))

        # The cache is empty at the beginning
        cached_obj = self.cache.get(self.objid)
        self.assertIsNone(cached_obj)

        # Save the object and check the cache file exists now
        self.cache.save(self.objid, self.obj)
        self.assertTrue(os.path.isfile(self.objid))

        # Check we get the object from the cache
        cached_obj = self.cache.get(self.objid)
        self.assertEqual(self.obj, cached_obj)

        # Create a new cache (e.g., from another process) and verify we get the object from file
        cache = DiskCache(".")
        cached_obj = cache.get(self.objid)
        self.assertEqual(self.obj, cached_obj)

        # But from another directory we do not hit the cache
        new_location = tempfile.mkdtemp()
        cache = DiskCache(new_location)
        cached_obj = cache.get(self.objid)
        self.assertIsNone(cached_obj)
        os.rmdir(new_location)

        # And now what happens if we let the cache expire? We do not get the object!
        self.cache.cache_duration = 2  #2 seconds
        time.sleep(2)
        cached_obj = self.cache.get(self.objid)
        self.assertIsNone(cached_obj)

        # But if I save it again it works (assuming this takes less than 2 seconds to be executes:
        # we are not running on potatoes)
        self.cache.save(self.objid, self.obj)
        cached_obj = self.cache.get(self.objid)
        self.assertEqual(self.obj, cached_obj)
        # It works also on another cache with empty memory cache
        cache = DiskCache(".")
        cached_obj = cache.get(self.objid)
        self.assertEqual(self.obj, cached_obj)
Exemple #3
0
def main(max_threads):
    scrape_callback = AlexaCallback()
    cache = DiskCache()
    #cache.clear()
    threaded_crawler(scrape_callback.seed_url,
                     scrape_callback=scrape_callback,
                     cache=cache,
                     max_threads=max_threads,
                     timeout=10)
                            if link not in seen:
                                seen.add(link)
                                crawl_queue.append(link)

    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue:
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True)
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)


if __name__ == "__main__":
    # link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, user_agent='BadCrawler')
    import time
    start = time.time()
    link_crawler('http://example.webscraping.com',
                 '/(index|view)',
                 delay=1,
                 num_retries=5,
                 max_depth=3,
                 user_agent='Baidu',
                 cache=DiskCache(),
                 max_urls=10)
    end = time.time()
    print "* Elapsed time: %3.2f seconds" % (end - start)
 def setUp(self):
     self.obj = "I am the object to be saved"
     self.objid = "objid"
     self.cache = DiskCache(".")
Exemple #6
0
def main():
    crawler = NoCrawler(cache=MongoCache(disk_cache=DiskCache(), db_name='video'))
    crawler.main()