def get(self, url): ip = self.request.remote_addr #to prevent abuses, only a request every minute is served request = Request.gql("where ip='%s'" % ip).get() #Look for request from the same IP address if not request is None: delta = request.is_allowed() if delta > 0: #too little time has passed from the previous request #self.error(408) #Timeout Error self.response.set_status(408, "Your IP address has issued a request less than 1 min ago. Please wait %d seconds" % delta) return else: request = Request(ip=ip, page_crawled=url) request.save() self.response.headers['Content-Type'] = 'application/json' handler = CrawlerHandler() site_image = memcache.get(url) if site_image is None: home_page = handler.start_crawling(url, MAX_PAGE_DEPTH, MAX_PAGES_TO_CRAWL, 0.01) #causes a little delay, but not too big (one 100th of a sec) if home_page is None: self.error(400) #Bad Request return else: site_image = handler.page_graph(home_page) memcache.set(url, site_image) self.__responde(site_image)
def test_list_resources(url, threads = 1, max_page_depth = None, max_pages_to_crawl = None): handler = CrawlerHandler() home_page = handler.start_crawling(url, threads, max_page_depth, max_pages_to_crawl, 0) #Looks for a page that doesn't exist resources = handler.list_resources(home_page + str(random())) for s in resources.values(): assert(len(s) == 0) #Looks for a page that DOES exist resources = handler.list_resources(home_page) assert(len( reduce(lambda s1, s2: s1 | s2, resources.values())) > 0) #At least some resource should be found
def test_page_graph(url, threads = 1, max_page_depth = None, max_pages_to_crawl = None): handler = CrawlerHandler() home_page = handler.start_crawling(url, threads, max_page_depth, max_pages_to_crawl, 0) #Looks for a page that doesn't exist pages_set = handler.page_graph(home_page + str(random())) assert(len(pages_set) == 0) #looks for a page that DOES exist pages_set_1 = handler.page_graph(home_page) pages_set_2 = handler.page_graph() assert (pages_set_1 == pages_set_2) return pages_set_2
def test(): handler = CrawlerHandler() assert handler.start_crawling("www.news.ycombinator.com", 30, None, 20, 0) is None print test_crawler("http://repubblica.it", 30, None, 20) path = "/%s/tests" % os.getcwd().replace("\\", "/") #Test cyclic reference between 2 documents site_resources = test_crawler(urlunsplit(("file", path, "test_1.html", '', '')), 3) #Asserts on content assert("img_2.jpg" in site_resources["images"]) assert("http://mysite.me/img_1.jpg" in site_resources["images"]) assert("/test.js" in site_resources["scripts"]) #Test cyclic reference between 3 or more documents site_resources = test_crawler(urlunsplit(("file", path, "test_B.html", '', '')), 2) #Asserts on content assert("img_2.jpg" in site_resources["images"]) assert("http://mysite.me/img_1.jpg" in site_resources["images"]) assert("/test.js" in site_resources["scripts"]) assert("img_3.jpg" in site_resources["images"]) url_B = urlunsplit(("file", path, "test_B.html", '', '')) test_list_resources(url_B, 2) print test_page_graph(url_B, 2, 1) print test_page_graph(url_B, 2) graph = test_page_graph(urlunsplit(("file", path, "test_B.html", '', '')), 5) url1 = urlunsplit(("file", path, "test_1.html", '', '')) url2 = urlunsplit(("file", path, "test_1_copy.html", '', '')) assert(graph[url1] == graph[url2]) assert len(graph[url_B]["resources"]["videos"]) == 2 print graph[url_B]["resources"]["audios"]
def test_crawler(url, threads = 1, max_page_depth = None, max_pages_to_crawl = None): handler = CrawlerHandler() handler.start_crawling(url, threads, max_page_depth, max_pages_to_crawl, 0) return handler.list_resources()