def crawl(self): while len(self.url_queue) > 0 and len( self.discovered) <= self.MAX_LINKS_TO_VISIT: url = self.url_queue.popleft() if 'DEBUG' in os.environ: print "Queue Size:", len(self.url_queue) print "Fetching: ", url webpage = WebPage(url) self.unvisited[url] = False all_links = webpage.get_anchors( False) # False: dont keep fragments all_assets = webpage.get_assets() self.assets.append({'url': url, 'assets': all_assets}) for link in all_links: # if belongs to same domain & is not already discovered if self.same_domain_rule.matches( link) and self.discovered[link.geturl()] is None: self.discovered[link.geturl()] = True # process if not already in the queue if self.unvisited[link.geturl()] is None: self.url_queue.append(link.geturl()) self.unvisited[link.geturl()] = True
def test_file_links(self): self.start_server(TestWebPage.FILE_LINKS_HTML) webpage = WebPage(TestWebPage.SERVER) self.assertEqual(0, len(webpage.get_js())) self.assertEqual(0, len(webpage.get_stylesheets())) self.assertEqual(0, len(webpage.get_links())) self.assertEqual(2, len(webpage.get_anchors())) self.assertEqual(0, len(webpage.get_images())) self.assertEqual(2, len(webpage.get_files()))