Beispiel #1
0
    def crawl(self):

        while len(self.url_queue) > 0 and len(
                self.discovered) <= self.MAX_LINKS_TO_VISIT:
            url = self.url_queue.popleft()

            if 'DEBUG' in os.environ:
                print "Queue Size:", len(self.url_queue)
                print "Fetching: ", url

            webpage = WebPage(url)
            self.unvisited[url] = False

            all_links = webpage.get_anchors(
                False)  # False: dont keep fragments
            all_assets = webpage.get_assets()
            self.assets.append({'url': url, 'assets': all_assets})

            for link in all_links:

                # if belongs to same domain & is not already discovered
                if self.same_domain_rule.matches(
                        link) and self.discovered[link.geturl()] is None:
                    self.discovered[link.geturl()] = True
                    # process if not already in the queue
                    if self.unvisited[link.geturl()] is None:
                        self.url_queue.append(link.geturl())
                        self.unvisited[link.geturl()] = True
Beispiel #2
0
 def test_file_links(self):
     self.start_server(TestWebPage.FILE_LINKS_HTML)
     webpage = WebPage(TestWebPage.SERVER)
     self.assertEqual(0, len(webpage.get_js()))
     self.assertEqual(0, len(webpage.get_stylesheets()))
     self.assertEqual(0, len(webpage.get_links()))
     self.assertEqual(2, len(webpage.get_anchors()))
     self.assertEqual(0, len(webpage.get_images()))
     self.assertEqual(2, len(webpage.get_files()))