def crawl(self): while not self.queue.empty(): (url, attempt, depth) = self.queue.get() print((url, attempt, depth)) if self.is_outer_url(url): self.outer_link_counter += 1 elif self.is_subdomain_url(url): self.subdomain_set.add(self.get_subdomain_name(url)) else: self.inner_link_counter += 1 if depth >= self.max_depth: continue content = self.get_page(url) if not content: if attempt >= self.max_attempts: self.error_counter += 1 continue else: self.queue.put((url, attempt+1, depth)) continue parser =HtmlParser(content) url_list = parser.get_links() for u in url_list: if len(u) < 1: continue u = self.make_full_link(u) if u not in self.reached_urls: self.reached_urls[u] = depth+1 self.queue.put((u, 0, depth+1)) print(self.reached_urls) print(self.queue.qsize()) print("Subdomains:", self.subdomain_set) print("Inner links count:", self.inner_link_counter) print("Outer links count:", self.outer_link_counter) print("Unavailable pages count:", self.error_counter)
def test_empty_page(self): parser = HtmlParser("") self.assertEqual(parser.get_links(), []) self.assertEqual(parser.get_text(), '')
def test_get_several_links(self): parser = HtmlParser( "<div><a href='http://abc.abc/'>text</a></div><a href='http://def.def/'></a><div><A HREF='q.html'>text</a></div>" ) self.assertEqual(parser.get_links(), ['http://abc.abc/', 'http://def.def/', 'q.html'])
def test_one_link(self): parser = HtmlParser( "<a href='http://abc.abc/'>text</a>" ) self.assertEqual(parser.get_links(), ['http://abc.abc/'])