def analyze_url(self, page: Page): self.seen_urls.add(page) if not self.check_domains(str(page)): return html = self.get_html(page.url) if html is None: return if self.analyze_robot(page.url): return if self.visited_urls_count < self.max_count_urls: self.visited_urls_count += 1 parser = Parser(page.url) info = parser.get_info(html, str(page)) if len(self.request.intersection(info)) != 0 \ and page not in self.result_urls: self.result_urls.add(page) self.update_parents() if self.download: self.write_html(page, html) found_links = set(parser.get_urls(html)) for link in found_links.difference(self.seen_urls): if link: if str(link)[-1] == '/': page = Page(link.parent) else: page = Page(link) self.urls.put(page) else: return
def test_get_url_with_urls(self): with open('test.html', 'r') as test: text = test.read() test_url = Parser(URL('https://t/')) self.assertEqual(len(test_url.get_urls(text)), 4)