Example #1
0
 def analyze_url(self, page: Page):
     self.seen_urls.add(page)
     if not self.check_domains(str(page)):
         return
     html = self.get_html(page.url)
     if html is None:
         return
     if self.analyze_robot(page.url):
         return
     if self.visited_urls_count < self.max_count_urls:
         self.visited_urls_count += 1
         parser = Parser(page.url)
         info = parser.get_info(html, str(page))
         if len(self.request.intersection(info)) != 0 \
                 and page not in self.result_urls:
             self.result_urls.add(page)
             self.update_parents()
             if self.download:
                 self.write_html(page, html)
         found_links = set(parser.get_urls(html))
         for link in found_links.difference(self.seen_urls):
             if link:
                 if str(link)[-1] == '/':
                     page = Page(link.parent)
                 else:
                     page = Page(link)
                 self.urls.put(page)
     else:
         return
Example #2
0
 def test_get_url_with_urls(self):
     with open('test.html', 'r') as test:
         text = test.read()
         test_url = Parser(URL('https://t/'))
         self.assertEqual(len(test_url.get_urls(text)), 4)