Beispiel #1
0
    def _get_links(self, elements, attribute, base_url_split,
                   original_url_split):
        links = []
        for element in elements:
            if attribute in element.attrs:
                url = element[attribute]

                if not self.worker_config.strict_mode:
                    url = url.strip()

                if not is_link(url):
                    continue
                abs_url_split = get_absolute_url_split(url, base_url_split)

                if not is_supported_scheme(
                        abs_url_split, self.worker_config.ignore_bad_tel_urls):
                    continue

                link = Link(type=unicode(element.name),
                            url_split=abs_url_split,
                            original_url_split=original_url_split,
                            source_str=unicode(element))
                links.append(link)

        return links
Beispiel #2
0
 def test_get_absolute_url(self):
     base_url_split = get_clean_url_split(
         "https://www.example.com/hello/index.html")
     self.assertEqual(
         "https://www.example2.com/test.js",
         get_absolute_url_split(
             "//www.example2.com/test.js", base_url_split).geturl())
     self.assertEqual(
         "https://www.example.com/hello2/test.html",
         get_absolute_url_split(
             "/hello2/test.html", base_url_split).geturl())
     self.assertEqual(
         "https://www.example.com/hello/test.html",
         get_absolute_url_split("test.html", base_url_split).geturl())
     self.assertEqual(
         "https://www.example.com/test.html",
         get_absolute_url_split("../test.html", base_url_split).geturl())
Beispiel #3
0
 def test_get_absolute_url(self):
     base_url_split = get_clean_url_split(
         "https://www.example.com/hello/index.html")
     self.assertEqual(
         "https://www.example2.com/test.js",
         get_absolute_url_split(
             "//www.example2.com/test.js", base_url_split).geturl())
     self.assertEqual(
         "https://www.example.com/hello2/test.html",
         get_absolute_url_split(
             "/hello2/test.html", base_url_split).geturl())
     self.assertEqual(
         "https://www.example.com/hello/test.html",
         get_absolute_url_split("test.html", base_url_split).geturl())
     self.assertEqual(
         "https://www.example.com/test.html",
         get_absolute_url_split("../test.html", base_url_split).geturl())
Beispiel #4
0
 def _add_urls_from_single_content_check(self, start_urls,
                                         single_content_check):
     for key in single_content_check.keys():
         if key == PREFIX_ALL:
             continue
         if key.netloc and key not in start_urls:
             start_urls.append(key)
         else:
             for url_split in start_urls:
                 new_url = get_absolute_url_split(key.geturl(), url_split)
                 if new_url not in start_urls:
                     start_urls.append(new_url)
Beispiel #5
0
    def _get_links(self, elements, attribute, base_url_split,
                   original_url_split):
        links = []
        for element in elements:
            if attribute in element.attrs:
                url = element[attribute]

                if not self.worker_config.strict_mode:
                    url = url.strip()

                if not is_link(url):
                    continue
                abs_url_split = get_absolute_url_split(url, base_url_split)

                if abs_url_split.scheme not in SUPPORTED_SCHEMES:
                    continue

                link = Link(
                    type=unicode(element.name), url_split=abs_url_split,
                    original_url_split=original_url_split,
                    source_str=unicode(element))
                links.append(link)

        return links