Exemple #1
0
 def test_get_absolute_url(self):
     base_url_split = get_clean_url_split(
             "https://www.example.com/hello/index.html")
     self.assertEqual("https://www.example2.com/test.js",
         get_absolute_url_split("//www.example2.com/test.js",
                 base_url_split).geturl())
     self.assertEqual("https://www.example.com/hello2/test.html",
         get_absolute_url_split("/hello2/test.html",
                 base_url_split).geturl())
     self.assertEqual("https://www.example.com/hello/test.html",
         get_absolute_url_split("test.html", base_url_split).geturl())
     self.assertEqual("https://www.example.com/test.html",
         get_absolute_url_split("../test.html", base_url_split).geturl())
Exemple #2
0
 def test_get_absolute_url(self):
     base_url_split = get_clean_url_split(
             "https://www.example.com/hello/index.html")
     self.assertEqual("https://www.example2.com/test.js",
         get_absolute_url_split("//www.example2.com/test.js",
                 base_url_split).geturl())
     self.assertEqual("https://www.example.com/hello2/test.html",
         get_absolute_url_split("/hello2/test.html",
                 base_url_split).geturl())
     self.assertEqual("https://www.example.com/hello/test.html",
         get_absolute_url_split("test.html", base_url_split).geturl())
     self.assertEqual("https://www.example.com/test.html",
         get_absolute_url_split("../test.html", base_url_split).geturl())
    def _get_links(self, elements, attribute, base_url_split,
        original_url_split):
        links = []
        for element in elements:
            if attribute in element.attrs:
                url = element[attribute]
                if not is_link(url):
                    continue
                abs_url_split = get_absolute_url_split(url, base_url_split)

                if abs_url_split.scheme not in SUPPORTED_SCHEMES:
                    continue

                link = Link(type=unicode(element.name), url_split=abs_url_split,
                    original_url_split=original_url_split,
                    source_str=unicode(element))
                links.append(link)

        return links
    def _get_links(self, elements, attribute, base_url_split,
        original_url_split):
        links = []
        for element in elements:
            if attribute in element.attrs:
                url = element[attribute]

                if not self.worker_config.strict_mode:
                    url = url.strip()

                if not is_link(url):
                    continue
                abs_url_split = get_absolute_url_split(url, base_url_split)

                if abs_url_split.scheme not in SUPPORTED_SCHEMES:
                    continue

                link = Link(type=unicode(element.name), url_split=abs_url_split,
                    original_url_split=original_url_split,
                    source_str=unicode(element))
                links.append(link)

        return links