Exemple #1
0
    def extra_urls(self):
        # Extract URLs for special tags attributes that may reference any kind of resource.
        # See http://htmlreference.io/
        for tag in self.soup.find_all(["area", "base", "link"], href=True):
            yield self.make_absolute(tag["href"])
        for tag in self.soup.find_all(
            ["audio", "embed", "img", "script", "source", "track", "video"],
                src=True):
            yield self.make_absolute(tag["src"])
        for tag in self.soup.find_all(["blockquote", "del", "ins", "q"],
                                      cite=True):
            yield self.make_absolute(tag["cite"])
        for tag in self.soup.find_all("object", data=True):
            yield self.make_absolute(tag["data"])
        for tag in self.soup.find_all("param",
                                      attrs={
                                          "name": "movie",
                                          "value": True
                                      }):
            yield self.make_absolute(tag["value"])
        for tag in self.soup.find_all(["img", "source"], srcset=True):
            for source_desc in tag["srcset"].split(","):
                url = source_desc.strip().split(" ")[0]
                if url:
                    yield self.make_absolute(url)

        for attribute in JS_EVENTS:
            for tag in self.soup.find_all(None, attrs={attribute: True}):
                for url in lamejs.LameJs(tag[attribute]).get_links():
                    yield self.make_absolute(url)

        for script in self.soup.find_all("script", string=True):
            urls = lamejs.LameJs(script.string).get_links()

            # too many annoying false positives
            # candidates = re.findall(r'"([A-Za-z0-9_=#&%.+?/-]*)"', script.string)
            # candidates += re.findall(r"'([A-Za-z0-9_=#&%.+?/-]*)'", script.string)
            #
            # allowed_ext = [".php", ".asp", ".xml", ".js", ".json", ".jsp"]
            # for jstr in candidates:
            #     if "." in jstr and jstr not in COMMON_JS_STRINGS:
            #         for ext in allowed_ext:
            #             if ext in jstr:
            #                 urls.append(jstr)
            #                 break
            for url in urls:
                yield self.make_absolute(url)

        for tag in self.soup.find_all("a", href=JS_SCHEME_REGEX):
            for url in lamejs.LameJs(tag["href"].split(':', 1)[1]).get_links():
                yield self.make_absolute(url)

        for tag in self.soup.find_all("form", action=JS_SCHEME_REGEX):
            for url in lamejs.LameJs(tag["action"].split(':',
                                                         1)[1]).get_links():
                yield self.make_absolute(url)
Exemple #2
0
    def extract_links(self, page, request) -> List:
        swf_links = []
        js_links = []
        allowed_links = []

        new_requests = []

        if "application/x-shockwave-flash" in page.type or request.file_ext == "swf":
            try:
                swf_links = swf.extract_links_from_swf(page.bytes)
            except Exception:
                pass
        elif "/x-javascript" in page.type or "/x-js" in page.type or "/javascript" in page.type:
            js_links = lamejs.LameJs(page.content).get_links()
            js_links += jsparser_angular.JsParserAngular(
                page.url, page.content).get_links()

        elif page.type.startswith(MIME_TEXT_TYPES):
            allowed_links.extend(filter(self._crawler.is_in_scope, page.links))
            allowed_links.extend(
                filter(self._crawler.is_in_scope,
                       page.js_redirections + page.html_redirections))

            for extra_url in filter(self._crawler.is_in_scope,
                                    page.extra_urls):
                parts = urlparse(extra_url)
                # There are often css and js URLs with useless parameters like version or random number
                # used to prevent caching in browser. So let's exclude those extensions
                if parts.path.endswith(".css"):
                    continue

                if parts.path.endswith(".js") and parts.query:
                    # For JS script, allow to process them but remove parameters
                    allowed_links.append(extra_url.split("?")[0])
                    continue

                allowed_links.append(extra_url)

            for form in page.iter_forms():
                # TODO: apply bad_params filtering in form URLs
                if self._crawler.is_in_scope(form):
                    if form.hostname not in self._hostnames:
                        form.link_depth = 0
                    else:
                        form.link_depth = request.link_depth + 1

                    new_requests.append(form)

        for url in swf_links + js_links:
            if url:
                url = page.make_absolute(url)
                if url and self._crawler.is_in_scope(url):
                    allowed_links.append(url)

        for new_url in allowed_links:
            if "?" in new_url:
                path_only = new_url.split("?")[0]
                if path_only not in allowed_links and self._crawler.is_in_scope(
                        path_only):
                    allowed_links.append(path_only)

        for new_url in set(allowed_links):
            if new_url == "":
                continue

            if self.is_forbidden(new_url):
                continue

            if "?" in new_url:
                path, query_string = new_url.split("?", 1)
                # TODO: encoding parameter ?
                get_params = [
                    list(t) for t in filter(
                        lambda param_tuple: param_tuple[0] not in self.
                        _bad_params, web.parse_qsl(query_string))
                ]
            elif new_url.endswith(EXCLUDED_MEDIA_EXTENSIONS):
                # exclude static media files
                continue
            else:
                path = new_url
                get_params = []

            if page.is_directory_redirection and new_url == page.redirection_url:
                depth = request.link_depth
            else:
                depth = request.link_depth + 1

            new_requests.append(
                web.Request(path, get_params=get_params, link_depth=depth))

        return new_requests