def extra_urls(self): # Extract URLs for special tags attributes that may reference any kind of resource. # See http://htmlreference.io/ for tag in self.soup.find_all(["area", "base", "link"], href=True): yield self.make_absolute(tag["href"]) for tag in self.soup.find_all( ["audio", "embed", "img", "script", "source", "track", "video"], src=True): yield self.make_absolute(tag["src"]) for tag in self.soup.find_all(["blockquote", "del", "ins", "q"], cite=True): yield self.make_absolute(tag["cite"]) for tag in self.soup.find_all("object", data=True): yield self.make_absolute(tag["data"]) for tag in self.soup.find_all("param", attrs={ "name": "movie", "value": True }): yield self.make_absolute(tag["value"]) for tag in self.soup.find_all(["img", "source"], srcset=True): for source_desc in tag["srcset"].split(","): url = source_desc.strip().split(" ")[0] if url: yield self.make_absolute(url) for attribute in JS_EVENTS: for tag in self.soup.find_all(None, attrs={attribute: True}): for url in lamejs.LameJs(tag[attribute]).get_links(): yield self.make_absolute(url) for script in self.soup.find_all("script", string=True): urls = lamejs.LameJs(script.string).get_links() # too many annoying false positives # candidates = re.findall(r'"([A-Za-z0-9_=#&%.+?/-]*)"', script.string) # candidates += re.findall(r"'([A-Za-z0-9_=#&%.+?/-]*)'", script.string) # # allowed_ext = [".php", ".asp", ".xml", ".js", ".json", ".jsp"] # for jstr in candidates: # if "." in jstr and jstr not in COMMON_JS_STRINGS: # for ext in allowed_ext: # if ext in jstr: # urls.append(jstr) # break for url in urls: yield self.make_absolute(url) for tag in self.soup.find_all("a", href=JS_SCHEME_REGEX): for url in lamejs.LameJs(tag["href"].split(':', 1)[1]).get_links(): yield self.make_absolute(url) for tag in self.soup.find_all("form", action=JS_SCHEME_REGEX): for url in lamejs.LameJs(tag["action"].split(':', 1)[1]).get_links(): yield self.make_absolute(url)
def extract_links(self, page, request) -> List: swf_links = [] js_links = [] allowed_links = [] new_requests = [] if "application/x-shockwave-flash" in page.type or request.file_ext == "swf": try: swf_links = swf.extract_links_from_swf(page.bytes) except Exception: pass elif "/x-javascript" in page.type or "/x-js" in page.type or "/javascript" in page.type: js_links = lamejs.LameJs(page.content).get_links() js_links += jsparser_angular.JsParserAngular( page.url, page.content).get_links() elif page.type.startswith(MIME_TEXT_TYPES): allowed_links.extend(filter(self._crawler.is_in_scope, page.links)) allowed_links.extend( filter(self._crawler.is_in_scope, page.js_redirections + page.html_redirections)) for extra_url in filter(self._crawler.is_in_scope, page.extra_urls): parts = urlparse(extra_url) # There are often css and js URLs with useless parameters like version or random number # used to prevent caching in browser. So let's exclude those extensions if parts.path.endswith(".css"): continue if parts.path.endswith(".js") and parts.query: # For JS script, allow to process them but remove parameters allowed_links.append(extra_url.split("?")[0]) continue allowed_links.append(extra_url) for form in page.iter_forms(): # TODO: apply bad_params filtering in form URLs if self._crawler.is_in_scope(form): if form.hostname not in self._hostnames: form.link_depth = 0 else: form.link_depth = request.link_depth + 1 new_requests.append(form) for url in swf_links + js_links: if url: url = page.make_absolute(url) if url and self._crawler.is_in_scope(url): allowed_links.append(url) for new_url in allowed_links: if "?" in new_url: path_only = new_url.split("?")[0] if path_only not in allowed_links and self._crawler.is_in_scope( path_only): allowed_links.append(path_only) for new_url in set(allowed_links): if new_url == "": continue if self.is_forbidden(new_url): continue if "?" in new_url: path, query_string = new_url.split("?", 1) # TODO: encoding parameter ? get_params = [ list(t) for t in filter( lambda param_tuple: param_tuple[0] not in self. _bad_params, web.parse_qsl(query_string)) ] elif new_url.endswith(EXCLUDED_MEDIA_EXTENSIONS): # exclude static media files continue else: path = new_url get_params = [] if page.is_directory_redirection and new_url == page.redirection_url: depth = request.link_depth else: depth = request.link_depth + 1 new_requests.append( web.Request(path, get_params=get_params, link_depth=depth)) return new_requests