def js_extractor(response): """Extract js files from the response body""" # Extract .js files matches = rscript.findall(response) for match in matches: match = match[2].replace('\'', '').replace('"', '') verb('JS file', match) bad_scripts.add(match)
def intel_extractor(url, response): """Extract intel from the response body.""" for rintel in rintels: res = re.sub(r'<(script).*?</\1>(?s)', '', response) res = re.sub(r'<[^<]+?>', '', res) matches = rintel[0].findall(res) if matches: for match in matches: verb('Intel', match) bad_intel.add((match, rintel[1], url))
def jscanner(url): """Extract endpoints from JavaScript code.""" response = requester(url, main_url, delay, cook, headers, timeout, host, proxies, user_agents, failed, processed) # Extract URLs/endpoints matches = rendpoint.findall(response) # Iterate over the matches, match is a tuple for match in matches: # Combining the items because one of them is always empty match = match[0] + match[1] # Making sure it's not some JavaScript code if not re.search(r'[}{><"\']', match) and not match == '/': verb('JS endpoint', match) endpoints.add(match)
def zap(input_url, archive, domain, host, internal, robots, proxies): """Extract links from robots.txt and sitemap.xml.""" if archive: print('%s Fetching URLs from archive.org' % run) if False: archived_urls = time_machine(domain, 'domain') else: archived_urls = time_machine(host, 'host') print('%s Retrieved %i URLs from archive.org' % ( good, len(archived_urls) - 1)) for url in archived_urls: verb('Internal page', url) internal.add(url) # Makes request to robots.txt response = requests.get(input_url + '/robots.txt', verify=False, proxies=random.choice(proxies)).text # Making sure robots.txt isn't some fancy 404 page if '<body' not in response: # If you know it, you know it matches = re.findall(r'Allow: (.*)|Disallow: (.*)', response) if matches: # Iterating over the matches, match is a tuple here for match in matches: # One item in match will always be empty so will combine both # items match = ''.join(match) # If the URL doesn't use a wildcard if '*' not in match: url = input_url + match # Add the URL to internal list for crawling internal.add(url) # Add the URL to robots list robots.add(url) print('%s URLs retrieved from robots.txt: %s' % (good, len(robots))) # Makes request to sitemap.xml response = requests.get(input_url + '/sitemap.xml', verify=False, proxies=random.choice(proxies)).text # Making sure robots.txt isn't some fancy 404 page if '<body' not in response: matches = xml_parser(response) if matches: # if there are any matches print('%s URLs retrieved from sitemap.xml: %s' % ( good, len(matches))) for match in matches: verb('Internal page', match) # Cleaning up the URL and adding it to the internal list for # crawling internal.add(match)
def zap(input_url, archive, domain, host, internal, robots, proxies): """Extract links from robots.txt and sitemap.xml.""" if archive: print('%s Fetching URLs from archive.org' % run) if False: archived_urls = time_machine(domain, 'domain') else: archived_urls = time_machine(host, 'host') print('%s Retrieved %i URLs from archive.org' % ( good, len(archived_urls) - 1)) for url in archived_urls: verb('Internal page', url) internal.add(url) # Makes request to robots.txt response = requests.get(input_url + '/robots.txt', proxies=random.choice(proxies)).text # Making sure robots.txt isn't some fancy 404 page if '<body' not in response: # If you know it, you know it matches = re.findall(r'Allow: (.*)|Disallow: (.*)', response) if matches: # Iterating over the matches, match is a tuple here for match in matches: # One item in match will always be empty so will combine both # items match = ''.join(match) # If the URL doesn't use a wildcard if '*' not in match: url = input_url + match # Add the URL to internal list for crawling internal.add(url) # Add the URL to robots list robots.add(url) print('%s URLs retrieved from robots.txt: %s' % (good, len(robots))) # Makes request to sitemap.xml response = requests.get(input_url + '/sitemap.xml', proxies=random.choice(proxies)).text # Making sure robots.txt isn't some fancy 404 page if '<body' not in response: matches = xml_parser(response) if matches: # if there are any matches print('%s URLs retrieved from sitemap.xml: %s' % ( good, len(matches))) for match in matches: verb('Internal page', match) # Cleaning up the URL and adding it to the internal list for # crawling internal.add(match)
def extractor(url): """Extract details from the response body.""" response = requester(url, main_url, delay, cook, headers, timeout, host, ninja, user_agents, failed, processed) if clone: mirror(url, response) matches = re.findall(r'<[aA].*(href|HREF)=([^\s>]+)', response) for link in matches: # Remove everything after a "#" to deal with in-page anchors link = link[1].replace('\'', '').replace('"', '').split('#')[0] # Checks if the URLs should be crawled if is_link(link, processed, files): if link[:4] == 'http': if link.startswith(main_url): verb('Internal page', link) internal.add(link) else: verb('External page', link) external.add(link) elif link[:2] == '//': if link.split('/')[2].startswith(host): verb('Internal page', link) internal.add(schema + link) else: verb('External page', link) external.add(link) elif link[:1] == '/': verb('Internal page', link) internal.add(main_url + link) else: verb('Internal page', link) internal.add(main_url + '/' + link) if not only_urls: intel_extractor(response) js_extractor(response) if args.regex and not supress_regex: regxy(args.regex, response, supress_regex, custom) if api: matches = re.findall(r'[\w-]{16,45}', response) for match in matches: if entropy(match) >= 4: verb('Key', match) keys.add(url + ': ' + match)
def extractor(url): """Extract details from the response body.""" try: re.findall(r"//", url)[1] return except: pass try: re.findall(r"https:", url)[1] return except: pass if not re.match(main_url, url): return print(url) response = requester(url, main_url, delay, cook, headers, timeout, host, proxies, user_agents, failed, processed) if clone: mirror(url, response) matches = rhref.findall(response) for link in matches: # Remove everything after a "#" to deal with in-page anchors link = link[1].replace('\'', '').replace('"', '').split('#')[0] # Checks if the URLs should be crawled if is_link(link, processed, files): if link[:4] == 'http': if link.startswith(main_url): verb('Internal page', link) internal.add(link) else: verb('External page', link) external.add(link) elif link[:2] == '//': if link.split('/')[2].startswith(host): verb('Internal page', link) internal.add(schema + '://' + link) else: verb('External page', link) external.add(link) elif link[:1] == '/': verb('Internal page', link) internal.add(remove_file(url) + link) else: verb('Internal page', link) usable_url = remove_file(url) if usable_url.endswith('/'): internal.add(usable_url + link) elif link.startswith('/'): internal.add(usable_url + link) else: internal.add(usable_url + '/' + link) if not only_urls: intel_extractor(url, response) js_extractor(response) if args.regex and not supress_regex: regxy(args.regex, response, supress_regex, custom) if api: matches = rentropy.findall(response) for match in matches: if entropy(match) >= 4: verb('Key', match) keys.add(url + ': ' + match)
def extractor(url): """Extract details from the response body.""" response = requester( url, main_url, delay, cook, headers, timeout, host, proxies, user_agents, failed, processed, ) if clone: mirror(url, response) matches = rhref.findall(response) for link in matches: # Remove everything after a "#" to deal with in-page anchors link = link[1].replace("'", "").replace('"', "").split("#")[0] # Checks if the URLs should be crawled if is_link(link, processed, files): if link[:4] == "http": if link.startswith(main_url): verb("Internal page", link) internal.add(link) else: verb("External page", link) external.add(link) elif link[:2] == "//": if link.split("/")[2].startswith(host): verb("Internal page", link) internal.add(schema + "://" + link) else: verb("External page", link) external.add(link) elif link[:1] == "/": verb("Internal page", link) internal.add(remove_file(url) + link) else: verb("Internal page", link) usable_url = remove_file(url) if usable_url.endswith("/"): internal.add(usable_url + link) elif link.startswith("/"): internal.add(usable_url + link) else: internal.add(usable_url + "/" + link) if not only_urls: intel_extractor(url, response) js_extractor(response) if args.regex and not supress_regex: regxy(args.regex, response, supress_regex, custom) if api: matches = rentropy.findall(response) for match in matches: if entropy(match) >= 4: verb("Key", match) keys.add(url + ": " + match)
def extractor(url): """Extract details from the response body.""" response = requester(url, main_url, delay, cook, headers, timeout, host, proxies, user_agents, failed, processed) if clone: mirror(url, response) matches = rhref.findall(response) for link in matches: # Remove everything after a "#" to deal with in-page anchors link = link[1].replace('\'', '').replace('"', '').split('#')[0] # Checks if the URLs should be crawled if is_link(link, processed, files): if link[:4] == 'http': if link.startswith(main_url): verb('Internal page', link) internal.add(link) else: verb('External page', link) external.add(link) elif link[:2] == '//': if link.split('/')[2].startswith(host): verb('Internal page', link) internal.add(schema + '://' + link) else: verb('External page', link) external.add(link) elif link[:1] == '/': verb('Internal page', link) internal.add(remove_file(url) + link) else: verb('Internal page', link) usable_url = remove_file(url) if usable_url.endswith('/'): internal.add(usable_url + link) elif link.startswith('/'): internal.add(usable_url + link) else: internal.add(usable_url + '/' + link) if not only_urls: intel_extractor(url, response) js_extractor(response) if args.regex and not supress_regex: regxy(args.regex, response, supress_regex, custom) if api: matches = rentropy.findall(response) for match in matches: if entropy(match) >= 4: verb('Key', match) keys.add(url + ': ' + match)