def crawler(): for level in range(var['level']): # Links to crawl = (all links - already crawled links) - links not to crawl internal = result['urls']['internal'] processed = var['processed'] urls = remove_regex(internal - processed, var['exclude']) # If urls to crawl are 0 i.e. all urls have been crawled if not urls: break # if crawled urls are somehow more than all urls. Possible? :/ elif len(internal) <= len(processed): if len(internal) > 2 + len(var['seeds']): break print('%s Level %i: %i URLs' % (run, level + 1, len(urls))) threadpool = concurrent.futures.ThreadPoolExecutor( max_workers=var['threads']) futures = (threadpool.submit(parser, url) for url in urls) for i, _ in enumerate(concurrent.futures.as_completed(futures)): if i + 1 == len(urls) or (i + 1) % var['threads'] == 0: print('%s Progress: %i/%i' % (info, i + 1, len(urls)), end='\r') print('')
# Combining the items because one of them is always empty match = match[0] + match[1] # Making sure it's not some JavaScript code if not re.search(r'[}{><"\']', match) and not match == '/': verb('JS endpoint', match) endpoints.add(match) # Records the time at which crawling started then = time.time() # Step 1. Extract urls from robots.txt & sitemap.xml zap(main_url, args.archive, domain, host, internal, robots) # This is so the level 1 emails are parsed as well internal = set(remove_regex(internal, args.exclude)) # Step 2. Crawl recursively to the limit specified in "crawl_level" for level in range(crawl_level): # Links to crawl = (all links - already crawled links) - links not to crawl links = remove_regex(internal - processed, args.exclude) # If links to crawl are 0 i.e. all links have been crawled if not links: break # if crawled links are somehow more than all links. Possible? ;/ elif len(internal) <= len(processed): if len(internal) > 2 + len(args.seeds): break print('%s Level %i: %i URLs' % (run, level + 1, len(links))) try: flash(extractor, links, thread_count)
# 进行组合,因为其中一项总是空的 match = match[0] + match[1] # 确保其中没有一些JS代码 if not re.search(r'[}{><"\']', match) and not match == '/': endpoints.add(match) # 在爬取开始的时候记录一个时间值 then = time.time() # 第一步:从 robots.txt 和 sitemap.xml 文件中提取 urls zap(main_url, domain, host, internal, robots) # 这里涉及到 core/zap.py 文件中的 zap() 函数 # 这是第一层,emails 也可以被解析 internal = set(remove_regex( internal, args.exclude)) # 这里涉及到 core/utils.py 文件中的 remove_regex() 函数 # 第二步:递归爬取在 crawl_level 中指定的限制层数 for level in range(crawl_level): # 需要爬取的链接数 =(所有的链接数 - 已经爬取过的链接数)- 不需要爬取的链接数 links = remove_regex(internal - processed, args.exclude) # 如果需要爬取的链接数是 0,即:所有的链接都爬取过了 if not links: break # 如果爬取过的链接数大于所有的链接数 elif len(internal) <= len(processed): if len(internal) > 2 + len(args.seeds): break print('%s Level %i: %i URLs' % (run, level + 1, len(links))) try: flash(extractor, links,
def post(self, target, level_): global keys global files global intel global robots global custom global failed global scripts global external global fuzzable global endpoints global processed global internal global main_url global delay global cook global headers global timeout global host global proxies global user_agents global only_urls global bad_intel global bad_scripts global clone global schema global args global supress_regex global results results = {} # Disable SSL related warnings warnings.filterwarnings('ignore') # Processing command line arguments parser = argparse.ArgumentParser() # Options parser.add_argument('-u', '--url', help='root url', dest='root') parser.add_argument('-c', '--cookie', help='cookie', dest='cook') parser.add_argument('-r', '--regex', help='regex pattern', dest='regex') parser.add_argument('-e', '--export', help='export format', dest='export', choices=['csv', 'json']) parser.add_argument('-o', '--output', help='output directory', dest='output') parser.add_argument('-l', '--level', help='levels to crawl', dest='level', type=int) parser.add_argument('-t', '--threads', help='number of threads', dest='threads', type=int) parser.add_argument('-d', '--delay', help='delay between requests', dest='delay', type=float) parser.add_argument('-v', '--verbose', help='verbose output', dest='verbose', action='store_true') parser.add_argument('-s', '--seeds', help='additional seed URLs', dest='seeds', nargs="+", default=[]) parser.add_argument('--stdout', help='send variables to stdout', dest='std') parser.add_argument('--user-agent', help='custom user agent(s)', dest='user_agent') parser.add_argument('--exclude', help='exclude URLs matching this regex', dest='exclude') parser.add_argument('--timeout', help='http request timeout', dest='timeout', type=float) parser.add_argument('-p', '--proxy', help='Proxy server IP:PORT or DOMAIN:PORT', dest='proxies', type=proxy_type) # Switches parser.add_argument('--clone', help='clone the website locally', dest='clone', action='store_true') parser.add_argument('--headers', help='add headers', dest='headers', action='store_true') parser.add_argument('--dns', help='enumerate subdomains and DNS data', dest='dns', action='store_true') parser.add_argument('--keys', help='find secret keys', dest='api', action='store_true') parser.add_argument('--update', help='update photon', dest='update', action='store_true') parser.add_argument('--only-urls', help='only extract URLs', dest='only_urls', action='store_true') parser.add_argument('--wayback', help='fetch URLs from archive.org as seeds', dest='archive', action='store_true') args = parser.parse_args() print("------------------------------------------------") print(args.root) print(type(args.level)) print(type(args.threads)) print(args.api) print(args.archive) print(args.export) args.root = "http://" + target args.level = int(level_) args.threads = 30 args.api = True args.archive = True args.export = "json" # If the user has supplied --update argument if args.update: updater() quit() # If the user has supplied a URL if args.root: main_inp = args.root if main_inp.endswith('/'): # We will remove it as it can cause problems later in the code main_inp = main_inp[:-1] # If the user hasn't supplied an URL else: print('\n' + parser.format_help().lower()) quit() clone = args.clone headers = args.headers # prompt for headers verbose = args.verbose # verbose output delay = args.delay or 0 # Delay between requests timeout = args.timeout or 6 # HTTP request timeout cook = args.cook or None # Cookie api = bool( args.api) # Extract high entropy strings i.e. API keys and stuff proxies = [] if args.proxies: print("%s Testing proxies, can take a while..." % info) for proxy in args.proxies: if is_good_proxy(proxy): proxies.append(proxy) else: print("%s Proxy %s doesn't seem to work or timedout" % (bad, proxy['http'])) print("%s Done" % info) if not proxies: print("%s no working proxies, quitting!" % bad) exit() else: proxies.append(None) crawl_level = args.level or 2 # Crawling level thread_count = args.threads or 2 # Number of threads only_urls = bool(args.only_urls) # Only URLs mode is off by default # Variables we are gonna use later to store stuff keys = set() # High entropy strings, prolly secret keys files = set() # The pdf, css, png, etc files. intel = set( ) # The email addresses, website accounts, AWS buckets etc. robots = set() # The entries of robots.txt custom = set() # Strings extracted by custom regex pattern failed = set() # URLs that photon failed to crawl scripts = set() # THe Javascript files external = set( ) # URLs that don't belong to the target i.e. out-of-scope # URLs that have get params in them e.g. example.com/page.php?id=2 fuzzable = set() endpoints = set() # URLs found from javascript files processed = set(['dummy']) # URLs that have been crawled # URLs that belong to the target i.e. in-scope internal = set(args.seeds) everything = [] bad_scripts = set() # Unclean javascript file urls bad_intel = set() # needed for intel filtering core.config.verbose = verbose if headers: try: prompt = prompt() except FileNotFoundError as e: print('Could not load headers prompt: {}'.format(e)) quit() headers = extract_headers(prompt) # If the user hasn't supplied the root URL with http(s), we will handle it if main_inp.startswith('http'): main_url = main_inp else: try: requests.get('https://' + main_inp, proxies=random.choice(proxies)) main_url = 'https://' + main_inp except: main_url = 'http://' + main_inp schema = main_url.split('//')[0] # https: or http:? # Adding the root URL to internal for crawling internal.add(main_url) # Extracts host out of the URL host = urlparse(main_url).netloc output_dir = args.output or host output_dir = "results" try: domain = top_level(main_url) except: domain = host if args.user_agent: user_agents = args.user_agent.split(',') else: with open(sys.path[0] + '/core/user-agents.txt', 'r') as uas: user_agents = [agent.strip('\n') for agent in uas] supress_regex = False # Records the time at which crawling started then = time.time() # Step 1. Extract urls from robots.txt & sitemap.xml zap(main_url, args.archive, domain, host, internal, robots, proxies) # This is so the level 1 emails are parsed as well internal = set(remove_regex(internal, args.exclude)) # Step 2. Crawl recursively to the limit specified in "crawl_level" for level in range(crawl_level): # Links to crawl = (all links - already crawled links) - links not to crawl links = remove_regex(internal - processed, args.exclude) # If links to crawl are 0 i.e. all links have been crawled if not links: break # if crawled links are somehow more than all links. Possible? ;/ elif len(internal) <= len(processed): if len(internal) > 2 + len(args.seeds): break print('%s Level %i: %i URLs' % (run, level + 1, len(links))) try: flash(self.extractor, links, thread_count) except KeyboardInterrupt: print('') break if not only_urls: for match in bad_scripts: if match.startswith(main_url): scripts.add(match) elif match.startswith('/') and not match.startswith('//'): scripts.add(main_url + match) elif not match.startswith('http') and not match.startswith( '//'): scripts.add(main_url + '/' + match) # Step 3. Scan the JavaScript files for endpoints print('%s Crawling %i JavaScript files' % (run, len(scripts))) flash(self.jscanner, scripts, thread_count) for url in internal: if '=' in url: fuzzable.add(url) for match, intel_name, url in bad_intel: if isinstance(match, tuple): for x in match: # Because "match" is a tuple if x != '': # If the value isn't empty if intel_name == "CREDIT_CARD": if not luhn(match): # garbage number continue intel.add("%s:%s" % (intel_name, x)) else: if intel_name == "CREDIT_CARD": if not luhn(match): # garbage number continue intel.add("%s:%s:%s" % (url, intel_name, match)) for url in external: try: if top_level(url, fix_protocol=True) in INTELS: intel.add(url) except: pass # Records the time at which crawling stopped now = time.time() # Finds total time taken diff = (now - then) minutes, seconds, time_per_request = timer(diff, processed) # Step 4. Save the results if not os.path.exists(output_dir): # if the directory doesn't exist os.mkdir(output_dir) # create a new directory datasets = [ files, intel, robots, custom, failed, internal, scripts, external, fuzzable, endpoints, keys ] dataset_names = [ 'files', 'intel', 'robots', 'custom', 'failed', 'internal', 'scripts', 'external', 'fuzzable', 'endpoints', 'keys' ] writer(datasets, dataset_names, output_dir) # Printing out results print(('%s-%s' % (red, end)) * 50) for dataset, dataset_name in zip(datasets, dataset_names): if dataset: print('%s %s: %s' % (good, dataset_name.capitalize(), len(dataset))) print(('%s-%s' % (red, end)) * 50) print('%s Total requests made: %i' % (info, len(processed))) print('%s Total time taken: %i minutes %i seconds' % (info, minutes, seconds)) print('%s Requests per second: %i' % (info, int(len(processed) / diff))) datasets = { 'files': list(files), 'intel': list(intel), 'robots': list(robots), 'custom': list(custom), 'failed': list(failed), 'internal': list(internal), 'scripts': list(scripts), 'external': list(external), 'fuzzable': list(fuzzable), 'endpoints': list(endpoints), 'keys': list(keys) } if args.dns: print('%s Enumerating subdomains' % run) from plugins.find_subdomains import find_subdomains subdomains = find_subdomains(domain) print('%s %i subdomains found' % (info, len(subdomains))) writer([subdomains], ['subdomains'], output_dir) datasets['subdomains'] = subdomains from plugins.dnsdumpster import dnsdumpster print('%s Generating DNS map' % run) dnsdumpster(domain, output_dir) if args.export: from plugins.exporter import exporter # exporter(directory, format, datasets) results = datasets exporter(output_dir, args.export, datasets) print('%s Results saved in %s%s%s directory' % (good, green, output_dir, end)) if args.std: for string in datasets[args.std]: sys.stdout.write(string + '\n') return results, 200
# Combining the items because one of them is always empty match = match[0] + match[1] # Making sure it's not some JavaScript code if not re.search(r'[}{><"\']', match) and not match == '/': verb('JS endpoint', match) endpoints.add(match) # Records the time at which crawling started then = time.time() # Step 1. Extract urls from robots.txt & sitemap.xml zap(main_url, args.archive, domain, host, internal, robots, proxies) # This is so the level 1 emails are parsed as well internal = set(remove_regex(internal, args.exclude)) # Step 2. Crawl recursively to the limit specified in "crawl_level" for level in range(crawl_level): # Links to crawl = (all links - already crawled links) - links not to crawl links = remove_regex(internal - processed, args.exclude) # If links to crawl are 0 i.e. all links have been crawled if not links: break # if crawled links are somehow more than all links. Possible? ;/ elif len(internal) <= len(processed): if len(internal) > 2 + len(args.seeds): break print('%s Level %i: %i URLs' % (run, level + 1, len(links))) try: flash(extractor, links, thread_count)