Exemple #1
0
def crawler():
    for level in range(var['level']):
        # Links to crawl = (all links - already crawled links) - links not to crawl
        internal = result['urls']['internal']
        processed = var['processed']
        urls = remove_regex(internal - processed, var['exclude'])
        # If urls to crawl are 0 i.e. all urls have been crawled
        if not urls:
            break
        # if crawled urls are somehow more than all urls. Possible? :/
        elif len(internal) <= len(processed):
            if len(internal) > 2 + len(var['seeds']):
                break
        print('%s Level %i: %i URLs' % (run, level + 1, len(urls)))
        threadpool = concurrent.futures.ThreadPoolExecutor(
                max_workers=var['threads'])
        futures = (threadpool.submit(parser, url) for url in urls)
        for i, _ in enumerate(concurrent.futures.as_completed(futures)):
            if i + 1 == len(urls) or (i + 1) % var['threads'] == 0:
                print('%s Progress: %i/%i' % (info, i + 1, len(urls)),
                        end='\r')
        print('')
Exemple #2
0
        # Combining the items because one of them is always empty
        match = match[0] + match[1]
        # Making sure it's not some JavaScript code
        if not re.search(r'[}{><"\']', match) and not match == '/':
            verb('JS endpoint', match)
            endpoints.add(match)


# Records the time at which crawling started
then = time.time()

# Step 1. Extract urls from robots.txt & sitemap.xml
zap(main_url, args.archive, domain, host, internal, robots)

# This is so the level 1 emails are parsed as well
internal = set(remove_regex(internal, args.exclude))

# Step 2. Crawl recursively to the limit specified in "crawl_level"
for level in range(crawl_level):
    # Links to crawl = (all links - already crawled links) - links not to crawl
    links = remove_regex(internal - processed, args.exclude)
    # If links to crawl are 0 i.e. all links have been crawled
    if not links:
        break
    # if crawled links are somehow more than all links. Possible? ;/
    elif len(internal) <= len(processed):
        if len(internal) > 2 + len(args.seeds):
            break
    print('%s Level %i: %i URLs' % (run, level + 1, len(links)))
    try:
        flash(extractor, links, thread_count)
        # 进行组合,因为其中一项总是空的
        match = match[0] + match[1]
        # 确保其中没有一些JS代码
        if not re.search(r'[}{><"\']', match) and not match == '/':
            endpoints.add(match)


# 在爬取开始的时候记录一个时间值
then = time.time()

# 第一步:从 robots.txt 和 sitemap.xml 文件中提取 urls
zap(main_url, domain, host, internal,
    robots)  # 这里涉及到 core/zap.py 文件中的 zap() 函数

# 这是第一层,emails 也可以被解析
internal = set(remove_regex(
    internal, args.exclude))  # 这里涉及到 core/utils.py 文件中的 remove_regex() 函数

# 第二步:递归爬取在 crawl_level 中指定的限制层数
for level in range(crawl_level):
    # 需要爬取的链接数 =(所有的链接数 - 已经爬取过的链接数)- 不需要爬取的链接数
    links = remove_regex(internal - processed, args.exclude)
    # 如果需要爬取的链接数是 0,即:所有的链接都爬取过了
    if not links:
        break
    # 如果爬取过的链接数大于所有的链接数
    elif len(internal) <= len(processed):
        if len(internal) > 2 + len(args.seeds):
            break
    print('%s Level %i: %i URLs' % (run, level + 1, len(links)))
    try:
        flash(extractor, links,
Exemple #4
0
    def post(self, target, level_):

        global keys
        global files
        global intel
        global robots
        global custom
        global failed
        global scripts
        global external
        global fuzzable
        global endpoints
        global processed
        global internal
        global main_url
        global delay
        global cook
        global headers
        global timeout
        global host
        global proxies
        global user_agents
        global only_urls
        global bad_intel
        global bad_scripts
        global clone
        global schema
        global args
        global supress_regex
        global results

        results = {}
        # Disable SSL related warnings
        warnings.filterwarnings('ignore')

        # Processing command line arguments
        parser = argparse.ArgumentParser()
        # Options
        parser.add_argument('-u', '--url', help='root url', dest='root')
        parser.add_argument('-c', '--cookie', help='cookie', dest='cook')
        parser.add_argument('-r',
                            '--regex',
                            help='regex pattern',
                            dest='regex')
        parser.add_argument('-e',
                            '--export',
                            help='export format',
                            dest='export',
                            choices=['csv', 'json'])
        parser.add_argument('-o',
                            '--output',
                            help='output directory',
                            dest='output')
        parser.add_argument('-l',
                            '--level',
                            help='levels to crawl',
                            dest='level',
                            type=int)
        parser.add_argument('-t',
                            '--threads',
                            help='number of threads',
                            dest='threads',
                            type=int)
        parser.add_argument('-d',
                            '--delay',
                            help='delay between requests',
                            dest='delay',
                            type=float)
        parser.add_argument('-v',
                            '--verbose',
                            help='verbose output',
                            dest='verbose',
                            action='store_true')
        parser.add_argument('-s',
                            '--seeds',
                            help='additional seed URLs',
                            dest='seeds',
                            nargs="+",
                            default=[])
        parser.add_argument('--stdout',
                            help='send variables to stdout',
                            dest='std')
        parser.add_argument('--user-agent',
                            help='custom user agent(s)',
                            dest='user_agent')
        parser.add_argument('--exclude',
                            help='exclude URLs matching this regex',
                            dest='exclude')
        parser.add_argument('--timeout',
                            help='http request timeout',
                            dest='timeout',
                            type=float)
        parser.add_argument('-p',
                            '--proxy',
                            help='Proxy server IP:PORT or DOMAIN:PORT',
                            dest='proxies',
                            type=proxy_type)

        # Switches
        parser.add_argument('--clone',
                            help='clone the website locally',
                            dest='clone',
                            action='store_true')
        parser.add_argument('--headers',
                            help='add headers',
                            dest='headers',
                            action='store_true')
        parser.add_argument('--dns',
                            help='enumerate subdomains and DNS data',
                            dest='dns',
                            action='store_true')
        parser.add_argument('--keys',
                            help='find secret keys',
                            dest='api',
                            action='store_true')
        parser.add_argument('--update',
                            help='update photon',
                            dest='update',
                            action='store_true')
        parser.add_argument('--only-urls',
                            help='only extract URLs',
                            dest='only_urls',
                            action='store_true')
        parser.add_argument('--wayback',
                            help='fetch URLs from archive.org as seeds',
                            dest='archive',
                            action='store_true')
        args = parser.parse_args()

        print("------------------------------------------------")
        print(args.root)
        print(type(args.level))
        print(type(args.threads))
        print(args.api)
        print(args.archive)
        print(args.export)
        args.root = "http://" + target
        args.level = int(level_)
        args.threads = 30
        args.api = True
        args.archive = True
        args.export = "json"

        # If the user has supplied --update argument
        if args.update:
            updater()
            quit()

        # If the user has supplied a URL
        if args.root:
            main_inp = args.root
            if main_inp.endswith('/'):
                # We will remove it as it can cause problems later in the code
                main_inp = main_inp[:-1]
        # If the user hasn't supplied an URL
        else:
            print('\n' + parser.format_help().lower())
            quit()

        clone = args.clone
        headers = args.headers  # prompt for headers
        verbose = args.verbose  # verbose output
        delay = args.delay or 0  # Delay between requests
        timeout = args.timeout or 6  # HTTP request timeout
        cook = args.cook or None  # Cookie
        api = bool(
            args.api)  # Extract high entropy strings i.e. API keys and stuff

        proxies = []
        if args.proxies:
            print("%s Testing proxies, can take a while..." % info)
            for proxy in args.proxies:
                if is_good_proxy(proxy):
                    proxies.append(proxy)
                else:
                    print("%s Proxy %s doesn't seem to work or timedout" %
                          (bad, proxy['http']))
            print("%s Done" % info)
            if not proxies:
                print("%s no working proxies, quitting!" % bad)
                exit()
        else:
            proxies.append(None)

        crawl_level = args.level or 2  # Crawling level
        thread_count = args.threads or 2  # Number of threads
        only_urls = bool(args.only_urls)  # Only URLs mode is off by default

        # Variables we are gonna use later to store stuff
        keys = set()  # High entropy strings, prolly secret keys
        files = set()  # The pdf, css, png, etc files.
        intel = set(
        )  # The email addresses, website accounts, AWS buckets etc.
        robots = set()  # The entries of robots.txt
        custom = set()  # Strings extracted by custom regex pattern
        failed = set()  # URLs that photon failed to crawl
        scripts = set()  # THe Javascript files
        external = set(
        )  # URLs that don't belong to the target i.e. out-of-scope
        # URLs that have get params in them e.g. example.com/page.php?id=2
        fuzzable = set()
        endpoints = set()  # URLs found from javascript files
        processed = set(['dummy'])  # URLs that have been crawled
        # URLs that belong to the target i.e. in-scope
        internal = set(args.seeds)

        everything = []
        bad_scripts = set()  # Unclean javascript file urls
        bad_intel = set()  # needed for intel filtering

        core.config.verbose = verbose

        if headers:
            try:
                prompt = prompt()
            except FileNotFoundError as e:
                print('Could not load headers prompt: {}'.format(e))
                quit()
            headers = extract_headers(prompt)

        # If the user hasn't supplied the root URL with http(s), we will handle it
        if main_inp.startswith('http'):
            main_url = main_inp
        else:
            try:
                requests.get('https://' + main_inp,
                             proxies=random.choice(proxies))
                main_url = 'https://' + main_inp
            except:
                main_url = 'http://' + main_inp

        schema = main_url.split('//')[0]  # https: or http:?
        # Adding the root URL to internal for crawling
        internal.add(main_url)
        # Extracts host out of the URL
        host = urlparse(main_url).netloc

        output_dir = args.output or host
        output_dir = "results"

        try:
            domain = top_level(main_url)
        except:
            domain = host

        if args.user_agent:
            user_agents = args.user_agent.split(',')
        else:
            with open(sys.path[0] + '/core/user-agents.txt', 'r') as uas:
                user_agents = [agent.strip('\n') for agent in uas]

        supress_regex = False

        # Records the time at which crawling started
        then = time.time()

        # Step 1. Extract urls from robots.txt & sitemap.xml
        zap(main_url, args.archive, domain, host, internal, robots, proxies)

        # This is so the level 1 emails are parsed as well
        internal = set(remove_regex(internal, args.exclude))

        # Step 2. Crawl recursively to the limit specified in "crawl_level"
        for level in range(crawl_level):
            # Links to crawl = (all links - already crawled links) - links not to crawl
            links = remove_regex(internal - processed, args.exclude)
            # If links to crawl are 0 i.e. all links have been crawled
            if not links:
                break
            # if crawled links are somehow more than all links. Possible? ;/
            elif len(internal) <= len(processed):
                if len(internal) > 2 + len(args.seeds):
                    break
            print('%s Level %i: %i URLs' % (run, level + 1, len(links)))
            try:
                flash(self.extractor, links, thread_count)
            except KeyboardInterrupt:
                print('')
                break

        if not only_urls:
            for match in bad_scripts:
                if match.startswith(main_url):
                    scripts.add(match)
                elif match.startswith('/') and not match.startswith('//'):
                    scripts.add(main_url + match)
                elif not match.startswith('http') and not match.startswith(
                        '//'):
                    scripts.add(main_url + '/' + match)
            # Step 3. Scan the JavaScript files for endpoints
            print('%s Crawling %i JavaScript files' % (run, len(scripts)))
            flash(self.jscanner, scripts, thread_count)

            for url in internal:
                if '=' in url:
                    fuzzable.add(url)

            for match, intel_name, url in bad_intel:
                if isinstance(match, tuple):
                    for x in match:  # Because "match" is a tuple
                        if x != '':  # If the value isn't empty
                            if intel_name == "CREDIT_CARD":
                                if not luhn(match):
                                    # garbage number
                                    continue
                            intel.add("%s:%s" % (intel_name, x))
                else:
                    if intel_name == "CREDIT_CARD":
                        if not luhn(match):
                            # garbage number
                            continue
                    intel.add("%s:%s:%s" % (url, intel_name, match))
                for url in external:
                    try:
                        if top_level(url, fix_protocol=True) in INTELS:
                            intel.add(url)
                    except:
                        pass

        # Records the time at which crawling stopped
        now = time.time()
        # Finds total time taken
        diff = (now - then)
        minutes, seconds, time_per_request = timer(diff, processed)

        # Step 4. Save the results
        if not os.path.exists(output_dir):  # if the directory doesn't exist
            os.mkdir(output_dir)  # create a new directory

        datasets = [
            files, intel, robots, custom, failed, internal, scripts, external,
            fuzzable, endpoints, keys
        ]
        dataset_names = [
            'files', 'intel', 'robots', 'custom', 'failed', 'internal',
            'scripts', 'external', 'fuzzable', 'endpoints', 'keys'
        ]

        writer(datasets, dataset_names, output_dir)
        # Printing out results
        print(('%s-%s' % (red, end)) * 50)
        for dataset, dataset_name in zip(datasets, dataset_names):
            if dataset:
                print('%s %s: %s' %
                      (good, dataset_name.capitalize(), len(dataset)))
        print(('%s-%s' % (red, end)) * 50)

        print('%s Total requests made: %i' % (info, len(processed)))
        print('%s Total time taken: %i minutes %i seconds' %
              (info, minutes, seconds))
        print('%s Requests per second: %i' %
              (info, int(len(processed) / diff)))

        datasets = {
            'files': list(files),
            'intel': list(intel),
            'robots': list(robots),
            'custom': list(custom),
            'failed': list(failed),
            'internal': list(internal),
            'scripts': list(scripts),
            'external': list(external),
            'fuzzable': list(fuzzable),
            'endpoints': list(endpoints),
            'keys': list(keys)
        }

        if args.dns:
            print('%s Enumerating subdomains' % run)
            from plugins.find_subdomains import find_subdomains
            subdomains = find_subdomains(domain)
            print('%s %i subdomains found' % (info, len(subdomains)))
            writer([subdomains], ['subdomains'], output_dir)
            datasets['subdomains'] = subdomains
            from plugins.dnsdumpster import dnsdumpster
            print('%s Generating DNS map' % run)
            dnsdumpster(domain, output_dir)

        if args.export:
            from plugins.exporter import exporter
            # exporter(directory, format, datasets)
            results = datasets
            exporter(output_dir, args.export, datasets)

        print('%s Results saved in %s%s%s directory' %
              (good, green, output_dir, end))

        if args.std:
            for string in datasets[args.std]:
                sys.stdout.write(string + '\n')

        return results, 200
Exemple #5
0
        # Combining the items because one of them is always empty
        match = match[0] + match[1]
        # Making sure it's not some JavaScript code
        if not re.search(r'[}{><"\']', match) and not match == '/':
            verb('JS endpoint', match)
            endpoints.add(match)


# Records the time at which crawling started
then = time.time()

# Step 1. Extract urls from robots.txt & sitemap.xml
zap(main_url, args.archive, domain, host, internal, robots, proxies)

# This is so the level 1 emails are parsed as well
internal = set(remove_regex(internal, args.exclude))

# Step 2. Crawl recursively to the limit specified in "crawl_level"
for level in range(crawl_level):
    # Links to crawl = (all links - already crawled links) - links not to crawl
    links = remove_regex(internal - processed, args.exclude)
    # If links to crawl are 0 i.e. all links have been crawled
    if not links:
        break
    # if crawled links are somehow more than all links. Possible? ;/
    elif len(internal) <= len(processed):
        if len(internal) > 2 + len(args.seeds):
            break
    print('%s Level %i: %i URLs' % (run, level + 1, len(links)))
    try:
        flash(extractor, links, thread_count)