Esempio n. 1
0
def js_extractor(response):
    """Extract js files from the response body"""
    # Extract .js files
    matches = rscript.findall(response)
    for match in matches:
        match = match[2].replace('\'', '').replace('"', '')
        verb('JS file', match)
        bad_scripts.add(match)
Esempio n. 2
0
def intel_extractor(url, response):
    """Extract intel from the response body."""
    for rintel in rintels:
        res = re.sub(r'<(script).*?</\1>(?s)', '', response)
        res = re.sub(r'<[^<]+?>', '', res)
        matches = rintel[0].findall(res)
        if matches:
            for match in matches:
                verb('Intel', match)
                bad_intel.add((match, rintel[1], url))
Esempio n. 3
0
def jscanner(url):
    """Extract endpoints from JavaScript code."""
    response = requester(url, main_url, delay, cook, headers, timeout, host, proxies, user_agents, failed, processed)
    # Extract URLs/endpoints
    matches = rendpoint.findall(response)
    # Iterate over the matches, match is a tuple
    for match in matches:
        # Combining the items because one of them is always empty
        match = match[0] + match[1]
        # Making sure it's not some JavaScript code
        if not re.search(r'[}{><"\']', match) and not match == '/':
            verb('JS endpoint', match)
            endpoints.add(match)
Esempio n. 4
0
 def jscanner(url):
     """Extract endpoints from JavaScript code."""
     response = requester(url, main_url, delay, cook, headers, timeout,
                          host, proxies, user_agents, failed, processed)
     # Extract URLs/endpoints
     matches = rendpoint.findall(response)
     # Iterate over the matches, match is a tuple
     for match in matches:
         # Combining the items because one of them is always empty
         match = match[0] + match[1]
         # Making sure it's not some JavaScript code
         if not re.search(r'[}{><"\']', match) and not match == '/':
             verb('JS endpoint', match)
             endpoints.add(match)
Esempio n. 5
0
def zap(input_url, archive, domain, host, internal, robots, proxies):
    """Extract links from robots.txt and sitemap.xml."""
    if archive:
        print('%s Fetching URLs from archive.org' % run)
        if False:
            archived_urls = time_machine(domain, 'domain')
        else:
            archived_urls = time_machine(host, 'host')
        print('%s Retrieved %i URLs from archive.org' % (
            good, len(archived_urls) - 1))
        for url in archived_urls:
            verb('Internal page', url)
            internal.add(url)
    # Makes request to robots.txt
    response = requests.get(input_url + '/robots.txt',
                            verify=False,
                            proxies=random.choice(proxies)).text
    # Making sure robots.txt isn't some fancy 404 page
    if '<body' not in response:
        # If you know it, you know it
        matches = re.findall(r'Allow: (.*)|Disallow: (.*)', response)
        if matches:
            # Iterating over the matches, match is a tuple here
            for match in matches:
                # One item in match will always be empty so will combine both
                # items
                match = ''.join(match)
                # If the URL doesn't use a wildcard
                if '*' not in match:
                    url = input_url + match
                    # Add the URL to internal list for crawling
                    internal.add(url)
                    # Add the URL to robots list
                    robots.add(url)
            print('%s URLs retrieved from robots.txt: %s' % (good, len(robots)))
    # Makes request to sitemap.xml
    response = requests.get(input_url + '/sitemap.xml',
                            verify=False,
                            proxies=random.choice(proxies)).text
    # Making sure robots.txt isn't some fancy 404 page
    if '<body' not in response:
        matches = xml_parser(response)
        if matches: # if there are any matches
            print('%s URLs retrieved from sitemap.xml: %s' % (
                good, len(matches)))
            for match in matches:
                verb('Internal page', match)
                # Cleaning up the URL and adding it to the internal list for
                # crawling
                internal.add(match)
Esempio n. 6
0
def zap(input_url, archive, domain, host, internal, robots, proxies):
    """Extract links from robots.txt and sitemap.xml."""
    if archive:
        print('%s Fetching URLs from archive.org' % run)
        if False:
            archived_urls = time_machine(domain, 'domain')
        else:
            archived_urls = time_machine(host, 'host')
        print('%s Retrieved %i URLs from archive.org' % (
            good, len(archived_urls) - 1))
        for url in archived_urls:
            verb('Internal page', url)
            internal.add(url)
    # Makes request to robots.txt
    response = requests.get(input_url + '/robots.txt',
                            proxies=random.choice(proxies)).text
    # Making sure robots.txt isn't some fancy 404 page
    if '<body' not in response:
        # If you know it, you know it
        matches = re.findall(r'Allow: (.*)|Disallow: (.*)', response)
        if matches:
            # Iterating over the matches, match is a tuple here
            for match in matches:
                # One item in match will always be empty so will combine both
                # items
                match = ''.join(match)
                # If the URL doesn't use a wildcard
                if '*' not in match:
                    url = input_url + match
                    # Add the URL to internal list for crawling
                    internal.add(url)
                    # Add the URL to robots list
                    robots.add(url)
            print('%s URLs retrieved from robots.txt: %s' % (good, len(robots)))
    # Makes request to sitemap.xml
    response = requests.get(input_url + '/sitemap.xml',
                            proxies=random.choice(proxies)).text
    # Making sure robots.txt isn't some fancy 404 page
    if '<body' not in response:
        matches = xml_parser(response)
        if matches: # if there are any matches
            print('%s URLs retrieved from sitemap.xml: %s' % (
                good, len(matches)))
            for match in matches:
                verb('Internal page', match)
                # Cleaning up the URL and adding it to the internal list for
                # crawling
                internal.add(match)
Esempio n. 7
0
def extractor(url):
    """Extract details from the response body."""
    response = requester(url, main_url, delay, cook, headers, timeout, host,
                         ninja, user_agents, failed, processed)
    if clone:
        mirror(url, response)
    matches = re.findall(r'<[aA].*(href|HREF)=([^\s>]+)', response)
    for link in matches:
        # Remove everything after a "#" to deal with in-page anchors
        link = link[1].replace('\'', '').replace('"', '').split('#')[0]
        # Checks if the URLs should be crawled
        if is_link(link, processed, files):
            if link[:4] == 'http':
                if link.startswith(main_url):
                    verb('Internal page', link)
                    internal.add(link)
                else:
                    verb('External page', link)
                    external.add(link)
            elif link[:2] == '//':
                if link.split('/')[2].startswith(host):
                    verb('Internal page', link)
                    internal.add(schema + link)
                else:
                    verb('External page', link)
                    external.add(link)
            elif link[:1] == '/':
                verb('Internal page', link)
                internal.add(main_url + link)
            else:
                verb('Internal page', link)
                internal.add(main_url + '/' + link)

    if not only_urls:
        intel_extractor(response)
        js_extractor(response)
    if args.regex and not supress_regex:
        regxy(args.regex, response, supress_regex, custom)
    if api:
        matches = re.findall(r'[\w-]{16,45}', response)
        for match in matches:
            if entropy(match) >= 4:
                verb('Key', match)
                keys.add(url + ': ' + match)
Esempio n. 8
0
def extractor(url):
    """Extract details from the response body."""
    try:
        re.findall(r"//", url)[1]
        return
    except:
        pass

    try:
        re.findall(r"https:", url)[1]
        return
    except:
        pass


    if not re.match(main_url, url):
        return
    print(url)
    response = requester(url, main_url, delay, cook, headers, timeout, host, proxies, user_agents, failed, processed)
    if clone:
        mirror(url, response)
    matches = rhref.findall(response)
    for link in matches:
        # Remove everything after a "#" to deal with in-page anchors
        link = link[1].replace('\'', '').replace('"', '').split('#')[0]
        # Checks if the URLs should be crawled
        if is_link(link, processed, files):
            if link[:4] == 'http':
                if link.startswith(main_url):
                    verb('Internal page', link)
                    internal.add(link)
                else:
                    verb('External page', link)
                    external.add(link)
            elif link[:2] == '//':
                if link.split('/')[2].startswith(host):
                    verb('Internal page', link)
                    internal.add(schema + '://' + link)
                else:
                    verb('External page', link)
                    external.add(link)
            elif link[:1] == '/':
                verb('Internal page', link)
                internal.add(remove_file(url) + link)
            else:
                verb('Internal page', link)
                usable_url = remove_file(url)
                if usable_url.endswith('/'):
                    internal.add(usable_url + link)
                elif link.startswith('/'):
                    internal.add(usable_url + link)
                else:
                    internal.add(usable_url + '/' + link)

    if not only_urls:
        intel_extractor(url, response)
        js_extractor(response)
    if args.regex and not supress_regex:
        regxy(args.regex, response, supress_regex, custom)
    if api:
        matches = rentropy.findall(response)
        for match in matches:
            if entropy(match) >= 4:
                verb('Key', match)
                keys.add(url + ': ' + match)
Esempio n. 9
0
def extractor(url):
    """Extract details from the response body."""
    response = requester(
        url,
        main_url,
        delay,
        cook,
        headers,
        timeout,
        host,
        proxies,
        user_agents,
        failed,
        processed,
    )
    if clone:
        mirror(url, response)
    matches = rhref.findall(response)

    for link in matches:
        # Remove everything after a "#" to deal with in-page anchors
        link = link[1].replace("'", "").replace('"', "").split("#")[0]
        # Checks if the URLs should be crawled
        if is_link(link, processed, files):
            if link[:4] == "http":
                if link.startswith(main_url):
                    verb("Internal page", link)
                    internal.add(link)
                else:
                    verb("External page", link)
                    external.add(link)
            elif link[:2] == "//":
                if link.split("/")[2].startswith(host):
                    verb("Internal page", link)
                    internal.add(schema + "://" + link)
                else:
                    verb("External page", link)
                    external.add(link)
            elif link[:1] == "/":
                verb("Internal page", link)
                internal.add(remove_file(url) + link)
            else:
                verb("Internal page", link)
                usable_url = remove_file(url)
                if usable_url.endswith("/"):
                    internal.add(usable_url + link)
                elif link.startswith("/"):
                    internal.add(usable_url + link)
                else:
                    internal.add(usable_url + "/" + link)

    if not only_urls:
        intel_extractor(url, response)
        js_extractor(response)
    if args.regex and not supress_regex:
        regxy(args.regex, response, supress_regex, custom)
    if api:
        matches = rentropy.findall(response)
        for match in matches:
            if entropy(match) >= 4:
                verb("Key", match)
                keys.add(url + ": " + match)
Esempio n. 10
0
def extractor(url):
    """Extract details from the response body."""
    response = requester(url, main_url, delay, cook, headers, timeout, host, proxies, user_agents, failed, processed)
    if clone:
        mirror(url, response)
    matches = rhref.findall(response)
    for link in matches:
        # Remove everything after a "#" to deal with in-page anchors
        link = link[1].replace('\'', '').replace('"', '').split('#')[0]
        # Checks if the URLs should be crawled
        if is_link(link, processed, files):
            if link[:4] == 'http':
                if link.startswith(main_url):
                    verb('Internal page', link)
                    internal.add(link)
                else:
                    verb('External page', link)
                    external.add(link)
            elif link[:2] == '//':
                if link.split('/')[2].startswith(host):
                    verb('Internal page', link)
                    internal.add(schema + '://' + link)
                else:
                    verb('External page', link)
                    external.add(link)
            elif link[:1] == '/':
                verb('Internal page', link)
                internal.add(remove_file(url) + link)
            else:
                verb('Internal page', link)
                usable_url = remove_file(url)
                if usable_url.endswith('/'):
                    internal.add(usable_url + link)
                elif link.startswith('/'):
                    internal.add(usable_url + link)
                else:
                    internal.add(usable_url + '/' + link)

    if not only_urls:
        intel_extractor(url, response)
        js_extractor(response)
    if args.regex and not supress_regex:
        regxy(args.regex, response, supress_regex, custom)
    if api:
        matches = rentropy.findall(response)
        for match in matches:
            if entropy(match) >= 4:
                verb('Key', match)
                keys.add(url + ': ' + match)