def main():
    """Start the scanner."""

    print_banner()
    setup_logger()

    args = require_arguments()

    options = Options()

    options.scope.protocol_must_match = args.protocol_must_match
    options.scope.subdomain_must_match = not args.scan_other_subdomains
    options.scope.hostname_must_match = not args.scan_other_hostnames
    options.scope.tld_must_match = not args.scan_other_tlds
    options.scope.max_depth = args.max_depth if args.crawl else 0
    options.performance.max_threads = args.max_threads
    options.misc.verify_ssl_certificates = not args.ignore_invalid_certificates
    options.misc.trusted_certificates = args.trusted_certificates

    """ ########################################################## """
    """                                                            """
    """  Please add your custom options like e.g. authentication,  """
    """  cookies, headers, proxies or scope options below.         """
    """                                                            """
    """ ########################################################## """



    driver = Driver(args, options)
    driver.start()
Esempio n. 2
0
    def test_hash_different_encoded_and_decoded_values(self):
        """Ensure encoded and decoded values have a different hash."""

        queue = Queue(Options())

        queue.add_request(Request("http://example.ltd?val={{aaaa}}"))
        queue.add_request(Request("http://example.ltd?val=%7B%7Baaaa%7D%7D"))

        self.assertEqual(queue.count_total, 2)
Esempio n. 3
0
    def test_hash_different_query_order(self):
        """Ensure query parameters in different orders are treated as one queue item."""

        queue = Queue(Options())

        queue.add_request(Request("https://www.example.ltd?b=b&c=c&a=a"))
        queue.add_request(Request("https://www.example.ltd?b=b&a=a&c=c"))
        queue.add_request(Request("https://www.example.ltd?a=a&b=b&c=c"))

        self.assertEqual(queue.count_total, 1)
Esempio n. 4
0
    def setoptions(self, depth=1):
        """Define how far user want to crawl"""

        options = Options()
        options.scope.max_depth = depth
        options.callbacks.crawler_before_start = self.crawlerstart
        options.callbacks.crawler_after_finish = self.crawlerfinish
        options.callbacks.request_before_start = self.requeststart
        options.callbacks.request_after_finish = self.requestfinish

        self.crawler = nyawcCrawler(options)
    def test_regex_url_count(self):
        """Test if the amount of URLs found complies with the expected amount."""

        html = ""
        for url in self.__urls:
            html += "\n" + url["test"]

        finder = HTMLRegexLinkScraper(Options(), QueueItem(Request(""), Response()))
        matches = finder.get_requests_from_content(self.__host, html)

        self.assertEqual(len(matches), 30)
Esempio n. 6
0
    def test_hash_option_subdomain_must_not_match(self):
        """Ensure different subdomains are treated as one queue item if subdomains must match is False."""

        options = Options()
        options.scope.subdomain_must_match = False
        queue = Queue(options)

        queue.add_request(Request("https://www.example.ltd"))
        queue.add_request(Request("https://webmail.example.ltd"))
        queue.add_request(Request("https://subdomain.example.ltd"))

        self.assertEqual(queue.count_total, 1)
Esempio n. 7
0
    def test_hash_option_protocol_must_not_match(self):
        """Ensure different protocols are treated as one queue item if protocols must match is False."""

        options = Options()
        options.scope.protocol_must_match = False
        queue = Queue(options)

        queue.add_request(Request("https://example.ltd"))
        queue.add_request(Request("http://example.ltd"))
        queue.add_request(Request("ftp://example.ltd"))

        self.assertEqual(queue.count_total, 1)
    def test_regex_url_matches(self):
        """Test if all the URLs match the found URLs."""
        
        for url in self.__urls:
            finder = HTMLRegexLinkScraper(Options(), QueueItem(Request(""), Response()))
            requests = finder.get_requests_from_content(self.__host, url["test"])

            if url["must_pass"]:
                self.assertEqual(len(requests), 1)
                self.assertEqual(requests[0].url, url["url"])
            else:
                self.assertEqual(len(requests), 0)
Esempio n. 9
0
    def test_hash_is_always_the_same(self):
        """Ensure the hashes are calculated correctly by checking for duplicates in the queue."""

        options = Options()
        queue = Queue(options)

        for index in range(0, 100):
            request = Request("https://example.ltd?1=1#2=2")
            HTTPRequestHelper.patch_with_options(request, options)
            request.cookies.set(name='tasty_cookie{}'.format(index), value='yum', domain='example.ltd')
            queue.add_request(request)

        self.assertEqual(queue.count_total, 1)
    def test_crawl_website(self):
        """Crawl the website in `test/` and check if the count is correct."""

        if not self.travis:
            print("\n\nPlease note that the 'TestSite' unit test did not run.")
            print("It will only run in Travis since it needs a webserver.")
            return

        options = Options()
        options.callbacks.crawler_after_finish
        crawler = Crawler(options)
        crawler.start_with(Request("http://localhost/"))

        self.assertEqual(crawler.queue.count_total, 16)
Esempio n. 11
0
    def test_soup_url_count(self):
        """Test if the amount of URLs found complies with the expected amount."""

        html = ""
        for url in self.__urls:
            html += "\n" + url["test"]

        request = Request(self.__host)
        response = Response()
        response.text = html

        finder = HTMLSoupFormScraper(Options(), QueueItem(request, response))
        matches = finder.get_requests()

        self.assertEqual(len(matches), 4)
    def test_soup_url_matches(self):
        """Test if all the URLs match the found URLs."""

        for url in self.__urls:
            request = Request(self.__host)
            response = Response()
            response.text = url["test"]

            finder = SoupFormScraper(Options(), QueueItem(request, response))
            requests = finder.get_requests()

            if url["must_pass"]:
                self.assertEqual(requests[0].url, url["url"])
                self.assertEqual(len(requests), 1)
            else:
                self.assertEqual(len(requests), 0)
Esempio n. 13
0
def main():
    """Start the scanner."""

    print_banner()
    setup_logger()

    args = require_arguments()

    options = Options()

    options.scope.protocol_must_match = args.protocol_must_match
    options.scope.subdomain_must_match = not args.crawl_other_subdomains
    options.scope.hostname_must_match = not args.crawl_other_hostnames
    options.scope.tld_must_match = not args.crawl_other_tlds
    options.scope.max_depth = args.max_depth
    options.performance.max_threads = args.max_threads

    driver = Driver(args, options)
    driver.start()
Esempio n. 14
0
def main():
    """Start the scanner."""

    print_banner()
    setup_logger()

    args = require_arguments()

    options = Options()

    options.scope.protocol_must_match = args.protocol_must_match
    options.scope.subdomain_must_match = not args.scan_other_subdomains
    options.scope.hostname_must_match = not args.scan_other_hostnames
    options.scope.tld_must_match = not args.scan_other_tlds
    options.scope.max_depth = args.max_depth if args.crawl else 0
    options.performance.max_threads = args.max_threads
    options.misc.verify_ssl_certificates = not args.ignore_invalid_certificates
    options.misc.trusted_certificates = args.trusted_certificates

    driver = Driver(args, options)
    driver.start()
Esempio n. 15
0
    def __init__(self):
        """
        runs when object initialized
        """
        # list of crawled urls
        self.crawled = []
        # define steps og crawling as described above
        options = Options()
        # depth of crawling means depth in url tree.
        # for  ex apple.com and apple.com/buy has depth 1, but apple.com/buy/iphone11 has depth 2 if the only one link available
        options.scope.max_depth = 1
        # nothing to do before start.  lambda: None is an empty action equal to def my_def(): pass
        options.callbacks.crawler_before_start = lambda: None
        # nothing to do after finish .  lambda: None is an empty action equal to def my_def(): pass
        options.callbacks.crawler_after_finish = lambda queue: None
        # let nyawc to contine crawling before each request
        options.callbacks.request_before_start = lambda queue, queue_item: CrawlerActions.DO_CONTINUE_CRAWLING
        # call request_after_finish method after request

        options.callbacks.request_after_finish = self.request_after_finish
        self.crawler = nyawcCrawler(options)
def cb_request_on_error(queue_item, message):
    print("[error] " + message)


def cb_form_before_autofill(queue_item, elements, form_data):
    # return CrawlerActions.DO_NOT_AUTOFILL_FORM

    return CrawlerActions.DO_AUTOFILL_FORM


def cb_form_after_autofill(queue_item, elements, form_data):
    pass


# Declare the options
options = Options()

# Callback options (https://tijme.github.io/not-your-average-web-crawler/latest/options_callbacks.html)
options.callbacks.crawler_before_start = cb_crawler_before_start  # Called before the crawler starts crawling. Default is a null route.
options.callbacks.crawler_after_finish = cb_crawler_after_finish  # Called after the crawler finished crawling. Default is a null route.
options.callbacks.request_before_start = cb_request_before_start  # Called before the crawler starts a new request. Default is a null route.
options.callbacks.request_after_finish = cb_request_after_finish  # Called after the crawler finishes a request. Default is a null route.
options.callbacks.request_in_thread_before_start = cb_request_in_thread_before_start  # Called in the crawling thread (when it started). Default is a null route.
options.callbacks.request_in_thread_after_finish = cb_request_in_thread_after_finish  # Called in the crawling thread (when it finished). Default is a null route.
options.callbacks.request_on_error = cb_request_on_error  # Called if a request failed. Default is a null route.
options.callbacks.form_before_autofill = cb_form_before_autofill  # Called before the crawler autofills a form. Default is a null route.
options.callbacks.form_after_autofill = cb_form_after_autofill  # Called after the crawler autofills a form. Default is a null route.

# Scope options (https://tijme.github.io/not-your-average-web-crawler/latest/options_crawling_scope.html)
options.scope.protocol_must_match = False  # Only crawl pages with the same protocol as the startpoint (e.g. only https). Default is False.
options.scope.subdomain_must_match = True  # Only crawl pages with the same subdomain as the startpoint. If the startpoint is not a subdomain, no subdomains will be crawled. Default is True.