Ejemplo n.º 1
0
    def __init__(self, options):
        """Constructs a Crawler instance.

        Args:
            options (obj): The options to use for the current crawling runtime.

        """

        self.__options = options
        self.__queue = Queue(self.__options)
Ejemplo n.º 2
0
    def test_hash_is_always_the_same(self):
        """Ensure the hashes are calculated correctly by checking for duplicates in the queue."""

        options = Options()
        queue = Queue(options)

        for index in range(0, 100):
            request = Request("https://example.ltd?1=1#2=2")
            HTTPRequestHelper.patch_with_options(request, options)
            request.cookies.set(name='tasty_cookie{}'.format(index), value='yum', domain='example.ltd')
            queue.add_request(request)

        self.assertEqual(queue.count_total, 1)
Ejemplo n.º 3
0
    def __init__(self, options):
        """Constructs a Crawler instance.

        Args:
            options (obj): The options to use for the current crawling runtime.

        """

        self.__options = options
        self.queue = Queue(self.__options)
        self.__stopping = False
        self.__stopped = False
        self.__lock = threading.Lock()
Ejemplo n.º 4
0
    def __init__(self, options):
        """Constructs a Crawler instance.

        Args:
            options (:class:`nyawc.Options`): The options to use for the current crawling runtime.

        """

        signal.signal(signal.SIGINT, self.__signal_handler)

        self.queue = Queue(options)
        self.__options = options
        self.__should_stop = False
        self.__stopping = False
        self.__stopped = False
        self.__threads = {}
        self.__lock = threading.Lock()
Ejemplo n.º 5
0
    def test_hash_option_subdomain_must_not_match(self):
        """Ensure different subdomains are treated as one queue item if subdomains must match is False."""

        options = Options()
        options.scope.subdomain_must_match = False
        queue = Queue(options)

        queue.add_request(Request("https://www.example.ltd"))
        queue.add_request(Request("https://webmail.example.ltd"))
        queue.add_request(Request("https://subdomain.example.ltd"))

        self.assertEqual(queue.count_total, 1)
Ejemplo n.º 6
0
    def test_hash_option_protocol_must_not_match(self):
        """Ensure different protocols are treated as one queue item if protocols must match is False."""

        options = Options()
        options.scope.protocol_must_match = False
        queue = Queue(options)

        queue.add_request(Request("https://example.ltd"))
        queue.add_request(Request("http://example.ltd"))
        queue.add_request(Request("ftp://example.ltd"))

        self.assertEqual(queue.count_total, 1)
Ejemplo n.º 7
0
    def test_hash_different_encoded_and_decoded_values(self):
        """Ensure encoded and decoded values have a different hash."""

        queue = Queue(Options())

        queue.add_request(Request("http://example.ltd?val={{aaaa}}"))
        queue.add_request(Request("http://example.ltd?val=%7B%7Baaaa%7D%7D"))

        self.assertEqual(queue.count_total, 2)
Ejemplo n.º 8
0
    def test_hash_different_query_order(self):
        """Ensure query parameters in different orders are treated as one queue item."""

        queue = Queue(Options())

        queue.add_request(Request("https://www.example.ltd?b=b&c=c&a=a"))
        queue.add_request(Request("https://www.example.ltd?b=b&a=a&c=c"))
        queue.add_request(Request("https://www.example.ltd?a=a&b=b&c=c"))

        self.assertEqual(queue.count_total, 1)
Ejemplo n.º 9
0
class Crawler(object):
    """The main Crawler class which handles the crawling recursion, queue and processes.

    Attributes:
        queue (:class:`nyawc.Queue`): The request/response pair queue containing everything to crawl.
        __options (:class:`nyawc.Options`): The options to use for the current crawling runtime.
        __should_stop (bool): If the crawler should stop the crawling process.
        __stopping (bool): If the crawler is stopping the crawling process.
        __stopped (bool): If the crawler finished stopping the crawler process.
        __threads (obj): All currently running threads, as queue item hash => :class:`nyawc.CrawlerThread`.
        __lock (obj): The callback lock to prevent race conditions.

    """
    def __init__(self, options):
        """Constructs a Crawler instance.

        Args:
            options (:class:`nyawc.Options`): The options to use for the current crawling runtime.

        """

        signal.signal(signal.SIGINT, self.__signal_handler)

        self.queue = Queue(options)
        self.__options = options
        self.__should_stop = False
        self.__stopping = False
        self.__stopped = False
        self.__threads = {}
        self.__lock = threading.Lock()

    def __signal_handler(self, signum, frame):
        """On sigint (e.g. CTRL+C) stop the crawler.

        Args:
            signum (int): The signal number.
            frame (obj): The current stack frame.

        """

        self.__crawler_stop()

    def start_with(self, request):
        """Start the crawler using the given request.

        Args:
            request (:class:`nyawc.http.Request`): The startpoint for the crawler.

        """

        HTTPRequestHelper.patch_with_options(request, self.__options)
        self.queue.add_request(request)

        self.__crawler_start()

    def __spawn_new_requests(self):
        """Spawn new requests until the max threads option value is reached.

        Note:
            If no new requests were spawned and there are no requests in progress
            the crawler will stop crawling.

        """

        in_progress_count = len(
            self.queue.get_all(QueueItem.STATUS_IN_PROGRESS))

        while in_progress_count < self.__options.performance.max_threads:
            if self.__spawn_new_request():
                in_progress_count += 1
            else:
                break

        if in_progress_count == 0:
            self.__crawler_stop()

    def __spawn_new_request(self):
        """Spawn the first queued request if there is one available.

        Returns:
            bool: True if a new request was spawned, false otherwise.

        """

        first_in_line = self.queue.get_first(QueueItem.STATUS_QUEUED)
        if first_in_line is None:
            return False

        self.__request_start(first_in_line)
        return True

    def __wait_for_current_threads(self):
        """Wait until all the current threads are finished."""

        for thread in list(self.__threads.values()):
            thread.join()

    def __crawler_start(self):
        """Spawn the first X queued request, where X is the max threads option.

        Note:
            The main thread will sleep until the crawler is finished. This enables
            quiting the application using sigints (see http://stackoverflow.com/a/11816038/2491049).

        """

        try:
            self.__options.callbacks.crawler_before_start()
        except Exception as e:
            print(e)
            print(traceback.format_exc())

        self.__spawn_new_requests()

        while not self.__stopped:
            if self.__should_stop:
                self.__crawler_stop()

            time.sleep(1)

    def __crawler_stop(self):
        """Mark the crawler as stopped.

        Note:
            If :attr:`__stopped` is True, the main thread will be stopped. Every piece of code that gets
            executed after :attr:`__stopped` is True could cause Thread exceptions and or race conditions.

        """

        if self.__stopping:
            return

        self.__stopping = True
        self.__wait_for_current_threads()

        self.queue.move_bulk(
            [QueueItem.STATUS_QUEUED, QueueItem.STATUS_IN_PROGRESS],
            QueueItem.STATUS_CANCELLED)

        self.__crawler_finish()
        self.__stopped = True

    def __crawler_finish(self):
        """Called when the crawler is finished because there are no queued requests left or it was stopped."""

        try:
            self.__options.callbacks.crawler_after_finish(self.queue)
        except Exception as e:
            print(e)
            print(traceback.format_exc())

    def __request_start(self, queue_item):
        """Execute the request in given queue item.

        Args:
            queue_item (:class:`nyawc.QueueItem`): The request/response pair to scrape.

        """

        try:
            action = self.__options.callbacks.request_before_start(
                self.queue, queue_item)
        except Exception as e:
            action = None
            print(e)
            print(traceback.format_exc())

        if action == CrawlerActions.DO_STOP_CRAWLING:
            self.__should_stop = True

        if action == CrawlerActions.DO_SKIP_TO_NEXT:
            self.queue.move(queue_item, QueueItem.STATUS_FINISHED)
            self.__spawn_new_requests()

        if action == CrawlerActions.DO_CONTINUE_CRAWLING or action is None:
            self.queue.move(queue_item, QueueItem.STATUS_IN_PROGRESS)

            thread = CrawlerThread(self.__request_finish, self.__lock,
                                   self.__options, queue_item)
            self.__threads[queue_item.get_hash()] = thread
            thread.daemon = True
            thread.start()

    def __request_finish(self, queue_item, new_requests, request_failed=False):
        """Called when the crawler finished the given queue item.

        Args:
            queue_item (:class:`nyawc.QueueItem`): The request/response pair that finished.
            new_requests list(:class:`nyawc.http.Request`): All the requests that were found during this request.
            request_failed (bool): True if the request failed (if needs to be moved to errored).

        """

        if self.__stopping:
            return

        del self.__threads[queue_item.get_hash()]

        if request_failed:
            new_queue_items = []
            self.queue.move(queue_item, QueueItem.STATUS_ERRORED)
        else:
            new_queue_items = self.__add_scraped_requests_to_queue(
                queue_item, new_requests)
            self.queue.move(queue_item, QueueItem.STATUS_FINISHED)

        try:
            action = self.__options.callbacks.request_after_finish(
                self.queue, queue_item, new_queue_items)
        except Exception as e:
            action = None
            print(e)
            print(traceback.format_exc())

        if action == CrawlerActions.DO_STOP_CRAWLING:
            self.__should_stop = True

        if action == CrawlerActions.DO_CONTINUE_CRAWLING or action is None:
            self.__spawn_new_requests()

    def __add_scraped_requests_to_queue(self, queue_item, scraped_requests):
        """Convert the scraped requests to queue items, return them and also add them to the queue.

        Args:
            queue_item (:class:`nyawc.QueueItem`): The request/response pair that finished.
            new_requests list(:class:`nyawc.http.Request`): All the requests that were found during this request.

        Returns:
            list(:class:`nyawc.QueueItem`): The new queue items.

        """

        new_queue_items = []

        for scraped_request in scraped_requests:
            HTTPRequestHelper.patch_with_options(scraped_request,
                                                 self.__options, queue_item)

            if not HTTPRequestHelper.complies_with_scope(
                    queue_item, scraped_request, self.__options.scope):
                continue

            if self.queue.has_request(scraped_request):
                continue

            scraped_request.depth = queue_item.request.depth + 1
            if self.__options.scope.max_depth is not None:
                if scraped_request.depth > self.__options.scope.max_depth:
                    continue

            new_queue_item = self.queue.add_request(scraped_request)
            new_queue_items.append(new_queue_item)

        return new_queue_items
Ejemplo n.º 10
0
class Crawler:
    """The main Crawler class which handles the crawling recursion, queue and processes.

    Attributes:
        __options (obj): The options to use for the current crawling runtime.
        __queue (obj): The request/response pair queue containing everything to crawl.
        __stopping (bool): If the crawler is topping the crawling process.
        __stopped (bool): If the crawler finished stopping the crawler process.
        __lock (obj): The callback lock to prevent race conditions.

    """

    __options = None

    __queue = None

    __stopping = False

    __stopped = False

    __lock = threading.Lock()

    def __init__(self, options):
        """Constructs a Crawler instance.

        Args:
            options (obj): The options to use for the current crawling runtime.

        """

        self.__options = options
        self.__queue = Queue(self.__options)

    def start_with(self, request):
        """Start the crawler using the given request.

        Args:
            request (obj): The startpoint for the crawler.

        """

        HTTPRequestHelper.patch_with_options(request, self.__options)
        self.__queue.add_request(request)

        self.__crawler_start()

    def __spawn_new_requests(self):
        """Spawn new requests until the max processes option value is reached.

        Note: If no new requests were spawned and there are no requests in progress
        the crawler will stop crawling.

        """

        concurrent_requests_count = self.__queue.count_in_progress
        new_requests_spawned = False

        while concurrent_requests_count < self.__options.performance.max_threads:
            if self.__spawn_new_request():
                new_requests_spawned = True
                concurrent_requests_count += 1
            else:
                break

        if concurrent_requests_count == 0 and not new_requests_spawned and not self.__stopping:
            self.__crawler_stop()

    def __spawn_new_request(self):
        """Spawn the first queued request if there is one available.

        Returns:
            bool: If a new request was spawned.

        """

        first_in_line = self.__queue.get_first(QueueItem.STATUS_QUEUED)
        if first_in_line is None:
            return False

        self.__request_start(first_in_line)
        return True

    def __crawler_start(self):
        """Spawn the first X queued request, where X is the max threads option.

        Note: The main thread will sleep until the crawler stopped or on keyboard
        interruption. This prevents race conditions where sub threads will callback
        to the main thread while the main thread is already finished.

        """

        self.__options.callbacks.crawler_before_start()

        try:
            self.__spawn_new_requests()

            while not self.__stopped:
                time.sleep(1)
        except (KeyboardInterrupt, SystemExit):
            pass

    def __crawler_stop(self, force_quit=False):
        """Mark the crawler as stopped.

        Note:
            If self.__stopped is True, the main thread will be stopped. Every piece of code that gets
            executed after self.__stopped is True could cause Thread exceptions and or race conditions.

        Args:
            force_quit (bool): Also cancel any ongoing requests.

        """

        self.__stopping = True

        for status in [QueueItem.STATUS_QUEUED, QueueItem.STATUS_IN_PROGRESS]:
            for queue_item in self.__queue.get_all(status).values():
                self.__queue.move(queue_item, QueueItem.STATUS_CANCELLED)

        self.__crawler_finish()

        self.__stopped = True

    def __crawler_finish(self):
        """Called when the crawler is finished because there are no queued requests left or it was stopped."""

        self.__options.callbacks.crawler_after_finish(self.__queue)

    def __request_start(self, queue_item):
        """Execute the request in given queue item.

        Args:
            queue_item (obj): The request/response pair to scrape.

        """

        action = self.__options.callbacks.request_before_start(
            self.__queue, queue_item)

        if action == CrawlerActions.DO_STOP_CRAWLING:
            self.__crawler_stop(True)
            return

        if action == CrawlerActions.DO_SKIP_TO_NEXT:
            self.__queue.move(queue_item, QueueItem.STATUS_FINISHED)
            return

        if action == CrawlerActions.DO_CONTINUE_CRAWLING or action is None:
            self.__queue.move(queue_item, QueueItem.STATUS_IN_PROGRESS)

            thread = CrawlerThread(self.__request_finish, self.__lock,
                                   self.__options, queue_item)
            thread.daemon = True
            thread.start()

    def __request_finish(self, queue_item, new_requests):
        """Called when the crawler finished the given queued item.

        Args:
            queue_item (obj): The request/response pair that finished.
            new_requests list(obj): All the requests that were found during this request.

        """

        new_queue_items = []
        action = None

        if queue_item.status not in [
                QueueItem.STATUS_ERRORED, QueueItem.STATUS_CANCELLED
        ]:
            for new_request in new_requests:
                HTTPRequestHelper.patch_with_options(new_request,
                                                     self.__options,
                                                     queue_item)

                if not HTTPRequestHelper.complies_with_scope(
                        queue_item, new_request, self.__options.scope):
                    continue

                if self.__queue.has_request(new_request):
                    continue

                new_request.depth = queue_item.request.depth + 1
                if self.__options.scope.max_depth is not None:
                    if new_request.depth > self.__options.scope.max_depth:
                        continue

                new_queue_item = self.__queue.add_request(new_request)
                new_queue_items.append(new_queue_item)

            self.__queue.move(queue_item, QueueItem.STATUS_FINISHED)
            action = self.__options.callbacks.request_after_finish(
                self.__queue, queue_item, new_queue_items)

        if self.__stopping:
            return

        if action == CrawlerActions.DO_STOP_CRAWLING:
            self.__crawler_stop()
            return

        if action == CrawlerActions.DO_CONTINUE_CRAWLING or action is None:
            self.__spawn_new_requests()
            return