Esempio n. 1
0
    def _treewalk_stop(self, data: Any) -> communication.Response:
        """Stop the current execution of the TreeWalk.

        Args:
            data (Any): ignored, but required due to callback signature

        Returns:
            communication.Response: response object

        """
        if self._state.is_ready():
            message = 'Attempted to stop when TreeWalk was ready.'
        else:
            command = communication.Command(command=communication.WORKER_STOP,
                                            data=None)
            for worker_control in self._workers:
                worker_control.queue_input.put(command)
                _ = worker_control.queue_output.get()
            self._workers_can_exit.set()
            for worker_control in self._workers:
                worker_control.me.join()
            self._workers_can_exit.clear()
            self._db_connection.set_crawl_state(
                tree_walk_id=self._tree_walk_id,
                status=communication.CRAWL_STATUS_ABORTED)
            self._reset()
            message = communication.MANAGER_OK
        return communication.Response(success=True,
                                      message=message,
                                      command=communication.MANAGER_STOP)
Esempio n. 2
0
    def _treewalk_pause(self, data: Any) -> communication.Response:
        """Pause the current execution of the TreeWalk.

        Args:
            data (Any): ignored, but required due to callback signature

        Returns:
            communication.Response: response object

        """
        try:
            self._state.set_paused()
            command = communication.Command(command=communication.WORKER_PAUSE,
                                            data=None)
            for worker_control in self._workers:
                worker_control.queue_input.put(command)
            self._db_connection.set_crawl_state(
                tree_walk_id=self._tree_walk_id,
                status=communication.CRAWL_STATUS_PAUSED)
            success = True
            message = communication.MANAGER_OK
        except treewalk.StateException as err:
            success = False
            message = f'Attempted to pause. {str(err)}'
        return communication.Response(success=success,
                                      message=message,
                                      command=communication.MANAGER_PAUSE)
Esempio n. 3
0
 def done() -> None:
     """Send the finish signal to each worker and update the database."""
     command = communication.Command(
         command=communication.WORKER_FINISH, data=None)
     for worker_control in self._workers:
         worker_control.queue_input.put(command)
     self._db_connection.set_crawl_state(
         tree_walk_id=self._tree_walk_id,
         status=communication.CRAWL_STATUS_FINISHED)
Esempio n. 4
0
    def _update_workers(self, num_workers: int, reduce: bool) -> None:
        """Update the workers due to maximum resource consumption.

        Args:
            num_workers (int): new number of workers
            reduce (bool): if True reduce, otherwise increase

        """
        if reduce:
            diff = self._num_workers.value - num_workers
            command = communication.Command(command=communication.WORKER_STOP,
                                            data=None)
            to_kill = self._workers[:diff]
            del self._workers[:diff]
            for worker_control in to_kill:
                worker_control.queue_input.put(command)
                worker_control.queue_output.get()
            self._workers_can_exit.set()
            for worker_control in to_kill:
                worker_control.me.join()
            self._workers_can_exit.clear()
            logging.info(
                f'TWManager: reduced the number of workers by {diff}.')
        else:
            diff = num_workers - self._num_workers.value
            for id_worker in range(diff):
                queue_input = multiprocessing.Queue()
                queue_output = multiprocessing.Queue()
                worker = Worker(
                    queue_input=queue_input,
                    queue_output=queue_output,
                    config=self._config,
                    connection_data=self._connection_data,
                    tree_walk_id=self._tree_walk_id,
                    lock=self._worker_lock,
                    counter=self._worker_counter,
                    finished=self._workers_finished,
                    num_workers=self._num_workers,
                    measure_time=self._measure_time,
                    event_can_exit=self._workers_can_exit,
                    debug=environment.env.CRAWLER_LOGGING_LEVEL == 'DEBUG')
                worker_control = WorkerControl(
                    worker=worker,
                    queue_input=queue_input,
                    queue_output=queue_output,
                    event_finished=self._workers_can_exit)
                self._workers.append(worker_control)
                worker.start()
            logging.info(
                f'TWManager: increased the number of workers by {diff}.')
        self._work_packages = treewalk.resize_work_packages(
            work_packages=self._work_packages, num_workers=num_workers)
        self._state.set_running_workers(num_workers)
        self._num_workers.value = num_workers
def shutdown() -> communication.Response:
    """Retrieve information about the current state  of the TreeWalk.

    Returns:
        communication.Response: response object

    """
    command = communication.Command(command=communication.MANAGER_SHUTDOWN,
                                    data=None)
    communication.manager_queue_input.put(command)
    return communication.manager_queue_output.get()
def stop() -> communication.Response:
    """Stop the TreeWalk.

    Returns:
        communication.Response: response object

    """
    command = communication.Command(command=communication.MANAGER_STOP,
                                    data=None)
    communication.manager_queue_input.put(command)
    return communication.manager_queue_output.get()
def unpause() -> communication.Response:
    """Continue the paused TreeWalk.

    Returns:
        communication.Response: response object

    """
    command = communication.Command(command=communication.MANAGER_UNPAUSE,
                                    data=None)
    communication.manager_queue_input.put(command)
    return communication.manager_queue_output.get()
def start(config: Config) -> communication.Response:
    """Start the TreeWalk.

    Args:
        config (Config): new configuration

    Returns:
        communication.Response: response object

    """
    if config.get_force_update():
        logging.info('TWManagerInterface: force-update was set. stopping.')
        command = communication.Command(command=communication.MANAGER_STOP,
                                        data=None)
        communication.manager_queue_input.put(command)
        # Ignore response of command stop
        _ = communication.manager_queue_output.get()
    command = communication.Command(command=communication.MANAGER_START,
                                    data=config)
    communication.manager_queue_input.put(command)
    return communication.manager_queue_output.get()
def _do_command(command: str, data: Any = None) -> communication.Response:
    """Helper method for passing a command to the scheduler.

    Args:
        command (str): command type
        data (Any, optional): the data required for the command. Defaults to None.

    Returns:
        communication.Response: response

    """
    command = communication.Command(command=command, data=data)
    communication.scheduler_queue_input.put(command)
    response = communication.scheduler_queue_output.get()
    return response
Esempio n. 10
0
 def work_single() -> None:
     """Work on the small work packages"""
     packages = []
     for index in range(self._num_workers.value):
         try:
             work = self._work_packages[index].pop()
             packages.append(work)
         except IndexError:
             packages.append([])
             pass
     self._work_packages = [
         items for items in self._work_packages if items
     ]
     for index, package in enumerate(packages):
         worker_control = self._workers[index]
         command = communication.Command(
             command=communication.WORKER_PACKAGE, data=package)
         worker_control.queue_input.put(command)
     self._workers_finished.wait()
     self._workers_finished.clear()
Esempio n. 11
0
 def work_split() -> None:
     """Work on the work packages that have to be split across workers."""
     directory = self._work_packages_split.pop()
     entries = [
         os.path.join(directory, entry)
         for entry in os.listdir(directory)
     ]
     files = [entry for entry in entries if os.path.isfile(entry)]
     work_packages = treewalk.chunkify_files(
         files=files, size=self._config.get_package_size())
     # In each iteration, all workers must retrieve a work package.
     # Otherwise, the finish mechanism won't work.
     while len(work_packages) % self._num_workers.value != 0:
         work_packages.append([])
     while work_packages:
         for worker_control in self._workers:
             package = work_packages.pop()
             command = communication.Command(
                 command=communication.WORKER_PACKAGE, data=package)
             worker_control.queue_input.put(command)
         self._workers_finished.wait()
         self._workers_finished.clear()
Esempio n. 12
0
def shutdown() -> None:
    """Shutdown the database updater thread."""
    command = communication.Command(
        command=communication.DATABASE_UPDATER_SHUTDOWN, data=None)
    communication.database_updater_input.put(command)
Esempio n. 13
0
def shutdown() -> None:
    """Shutdown the TreeWalk scheduler."""
    command = communication.Command(command=communication.SCHEDULER_SHUTDOWN,
                                    data=None)
    communication.scheduler_queue_input.put(command)