def start() -> flask.Response: """API endpoint to retrive start the crawler with a certain configuration. Starting the crawler with a arbitrary configuration. Returns: flask.Response: REST response """ config = flask.request.args.get('config') if config is None: response = communication.Response( success=False, message='Provide config or filepath via ?config=<your-config>', command=communication.MANAGER_START, ) return _get_response(response) parser = config_service.ConfigParser(config) try: config = parser.parse() except config_service.ConfigParsingException as error: response = communication.Response(success=False, message=str(error), command=communication.MANAGER_START) return _get_response(response) response = scheduler.add_config(config) return _get_response(response)
def respond_config_deleted(identifier: str, success: bool) -> None: """Respond that the config was deleted from the database. This function creates a corresponding response object and inserts it in the scheduler output queue. The response depends if the deletion succeeded or failed. Args: identifier (str): identifier of the configuration success (bool): insertion succeeded/failed """ if success: response = communication.Response( success=True, message=(f'Configuration with identifier {identifier} ' f'was successfully deleted from the schedule.'), command=communication.SCHEDULER_REMOVE_CONFIG) else: response = communication.Response( success=False, message=(f'Configuration with identifier {identifier} ' f'wasn\'t deleted from the schedule (not present).'), command=communication.SCHEDULER_REMOVE_CONFIG) communication.scheduler_queue_output.put(response)
def respond_interval_deleted(identifier: str, success: bool) -> None: """Respond that the interval was deleted from the database. This function creates a corresponding response object and inserts it in the scheduler output queue. The response depends if the insertion succeeded or failed. Args: identifier (str): identifier of the interval success (bool): insertion succeeded/failed """ if success: response = communication.Response( success=True, message=(f'Interval with identifier {identifier} ' f'was successfully deleted from the database.'), command=communication.SCHEDULER_REMOVE_INTERVAL) else: response = communication.Response( success=False, message=( f'Interval with identifier {identifier} ' f'wasn\'t deleted from the database because it wasn\' present.' ), command=communication.SCHEDULER_REMOVE_INTERVAL) communication.scheduler_queue_output.put(response)
def respond_interval_inserted(identifier: str, success: bool) -> None: """Respond that the interval was inserted in the database. This function creates a corresponding response object and inserts it in the scheduler output queue. The response depends if the insertion succeeded or failed. Args: identifier (str): identifier of the interval success (bool): insertion succeeded/failed """ if success: response = communication.Response( success=True, message=(f'Interval with identifier {identifier} ' f'was successfully added.'), command=communication.SCHEDULER_ADD_INTERVAL) else: response = communication.Response( success=False, message=( f'Interval with identifier {identifier} ' f'wasn\'t added to the database due to an internal error.'), command=communication.SCHEDULER_ADD_INTERVAL) communication.scheduler_queue_output.put(response)
def _treewalk_pause(self, data: Any) -> communication.Response: """Pause the current execution of the TreeWalk. Args: data (Any): ignored, but required due to callback signature Returns: communication.Response: response object """ try: self._state.set_paused() command = communication.Command(command=communication.WORKER_PAUSE, data=None) for worker_control in self._workers: worker_control.queue_input.put(command) self._db_connection.set_crawl_state( tree_walk_id=self._tree_walk_id, status=communication.CRAWL_STATUS_PAUSED) success = True message = communication.MANAGER_OK except treewalk.StateException as err: success = False message = f'Attempted to pause. {str(err)}' return communication.Response(success=success, message=message, command=communication.MANAGER_PAUSE)
def _treewalk_stop(self, data: Any) -> communication.Response: """Stop the current execution of the TreeWalk. Args: data (Any): ignored, but required due to callback signature Returns: communication.Response: response object """ if self._state.is_ready(): message = 'Attempted to stop when TreeWalk was ready.' else: command = communication.Command(command=communication.WORKER_STOP, data=None) for worker_control in self._workers: worker_control.queue_input.put(command) _ = worker_control.queue_output.get() self._workers_can_exit.set() for worker_control in self._workers: worker_control.me.join() self._workers_can_exit.clear() self._db_connection.set_crawl_state( tree_walk_id=self._tree_walk_id, status=communication.CRAWL_STATUS_ABORTED) self._reset() message = communication.MANAGER_OK return communication.Response(success=True, message=message, command=communication.MANAGER_STOP)
def intervals_add() -> flask.Response: """API endpoint to add intervals for maximum resource consumption. Returns: flask.Response: REST response """ start = flask.request.args.get('start') end = flask.request.args.get('end') cpu = flask.request.args.get('cpu') if (start is None) or (end is None) or (cpu is None): response = communication.Response( success=False, message='Please provide a start/end time and a cpu level.', command=communication.SCHEDULER_ADD_INTERVAL, ) return _get_response(response) if not interval_pkg.TimeInterval.assert_valid(start_str=start, end_str=end): response = communication.Response( success=False, message='Invalid start/end times.', command=communication.SCHEDULER_ADD_INTERVAL, ) return _get_response(response) try: cpu = int(cpu) if cpu not in range(1, 5): raise ValueError except ValueError: response = communication.Response( success=False, message='The CPU level must be 1, 2, 3 or 4.', command=communication.SCHEDULER_ADD_INTERVAL, ) return _get_response(response) interval = interval_pkg.TimeInterval(start_str=start, end_str=end, cpu_level=cpu) response = scheduler.add_interval(interval) return _get_response(response)
def _clean_up(self) -> None: """Clean up method for cleaning up all used resources.""" self.message('cleaning up before exiting.') self._db_connection_files.close() self._db_connection_metadata.close() response = communication.Response( success=True, message=(self._exiftool_time, self._hashing_time, self._db_connection_files.get_time() + self._db_connection_metadata.get_time()), command=communication.WORKER_FINISH) self._queue_output.put(response) self._event_can_exit.wait()
def shutdown(): manager.shutdown() db_updater.shutdown() scheduler.shutdown() func = flask.request.environ.get('werkzeug.server.shutdown') if func is None: logging.critical('TWApi: Unable to shutdown Flask server!') return None func() response = communication.Response(success=True, message='Shutting down. Bye!', command=communication.MANAGER_SHUTDOWN) return _get_response(response)
def respond_schedule(schedule: dict) -> None: """Respond the TreeWalk schedule. This function creates a corresponding response object and inserts it in the scheduler output queue. The response depends if the insertion succeeded or failed. Args: schedule (dict): schedule """ if schedule is None: response = communication.Response( success=False, message='Unable to read schedule.', command=communication.SCHEDULER_GET_SCHEDULE) else: response = communication.Response( success=True, message=schedule, command=communication.SCHEDULER_GET_SCHEDULE) communication.scheduler_queue_output.put(response)
def respond_intervals(intervals: dict) -> None: """Respond the intervals. This function creates a corresponding response object and inserts it in the scheduler output queue. The response depends if the insertion succeeded or failed. Args: intervals (dict): intervals """ if intervals is None: response = communication.Response( success=False, message='Unable to read intervals.', command=communication.SCHEDULER_GET_INTERVALS) else: response = communication.Response( success=True, message=intervals, command=communication.SCHEDULER_GET_INTERVALS) communication.scheduler_queue_output.put(response)
def respond_interval_overlaps(identifier: str) -> None: """Respond that the interval overlaps with a already existing one. This function creates a corresponding response object and inserts it in the scheduler output queue. Args: identifier (str): identifier of the interval """ response = communication.Response( success=False, message=(f'Interval with identifier {identifier} is overlapping.'), command=communication.SCHEDULER_ADD_INTERVAL) communication.scheduler_queue_output.put(response)
def respond_config_already_present(identifier: str) -> None: """Respond that the config is already present in the schedule. This function creates a corresponding response object and inserts it in the scheduler output queue. Args: identifier (str): identifier of the configuration """ response = communication.Response( success=False, message=( f'Configuration with identifier {identifier} ' f'is already present in the schedule, thus it was not added.'), command=communication.SCHEDULER_ADD_CONFIG) communication.scheduler_queue_output.put(response)
def info(self) -> communication.Response: """Return the current status. Returns: communication.Response: current info """ if self._is_ready(): data = { 'status': self._status, 'config': self._config, 'processes': self._running_workers } else: data = { 'status': self._status, 'config': self._config.get_data(as_json=False), 'processes': self._running_workers, 'progress': f'{self._progress:.2f}' } return communication.Response(success=True, message=data, command=communication.MANAGER_INFO)
def _treewalk_start(self, config: Config) -> communication.Response: """Start the TreeWalk with given configuration. Args: config (Config): configuration of new TreeWalk Returns: communication.Response: response object """ def prepare(config: Config) -> Tuple[int, int, list, list]: """Prepare the start of the TreeWalk. Initialize required data such as work packages or number of workers. Args: config (Config): config of the execution Returns: Tuple[int, int, list, list]: (db_id, #workers, work_packages_single, work_packages_split) """ # Prepare database self._db_connection = database.DatabaseConnection( db_info=self._connection_data, measure_time=self._measure_time) db_id = self._db_connection.insert_new_record_crawls(config) # Prepare number of workers number_of_workers = treewalk.get_number_of_workers( config.get_cpu_level()) max_cpu_level, max_num_workers = self._state.get_cpu_level(True) if max_cpu_level > 0: actual = number_of_workers number_of_workers = min(max_num_workers, number_of_workers) logging.info(f'Reduced the number of workers by ' f'{abs(max_num_workers - actual)} ' f'due to interval restriction.') # Prepare work packages work_packages, split = treewalk.create_work_packages( inputs=config.get_directories(), work_package_size=config.get_package_size(), number_of_workers=number_of_workers, already_processed=[]) return (db_id, number_of_workers, work_packages, split) # Check if it is ok to run (preparing doesn't have to checked # since it cannot be interrupted) if self._state.is_paused(): return communication.Response( success=False, message='Attempted to start when TreeWalk was paused.', command=communication.MANAGER_START) if self._state.is_running(): logging.info('TWManager: attempted to start when TW is running.') return communication.Response( success=False, message='Attempted to start when TreeWalk is running.', command=communication.MANAGER_START) self._state.set_preparing(config) # Prepare the data data = prepare(config) tree_walk_id, num_workers, work_packages, work_packages_split = data # Create the worker processes and start them for id_worker in range(num_workers): queue_input = multiprocessing.Queue() queue_output = multiprocessing.Queue() worker = Worker( queue_input=queue_input, queue_output=queue_output, config=config, connection_data=self._connection_data, tree_walk_id=tree_walk_id, lock=self._worker_lock, counter=self._worker_counter, finished=self._workers_finished, num_workers=self._num_workers, measure_time=self._measure_time, event_can_exit=self._workers_can_exit, debug=environment.env.CRAWLER_LOGGING_LEVEL == 'DEBUG') worker_control = WorkerControl( worker=worker, queue_input=queue_input, queue_output=queue_output, event_finished=self._workers_can_exit) self._workers.append(worker_control) for worker_control in self._workers: worker_control.me.start() # Update the manager self._config = config self._roots = config.get_directories() self._num_workers.value = num_workers self._work_packages = work_packages self._work_packages_split = work_packages_split self._total = self._get_number_of_work_packages() self._tree_walk_id = tree_walk_id self._state.set_running(config) self._state.set_running_workers(self._num_workers.value) self._time_start = datetime.now() return communication.Response(success=True, message=communication.MANAGER_OK, command=communication.MANAGER_START)