class CoreVerificator(object): handle_key_error_for_state = HandleKeyError(state_check_log_key_error) def __init__(self, verification_options=None, advanced_verification=False): self.ver_states = {} self.advanced_verification = advanced_verification self.verification_options = verification_options def set_verification_options(self, verification_options): self.verification_options = verification_options if verification_options is None: self.advanced_verification = False else: self.advanced_verification = True def is_verified(self, subtask_id): return self.ver_states.get(subtask_id) == SubtaskVerificationState.VERIFIED @handle_key_error_for_state def get_verification_state(self, subtask_id): return self.ver_states[subtask_id] @handle_key_error_for_state def verify(self, subtask_id, subtask_info, tr_files, task): self._check_files(subtask_id, subtask_info, tr_files, task) return self.ver_states[subtask_id] def _check_files(self, subtask_id, subtask_info, tr_files, task): for tr_file in tr_files: if os.path.isfile(tr_file): self.ver_states[subtask_id] = SubtaskVerificationState.VERIFIED return self.ver_states[subtask_id] = SubtaskVerificationState.WRONG_ANSWER
class TestHandleKeyError(TestCase): h = HandleKeyError(handle_error) @staticmethod def add_to_el_in_dict(dict_, el, num): dict_[el] += num @h def test_call_with_dec(self): d = {'bla': 3} assert self.add_to_el_in_dict(d, 'kwa', 3) == 6
class CompTaskKeeper(object): """Keeps information about subtasks that should be computed by this node. """ handle_key_error = HandleKeyError(log_key_error) def __init__(self, tasks_path, persist=True): """ Create new instance of compuatational task's definition's keeper tasks_path: pathlib.Path to tasks directory """ # information about tasks that this node wants to compute self.active_tasks = {} self.subtask_to_task = {} # maps subtasks id to tasks id self.dump_path = tasks_path / "comp_task_keeper.pickle" self.persist = persist self.restore() def dump(self): if not self.persist: return logger.debug('COMPTASK DUMP: %s', self.dump_path) with self.dump_path.open('wb') as f: dump_data = self.active_tasks, self.subtask_to_task from pprint import pformat for task in self.active_tasks.values(): logger.debug('dump_data: %s', pformat(task)) pickle.dump(dump_data, f) def restore(self): if not self.persist: return logger.debug('COMPTASK RESTORE: %s', self.dump_path) if not self.dump_path.exists(): logger.debug('No previous comptask dump found.') return with self.dump_path.open('rb') as f: try: active_tasks, subtask_to_task = pickle.load(f) except (pickle.UnpicklingError, EOFError): logger.exception( 'Problem restoring dumpfile: %s', self.dump_path ) return self.active_tasks.update(active_tasks) self.subtask_to_task.update(subtask_to_task) def add_request(self, theader, price): logger.debug('CT.add_request()') if not type(price) in (int, long): raise TypeError( "Incorrect 'price' type: {}." " Should be int or long".format(type(price)) ) if price < 0: raise ValueError("Price should be greater or equal zero") task_id = theader.task_id if task_id in self.active_tasks: self.active_tasks[task_id].requests += 1 else: self.active_tasks[task_id] = CompTaskInfo(theader, price) self.dump() @handle_key_error def get_subtask_ttl(self, task_id): return self.active_tasks[task_id].header.subtask_timeout @handle_key_error def get_task_env(self, task_id): return self.active_tasks[task_id].header.environment @handle_key_error def receive_subtask(self, comp_task_def): logger.debug('CT.receive_subtask()') task = self.active_tasks[comp_task_def.task_id] if not task.requests > 0: return if comp_task_def.subtask_id in task.subtasks: return task.requests -= 1 task.subtasks[comp_task_def.subtask_id] = comp_task_def self.subtask_to_task[comp_task_def.subtask_id] = comp_task_def.task_id self.dump() return True def get_task_id_for_subtask(self, subtask_id): return self.subtask_to_task.get(subtask_id) @handle_key_error def get_node_for_task_id(self, task_id): return self.active_tasks[task_id].header.task_owner_key_id @handle_key_error def get_value(self, task_id, computing_time): price = self.active_tasks[task_id].price if not type(price) in (int, long): raise TypeError( "Incorrect 'price' type: {}." " Should be int or long".format(type(price)) ) return compute_subtask_value(price, computing_time) @handle_key_error def request_failure(self, task_id): logger.debug('CT.request_failure(%r)', task_id) self.active_tasks[task_id].requests -= 1 self.dump()
class TaskManager(TaskEventListener): """ Keeps and manages information about requested tasks """ handle_task_key_error = HandleKeyError(log_task_key_error) handle_subtask_key_error = HandleKeyError(log_subtask_key_error) def __init__(self, node_name, node, keys_auth, listen_address="", listen_port=0, root_path="res", use_distributed_resources=True, tasks_dir="tasks", task_persistence=False): super(TaskManager, self).__init__() self.apps_manager = AppsManager() self.apps_manager.load_apps() apps = self.apps_manager.apps.values() task_types = [app.task_type_info(None, app.controller) for app in apps] self.task_types = {t.name.lower(): t for t in task_types} self.node_name = node_name self.node = node self.keys_auth = keys_auth self.key_id = keys_auth.get_key_id() self.tasks = {} self.tasks_states = {} self.subtask2task_mapping = {} self.listen_address = listen_address self.listen_port = listen_port # FIXME Remove this variable and make task persistance obligatory after it is more tested # Remember to also remove it from init params self.task_persistence = task_persistence self.tasks_dir = Path(tasks_dir) if not self.tasks_dir.is_dir(): self.tasks_dir.mkdir(parents=True) self.root_path = root_path self.dir_manager = DirManager(self.get_task_manager_root()) # resource_manager = OpenStackSwiftResourceManager(self.dir_manager, # resource_dir_method=self.dir_manager.get_task_temporary_dir) resource_manager = HyperdriveResourceManager( self.dir_manager, resource_dir_method=self.dir_manager.get_task_temporary_dir) self.task_result_manager = EncryptedResultPackageManager( resource_manager) self.activeStatus = [ TaskStatus.computing, TaskStatus.starting, TaskStatus.waiting ] self.use_distributed_resources = use_distributed_resources self.comp_task_keeper = CompTaskKeeper(self.tasks_dir, persist=self.task_persistence) if self.task_persistence: self.restore_tasks() def get_task_manager_root(self): return self.root_path def get_external_address(self): request = AsyncRequest(get_external_address, self.listen_port) return async_run(request) def create_task(self, dictionary): # FIXME: remove after the new interface has been integrated with if not isinstance(dictionary, dict): return dictionary type_name = dictionary['type'].lower() task_type = self.task_types[type_name] builder_type = task_type.task_builder_type definition = builder_type.build_definition(task_type, dictionary) builder = builder_type(self.node_name, definition, self.root_path, self.dir_manager) return Task.build_task(builder) def get_task_definition_dict(self, task): if isinstance(task, dict): return task definition = task.task_definition task_type = self.task_types[definition.task_type.lower()] return task_type.task_builder_type.build_dictionary(definition) @inlineCallbacks def add_new_task(self, task): if task.header.task_id in self.tasks: raise RuntimeError("Task has been already added") if not self.key_id: raise ValueError("'key_id' is not set") if not SocketAddress.is_proper_address(self.listen_address, self.listen_port): raise IOError("Incorrect socket address") prev_pub_addr, prev_pub_port, prev_nat_type = self.node.pub_addr, self.node.pub_port, self.node.nat_type self.node.pub_addr, self.node.pub_port, self.node.nat_type = yield self.get_external_address( ) if prev_pub_addr != self.node.pub_addr or \ prev_pub_port != self.node.pub_port or \ prev_nat_type != self.node.nat_type: self.update_task_signatures() task.header.task_owner_address = self.listen_address task.header.task_owner_port = self.listen_port task.header.task_owner_key_id = self.key_id task.header.task_owner = self.node task.header.signature = self.sign_task_header(task.header) self.dir_manager.clear_temporary(task.header.task_id, undeletable=task.undeletable) self.dir_manager.get_task_temporary_dir(task.header.task_id, create=True) task.register_listener(self) task.task_status = TaskStatus.waiting self.tasks[task.header.task_id] = task ts = TaskState() ts.status = TaskStatus.waiting ts.outputs = task.get_output_names() ts.total_subtasks = task.get_total_tasks() ts.time_started = time.time() self.tasks_states[task.header.task_id] = ts if self.task_persistence: self.dump_task(task.header.task_id) logger.info("Task {} added".format(task.header.task_id)) self.notice_task_updated(task.header.task_id) def dump_task(self, task_id): logger.debug('DUMP TASK') try: data = self.tasks[task_id], self.tasks_states[task_id] filepath = self.tasks_dir / ('%s.pickle' % (task_id, )) logger.debug('DUMP TASK %r', filepath) with filepath.open('wb') as f: pickle.dump(data, f, protocol=2) except: logger.exception('DUMP ERROR task_id: %r task: %r state: %r', task_id, self.tasks.get(task_id, '<not found>'), self.tasks_states.get(task_id, '<not found>')) if filepath.exists(): filepath.unlink() raise def restore_tasks(self): logger.debug('RESTORE TASKS') for path in self.tasks_dir.iterdir(): logger.debug('RESTORE TASKS %r', path) if not path.suffix == '.pickle': continue logger.debug('RESTORE TASKS really %r', path) with path.open('rb') as f: try: task, state = pickle.load(f) self.tasks[task.header.task_id] = task self.tasks_states[task.header.task_id] = state except (pickle.UnpicklingError, EOFError, ImportError): logger.exception('Problem restoring task from: %s', path) path.unlink() continue dispatcher.send(signal='golem.taskmanager', event='task_restored', task=task, state=state) @handle_task_key_error def resources_send(self, task_id): self.tasks_states[task_id].status = TaskStatus.waiting self.tasks[task_id].task_status = TaskStatus.waiting self.notice_task_updated(task_id) logger.info("Resources for task {} sent".format(task_id)) def get_next_subtask(self, node_id, node_name, task_id, estimated_performance, price, max_resource_size, max_memory_size, num_cores=0, address=""): """ Assign next subtask from task <task_id> to node with given id <node_id> and name. If subtask is assigned the function is returning a tuple ( :param node_id: :param node_name: :param task_id: :param estimated_performance: :param price: :param max_resource_size: :param max_memory_size: :param num_cores: :param address: :return (ComputeTaskDef|None, bool, bool): Function returns a triplet. First element is either ComputeTaskDef that describe assigned subtask or None. The second element describes whether the task_id is a wrong task that isn't in task manager register. If task with <task_id> it's a known task then second element of a pair is always False (regardless new subtask was assigned or not). The third element describes whether we're waiting for client's other task results. """ logger.debug('get_next_subtask(%r, %r, %r, %r, %r, %r, %r, %r, %r)', node_id, node_name, task_id, estimated_performance, price, max_resource_size, max_memory_size, num_cores, address) if task_id not in self.tasks: logger.info("Cannot find task {} in my tasks".format(task_id)) return None, True, False task = self.tasks[task_id] if task.header.max_price < price: return None, False, False def has_subtasks(): if self.tasks_states[task_id].status not in self.activeStatus: logger.debug('state no in activestatus') return False if not task.needs_computation(): logger.debug('not task.needs_computation') return False if task.header.resource_size > (long(max_resource_size) * 1024): logger.debug('resources size >') return False if task.header.estimated_memory > (long(max_memory_size) * 1024): logger.debug('estimated memory >') return False return True if not has_subtasks(): logger.info( "Cannot get next task for estimated performance {}".format( estimated_performance)) return None, False, False extra_data = task.query_extra_data(estimated_performance, num_cores, node_id, node_name) if extra_data.should_wait: return None, False, True ctd = extra_data.ctd def check_compute_task_def(): if not isinstance(ctd, ComputeTaskDef) or not ctd.subtask_id: logger.debug('check ctd: ctd not instance or not subtask_id') return False if task_id != ctd.task_id or ctd.subtask_id in self.subtask2task_mapping: logger.debug( 'check ctd: %r != %r or %r in self.subtask2task_maping', task_id, ctd.task_id, ctd.subtask_id) return False if ctd.subtask_id in self.tasks_states[ctd.task_id].subtask_states: logger.debug('check ctd: subtask_states') return False return True if not check_compute_task_def(): return None, False, False ctd.key_id = task.header.task_owner_key_id ctd.return_address = task.header.task_owner_address ctd.return_port = task.header.task_owner_port ctd.task_owner = task.header.task_owner self.subtask2task_mapping[ctd.subtask_id] = task_id self.__add_subtask_to_tasks_states(node_name, node_id, price, ctd, address) self.notice_task_updated(task_id) return ctd, False, extra_data.should_wait def get_tasks_headers(self): ret = [] for t in self.tasks.values(): if t.needs_computation() and t.task_status in self.activeStatus: ret.append(t.header) return ret def get_trust_mod(self, subtask_id): if subtask_id in self.subtask2task_mapping: task_id = self.subtask2task_mapping[subtask_id] return self.tasks[task_id].get_trust_mod(subtask_id) else: logger.error("This is not my subtask {}".format(subtask_id)) return 0 def update_task_signatures(self): for task in self.tasks.values(): task.header.signature = self.sign_task_header(task.header) def sign_task_header(self, task_header): return self.keys_auth.sign(task_header.to_binary()) def verify_subtask(self, subtask_id): if subtask_id in self.subtask2task_mapping: task_id = self.subtask2task_mapping[subtask_id] return self.tasks[task_id].verify_subtask(subtask_id) else: return False def get_node_id_for_subtask(self, subtask_id): if subtask_id in self.subtask2task_mapping: subtask_state = self.tasks_states[self.subtask2task_mapping[ subtask_id]].subtask_states[subtask_id] return subtask_state.computer.node_id else: return None def set_value(self, task_id, subtask_id, value): if type(value) not in (int, long): raise TypeError( "Incorrect 'value' type: {}. Should be int or long".format( type(value))) task_state = self.tasks_states.get(task_id) if task_state is None: logger.warning("This is not my task {}".format(task_id)) return subtask_state = task_state.subtask_states.get(subtask_id) if subtask_state is None: logger.warning("This is not my subtask {}".format(subtask_id)) return subtask_state.value = value @handle_subtask_key_error def get_value(self, subtask_id): """ Return value of a given subtask :param subtask_id: id of a computed subtask :return long: price that should be paid for given subtask """ task_id = self.subtask2task_mapping[subtask_id] return self.tasks_states[task_id].subtask_states[subtask_id].value @handle_subtask_key_error def computed_task_received(self, subtask_id, result, result_type): task_id = self.subtask2task_mapping[subtask_id] subtask_state = self.tasks_states[task_id].subtask_states[subtask_id] subtask_status = subtask_state.subtask_status if not SubtaskStatus.is_computed(subtask_status): logger.warning( "Result for subtask {} when subtask state is {}".format( subtask_id, subtask_status)) self.notice_task_updated(task_id) return False self.tasks[task_id].computation_finished(subtask_id, result, result_type) ss = self.tasks_states[task_id].subtask_states[subtask_id] ss.subtask_progress = 1.0 ss.subtask_rem_time = 0.0 ss.subtask_status = SubtaskStatus.finished ss.stdout = self.tasks[task_id].get_stdout(subtask_id) ss.stderr = self.tasks[task_id].get_stderr(subtask_id) ss.results = self.tasks[task_id].get_results(subtask_id) if not self.tasks[task_id].verify_subtask(subtask_id): logger.debug("Subtask {} not accepted\n".format(subtask_id)) ss.subtask_status = SubtaskStatus.failure self.notice_task_updated(task_id) return False if self.tasks_states[task_id].status in self.activeStatus: if not self.tasks[task_id].finished_computation(): self.tasks_states[task_id].status = TaskStatus.computing else: if self.tasks[task_id].verify_task(): logger.debug("Task {} accepted".format(task_id)) self.tasks_states[task_id].status = TaskStatus.finished else: logger.debug("Task {} not accepted".format(task_id)) self.notice_task_updated(task_id) return True @handle_subtask_key_error def task_computation_failure(self, subtask_id, err): task_id = self.subtask2task_mapping[subtask_id] subtask_state = self.tasks_states[task_id].subtask_states[subtask_id] subtask_status = subtask_state.subtask_status if not SubtaskStatus.is_computed(subtask_status): logger.warning( "Result for subtask {} when subtask state is {}".format( subtask_id, subtask_status)) self.notice_task_updated(task_id) return False self.tasks[task_id].computation_failed(subtask_id) ss = self.tasks_states[task_id].subtask_states[subtask_id] ss.subtask_progress = 1.0 ss.subtask_rem_time = 0.0 ss.subtask_status = SubtaskStatus.failure ss.stderr = str(err) self.notice_task_updated(task_id) return True def task_result_incoming(self, subtask_id): node_id = self.get_node_id_for_subtask(subtask_id) if node_id and subtask_id in self.subtask2task_mapping: task_id = self.subtask2task_mapping[subtask_id] if task_id in self.tasks: task = self.tasks[task_id] states = self.tasks_states[task_id].subtask_states[subtask_id] task.result_incoming(subtask_id) states.subtask_status = SubtaskStatus.downloading self.notify_update_task(task_id) else: logger.error("Unknown task id: {}".format(task_id)) else: logger.error("Node_id {} or subtask_id {} does not exist".format( node_id, subtask_id)) # CHANGE TO RETURN KEY_ID (check IF SUBTASK COMPUTER HAS KEY_ID def check_timeouts(self): nodes_with_timeouts = [] for t in self.tasks.values(): th = t.header if self.tasks_states[th.task_id].status not in self.activeStatus: continue cur_time = get_timestamp_utc() if cur_time > th.deadline: logger.info("Task {} dies".format(th.task_id)) t.task_stats = TaskStatus.timeout self.tasks_states[th.task_id].status = TaskStatus.timeout self.notice_task_updated(th.task_id) ts = self.tasks_states[th.task_id] for s in ts.subtask_states.values(): if SubtaskStatus.is_computed(s.subtask_status): if cur_time > s.deadline: logger.info("Subtask {} dies".format(s.subtask_id)) s.subtask_status = SubtaskStatus.failure nodes_with_timeouts.append(s.computer.node_id) t.computation_failed(s.subtask_id) s.stderr = "[GOLEM] Timeout" self.notice_task_updated(th.task_id) return nodes_with_timeouts def get_progresses(self): tasks_progresses = {} for t in self.tasks.values(): if t.get_progress() < 1.0: ltss = LocalTaskStateSnapshot(t.header.task_id, t.get_total_tasks(), t.get_active_tasks(), t.get_progress(), t.short_extra_data_repr(2200.0)) tasks_progresses[t.header.task_id] = ltss return tasks_progresses @handle_task_key_error def get_resources(self, task_id, resource_header, resource_type=0): task = self.tasks[task_id] return task.get_resources(resource_header, resource_type) @handle_task_key_error def restart_task(self, task_id): logger.info("restarting task") self.dir_manager.clear_temporary( task_id, undeletable=self.tasks[task_id].undeletable) self.tasks[task_id].restart() self.tasks[task_id].task_status = TaskStatus.waiting self.tasks_states[task_id].status = TaskStatus.waiting self.tasks_states[task_id].time_started = time.time() for ss in self.tasks_states[task_id].subtask_states.values(): if ss.subtask_status != SubtaskStatus.failure: ss.subtask_status = SubtaskStatus.restarted self.notice_task_updated(task_id) @handle_subtask_key_error def restart_subtask(self, subtask_id): task_id = self.subtask2task_mapping[subtask_id] self.tasks[task_id].restart_subtask(subtask_id) self.tasks_states[task_id].status = TaskStatus.computing self.tasks_states[task_id].subtask_states[ subtask_id].subtask_status = SubtaskStatus.restarted self.tasks_states[task_id].subtask_states[ subtask_id].stderr = "[GOLEM] Restarted" self.notice_task_updated(task_id) @handle_task_key_error def abort_task(self, task_id): self.tasks[task_id].abort() self.tasks[task_id].task_status = TaskStatus.aborted self.tasks_states[task_id].status = TaskStatus.aborted for sub in self.tasks_states[task_id].subtask_states.values(): del self.subtask2task_mapping[sub.subtask_id] self.tasks_states[task_id].subtask_states.clear() self.notice_task_updated(task_id) @handle_task_key_error def pause_task(self, task_id): self.tasks[task_id].task_status = TaskStatus.paused self.tasks_states[task_id].status = TaskStatus.paused self.notice_task_updated(task_id) @handle_task_key_error def resume_task(self, task_id): self.tasks[task_id].task_status = TaskStatus.starting self.tasks_states[task_id].status = TaskStatus.starting self.notice_task_updated(task_id) @handle_task_key_error def delete_task(self, task_id): for sub in self.tasks_states[task_id].subtask_states.values(): del self.subtask2task_mapping[sub.subtask_id] self.tasks_states[task_id].subtask_states.clear() self.tasks[task_id].unregister_listener(self) del self.tasks[task_id] del self.tasks_states[task_id] self.dir_manager.clear_temporary(task_id) @handle_task_key_error def query_task_state(self, task_id): ts = self.tasks_states[task_id] t = self.tasks[task_id] ts.progress = t.get_progress() ts.elapsed_time = time.time() - ts.time_started if ts.progress > 0.0: ts.remaining_time = (ts.elapsed_time / ts.progress) - ts.elapsed_time else: ts.remaining_time = -0.0 t.update_task_state(ts) return ts def get_subtasks(self, task_id): """ Get all subtasks related to given task id :param task_id: Task ID :return: list of all subtasks related with @task_id or None if @task_id is not known """ if task_id not in self.tasks_states: return None return [ sub.subtask_id for sub in self.tasks_states[task_id].subtask_states.values() ] def change_config(self, root_path, use_distributed_resource_management): self.dir_manager = DirManager(root_path) self.use_distributed_resources = use_distributed_resource_management @handle_task_key_error def change_timeouts(self, task_id, full_task_timeout, subtask_timeout): task = self.tasks[task_id] task.header.deadline = timeout_to_deadline(full_task_timeout) task.header.subtask_timeout = subtask_timeout task.full_task_timeout = full_task_timeout task.header.last_checking = time.time() def get_task_id(self, subtask_id): return self.subtask2task_mapping[subtask_id] def get_task_dict(self, task_id): task = self.tasks[task_id] # single=True retrieves one preview file. If rendering frames, # it's the preview of the most recently computed frame. dictionary = {u'preview': task.get_preview(single=True)} dictionary.update(self.get_simple_task_dict(task)) dictionary.update(self.get_task_definition_dict(task)) return dictionary def get_tasks_dict(self): return [self.get_simple_task_dict(t) for t in self.tasks.itervalues()] def get_subtask_dict(self, subtask_id): task_id = self.subtask2task_mapping[subtask_id] task_state = self.tasks_states[task_id] subtask = task_state.subtask_states[subtask_id] return subtask.to_dictionary() def get_subtasks_dict(self, task_id): task_state = self.tasks_states[task_id] subtasks = task_state.subtask_states return [subtask.to_dictionary() for subtask in subtasks.itervalues()] def get_subtasks_borders(self, task_id): task = self.tasks[task_id] task_type_name = task.task_definition.task_type.lower() task_type = self.task_types[task_type_name] task_state = self.tasks_states[task_id] total_subtasks = task.get_total_tasks() return { to_unicode(subtask.subtask_id): task_type.get_task_border(subtask, task.task_definition, total_subtasks, as_path=True) for subtask in task_state.subtask_states.values() } def get_simple_task_dict(self, task): state = self.tasks_states.get(task.header.task_id) timeout = task.task_definition.full_task_timeout dictionary = {u'duration': max(timeout - state.remaining_time, 0)} dictionary.update(task.to_dictionary()) dictionary.update(state.to_dictionary()) return dictionary def get_task_preview(self, task_id, single=False): return self.tasks[task_id].get_preview(single=single) @handle_subtask_key_error def set_computation_time(self, subtask_id, computation_time): """ Set computation time for subtask and also compute and set new value based on saved price for this subtask :param str subtask_id: subtask which was computed in given computation_time :param float computation_time: how long does it take to compute this task :return: """ task_id = self.subtask2task_mapping[subtask_id] ss = self.tasks_states[task_id].subtask_states[subtask_id] ss.computation_time = computation_time ss.value = compute_subtask_value(ss.computer.price, computation_time) def add_comp_task_request(self, theader, price): """ Add a header of a task which this node may try to compute """ self.comp_task_keeper.add_request(theader, price) @handle_task_key_error def get_payment_for_task_id(self, task_id): val = 0.0 t = self.tasks_states[task_id] for ss in t.subtask_states.values(): val += ss.value return val def __add_subtask_to_tasks_states(self, node_name, node_id, price, ctd, address): if ctd.task_id not in self.tasks_states: raise RuntimeError("Should never be here!") logger.debug('add_subtask_to_tasks_states(%r, %r, %r, %r, %r)', node_name, node_id, price, ctd, address) ss = SubtaskState() ss.computer.node_id = node_id ss.computer.node_name = node_name ss.computer.performance = ctd.performance ss.computer.ip_address = address ss.computer.price = price ss.time_started = time.time() ss.deadline = ctd.deadline # TODO: read node ip address ss.subtask_definition = ctd.short_description ss.subtask_id = ctd.subtask_id ss.extra_data = ctd.extra_data ss.subtask_status = TaskStatus.starting ss.value = 0 self.tasks_states[ctd.task_id].subtask_states[ctd.subtask_id] = ss def notify_update_task(self, task_id): self.notice_task_updated(task_id) @handle_task_key_error def notice_task_updated(self, task_id): # self.save_state() if self.task_persistence: self.dump_task(task_id) dispatcher.send(signal='golem.taskmanager', event='task_status_updated', task_id=task_id)
class TaskManager(TaskEventListener): """ Keeps and manages information about requested tasks Requestor uses TaskManager to assign task to providers """ handle_task_key_error = HandleKeyError(log_task_key_error) handle_subtask_key_error = HandleKeyError(log_subtask_key_error) handle_generic_key_error = HandleForwardedError(KeyError, log_generic_key_error) class Error(Exception): pass class AlreadyRestartedError(Error): pass def __init__(self, node_name, node, keys_auth, root_path, use_distributed_resources=True, tasks_dir="tasks", task_persistence=True, apps_manager=AppsManager(), finished_cb=None): super().__init__() self.apps_manager = apps_manager apps = list(apps_manager.apps.values()) task_types = [app.task_type_info() for app in apps] self.task_types = {t.name.lower(): t for t in task_types} self.node_name = node_name self.node = node self.keys_auth = keys_auth self.key_id = keys_auth.key_id self.tasks: Dict[str, Task] = {} self.tasks_states: Dict[str, TaskState] = {} self.subtask2task_mapping: Dict[str, str] = {} self.task_persistence = task_persistence tasks_dir = Path(tasks_dir) self.tasks_dir = tasks_dir / "tmanager" if not self.tasks_dir.is_dir(): self.tasks_dir.mkdir(parents=True) self.root_path = root_path self.dir_manager = DirManager(self.get_task_manager_root()) resource_manager = HyperdriveResourceManager( self.dir_manager, resource_dir_method=self.dir_manager.get_task_temporary_dir, ) self.task_result_manager = EncryptedResultPackageManager( resource_manager) self.activeStatus = [ TaskStatus.computing, TaskStatus.starting, TaskStatus.waiting ] self.use_distributed_resources = use_distributed_resources self.comp_task_keeper = CompTaskKeeper( tasks_dir, persist=self.task_persistence, ) self.requestor_stats_manager = RequestorTaskStatsManager() self.finished_cb = finished_cb if self.task_persistence: self.restore_tasks() def get_task_manager_root(self): return self.root_path def create_task(self, dictionary, minimal=False): purpose = TaskPurpose.TESTING if minimal else TaskPurpose.REQUESTING type_name = dictionary['type'].lower() compute_on = dictionary.get('compute_on', 'cpu').lower() is_requesting = purpose == TaskPurpose.REQUESTING if type_name == "blender" and is_requesting and compute_on == "gpu": type_name = type_name + "_nvgpu" task_type = self.task_types[type_name].for_purpose(purpose) builder_type = task_type.task_builder_type definition = builder_type.build_definition(task_type, dictionary, minimal) definition.task_id = CoreTask.create_task_id(self.keys_auth.public_key) definition.concent_enabled = dictionary.get('concent_enabled', False) builder = builder_type(self.node, definition, self.dir_manager) return builder.build() def get_task_definition_dict(self, task: Task): if isinstance(task, dict): return task definition = task.task_definition task_type = self.task_types[definition.task_type.lower()] return task_type.task_builder_type.build_dictionary(definition) def add_new_task(self, task: Task, estimated_fee: int = 0) -> None: task_id = task.header.task_id if task_id in self.tasks: raise RuntimeError("Task {} has been already added".format( task.header.task_id)) if not self.key_id: raise ValueError("'key_id' is not set") task.header.fixed_header.task_owner = self.node task.header.signature = self.sign_task_header(task.header) task.create_reference_data_for_task_validation() task.register_listener(self) ts = TaskState() ts.status = TaskStatus.notStarted ts.outputs = task.get_output_names() ts.subtasks_count = task.get_total_tasks() ts.time_started = time.time() ts.estimated_cost = task.price ts.estimated_fee = estimated_fee self.tasks[task_id] = task self.tasks_states[task_id] = ts logger.info("Task %s added", task_id) self.notice_task_updated(task_id, op=TaskOp.CREATED, persist=False) @handle_task_key_error def increase_task_mask(self, task_id: str, num_bits: int = 1) -> None: """ Increase mask for given task i.e. make it more restrictive """ task = self.tasks[task_id] try: task.header.mask.increase(num_bits) except ValueError: logger.exception('Wrong number of bits for mask increase') else: task.header.signature = self.sign_task_header(task.header) @handle_task_key_error def decrease_task_mask(self, task_id: str, num_bits: int = 1) -> None: """ Decrease mask for given task i.e. make it less restrictive """ task = self.tasks[task_id] try: task.header.mask.decrease(num_bits) except ValueError: logger.exception('Wrong number of bits for mask decrease') else: task.header.signature = self.sign_task_header(task.header) @handle_task_key_error def start_task(self, task_id): task_state = self.tasks_states[task_id] if not task_state.status.is_preparing(): raise RuntimeError( "Task {} has already been started".format(task_id)) task_state.status = TaskStatus.waiting self.notice_task_updated(task_id, op=TaskOp.STARTED) logger.info("Task %s started", task_id) def _dump_filepath(self, task_id): return self.tasks_dir / ('%s.pickle' % (task_id, )) def dump_task(self, task_id: str) -> None: logger.debug('DUMP TASK %r', task_id) filepath = self._dump_filepath(task_id) try: data = self.tasks[task_id], self.tasks_states[task_id] logger.debug('DUMPING TASK %r', filepath) with filepath.open('wb') as f: pickle.dump(data, f, protocol=2) logger.debug('TASK %s DUMPED in %r', task_id, filepath) except Exception as e: logger.exception( 'DUMP ERROR task_id: %r task: %r state: %r', task_id, self.tasks.get(task_id, '<not found>'), self.tasks_states.get(task_id, '<not found>'), ) if filepath.exists(): filepath.unlink() raise def remove_dump(self, task_id: str): filepath = self._dump_filepath(task_id) try: filepath.unlink() logger.debug('TASK DUMP with id %s REMOVED from %r', task_id, filepath) except (FileNotFoundError, OSError) as e: logger.warning("Couldn't remove dump file: %s - %s", filepath, e) @staticmethod def _migrate_status_to_enum(state: TaskState) -> None: """ This is a migration for data stored in pickles. See #2768 """ if isinstance(state.status, str): state.status = TaskStatus(state.status) subtask_state: SubtaskState for subtask_state in state.subtask_states.values(): if isinstance(subtask_state.subtask_status, str): subtask_state.subtask_status = \ SubtaskStatus(subtask_state.subtask_status) def restore_tasks(self) -> None: logger.debug('SEARCHING FOR TASKS TO RESTORE') broken_paths = set() for path in self.tasks_dir.iterdir(): if not path.suffix == '.pickle': continue logger.debug('RESTORE TASKS %r', path) task_id = None with path.open('rb') as f: try: task: Task state: TaskState task, state = pickle.load(f) TaskManager._migrate_status_to_enum(state) task.register_listener(self) task_id = task.header.task_id self.tasks[task_id] = task self.tasks_states[task_id] = state for sub in state.subtask_states.values(): self.subtask2task_mapping[sub.subtask_id] = task_id logger.debug('TASK %s RESTORED from %r', task_id, path) except (pickle.UnpicklingError, EOFError, ImportError, KeyError, AttributeError): logger.exception('Problem restoring task from: %s', path) # On Windows, attempting to remove a file that is in use # causes an exception to be raised, therefore # we'll remove broken files later broken_paths.add(path) if task_id is not None: self.notice_task_updated(task_id, op=TaskOp.RESTORED, persist=False) for path in broken_paths: path.unlink() @handle_task_key_error def resources_send(self, task_id): self.tasks_states[task_id].status = TaskStatus.waiting self.notice_task_updated(task_id) logger.info("Resources for task {} sent".format(task_id)) def got_wants_to_compute( self, task_id: str, key_id: str, # pylint: disable=unused-argument node_name: str): # pylint: disable=unused-argument """ Updates number of offers to compute task. For statistical purposes only, real processing of the offer is done elsewhere. Silently ignores wrong task ids. :param str task_id: id of the task in the offer :param key_id: id of the node offering computations :param node_name: name of the node offering computations :return: Nothing :rtype: None """ if task_id in self.tasks: self.notice_task_updated(task_id, op=TaskOp.WORK_OFFER_RECEIVED, persist=False) def task_needs_computation(self, task_id: str) -> bool: task_status = self.tasks_states[task_id].status if task_status not in self.activeStatus: logger.info( f'task is not active: {task_id}, status: {task_status}') return False task = self.tasks[task_id] if not task.needs_computation(): logger.info(f'no more computation needed: {task_id}') return False return True def get_next_subtask(self, node_id, node_name, task_id, estimated_performance, price, max_resource_size, max_memory_size, num_cores=0, address=""): """ Assign next subtask from task <task_id> to node with given id <node_id> and name. If subtask is assigned the function is returning a tuple :param node_id: :param node_name: :param task_id: :param estimated_performance: :param price: :param max_resource_size: :param max_memory_size: :param num_cores: :param address: :return (ComputeTaskDef|None: Function returns a ComputeTaskDef. First element is either ComputeTaskDef that describe assigned subtask or None. It is recommended to call is_my_task and should_wait_for_node before this to find the reason why the task is not able to be picked up """ logger.debug( 'get_next_subtask(%r, %r, %r, %r, %r, %r, %r, %r, %r)', node_id, node_name, task_id, estimated_performance, price, max_resource_size, max_memory_size, num_cores, address, ) if not self.is_my_task(task_id): return None if not self.check_next_subtask(node_id, node_name, task_id, price): return None if self.should_wait_for_node(task_id, node_id): return None task = self.tasks[task_id] if task.get_progress() == 1.0: logger.error( "Task already computed. " "task_id=%r, node_name=%r, node_id=%r", task_id, node_name, node_id) return None extra_data = task.query_extra_data(estimated_performance, num_cores, node_id, node_name) ctd = extra_data.ctd def check_compute_task_def(): if not isinstance(ctd, ComputeTaskDef) or not ctd['subtask_id']: logger.debug('check ctd: ctd not instance or not subtask_id') return False if task_id != ctd['task_id'] \ or ctd['subtask_id'] in self.subtask2task_mapping: logger.debug( 'check ctd: %r != %r or %r in self.subtask2task_maping', task_id, ctd['task_id'], ctd['subtask_id'], ) return False if (ctd['subtask_id'] in self.tasks_states[ctd['task_id']].subtask_states): logger.debug('check ctd: subtask_states') return False return True if not check_compute_task_def(): return None task.accept_client(node_id) self.subtask2task_mapping[ctd['subtask_id']] = task_id self.__add_subtask_to_tasks_states( node_name, node_id, ctd, address, ) self.notice_task_updated(task_id, subtask_id=ctd['subtask_id'], op=SubtaskOp.ASSIGNED) logger.debug( "Subtask generated. task=%s, node=%s, ctd=%s", task_id, node_info_str(node_name, node_id), ctd, ) return ctd def is_my_task(self, task_id) -> bool: """ Check if the task_id is known by this node """ return task_id in self.tasks def should_wait_for_node(self, task_id, node_id) -> bool: """ Check if the node has too many tasks assigned already """ if not self.is_my_task(task_id): logger.debug( "Not my task. task_id=%s, node=%s", task_id, short_node_id(node_id), ) return False task = self.tasks[task_id] verdict = task.should_accept_client(node_id) logger.debug( "Should accept client verdict. verdict=%s, task=%s, node=%s", verdict, task_id, short_node_id(node_id), ) if verdict == AcceptClientVerdict.SHOULD_WAIT: logger.warning("Waiting for results from %s on %s", short_node_id(node_id), task_id) return True elif verdict == AcceptClientVerdict.REJECTED: logger.warning( "Client has failed on subtask within this task" " and is banned from it. node_id=%s, task_id=%s", short_node_id(node_id), task_id) return False def check_next_subtask( # noqa pylint: disable=too-many-arguments self, node_id, node_name, task_id, price): """ Check next subtask from task <task_id> to give to node with id <node_id> and name. The returned tuple can be used to find the reason and handle accordingly. :param node_id: :param node_name: :param task_id: :param price: :return bool: Function returns a boolean. The return value describes if the task is able to be assigned """ logger.debug( 'check_next_subtask(%r, %r, %r, %r)', node_id, node_name, task_id, price, ) if not self.is_my_task(task_id): logger.info( "Cannot find task in my tasks. task_id=%s, provider=%s", task_id, node_info_str(node_name, node_id)) return False task = self.tasks[task_id] if task.header.max_price < price: return False if not self.task_needs_computation(task_id): logger.info( 'Task does not need computation. task_id=%s, provider=%s', task_id, node_info_str(node_name, node_id)) return False return True def copy_results(self, old_task_id: str, new_task_id: str, subtask_ids_to_copy: Iterable[str]) -> None: try: old_task = self.tasks[old_task_id] new_task = self.tasks[new_task_id] assert isinstance(old_task, CoreTask) assert isinstance(new_task, CoreTask) except (KeyError, AssertionError): logger.exception('Cannot copy results from task %r to %r', old_task_id, new_task_id) return # Map new subtasks to old by 'start_task' subtasks_to_copy = { subtask['start_task']: subtask for subtask in map(lambda id_: old_task.subtasks_given[id_], subtask_ids_to_copy) } # Generate all subtasks for the new task new_subtasks_ids = [] while new_task.needs_computation(): extra_data = new_task.query_extra_data(0, node_id=str(uuid.uuid4())) new_subtask_id = extra_data.ctd['subtask_id'] self.subtask2task_mapping[new_subtask_id] = \ new_task_id self.__add_subtask_to_tasks_states(node_name=None, node_id=None, address=None, ctd=extra_data.ctd) new_subtasks_ids.append(new_subtask_id) # it's important to do this step separately, to not disturb # 'needs_computation' condition above for new_subtask_id in new_subtasks_ids: self.tasks_states[new_task_id].subtask_states[new_subtask_id]\ .subtask_status = SubtaskStatus.failure new_task.subtasks_given[new_subtask_id]['status'] \ = SubtaskStatus.failure new_task.num_failed_subtasks += 1 def handle_copy_error(subtask_id, error): logger.error('Cannot copy result of subtask %r: %r', subtask_id, error) self.restart_subtask(subtask_id) for new_subtask_id, new_subtask in new_task.subtasks_given.items(): old_subtask = subtasks_to_copy.get(new_subtask['start_task']) if old_subtask: # Copy results from old subtask deferred = self._copy_subtask_results(old_task=old_task, new_task=new_task, old_subtask=old_subtask, new_subtask=new_subtask) deferred.addErrback(partial(handle_copy_error, new_subtask_id)) else: # Restart subtask to get it computed self.restart_subtask(new_subtask_id) def _copy_subtask_results(self, old_task: CoreTask, new_task: CoreTask, old_subtask: dict, new_subtask: dict) -> Deferred: old_task_id = old_task.header.task_id new_task_id = new_task.header.task_id assert isinstance(old_task.tmp_dir, str) assert isinstance(new_task.tmp_dir, str) old_tmp_dir = Path(old_task.tmp_dir) new_tmp_dir = Path(new_task.tmp_dir) old_subtask_id = old_subtask['subtask_id'] new_subtask_id = new_subtask['subtask_id'] def copy_and_extract_zips(): # TODO: Refactor this using package manager (?) old_result_path = old_tmp_dir / '{}.{}.zip'.format( old_task_id, old_subtask_id) new_result_path = new_tmp_dir / '{}.{}.zip'.format( new_task_id, new_subtask_id) shutil.copy(old_result_path, new_result_path) subtask_result_dir = new_tmp_dir / new_subtask_id os.makedirs(subtask_result_dir) with ZipFile(new_result_path, 'r') as zf: zf.extractall(subtask_result_dir) return [ str(subtask_result_dir / name) for name in zf.namelist() if name != '.package_desc' ] def after_results_extracted(results): new_task.copy_subtask_results(new_subtask_id, old_subtask, results) new_subtask_state = \ self.__set_subtask_state_finished(new_subtask_id) old_subtask_state = self.tasks_states[old_task_id] \ .subtask_states[old_subtask_id] self.notice_task_updated(task_id=new_task_id, subtask_id=new_subtask_id, op=SubtaskOp.FINISHED) deferred = deferToThread(copy_and_extract_zips) deferred.addCallback(after_results_extracted) return deferred def get_tasks_headers(self): ret = [] for tid, task in self.tasks.items(): status = self.tasks_states[tid].status if task.needs_computation() and status in self.activeStatus: ret.append(task.header) return ret def get_trust_mod(self, subtask_id): if subtask_id in self.subtask2task_mapping: task_id = self.subtask2task_mapping[subtask_id] return self.tasks[task_id].get_trust_mod(subtask_id) else: logger.error("This is not my subtask {}".format(subtask_id)) return 0 def update_task_signatures(self): for task in list(self.tasks.values()): task.header.signature = self.sign_task_header(task.header) def sign_task_header(self, task_header): return self.keys_auth.sign(task_header.to_binary()) def verify_subtask(self, subtask_id): if subtask_id in self.subtask2task_mapping: task_id = self.subtask2task_mapping[subtask_id] return self.tasks[task_id].verify_subtask(subtask_id) else: return False def is_this_my_task(self, header: TaskHeader) -> bool: return header.task_id in self.tasks or \ header.task_owner.key == self.node.key def get_node_id_for_subtask(self, subtask_id): if subtask_id not in self.subtask2task_mapping: return None task = self.subtask2task_mapping[subtask_id] subtask_state = self.tasks_states[task].subtask_states[subtask_id] return subtask_state.node_id @handle_subtask_key_error def computed_task_received(self, subtask_id, result, verification_finished): task_id = self.subtask2task_mapping[subtask_id] subtask_state = self.tasks_states[task_id].subtask_states[subtask_id] subtask_status = subtask_state.subtask_status if not subtask_status.is_computed(): logger.warning( "Result for subtask {} when subtask state is {}".format( subtask_id, subtask_status.value)) self.notice_task_updated(task_id, subtask_id=subtask_id, op=OtherOp.UNEXPECTED) verification_finished() return subtask_state.subtask_status = SubtaskStatus.verifying @TaskManager.handle_generic_key_error def verification_finished_(): ss = self.__set_subtask_state_finished(subtask_id) if not self.tasks[task_id].verify_subtask(subtask_id): logger.debug("Subtask %r not accepted\n", subtask_id) ss.subtask_status = SubtaskStatus.failure self.notice_task_updated(task_id, subtask_id=subtask_id, op=SubtaskOp.NOT_ACCEPTED) verification_finished() return self.notice_task_updated(task_id, subtask_id=subtask_id, op=SubtaskOp.FINISHED) if self.tasks_states[task_id].status in self.activeStatus: if not self.tasks[task_id].finished_computation(): self.tasks_states[task_id].status = TaskStatus.computing else: if self.tasks[task_id].verify_task(): logger.info("Task finished! task_id=%r", task_id) self.tasks_states[task_id].status =\ TaskStatus.finished self.notice_task_updated(task_id, op=TaskOp.FINISHED) else: logger.warning( "Task finished but was not accepted. " "task_id=%r", task_id) self.notice_task_updated(task_id, op=TaskOp.NOT_ACCEPTED) verification_finished() self.tasks[task_id].computation_finished(subtask_id, result, verification_finished_) @handle_subtask_key_error def __set_subtask_state_finished(self, subtask_id: str) -> SubtaskState: task_id = self.subtask2task_mapping[subtask_id] ss = self.tasks_states[task_id].subtask_states[subtask_id] ss.subtask_progress = 1.0 ss.subtask_rem_time = 0.0 ss.subtask_status = SubtaskStatus.finished ss.stdout = self.tasks[task_id].get_stdout(subtask_id) ss.stderr = self.tasks[task_id].get_stderr(subtask_id) ss.results = self.tasks[task_id].get_results(subtask_id) return ss @handle_subtask_key_error def task_computation_failure(self, subtask_id, err): task_id = self.subtask2task_mapping[subtask_id] subtask_state = self.tasks_states[task_id].subtask_states[subtask_id] subtask_status = subtask_state.subtask_status if not subtask_status.is_computed(): logger.warning( "Result for subtask {} when subtask state is {}".format( subtask_id, subtask_status.value)) self.notice_task_updated(task_id, subtask_id=subtask_id, op=OtherOp.UNEXPECTED) return False self.tasks[task_id].computation_failed(subtask_id) ss = self.tasks_states[task_id].subtask_states[subtask_id] ss.subtask_progress = 1.0 ss.subtask_rem_time = 0.0 ss.subtask_status = SubtaskStatus.failure ss.stderr = str(err) self.notice_task_updated(task_id, subtask_id=subtask_id, op=SubtaskOp.FAILED) return True def task_result_incoming(self, subtask_id): node_id = self.get_node_id_for_subtask(subtask_id) if node_id and subtask_id in self.subtask2task_mapping: task_id = self.subtask2task_mapping[subtask_id] if task_id in self.tasks: task = self.tasks[task_id] states = self.tasks_states[task_id].subtask_states[subtask_id] task.result_incoming(subtask_id) states.subtask_status = SubtaskStatus.downloading self.notice_task_updated(task_id, subtask_id=subtask_id, op=SubtaskOp.RESULT_DOWNLOADING) else: logger.error("Unknown task id: {}".format(task_id)) else: logger.error("Node_id {} or subtask_id {} does not exist".format( node_id, subtask_id)) # CHANGE TO RETURN KEY_ID (check IF SUBTASK COMPUTER HAS KEY_ID def check_timeouts(self): nodes_with_timeouts = [] for t in list(self.tasks.values()): th = t.header if self.tasks_states[th.task_id].status not in self.activeStatus: continue cur_time = get_timestamp_utc() # Check subtask timeout ts = self.tasks_states[th.task_id] for s in list(ts.subtask_states.values()): if s.subtask_status.is_computed(): if cur_time > s.deadline: logger.info("Subtask %r dies with status %r", s.subtask_id, s.subtask_status.value) s.subtask_status = SubtaskStatus.failure nodes_with_timeouts.append(s.node_id) t.computation_failed(s.subtask_id) s.stderr = "[GOLEM] Timeout" self.notice_task_updated(th.task_id, subtask_id=s.subtask_id, op=SubtaskOp.TIMEOUT) # Check task timeout if cur_time > th.deadline: logger.info("Task %r dies", th.task_id) self.tasks_states[th.task_id].status = TaskStatus.timeout # TODO: t.tell_it_has_timeout()? self.notice_task_updated(th.task_id, op=TaskOp.TIMEOUT) return nodes_with_timeouts def get_progresses(self): tasks_progresses = {} for t in list(self.tasks.values()): task_id = t.header.task_id task_state = self.tasks_states[task_id] task_status = task_state.status in_progress = not TaskStatus.is_completed(task_status) logger.info('Collecting progress %r %r %r', task_id, task_status, in_progress) if in_progress: ltss = LocalTaskStateSnapshot( task_id, t.get_total_tasks(), t.get_active_tasks(), t.get_progress(), t.short_extra_data_repr(task_state.extra_data) ) # FIXME in short_extra_data_repr should there be extra data # Issue #2460 tasks_progresses[task_id] = ltss return tasks_progresses @handle_task_key_error def assert_task_can_be_restarted(self, task_id: str) -> None: task_state = self.tasks_states[task_id] if task_state.status == TaskStatus.restarted: raise self.AlreadyRestartedError() @handle_task_key_error def put_task_in_restarted_state(self, task_id, clear_tmp=True): """ When restarting task, it's put in a final state 'restarted' and a new one is created. """ self.assert_task_can_be_restarted(task_id) if clear_tmp: self.dir_manager.clear_temporary(task_id) task_state = self.tasks_states[task_id] task_state.status = TaskStatus.restarted for ss in self.tasks_states[task_id].subtask_states.values(): if ss.subtask_status != SubtaskStatus.failure: ss.subtask_status = SubtaskStatus.restarted logger.info("Task %s put into restarted state", task_id) self.notice_task_updated(task_id, op=TaskOp.RESTARTED) @handle_subtask_key_error def restart_subtask(self, subtask_id): task_id = self.subtask2task_mapping[subtask_id] self.tasks[task_id].restart_subtask(subtask_id) task_state = self.tasks_states[task_id] task_state.status = TaskStatus.computing subtask_state = task_state.subtask_states[subtask_id] subtask_state.subtask_status = SubtaskStatus.restarted subtask_state.stderr = "[GOLEM] Restarted" self.notice_task_updated(task_id, subtask_id=subtask_id, op=SubtaskOp.RESTARTED) @handle_task_key_error def restart_frame_subtasks(self, task_id, frame): task = self.tasks[task_id] task_state = self.tasks_states[task_id] subtasks = task.get_subtasks(frame) if not subtasks: return for subtask_id in list(subtasks.keys()): task.restart_subtask(subtask_id) subtask_state = task_state.subtask_states[subtask_id] subtask_state.subtask_status = SubtaskStatus.restarted subtask_state.stderr = "[GOLEM] Restarted" self.notice_task_updated(task_id, subtask_id=subtask_id, op=SubtaskOp.RESTARTED, persist=False) task_state.status = TaskStatus.computing self.notice_task_updated(task_id, op=OtherOp.FRAME_RESTARTED) @handle_task_key_error def abort_task(self, task_id): self.tasks[task_id].abort() self.tasks_states[task_id].status = TaskStatus.aborted for sub in list(self.tasks_states[task_id].subtask_states.values()): del self.subtask2task_mapping[sub.subtask_id] self.tasks_states[task_id].subtask_states.clear() self.notice_task_updated(task_id, op=TaskOp.ABORTED) @rpc_utils.expose('comp.task.subtasks.frames') @handle_task_key_error def get_output_states(self, task_id): return self.tasks[task_id].get_output_states() @handle_task_key_error def delete_task(self, task_id): for sub in list(self.tasks_states[task_id].subtask_states.values()): del self.subtask2task_mapping[sub.subtask_id] self.tasks_states[task_id].subtask_states.clear() self.tasks[task_id].unregister_listener(self) del self.tasks[task_id] del self.tasks_states[task_id] self.dir_manager.clear_temporary(task_id) self.remove_dump(task_id) if self.finished_cb: self.finished_cb() @handle_task_key_error def query_task_state(self, task_id): ts = self.tasks_states[task_id] t = self.tasks[task_id] ts.progress = t.get_progress() ts.elapsed_time = time.time() - ts.time_started if ts.progress > 0.0: proportion = (ts.elapsed_time / ts.progress) ts.remaining_time = proportion - ts.elapsed_time else: ts.remaining_time = None t.update_task_state(ts) return ts def get_subtasks(self, task_id) -> Optional[List[str]]: """ Get all subtasks related to given task id :param task_id: Task ID :return: list of all subtasks related with @task_id or None if @task_id is not known """ task_state = self.tasks_states.get(task_id) if not task_state: return None subtask_states = list(task_state.subtask_states.values()) return [subtask_state.subtask_id for subtask_state in subtask_states] def get_frame_subtasks(self, task_id: str, frame) \ -> Optional[Dict[str, SubtaskState]]: task: Optional[Task] = self.tasks.get(task_id) if not task: return None if not isinstance(task, CoreTask): return None return task.get_subtasks(frame) def change_config(self, root_path, use_distributed_resource_management): self.dir_manager = DirManager(root_path) self.use_distributed_resources = use_distributed_resource_management def get_task_id(self, subtask_id): return self.subtask2task_mapping[subtask_id] def get_task_dict(self, task_id) -> Optional[Dict]: task = self.tasks.get(task_id) if not task: # task might have been deleted after the request was made return None task_type_name = task.task_definition.task_type.lower() task_type = self.task_types[task_type_name] state = self.query_task_state(task.header.task_id) dictionary = { 'duration': state.elapsed_time, # single=True retrieves one preview file. If rendering frames, # it's the preview of the most recently computed frame. 'preview': task_type.get_preview(task, single=True) } return update_dict(dictionary, task.to_dictionary(), state.to_dictionary(), self.get_task_definition_dict(task)) def get_tasks_dict(self) -> List[Dict]: task_ids = list(self.tasks.keys()) mapped = map(self.get_task_dict, task_ids) filtered = filter(None, mapped) return list(filtered) def get_subtask_dict(self, subtask_id): task_id = self.subtask2task_mapping[subtask_id] task_state = self.tasks_states[task_id] subtask = task_state.subtask_states[subtask_id] return subtask.to_dictionary() def get_subtasks_dict(self, task_id): task_state = self.tasks_states[task_id] subtasks = task_state.subtask_states if subtasks: return [subtask.to_dictionary() for subtask in subtasks.values()] @rpc_utils.expose('comp.task.subtasks.borders') def get_subtasks_borders(self, task_id, part=1): task = self.tasks[task_id] task_type_name = task.task_definition.task_type.lower() task_type = self.task_types[task_type_name] subtasks_count = task.get_total_tasks() return { to_unicode(subtask_id): task_type.get_task_border(subtask, task.task_definition, subtasks_count, as_path=True) for subtask_id, subtask in task.get_subtasks(part).items() } def get_task_preview(self, task_id, single=False): task = self.tasks[task_id] task_type_name = task.task_definition.task_type.lower() task_type = self.task_types[task_type_name] return task_type.get_preview(task, single=single) def add_comp_task_request(self, theader, price): """ Add a header of a task which this node may try to compute """ self.comp_task_keeper.add_request(theader, price) def get_estimated_cost(self, task_type, options): try: subtask_value = options['price'] * options['subtask_time'] return options['num_subtasks'] * subtask_value except (KeyError, ValueError): logger.exception("Cannot estimate price, wrong params") return None def __add_subtask_to_tasks_states(self, node_name, node_id, ctd, address): logger.debug('add_subtask_to_tasks_states(%r, %r, %r, %r)', node_name, node_id, ctd, address) ss = SubtaskState() ss.time_started = time.time() ss.node_id = node_id ss.node_name = node_name ss.deadline = ctd['deadline'] ss.subtask_definition = ctd['short_description'] ss.subtask_id = ctd['subtask_id'] ss.extra_data = ctd['extra_data'] ss.subtask_status = SubtaskStatus.starting (self.tasks_states[ctd['task_id']].subtask_states[ctd['subtask_id']] ) = ss def notify_update_task(self, task_id): self.notice_task_updated(task_id) @handle_task_key_error def notice_task_updated(self, task_id: str, subtask_id: str = None, op: Operation = None, persist: bool = True): """Called when a task is modified, saves the task and propagates information Whenever task is changed `notice_task_updated` should be called to save the task - if the change is save-worthy, as specified by the `persist` parameter - and propagate information about changed task to other parts of the system. Most of the calls are save-worthy, but a minority is not: for instance when the work offer is received, the task does not change so saving it does not make sense, but it still makes sense to let other parts of the system know about the change. Also, when a number of minor changes are always followed by a major one, as it is with restarting a frame task, it does not make sense to store all the partial changes, so only the final one is considered save-worthy. :param str task_id: id of the updated task :param str subtask_id: if the operation done on the task is related to a subtask, id of that subtask :param Operation op: performed operation :param bool persist: should the task be persisted now """ # self.save_state() logger.debug( "Notice task updated. task_id=%s, subtask_id=%s," "op=%s, persist=%s", task_id, subtask_id, op, persist, ) if persist and self.task_persistence: self.dump_task(task_id) task_state = self.tasks_states.get(task_id) dispatcher.send( signal='golem.taskmanager', event='task_status_updated', task_id=task_id, task_state=task_state, subtask_id=subtask_id, op=op, ) if self.finished_cb and persist and op \ and op.task_related() and op.is_completed(): self.finished_cb()
class CoreTask(Task): VERIFICATOR_CLASS = CoreVerificator handle_key_error = HandleKeyError(log_key_error) ################ # Task methods # ################ def __init__(self, src_code, task_definition, node_name, environment, resource_size=0, owner_address="", owner_port=0, owner_key_id="", max_pending_client_results=MAX_PENDING_CLIENT_RESULTS): """Create more specific task implementation """ self.task_definition = task_definition task_timeout = task_definition.full_task_timeout deadline = timeout_to_deadline(task_timeout) th = TaskHeader( node_name=node_name, task_id=task_definition.task_id, task_owner_address=owner_address, task_owner_port=owner_port, task_owner_key_id=owner_key_id, environment=environment, task_owner=Node(), deadline=deadline, subtask_timeout=task_definition.subtask_timeout, resource_size=resource_size, estimated_memory=task_definition.estimated_memory, max_price=task_definition.max_price, docker_images=task_definition.docker_images, ) Task.__init__(self, th, src_code) self.task_resources = list() self.total_tasks = 0 self.last_task = 0 self.num_tasks_received = 0 self.subtasks_given = {} self.num_failed_subtasks = 0 self.full_task_timeout = task_timeout self.counting_nodes = {} self.root_path = None self.stdout = { } # for each subtask keep info about stdout received from computing node self.stderr = { } # for each subtask keep info about stderr received from computing node self.results = { } # for each subtask keep info about files containing results self.res_files = {} self.tmp_dir = None self.verificator = self.VERIFICATOR_CLASS() self.max_pending_client_results = max_pending_client_results def is_docker_task(self): return hasattr(self.header, 'docker_images') and len(self.header.docker_images) > 0 def initialize(self, dir_manager): self.tmp_dir = dir_manager.get_task_temporary_dir(self.header.task_id, create=True) self.verificator.tmp_dir = self.tmp_dir def needs_computation(self): return (self.last_task != self.total_tasks) or (self.num_failed_subtasks > 0) def finished_computation(self): return self.num_tasks_received == self.total_tasks def computation_failed(self, subtask_id): self._mark_subtask_failed(subtask_id) def computation_finished(self, subtask_id, task_result, result_type=0): if not self.should_accept(subtask_id): logger.info("Not accepting results for {}".format(subtask_id)) return self.interpret_task_results(subtask_id, task_result, result_type) result_files = self.results.get(subtask_id) ver_state = self.verificator.verify( subtask_id, self.subtasks_given.get(subtask_id), result_files, self) if ver_state == SubtaskVerificationState.VERIFIED: self.accept_results(subtask_id, result_files) # TODO Add support for different verification states else: self.computation_failed(subtask_id) def accept_results(self, subtask_id, result_files): self.subtasks_given[subtask_id]['status'] = SubtaskStatus.finished @handle_key_error def verify_subtask(self, subtask_id): return self.subtasks_given[subtask_id][ 'status'] == SubtaskStatus.finished def verify_task(self): return self.finished_computation() def get_total_tasks(self): return self.total_tasks def get_active_tasks(self): return self.last_task def get_tasks_left(self): return (self.total_tasks - self.last_task) + self.num_failed_subtasks def restart(self): for subtask_id in self.subtasks_given.keys(): self.restart_subtask(subtask_id) @handle_key_error def restart_subtask(self, subtask_id): subtask_info = self.subtasks_given[subtask_id] was_failure_before = subtask_info['status'] in [ SubtaskStatus.failure, SubtaskStatus.resent ] if SubtaskStatus.is_computed(subtask_info['status']): self._mark_subtask_failed(subtask_id) elif subtask_info['status'] == SubtaskStatus.finished: self._mark_subtask_failed(subtask_id) tasks = subtask_info['end_task'] - subtask_info['start_task'] + 1 self.num_tasks_received -= tasks if not was_failure_before: subtask_info['status'] = SubtaskStatus.restarted def abort(self): pass def get_progress(self): if self.total_tasks == 0: return 0.0 return self.num_tasks_received / self.total_tasks def get_resources(self, resource_header, resource_type=0, tmp_dir=None): dir_name = self._get_resources_root_dir() if tmp_dir is None: tmp_dir = self.tmp_dir if os.path.exists(dir_name): if resource_type == resource_types["zip"]: return prepare_delta_zip(dir_name, resource_header, tmp_dir, self.task_resources) elif resource_type == resource_types["parts"]: return TaskResourceHeader.build_parts_header_delta_from_chosen( resource_header, dir_name, self.res_files) elif resource_type == resource_types["hashes"]: return copy.copy(self.task_resources) return None def update_task_state(self, task_state): pass @handle_key_error def get_trust_mod(self, subtask_id): return 1.0 def add_resources(self, res_files): self.res_files = res_files def get_stderr(self, subtask_id): return self.stderr.get(subtask_id, "") def get_stdout(self, subtask_id): return self.stdout.get(subtask_id, "") def get_results(self, subtask_id): return self.results.get(subtask_id, []) def to_dictionary(self): return { u'id': to_unicode(self.header.task_id), u'name': to_unicode(self.task_definition.task_name), u'type': to_unicode(self.task_definition.task_type), u'subtasks': self.get_total_tasks(), u'progress': self.get_progress() } ######################### # Specific task methods # ######################### def interpret_task_results(self, subtask_id, task_results, result_type, sort=True): """Filter out ".log" files from received results. Log files should represent stdout and stderr from computing machine. Other files should represent subtask results. :param subtask_id: id of a subtask for which results are received :param task_results: it may be a list of files, if result_type is equal to result_types["files"] or it may be a cbor serialized zip file containing all files, if result_type is equal to result_types["data"] :param result_type: a number from result_types, it may represents data format or files format :param bool sort: *default: True* Sort results, if set to True """ self.stdout[subtask_id] = "" self.stderr[subtask_id] = "" tr_files = self.load_task_results(task_results, result_type, subtask_id) self.results[subtask_id] = self.filter_task_results( tr_files, subtask_id) if sort: self.results[subtask_id].sort() @handle_key_error def result_incoming(self, subtask_id): self.counting_nodes[self.subtasks_given[subtask_id] ['node_id']].finish() self.subtasks_given[subtask_id]['status'] = SubtaskStatus.downloading def query_extra_data_for_test_task(self): return None # Implement in derived methods def load_task_results(self, task_result, result_type, subtask_id): """ Change results to a list of files. If result_type is equal to result_types["files"} this function only return task_results without making any changes. If result_type is equal to result_types["data"] tham task_result is cbor and unzipped and files are saved in tmp_dir. :param task_result: list of files of cbor serialized ziped file with files :param result_type: result_types element :param str subtask_id: :return: """ if result_type == result_types['data']: output_dir = os.path.join(self.tmp_dir, subtask_id) if not os.path.exists(output_dir): os.makedirs(output_dir) return [ self._unpack_task_result(trp, output_dir) for trp in task_result ] elif result_type == result_types['files']: return task_result else: logger.error( "Task result type not supported {}".format(result_type)) self.stderr[ subtask_id] = "[GOLEM] Task result {} not supported".format( result_type) return [] def filter_task_results(self, task_results, subtask_id, log_ext=".log", err_log_ext="err.log"): """ From a list of files received in task_results, return only files that don't have extension <log_ext> or <err_log_ext>. File with log_ext is saved as stdout for this subtask (only one file is currently supported). File with err_log_ext is save as stderr for this subtask (only one file is currently supported). :param list task_results: list of files :param str subtask_id: if of a given subtask :param str log_ext: extension that stdout files have :param str err_log_ext: extension that stderr files have :return: """ filtered_task_results = [] for tr in task_results: if tr.endswith(err_log_ext): self.stderr[subtask_id] = tr elif tr.endswith(log_ext): self.stdout[subtask_id] = tr else: try: new_tr = outer_dir_path(tr) if os.path.isfile(new_tr): os.remove(new_tr) os.rename(tr, new_tr) filtered_task_results.append(new_tr) except (IOError, OSError) as err: logger.warning("Cannot move file {} to new location: " "{}".format(tr, err)) return filtered_task_results def after_test(self, results, tmp_dir): return {} def notify_update_task(self): for l in self.listeners: l.notify_update_task(self.header.task_id) @handle_key_error def should_accept(self, subtask_id): status = self.subtasks_given[subtask_id]['status'] return SubtaskStatus.is_computed(status) @staticmethod def _interpret_log(log): if log is None: return "" if not os.path.isfile(log): return log try: with open(log) as f: res = f.read() return res except IOError as err: logger.error("Can't read file {}: {}".format(log, err)) return "" @handle_key_error def _mark_subtask_failed(self, subtask_id): self.subtasks_given[subtask_id]['status'] = SubtaskStatus.failure self.counting_nodes[self.subtasks_given[subtask_id] ['node_id']].reject() self.num_failed_subtasks += 1 def _unpack_task_result(self, trp, output_dir): tr = CBORSerializer.loads(trp) with open(os.path.join(output_dir, tr[0]), "wb") as fh: fh.write(decompress(tr[1])) return os.path.join(output_dir, tr[0]) def _get_resources_root_dir(self): prefix = os.path.commonprefix(self.task_resources) return os.path.dirname(prefix) def _accept_client(self, node_id): client = TaskClient.assert_exists(node_id, self.counting_nodes) finishing = client.finishing() max_finishing = self.max_pending_client_results if client.rejected(): return AcceptClientVerdict.REJECTED elif finishing >= max_finishing or client.started( ) - finishing >= max_finishing: return AcceptClientVerdict.SHOULD_WAIT client.start() return AcceptClientVerdict.ACCEPTED