def _get_work_task_id(self, get_work_response): if get_work_response['task_id'] is not None: return get_work_response['task_id'] elif 'batch_id' in get_work_response: try: task = load_task( module=get_work_response.get('task_module'), task_name=get_work_response['task_family'], params_str=get_work_response['task_params'], ) except Exception as ex: self._handle_task_load_error(ex, get_work_response['batch_task_ids']) self.run_succeeded = False return None self._scheduler.add_task( worker=self._id, task_id=task.task_id, module=get_work_response.get('task_module'), family=get_work_response['task_family'], params=task.to_str_params(), status=RUNNING, batch_id=get_work_response['batch_id'], ) return task.task_id
def _get_work(self): if self._stop_requesting_work: return None, 0, 0, 0 logger.debug("Asking scheduler for work...") r = self._scheduler.get_work(worker=self._id, host=self.host, assistant=self._assistant) n_pending_tasks = r["n_pending_tasks"] task_id = r["task_id"] running_tasks = r["running_tasks"] n_unique_pending = r["n_unique_pending"] self._get_work_response_history.append(dict(task_id=task_id, running_tasks=running_tasks)) if task_id is not None and task_id not in self._scheduled_tasks: logger.info("Did not schedule %s, will load it dynamically", task_id) try: # TODO: we should obtain the module name from the server! self._scheduled_tasks[task_id] = load_task( module=r.get("task_module"), task_name=r["task_family"], params_str=r["task_params"] ) except TaskClassException as ex: msg = "Cannot find task for %s" % task_id logger.exception(msg) subject = "Luigi: %s" % msg error_message = notifications.wrap_traceback(ex) notifications.send_error_email(subject, error_message) self._add_task( worker=self._id, task_id=task_id, status=FAILED, runnable=False, assistant=self._assistant ) task_id = None self.run_succeeded = False return task_id, running_tasks, n_pending_tasks, n_unique_pending
def _get_work(self): logger.debug("Asking scheduler for work...") r = self._scheduler.get_work(worker=self._id, host=self.host, assistant=self._assistant) n_pending_tasks = r['n_pending_tasks'] task_id = r['task_id'] running_tasks = r['running_tasks'] n_unique_pending = r['n_unique_pending'] if task_id is not None and task_id not in self._scheduled_tasks: logger.info('Did not schedule %s, will load it dynamically', task_id) try: # TODO: we should obtain the module name from the server! self._scheduled_tasks[task_id] = \ load_task(module=r.get('task_module'), task_name=r['task_family'], params_str=r['task_params']) except TaskClassException as ex: msg = 'Cannot find task for %s' % task_id logger.exception(msg) subject = 'Luigi: %s' % msg error_message = notifications.wrap_traceback(ex) notifications.send_error_email(subject, error_message) self._scheduler.add_task(worker=self._id, task_id=task_id, status=FAILED, runnable=False, assistant=self._assistant) task_id = None self.run_succeeded = False return task_id, running_tasks, n_pending_tasks, n_unique_pending
def _get_work_task_id(self, get_work_response): if get_work_response.get('task_id') is not None: return get_work_response['task_id'] elif 'batch_id' in get_work_response: try: task = load_task( module=get_work_response.get('task_module'), task_name=get_work_response['task_family'], params_str=get_work_response['task_params'], ) except Exception as ex: self._handle_task_load_error(ex, get_work_response['batch_task_ids']) self.run_succeeded = False return None self._scheduler.add_task( worker=self._id, task_id=task.task_id, module=get_work_response.get('task_module'), family=get_work_response['task_family'], params=task.to_str_params(), status=RUNNING, batch_id=get_work_response['batch_id'], ) return task.task_id else: return None
def _get_work(self): if self._stop_requesting_work: return GetWorkResponse(None, 0, 0, 0, 0, WORKER_STATE_DISABLED) if self.worker_processes > 0: logger.debug("Asking scheduler for work...") r = self._scheduler.get_work( worker=self._id, host=self.host, assistant=self._assistant, current_tasks=list(self._running_tasks.keys()), ) else: logger.debug("Checking if tasks are still pending") r = self._scheduler.count_pending(worker=self._id) running_tasks = r['running_tasks'] task_id = self._get_work_task_id(r) self._get_work_response_history.append({ 'task_id': task_id, 'running_tasks': running_tasks, }) if task_id is not None and task_id not in self._scheduled_tasks: logger.info('Did not schedule %s, will load it dynamically', task_id) try: # TODO: we should obtain the module name from the server! self._scheduled_tasks[task_id] = \ load_task(module=r.get('task_module'), task_name=r['task_family'], params_str=r['task_params']) except TaskClassException as ex: self._handle_task_load_error(ex, [task_id]) task_id = None self.run_succeeded = False if task_id is not None and 'batch_task_ids' in r: batch_tasks = filter(None, [ self._scheduled_tasks.get(batch_id) for batch_id in r['batch_task_ids'] ]) self._batch_running_tasks[task_id] = batch_tasks return GetWorkResponse( task_id=task_id, running_tasks=running_tasks, n_pending_tasks=r['n_pending_tasks'], n_unique_pending=r['n_unique_pending'], # TODO: For a tiny amount of time (a month?) we'll keep forwards compatibility # That is you can user a newer client than server (Sep 2016) n_pending_last_scheduled=r.get('n_pending_last_scheduled', 0), worker_state=r.get('worker_state', WORKER_STATE_ACTIVE), )
def _get_work(self): if self._stop_requesting_work: return None, 0, 0, 0, WORKER_STATE_DISABLED logger.debug("Asking scheduler for work...") r = self._scheduler.get_work( worker=self._id, host=self.host, assistant=self._assistant, current_tasks=list(self._running_tasks.keys()), ) n_pending_tasks = r['n_pending_tasks'] running_tasks = r['running_tasks'] n_unique_pending = r['n_unique_pending'] # TODO: For a tiny amount of time (a month?) we'll keep forwards compatibility # That is you can user a newer client than server (Sep 2016) worker_state = r.get('worker_state', WORKER_STATE_ACTIVE) # state according to server! task_id = self._get_work_task_id(r) self._get_work_response_history.append({ 'task_id': task_id, 'running_tasks': running_tasks, }) if task_id is not None and task_id not in self._scheduled_tasks: logger.info('Did not schedule %s, will load it dynamically', task_id) try: # TODO: we should obtain the module name from the server! self._scheduled_tasks[task_id] = \ load_task(module=r.get('task_module'), task_name=r['task_family'], params_str=r['task_params']) except TaskClassException as ex: msg = 'Cannot find task for %s' % task_id logger.exception(msg) subject = 'Luigi: %s' % msg error_message = notifications.wrap_traceback(ex) notifications.send_error_email(subject, error_message) self._add_task(worker=self._id, task_id=task_id, status=FAILED, runnable=False, assistant=self._assistant) task_id = None self.run_succeeded = False if task_id is not None and 'batch_task_ids' in r: batch_tasks = filter(None, [ self._scheduled_tasks.get(batch_id) for batch_id in r['batch_task_ids'] ]) self._batch_running_tasks[task_id] = batch_tasks return task_id, running_tasks, n_pending_tasks, n_unique_pending, worker_state
def _get_work(self): if self._stop_requesting_work: return GetWorkResponse(None, 0, 0, 0, 0, WORKER_STATE_DISABLED) if self.worker_processes > 0: logger.debug("Asking scheduler for work...") r = self._scheduler.get_work( worker=self._id, host=self.host, assistant_groups=self._config.assistant_groups if self._assistant else None, current_tasks=list(self._running_tasks.keys()), ) else: logger.debug("Checking if tasks are still pending") r = self._scheduler.count_pending(worker=self._id) running_tasks = r['running_tasks'] task_id = self._get_work_task_id(r) if self.show_execution_summary: self._get_work_response_history.append({ 'task_id': task_id, 'running_tasks': running_tasks, }) if task_id is not None and task_id not in self._scheduled_tasks: logger.info('Did not schedule %s, will load it dynamically', task_id) try: # TODO: we should obtain the module name from the server! self._scheduled_tasks[task_id] = \ load_task(module=r.get('task_module'), task_name=r['task_family'], params_str=r['task_params']) except Exception as ex: self._handle_task_load_error(ex, [task_id]) task_id = None self.run_succeeded = False if task_id is not None and 'batch_task_ids' in r: batch_tasks = filter(None, [ self._scheduled_tasks.get(batch_id) for batch_id in r['batch_task_ids']]) self._batch_running_tasks[task_id] = batch_tasks return GetWorkResponse( task_id=task_id, running_tasks=running_tasks, n_pending_tasks=r['n_pending_tasks'], n_unique_pending=r['n_unique_pending'], # TODO: For a tiny amount of time (a month?) we'll keep forwards compatibility # That is you can user a newer client than server (Sep 2016) n_pending_last_scheduled=r.get('n_pending_last_scheduled', 0), worker_state=r.get('worker_state', WORKER_STATE_ACTIVE), )
def _get_work(self): if self._stop_requesting_work: return None, 0, 0, 0 logger.debug("Asking scheduler for work...") r = self._scheduler.get_work( worker=self._id, host=self.host, assistant=self._assistant, current_tasks=list(self._running_tasks.keys()), ) n_pending_tasks = r['n_pending_tasks'] running_tasks = r['running_tasks'] n_unique_pending = r['n_unique_pending'] task_id = self._get_work_task_id(r) self._get_work_response_history.append({ 'task_id': task_id, 'running_tasks': running_tasks, }) if task_id is not None and task_id not in self._scheduled_tasks: logger.info('Did not schedule %s, will load it dynamically', task_id) try: # TODO: we should obtain the module name from the server! self._scheduled_tasks[task_id] = \ load_task(module=r.get('task_module'), task_name=r['task_family'], params_str=r['task_params']) except TaskClassException as ex: msg = 'Cannot find task for %s' % task_id logger.exception(msg) subject = 'Luigi: %s' % msg error_message = notifications.wrap_traceback(ex) notifications.send_error_email(subject, error_message) self._add_task(worker=self._id, task_id=task_id, status=FAILED, runnable=False, assistant=self._assistant) task_id = None self.run_succeeded = False if task_id is not None and 'batch_task_ids' in r: batch_tasks = filter(None, [ self._scheduled_tasks.get(batch_id) for batch_id in r['batch_task_ids']]) self._batch_running_tasks[task_id] = batch_tasks return task_id, running_tasks, n_pending_tasks, n_unique_pending
def _get_work(self): if self._stop_requesting_work: return None, 0, 0, 0, WORKER_STATE_DISABLED logger.debug("Asking scheduler for work...") r = self._scheduler.get_work( worker=self._id, host=self.host, assistant=self._assistant, current_tasks=list(self._running_tasks.keys()), ) n_pending_tasks = r['n_pending_tasks'] running_tasks = r['running_tasks'] n_unique_pending = r['n_unique_pending'] # TODO: For a tiny amount of time (a month?) we'll keep forwards compatibility # That is you can user a newer client than server (Sep 2016) worker_state = r.get('worker_state', WORKER_STATE_ACTIVE) # state according to server! task_id = self._get_work_task_id(r) self._get_work_response_history.append({ 'task_id': task_id, 'running_tasks': running_tasks, }) if task_id is not None and task_id not in self._scheduled_tasks: logger.info('Did not schedule %s, will load it dynamically', task_id) try: # TODO: we should obtain the module name from the server! self._scheduled_tasks[task_id] = \ load_task(module=r.get('task_module'), task_name=r['task_family'], params_str=r['task_params']) except TaskClassException as ex: self._handle_task_load_error(ex, [task_id]) task_id = None self.run_succeeded = False if task_id is not None and 'batch_task_ids' in r: batch_tasks = filter(None, [ self._scheduled_tasks.get(batch_id) for batch_id in r['batch_task_ids']]) self._batch_running_tasks[task_id] = batch_tasks return task_id, running_tasks, n_pending_tasks, n_unique_pending, worker_state
def _get_work_task_id(self, get_work_response): if get_work_response['task_id'] is not None: return get_work_response['task_id'] elif 'batch_id' in get_work_response: task = load_task( module=get_work_response.get('task_module'), task_name=get_work_response['task_family'], params_str=get_work_response['task_params'], ) self._scheduler.add_task( worker=self._id, task_id=task.task_id, module=get_work_response.get('task_module'), family=get_work_response['task_family'], params=task.to_str_params(), status=RUNNING, batch_id=get_work_response['batch_id'], ) return task.task_id
def inflate_cls(mod_cls, params): """ Reuse Luigi's service that registers task types so that we can dynamically instantiate an instance of this class. Like Luigi, we assume that the mod_cls is 'module.class' and we assume that the user has put there pipe location on PYTHONPATH. :param mod_cls: '<module>.<class>' :param params: Dictionary of parameter to value :return: Instance of the task in question """ mod_path = mod_cls.split('.') mod = '.'.join(mod_path[:-1]) cls = mod_path[-1] if mod == '': mod = None task = load_task(mod, cls, params) return task
def _handle_next_task(self): """ We have to catch three ways a task can be "done": 1. normal execution: the task runs/fails and puts a result back on the queue, 2. new dependencies: the task yielded new deps that were not complete and will be rescheduled and dependencies added, 3. child process dies: we need to catch this separately. """ while True: self._purge_children() # Deal with subprocess failures try: task_id, status, expl, missing, new_requirements = ( self._task_result_queue.get( timeout=self._config.wait_interval)) except Queue.Empty: return task = self._scheduled_tasks[task_id] if not task or task_id not in self._running_tasks: continue # Not a running task. Probably already removed. # Maybe it yielded something? # external task if run not implemented, retry-able if config option is enabled. external_task_retryable = _is_external(task) and self._config.retry_external_tasks if status == FAILED and not external_task_retryable: self._email_task_failure(task, expl) new_deps = [] if new_requirements: new_req = [load_task(module, name, params) for module, name, params in new_requirements] for t in new_req: self.add(t) new_deps = [t.task_id for t in new_req] self._add_task(worker=self._id, task_id=task_id, status=status, expl=json.dumps(expl), resources=task.process_resources(), runnable=None, params=task.to_str_params(), family=task.task_family, module=task.task_module, new_deps=new_deps, assistant=self._assistant, retry_policy_dict=_get_retry_policy_dict(task)) self._running_tasks.pop(task_id) # re-add task to reschedule missing dependencies if missing: reschedule = True # keep out of infinite loops by not rescheduling too many times for task_id in missing: self.unfulfilled_counts[task_id] += 1 if (self.unfulfilled_counts[task_id] > self._config.max_reschedules): reschedule = False if reschedule: self.add(task) self.run_succeeded &= (status == DONE) or (len(new_deps) > 0) return
def test_external_tasks_loadable(self): task = load_task("luigi", "ExternalTask", {}) assert(isinstance(task, luigi.ExternalTask))
def _handle_next_task(self): """ We have to catch three ways a task can be "done": 1. normal execution: the task runs/fails and puts a result back on the queue, 2. new dependencies: the task yielded new deps that were not complete and will be rescheduled and dependencies added, 3. child process dies: we need to catch this separately. """ while True: self._purge_children() # Deal with subprocess failures try: task_id, status, expl, missing, new_requirements = ( self._task_result_queue.get( timeout=self._config.wait_interval)) except Queue.Empty: return task = self._scheduled_tasks[task_id] if not task or task_id not in self._running_tasks: continue # Not a running task. Probably already removed. # Maybe it yielded something? # external task if run not implemented, retry-able if config option is enabled. external_task_retryable = _is_external(task) and self._config.retry_external_tasks if status == FAILED and not external_task_retryable: self._email_task_failure(task, expl) new_deps = [] if new_requirements: new_req = [load_task(module, name, params) for module, name, params in new_requirements] for t in new_req: self.add(t) new_deps = [t.task_id for t in new_req] self._add_task(worker=self._id, task_id=task_id, status=status, expl=json.dumps(expl), resources=task.process_resources(), runnable=None, params=task.to_str_params(), family=task.task_family, module=task.task_module, new_deps=new_deps, assistant=self._assistant) self._running_tasks.pop(task_id) # re-add task to reschedule missing dependencies if missing: reschedule = True # keep out of infinite loops by not rescheduling too many times for task_id in missing: self.unfulfilled_counts[task_id] += 1 if (self.unfulfilled_counts[task_id] > self._config.max_reschedules): reschedule = False if reschedule: self.add(task) self.run_succeeded &= (status == DONE) or (len(new_deps) > 0) return
def make_task(self, task_module : str) -> luigi.Task: """ Reifies the luigi.Task object from its name and saved parameters """ return load_task( task_module, self.name, {name: param.value for name, param in self.parameters.items()})
def _handle_next_task(self): """ We have to catch three ways a task can be "done": 1. normal execution: the task runs/fails and puts a result back on the queue, 2. new dependencies: the task yielded new deps that were not complete and will be rescheduled and dependencies added, 3. child process dies: we need to catch this separately. """ while True: self._purge_children() # Deal with subprocess failures try: task_id, status, error_message, missing, new_requirements = ( self._task_result_queue.get( timeout=float(self._config.wait_interval))) except Queue.Empty: return task = self._scheduled_tasks[task_id] if not task or task_id not in self._running_tasks: continue # Not a running task. Probably already removed. # Maybe it yielded something? new_deps = [] if new_requirements: new_req = [ load_task(module, name, params) for module, name, params in new_requirements ] for t in new_req: self.add(t) new_deps = [t.task_id for t in new_req] self._scheduler.add_task(worker=self._id, task_id=task_id, status=status, expl=error_message, resources=task.process_resources(), runnable=None, params=task.to_str_params(), family=task.task_family, module=task.task_module, new_deps=new_deps, assistant=self._assistant) if status == RUNNING: continue self._running_tasks.pop(task_id) # re-add task to reschedule missing dependencies if missing: reschedule = True # keep out of infinite loops by not rescheduling too many times for task_id in missing: self.unfulfilled_counts[task_id] += 1 if (self.unfulfilled_counts[task_id] > self._config.max_reschedules): reschedule = False if reschedule: self.add(task) self.run_succeeded &= status in (DONE, SUSPENDED) return
def make_task(self, task_module : str) -> luigi.Task: """ Reifies the luigi.Task object from its name and saved parameters """ return load_task( task_module, self.task_family, self.params)
def test_external_tasks_loadable(self): task = load_task("luigi", "ExternalTask", {}) assert (isinstance(task, luigi.ExternalTask))
def make_task(self, task_module: str) -> luigi.Task: """ Reifies the luigi.Task object from its name and saved parameters """ return load_task( task_module, self.name, {name: param.value for name, param in self.parameters.items()})
def make_task(self, task_module: str) -> luigi.Task: """ Reifies the luigi.Task object from its name and saved parameters """ return load_task(task_module, self.task_family, self.params)