Exemple #1
0
    def _get_work_task_id(self, get_work_response):
        if get_work_response['task_id'] is not None:
            return get_work_response['task_id']
        elif 'batch_id' in get_work_response:
            try:
                task = load_task(
                    module=get_work_response.get('task_module'),
                    task_name=get_work_response['task_family'],
                    params_str=get_work_response['task_params'],
                )
            except Exception as ex:
                self._handle_task_load_error(ex, get_work_response['batch_task_ids'])
                self.run_succeeded = False
                return None

            self._scheduler.add_task(
                worker=self._id,
                task_id=task.task_id,
                module=get_work_response.get('task_module'),
                family=get_work_response['task_family'],
                params=task.to_str_params(),
                status=RUNNING,
                batch_id=get_work_response['batch_id'],
            )
            return task.task_id
Exemple #2
0
    def _get_work(self):
        if self._stop_requesting_work:
            return None, 0, 0, 0
        logger.debug("Asking scheduler for work...")
        r = self._scheduler.get_work(worker=self._id, host=self.host, assistant=self._assistant)
        n_pending_tasks = r["n_pending_tasks"]
        task_id = r["task_id"]
        running_tasks = r["running_tasks"]
        n_unique_pending = r["n_unique_pending"]

        self._get_work_response_history.append(dict(task_id=task_id, running_tasks=running_tasks))

        if task_id is not None and task_id not in self._scheduled_tasks:
            logger.info("Did not schedule %s, will load it dynamically", task_id)

            try:
                # TODO: we should obtain the module name from the server!
                self._scheduled_tasks[task_id] = load_task(
                    module=r.get("task_module"), task_name=r["task_family"], params_str=r["task_params"]
                )
            except TaskClassException as ex:
                msg = "Cannot find task for %s" % task_id
                logger.exception(msg)
                subject = "Luigi: %s" % msg
                error_message = notifications.wrap_traceback(ex)
                notifications.send_error_email(subject, error_message)
                self._add_task(
                    worker=self._id, task_id=task_id, status=FAILED, runnable=False, assistant=self._assistant
                )
                task_id = None
                self.run_succeeded = False

        return task_id, running_tasks, n_pending_tasks, n_unique_pending
Exemple #3
0
    def _get_work(self):
        logger.debug("Asking scheduler for work...")
        r = self._scheduler.get_work(worker=self._id,
                                     host=self.host,
                                     assistant=self._assistant)
        n_pending_tasks = r['n_pending_tasks']
        task_id = r['task_id']
        running_tasks = r['running_tasks']
        n_unique_pending = r['n_unique_pending']

        if task_id is not None and task_id not in self._scheduled_tasks:
            logger.info('Did not schedule %s, will load it dynamically',
                        task_id)

            try:
                # TODO: we should obtain the module name from the server!
                self._scheduled_tasks[task_id] = \
                    load_task(module=r.get('task_module'),
                              task_name=r['task_family'],
                              params_str=r['task_params'])
            except TaskClassException as ex:
                msg = 'Cannot find task for %s' % task_id
                logger.exception(msg)
                subject = 'Luigi: %s' % msg
                error_message = notifications.wrap_traceback(ex)
                notifications.send_error_email(subject, error_message)
                self._scheduler.add_task(worker=self._id,
                                         task_id=task_id,
                                         status=FAILED,
                                         runnable=False,
                                         assistant=self._assistant)
                task_id = None
                self.run_succeeded = False

        return task_id, running_tasks, n_pending_tasks, n_unique_pending
Exemple #4
0
    def _get_work_task_id(self, get_work_response):
        if get_work_response.get('task_id') is not None:
            return get_work_response['task_id']
        elif 'batch_id' in get_work_response:
            try:
                task = load_task(
                    module=get_work_response.get('task_module'),
                    task_name=get_work_response['task_family'],
                    params_str=get_work_response['task_params'],
                )
            except Exception as ex:
                self._handle_task_load_error(ex, get_work_response['batch_task_ids'])
                self.run_succeeded = False
                return None

            self._scheduler.add_task(
                worker=self._id,
                task_id=task.task_id,
                module=get_work_response.get('task_module'),
                family=get_work_response['task_family'],
                params=task.to_str_params(),
                status=RUNNING,
                batch_id=get_work_response['batch_id'],
            )
            return task.task_id
        else:
            return None
Exemple #5
0
    def _get_work(self):
        logger.debug("Asking scheduler for work...")
        r = self._scheduler.get_work(worker=self._id, host=self.host, assistant=self._assistant)
        n_pending_tasks = r['n_pending_tasks']
        task_id = r['task_id']
        running_tasks = r['running_tasks']
        n_unique_pending = r['n_unique_pending']

        if task_id is not None and task_id not in self._scheduled_tasks:
            logger.info('Did not schedule %s, will load it dynamically', task_id)

            try:
                # TODO: we should obtain the module name from the server!
                self._scheduled_tasks[task_id] = \
                    load_task(module=r.get('task_module'),
                              task_name=r['task_family'],
                              params_str=r['task_params'])
            except TaskClassException as ex:
                msg = 'Cannot find task for %s' % task_id
                logger.exception(msg)
                subject = 'Luigi: %s' % msg
                error_message = notifications.wrap_traceback(ex)
                notifications.send_error_email(subject, error_message)
                self._scheduler.add_task(worker=self._id, task_id=task_id, status=FAILED, runnable=False,
                                         assistant=self._assistant)
                task_id = None
                self.run_succeeded = False

        return task_id, running_tasks, n_pending_tasks, n_unique_pending
Exemple #6
0
    def _get_work(self):
        if self._stop_requesting_work:
            return GetWorkResponse(None, 0, 0, 0, 0, WORKER_STATE_DISABLED)

        if self.worker_processes > 0:
            logger.debug("Asking scheduler for work...")
            r = self._scheduler.get_work(
                worker=self._id,
                host=self.host,
                assistant=self._assistant,
                current_tasks=list(self._running_tasks.keys()),
            )
        else:
            logger.debug("Checking if tasks are still pending")
            r = self._scheduler.count_pending(worker=self._id)

        running_tasks = r['running_tasks']
        task_id = self._get_work_task_id(r)

        self._get_work_response_history.append({
            'task_id': task_id,
            'running_tasks': running_tasks,
        })

        if task_id is not None and task_id not in self._scheduled_tasks:
            logger.info('Did not schedule %s, will load it dynamically',
                        task_id)

            try:
                # TODO: we should obtain the module name from the server!
                self._scheduled_tasks[task_id] = \
                    load_task(module=r.get('task_module'),
                              task_name=r['task_family'],
                              params_str=r['task_params'])
            except TaskClassException as ex:
                self._handle_task_load_error(ex, [task_id])
                task_id = None
                self.run_succeeded = False

        if task_id is not None and 'batch_task_ids' in r:
            batch_tasks = filter(None, [
                self._scheduled_tasks.get(batch_id)
                for batch_id in r['batch_task_ids']
            ])
            self._batch_running_tasks[task_id] = batch_tasks

        return GetWorkResponse(
            task_id=task_id,
            running_tasks=running_tasks,
            n_pending_tasks=r['n_pending_tasks'],
            n_unique_pending=r['n_unique_pending'],

            # TODO: For a tiny amount of time (a month?) we'll keep forwards compatibility
            #  That is you can user a newer client than server (Sep 2016)
            n_pending_last_scheduled=r.get('n_pending_last_scheduled', 0),
            worker_state=r.get('worker_state', WORKER_STATE_ACTIVE),
        )
Exemple #7
0
    def _get_work(self):
        if self._stop_requesting_work:
            return None, 0, 0, 0, WORKER_STATE_DISABLED
        logger.debug("Asking scheduler for work...")
        r = self._scheduler.get_work(
            worker=self._id,
            host=self.host,
            assistant=self._assistant,
            current_tasks=list(self._running_tasks.keys()),
        )
        n_pending_tasks = r['n_pending_tasks']
        running_tasks = r['running_tasks']
        n_unique_pending = r['n_unique_pending']
        # TODO: For a tiny amount of time (a month?) we'll keep forwards compatibility
        # That is you can user a newer client than server (Sep 2016)
        worker_state = r.get('worker_state',
                             WORKER_STATE_ACTIVE)  # state according to server!
        task_id = self._get_work_task_id(r)

        self._get_work_response_history.append({
            'task_id': task_id,
            'running_tasks': running_tasks,
        })

        if task_id is not None and task_id not in self._scheduled_tasks:
            logger.info('Did not schedule %s, will load it dynamically',
                        task_id)

            try:
                # TODO: we should obtain the module name from the server!
                self._scheduled_tasks[task_id] = \
                    load_task(module=r.get('task_module'),
                              task_name=r['task_family'],
                              params_str=r['task_params'])
            except TaskClassException as ex:
                msg = 'Cannot find task for %s' % task_id
                logger.exception(msg)
                subject = 'Luigi: %s' % msg
                error_message = notifications.wrap_traceback(ex)
                notifications.send_error_email(subject, error_message)
                self._add_task(worker=self._id,
                               task_id=task_id,
                               status=FAILED,
                               runnable=False,
                               assistant=self._assistant)
                task_id = None
                self.run_succeeded = False

        if task_id is not None and 'batch_task_ids' in r:
            batch_tasks = filter(None, [
                self._scheduled_tasks.get(batch_id)
                for batch_id in r['batch_task_ids']
            ])
            self._batch_running_tasks[task_id] = batch_tasks

        return task_id, running_tasks, n_pending_tasks, n_unique_pending, worker_state
Exemple #8
0
    def _get_work(self):
        if self._stop_requesting_work:
            return GetWorkResponse(None, 0, 0, 0, 0, WORKER_STATE_DISABLED)

        if self.worker_processes > 0:
            logger.debug("Asking scheduler for work...")
            r = self._scheduler.get_work(
                worker=self._id,
                host=self.host,
                assistant_groups=self._config.assistant_groups if self._assistant else None,
                current_tasks=list(self._running_tasks.keys()),
            )
        else:
            logger.debug("Checking if tasks are still pending")
            r = self._scheduler.count_pending(worker=self._id)

        running_tasks = r['running_tasks']
        task_id = self._get_work_task_id(r)

        if self.show_execution_summary:
            self._get_work_response_history.append({
                'task_id': task_id,
                'running_tasks': running_tasks,
            })

        if task_id is not None and task_id not in self._scheduled_tasks:
            logger.info('Did not schedule %s, will load it dynamically', task_id)

            try:
                # TODO: we should obtain the module name from the server!
                self._scheduled_tasks[task_id] = \
                    load_task(module=r.get('task_module'),
                              task_name=r['task_family'],
                              params_str=r['task_params'])
            except Exception as ex:
                self._handle_task_load_error(ex, [task_id])
                task_id = None
                self.run_succeeded = False

        if task_id is not None and 'batch_task_ids' in r:
            batch_tasks = filter(None, [
                self._scheduled_tasks.get(batch_id) for batch_id in r['batch_task_ids']])
            self._batch_running_tasks[task_id] = batch_tasks

        return GetWorkResponse(
            task_id=task_id,
            running_tasks=running_tasks,
            n_pending_tasks=r['n_pending_tasks'],
            n_unique_pending=r['n_unique_pending'],

            # TODO: For a tiny amount of time (a month?) we'll keep forwards compatibility
            #  That is you can user a newer client than server (Sep 2016)
            n_pending_last_scheduled=r.get('n_pending_last_scheduled', 0),
            worker_state=r.get('worker_state', WORKER_STATE_ACTIVE),
        )
Exemple #9
0
    def _get_work(self):
        if self._stop_requesting_work:
            return None, 0, 0, 0
        logger.debug("Asking scheduler for work...")
        r = self._scheduler.get_work(
            worker=self._id,
            host=self.host,
            assistant=self._assistant,
            current_tasks=list(self._running_tasks.keys()),
        )
        n_pending_tasks = r['n_pending_tasks']
        running_tasks = r['running_tasks']
        n_unique_pending = r['n_unique_pending']
        task_id = self._get_work_task_id(r)

        self._get_work_response_history.append({
            'task_id': task_id,
            'running_tasks': running_tasks,
        })

        if task_id is not None and task_id not in self._scheduled_tasks:
            logger.info('Did not schedule %s, will load it dynamically', task_id)

            try:
                # TODO: we should obtain the module name from the server!
                self._scheduled_tasks[task_id] = \
                    load_task(module=r.get('task_module'),
                              task_name=r['task_family'],
                              params_str=r['task_params'])
            except TaskClassException as ex:
                msg = 'Cannot find task for %s' % task_id
                logger.exception(msg)
                subject = 'Luigi: %s' % msg
                error_message = notifications.wrap_traceback(ex)
                notifications.send_error_email(subject, error_message)
                self._add_task(worker=self._id, task_id=task_id, status=FAILED, runnable=False,
                               assistant=self._assistant)
                task_id = None
                self.run_succeeded = False

        if task_id is not None and 'batch_task_ids' in r:
            batch_tasks = filter(None, [
                self._scheduled_tasks.get(batch_id) for batch_id in r['batch_task_ids']])
            self._batch_running_tasks[task_id] = batch_tasks

        return task_id, running_tasks, n_pending_tasks, n_unique_pending
Exemple #10
0
    def _get_work(self):
        if self._stop_requesting_work:
            return None, 0, 0, 0, WORKER_STATE_DISABLED
        logger.debug("Asking scheduler for work...")
        r = self._scheduler.get_work(
            worker=self._id,
            host=self.host,
            assistant=self._assistant,
            current_tasks=list(self._running_tasks.keys()),
        )
        n_pending_tasks = r['n_pending_tasks']
        running_tasks = r['running_tasks']
        n_unique_pending = r['n_unique_pending']
        # TODO: For a tiny amount of time (a month?) we'll keep forwards compatibility
        # That is you can user a newer client than server (Sep 2016)
        worker_state = r.get('worker_state', WORKER_STATE_ACTIVE)  # state according to server!
        task_id = self._get_work_task_id(r)

        self._get_work_response_history.append({
            'task_id': task_id,
            'running_tasks': running_tasks,
        })

        if task_id is not None and task_id not in self._scheduled_tasks:
            logger.info('Did not schedule %s, will load it dynamically', task_id)

            try:
                # TODO: we should obtain the module name from the server!
                self._scheduled_tasks[task_id] = \
                    load_task(module=r.get('task_module'),
                              task_name=r['task_family'],
                              params_str=r['task_params'])
            except TaskClassException as ex:
                self._handle_task_load_error(ex, [task_id])
                task_id = None
                self.run_succeeded = False

        if task_id is not None and 'batch_task_ids' in r:
            batch_tasks = filter(None, [
                self._scheduled_tasks.get(batch_id) for batch_id in r['batch_task_ids']])
            self._batch_running_tasks[task_id] = batch_tasks

        return task_id, running_tasks, n_pending_tasks, n_unique_pending, worker_state
Exemple #11
0
 def _get_work_task_id(self, get_work_response):
     if get_work_response['task_id'] is not None:
         return get_work_response['task_id']
     elif 'batch_id' in get_work_response:
         task = load_task(
             module=get_work_response.get('task_module'),
             task_name=get_work_response['task_family'],
             params_str=get_work_response['task_params'],
         )
         self._scheduler.add_task(
             worker=self._id,
             task_id=task.task_id,
             module=get_work_response.get('task_module'),
             family=get_work_response['task_family'],
             params=task.to_str_params(),
             status=RUNNING,
             batch_id=get_work_response['batch_id'],
         )
         return task.task_id
Exemple #12
0
 def _get_work_task_id(self, get_work_response):
     if get_work_response['task_id'] is not None:
         return get_work_response['task_id']
     elif 'batch_id' in get_work_response:
         task = load_task(
             module=get_work_response.get('task_module'),
             task_name=get_work_response['task_family'],
             params_str=get_work_response['task_params'],
         )
         self._scheduler.add_task(
             worker=self._id,
             task_id=task.task_id,
             module=get_work_response.get('task_module'),
             family=get_work_response['task_family'],
             params=task.to_str_params(),
             status=RUNNING,
             batch_id=get_work_response['batch_id'],
         )
         return task.task_id
Exemple #13
0
    def inflate_cls(mod_cls, params):
        """
        Reuse Luigi's service that registers task types
        so that we can dynamically instantiate an instance of this class.

        Like Luigi, we assume that the mod_cls is 'module.class' and we assume
        that the user has put there pipe location on PYTHONPATH.

        :param mod_cls: '<module>.<class>'
        :param params:  Dictionary of parameter to value
        :return:        Instance of the task in question
        """

        mod_path = mod_cls.split('.')
        mod = '.'.join(mod_path[:-1])
        cls = mod_path[-1]

        if mod == '':
            mod = None

        task = load_task(mod, cls, params)
        return task
Exemple #14
0
    def _handle_next_task(self):
        """
        We have to catch three ways a task can be "done":

        1. normal execution: the task runs/fails and puts a result back on the queue,
        2. new dependencies: the task yielded new deps that were not complete and
           will be rescheduled and dependencies added,
        3. child process dies: we need to catch this separately.
        """
        while True:
            self._purge_children()  # Deal with subprocess failures

            try:
                task_id, status, expl, missing, new_requirements = (
                    self._task_result_queue.get(
                        timeout=self._config.wait_interval))
            except Queue.Empty:
                return

            task = self._scheduled_tasks[task_id]
            if not task or task_id not in self._running_tasks:
                continue
                # Not a running task. Probably already removed.
                # Maybe it yielded something?

            # external task if run not implemented, retry-able if config option is enabled.
            external_task_retryable = _is_external(task) and self._config.retry_external_tasks
            if status == FAILED and not external_task_retryable:
                self._email_task_failure(task, expl)

            new_deps = []
            if new_requirements:
                new_req = [load_task(module, name, params)
                           for module, name, params in new_requirements]
                for t in new_req:
                    self.add(t)
                new_deps = [t.task_id for t in new_req]

            self._add_task(worker=self._id,
                           task_id=task_id,
                           status=status,
                           expl=json.dumps(expl),
                           resources=task.process_resources(),
                           runnable=None,
                           params=task.to_str_params(),
                           family=task.task_family,
                           module=task.task_module,
                           new_deps=new_deps,
                           assistant=self._assistant,
                           retry_policy_dict=_get_retry_policy_dict(task))

            self._running_tasks.pop(task_id)

            # re-add task to reschedule missing dependencies
            if missing:
                reschedule = True

                # keep out of infinite loops by not rescheduling too many times
                for task_id in missing:
                    self.unfulfilled_counts[task_id] += 1
                    if (self.unfulfilled_counts[task_id] >
                            self._config.max_reschedules):
                        reschedule = False
                if reschedule:
                    self.add(task)

            self.run_succeeded &= (status == DONE) or (len(new_deps) > 0)
            return
Exemple #15
0
 def test_external_tasks_loadable(self):
     task = load_task("luigi", "ExternalTask", {})
     assert(isinstance(task, luigi.ExternalTask))
Exemple #16
0
    def _handle_next_task(self):
        """
        We have to catch three ways a task can be "done":

        1. normal execution: the task runs/fails and puts a result back on the queue,
        2. new dependencies: the task yielded new deps that were not complete and
           will be rescheduled and dependencies added,
        3. child process dies: we need to catch this separately.
        """
        while True:
            self._purge_children()  # Deal with subprocess failures

            try:
                task_id, status, expl, missing, new_requirements = (
                    self._task_result_queue.get(
                        timeout=self._config.wait_interval))
            except Queue.Empty:
                return

            task = self._scheduled_tasks[task_id]
            if not task or task_id not in self._running_tasks:
                continue
                # Not a running task. Probably already removed.
                # Maybe it yielded something?

            # external task if run not implemented, retry-able if config option is enabled.
            external_task_retryable = _is_external(task) and self._config.retry_external_tasks
            if status == FAILED and not external_task_retryable:
                self._email_task_failure(task, expl)

            new_deps = []
            if new_requirements:
                new_req = [load_task(module, name, params)
                           for module, name, params in new_requirements]
                for t in new_req:
                    self.add(t)
                new_deps = [t.task_id for t in new_req]

            self._add_task(worker=self._id,
                           task_id=task_id,
                           status=status,
                           expl=json.dumps(expl),
                           resources=task.process_resources(),
                           runnable=None,
                           params=task.to_str_params(),
                           family=task.task_family,
                           module=task.task_module,
                           new_deps=new_deps,
                           assistant=self._assistant)

            self._running_tasks.pop(task_id)

            # re-add task to reschedule missing dependencies
            if missing:
                reschedule = True

                # keep out of infinite loops by not rescheduling too many times
                for task_id in missing:
                    self.unfulfilled_counts[task_id] += 1
                    if (self.unfulfilled_counts[task_id] >
                            self._config.max_reschedules):
                        reschedule = False
                if reschedule:
                    self.add(task)

            self.run_succeeded &= (status == DONE) or (len(new_deps) > 0)
            return
Exemple #17
0
 def make_task(self, task_module : str) -> luigi.Task:
     """ Reifies the luigi.Task object from its name and saved parameters """
     return load_task(
         task_module,
         self.name,
         {name: param.value for name, param in self.parameters.items()})
Exemple #18
0
    def _handle_next_task(self):
        """
        We have to catch three ways a task can be "done":

        1. normal execution: the task runs/fails and puts a result back on the queue,
        2. new dependencies: the task yielded new deps that were not complete and
           will be rescheduled and dependencies added,
        3. child process dies: we need to catch this separately.
        """
        while True:
            self._purge_children()  # Deal with subprocess failures

            try:
                task_id, status, error_message, missing, new_requirements = (
                    self._task_result_queue.get(
                        timeout=float(self._config.wait_interval)))
            except Queue.Empty:
                return

            task = self._scheduled_tasks[task_id]
            if not task or task_id not in self._running_tasks:
                continue
                # Not a running task. Probably already removed.
                # Maybe it yielded something?
            new_deps = []
            if new_requirements:
                new_req = [
                    load_task(module, name, params)
                    for module, name, params in new_requirements
                ]
                for t in new_req:
                    self.add(t)
                new_deps = [t.task_id for t in new_req]

            self._scheduler.add_task(worker=self._id,
                                     task_id=task_id,
                                     status=status,
                                     expl=error_message,
                                     resources=task.process_resources(),
                                     runnable=None,
                                     params=task.to_str_params(),
                                     family=task.task_family,
                                     module=task.task_module,
                                     new_deps=new_deps,
                                     assistant=self._assistant)

            if status == RUNNING:
                continue
            self._running_tasks.pop(task_id)

            # re-add task to reschedule missing dependencies
            if missing:
                reschedule = True

                # keep out of infinite loops by not rescheduling too many times
                for task_id in missing:
                    self.unfulfilled_counts[task_id] += 1
                    if (self.unfulfilled_counts[task_id] >
                            self._config.max_reschedules):
                        reschedule = False
                if reschedule:
                    self.add(task)

            self.run_succeeded &= status in (DONE, SUSPENDED)
            return
Exemple #19
0
 def make_task(self, task_module : str) -> luigi.Task:
     """ Reifies the luigi.Task object from its name and saved parameters """
     return load_task(
         task_module,
         self.task_family,
         self.params)
Exemple #20
0
 def test_external_tasks_loadable(self):
     task = load_task("luigi", "ExternalTask", {})
     assert (isinstance(task, luigi.ExternalTask))
Exemple #21
0
 def make_task(self, task_module: str) -> luigi.Task:
     """ Reifies the luigi.Task object from its name and saved parameters """
     return load_task(
         task_module, self.name,
         {name: param.value
          for name, param in self.parameters.items()})
Exemple #22
0
 def make_task(self, task_module: str) -> luigi.Task:
     """ Reifies the luigi.Task object from its name and saved parameters """
     return load_task(task_module, self.task_family, self.params)