Beispiel #1
0
    def _finish_task_execution(self):
        """
        Call on_finish hook, verify products exist and save metadata
        """
        # run on finish first, if this fails, we don't want to save metadata
        try:
            self._run_on_finish()
        except Exception:
            # NOTE: we also set the status in Task._build, which runs during
            # DAG.build() - but setting if here as well to prevent DAG
            # inconsistent state when the user calls Tas.build() directly
            self.exec_status = TaskStatus.Errored
            raise

        self.product.metadata.update(str(self.source))

        # For most Products, it's ok to do this check before
        # saving metadata, but not for GenericProduct, since the way
        # exists() works is by checking metadata, so we have to do it
        # here, after saving metadata
        if not self.product.exists():
            if isinstance(self.product, MetaProduct):
                raise TaskBuildError(
                    'Error building task "{}": '
                    'the task ran successfully but product '
                    '"{}" does not exist yet '
                    '(task.product.exists() returned False). '.format(
                        self.name, self.product))
            else:
                raise TaskBuildError(
                    'Error building task "{}": '
                    'the task ran successfully but at least one of the '
                    'products in "{}" does not exist yet '
                    '(task.product.exists() returned False). '.format(
                        self.name, self.product))
Beispiel #2
0
    def _run(self):
        """
        Run or download task if certain status conditions are met, otherwise
        raise a TaskBuildError exception
        """
        # cannot keep running, we depend on the render step to get all the
        # parameters resolved (params, upstream, product)
        if self.exec_status == TaskStatus.WaitingRender:
            raise TaskBuildError('Error building task "{}". '
                                 'Cannot build task that has not been '
                                 'rendered, call DAG.render() first'.format(
                                     self.name))

        elif self.exec_status == TaskStatus.Aborted:
            raise TaskBuildError('Attempted to run task "{}", whose '
                                 'status is TaskStatus.Aborted'.format(
                                     self.name))
        elif self.exec_status == TaskStatus.Skipped:
            raise TaskBuildError('Attempted to run task "{}", whose '
                                 'status TaskStatus.Skipped. Render again and '
                                 'set force=True if you want to force '
                                 'execution'.format(self.name))

        # NOTE: should i fetch metadata here? I need to make sure I have
        # the latest before building
        self._logger.info('Starting execution: %s', repr(self))

        then = datetime.now()

        _ensure_parents_exist(self.product)

        if self.exec_status == TaskStatus.WaitingDownload:
            try:
                self.product.download()
            except Exception as e:
                raise TaskBuildError(
                    f'Error downloading Product {self.product!r} '
                    f'from task {self!r}. Check the full traceback above for '
                    'details') from e

        # NOTE: should we validate status here?
        # (i.e., check it's WaitingExecution)
        else:
            self.run()

        now = datetime.now()

        elapsed = (now - then).total_seconds()
        self._logger.info(
            'Done. Operation took {:.1f} seconds'.format(elapsed))

        # TODO: also check that the Products were updated:
        # if they did not exist, they must exist now, if they alredy
        # exist, timestamp must be recent equal to the datetime.now()
        # used. maybe run fetch metadata again and validate?

        return TaskReport.with_data(name=self.name, ran=True, elapsed=elapsed)
Beispiel #3
0
    def _post_run_actions(self):
        """
        Call on_finish hook, save metadata, verify products exist and upload
        product
        """
        # run on finish first, if this fails, we don't want to save metadata
        try:
            self._run_on_finish()
        except Exception:
            # NOTE: we also set the status in Task._build, which runs during
            # DAG.build() - but setting if here as well to prevent DAG
            # inconsistent state when the user calls Tas.build() directly
            self.exec_status = TaskStatus.Errored
            raise

        if self.exec_status == TaskStatus.WaitingDownload:
            # clear current metadata to force reload
            # and ensure the task uses the downloaded metadata
            self.product.metadata.clear()
        else:
            self.product.metadata.update(
                source_code=str(self.source),
                params=self.params.to_json_serializable(params_only=True))

        # For most Products, it's ok to do this check before
        # saving metadata, but not for GenericProduct, since the way
        # exists() works is by checking metadata, so we have to do it
        # here, after saving metadata
        if not self.product.exists():
            if isinstance(self.product, MetaProduct):
                raise TaskBuildError(
                    'Error building task "{}": '
                    'the task ran successfully but product '
                    '"{}" does not exist yet '
                    '(task.product.exists() returned False). '.format(
                        self.name, self.product))
            else:
                raise TaskBuildError(
                    'Error building task "{}": '
                    'the task ran successfully but at least one of the '
                    'products in "{}" does not exist yet '
                    '(task.product.exists() returned False). '.format(
                        self.name, self.product))

        if self.exec_status != TaskStatus.WaitingDownload:
            self.product.upload()
Beispiel #4
0
    def debug(self, kind='ipdb'):
        """
        Run callable in debug mode.

        Parameters
        ----------
        kind : str ('ipdb' or 'pdb')
            Which debugger to use 'ipdb' for IPython debugger or 'pdb' for
            debugger from the standard library

        Notes
        -----
        Be careful when debugging tasks. If the task has run
        successfully, you overwrite products but don't save the
        updated source code, your DAG will enter an inconsistent state where
        the metadata won't match the overwritten product.
        """
        opts = {'ipdb', 'pdb'}

        if kind not in opts:
            raise ValueError('"kind" must be one of {}, got: "{}"'.format(
                opts, kind))

        if self.exec_status == TaskStatus.WaitingRender:
            raise TaskBuildError('Error in task "{}". '
                                 'Cannot call task.debug() on a task that has '
                                 'not been '
                                 'rendered, call DAG.render() first'.format(
                                     self.name))

        if 'upstream' in self.params and self._unserializer:
            params = _unserialize_params(self.params, self._unserializer)
        else:
            params = self.params.to_dict()

        if self._serializer:
            params.pop('product')

        if kind == 'ipdb':
            try:
                # this seems to only work in a Terminal
                ipdb = TerminalPdb()
            except AttributeError:
                # this works in a Jupyter notebook
                ipdb = Pdb()

            ipdb.runcall(self.source.primitive, **params)
        elif kind == 'pdb':
            pdb.runcall(self.source.primitive, **params)
Beispiel #5
0
    def build(self, force=False, catch_exceptions=True):
        """Build a single task

        Although Tasks are primarily designed to execute via DAG.build(), it
        is possible to do so in isolation. However, this only works if the
        task does not have any unrendered upstream dependencies, if that's the
        case, you should call DAG.render() before calling Task.build()

        Examples
        --------
        >>> from pathlib import Path
        >>> from ploomber import DAG
        >>> from ploomber.tasks import PythonCallable
        >>> from ploomber.products import File
        >>> def fn(product):
        ...     Path(str(product)).touch()
        >>> PythonCallable(fn, File('file.txt'), dag=DAG()).build()

        Returns
        -------
        dict
            A dictionary with keys 'run' and 'elapsed'

        Raises
        ------
        TaskBuildError
            If the error failed to build because it has upstream dependencies,
            the build itself failed or build succeded but on_finish hook failed

        DAGBuildEarlyStop
            If any task or on_finish hook raises a DAGBuildEarlyStop error
        """
        if any(t.exec_status == TaskStatus.WaitingRender
               for t in self.upstream.values()):
            raise TaskBuildError('Cannot directly build task "{}" as it '
                                 'has upstream dependencies, call '
                                 'dag.render() first'.format(self.name))

        # This is the public API for users who'd to run tasks in isolation,
        # we have to make sure we clear product cache status, otherwise
        # this will interfer with other render calls
        self.render(force=force)

        res, _ = self._build(catch_exceptions=catch_exceptions)
        self.product.metadata.clear()
        return res
Beispiel #6
0
    def develop(self, app='notebook', args=None):
        """Edit function interactively using Jupyter

        Parameters
        ----------
        app : str, {'notebook', 'lab'}, default='notebook'
            Which jupyter application to use

        args : str
            Extra args passed to the selected jupyter application

        Notes
        -----
        Cells whose first line is an empty commenf ("#"), will be removed when
        exporting back to the function, you can use this for temporary,
        exploratory work

        Be careful when developing tasks interacively. If the task has run
        successfully, you overwrite products but don't save the
        updated source code, your DAG will enter an inconsistent state where
        the metadata won't match the overwritten product.
        """
        apps = {'notebook', 'lab'}

        if app not in apps:
            raise ValueError('"app" must be one of {}, got: "{}"'.format(
                apps, app))

        if self.exec_status == TaskStatus.WaitingRender:
            raise TaskBuildError(
                'Error in task "{}". '
                'Cannot call task.develop() on a task that has '
                'not been '
                'rendered, call DAG.render() first'.format(self.name))

        with self._interactive_developer() as tmp:
            try:
                subprocess.run(['jupyter', app, tmp] + shlex.split(args or ''),
                               check=True)
            except KeyboardInterrupt:
                print(f'Jupyter {app} application closed...')
Beispiel #7
0
def _from_ipynb(path_to_nb, extension, nbconvert_exporter_name):
    if nbconvert_exporter_name is not None:
        exporter = nbconvert.get_exporter(nbconvert_exporter_name)
    else:
        try:
            exporter = nbconvert.get_exporter(extension.replace('.', ''))
        except ValueError:
            raise TaskBuildError('Could not determine nbconvert exporter '
                                 'either specify in the path extension '
                                 'or pass a valid exporter name in '
                                 'the NotebookRunner constructor, '
                                 'valid expoers are: {}'.format(
                                     nbconvert.get_export_names()))

    path = Path(path_to_nb)

    nb = nbformat.v4.reads(path.read_text())
    content, _ = nbconvert.export(exporter, nb, exclude_input=True)

    path.write_text(content)

    return content
Beispiel #8
0
    def run(self):
        if isinstance(self.product, MetaProduct):
            path_to_out = Path(str(self.product[self.nb_product_key]))
        else:
            path_to_out = Path(str(self.product))

        # we will run the notebook with this extension, regardless of the
        # user's choice, if any error happens, this will allow them to debug
        # we will change the extension after the notebook runs successfully
        path_to_out_ipynb = path_to_out.with_suffix('.ipynb')

        fd, tmp = tempfile.mkstemp('.ipynb')
        os.close(fd)

        tmp = Path(tmp)
        tmp.write_text(self.source.nb_str_rendered)

        if self.local_execution:
            self.papermill_params['cwd'] = str(self.source.loc.parent)

        # create parent folders if they don't exist
        Path(path_to_out_ipynb).parent.mkdir(parents=True, exist_ok=True)

        try:
            # no need to pass parameters, they are already there
            pm.execute_notebook(str(tmp), str(path_to_out_ipynb),
                                **self.papermill_params)
        except Exception as e:
            raise TaskBuildError('An error occurred when calling'
                                 ' papermil.execute_notebook, partially'
                                 ' executed notebook with traceback '
                                 'available at {}'.format(
                                     str(path_to_out_ipynb))) from e
        finally:
            tmp.unlink()

        path_to_out_ipynb.rename(path_to_out)
        self._converter.convert()
Beispiel #9
0
    def _build(self, catch_exceptions):
        """
        Private API for building DAGs. This is what executors should call.
        Unlike the public method, this one does not call render, as it
        should happen via a dag.render() call. It takes care of running the
        task and updating status accordingly

        Parameters
        ----------
        catch_exceptions : bool
            If True, catches exceptions during execution and shows a chained
            exception at the end: [original exception] then
            [exception with context info]. Set it to False when debugging
            tasks to drop-in a debugging session at the failing line.
        """
        if not catch_exceptions:
            res = self._run()
            self._post_run_actions()
            return res, self.product.metadata.to_dict()
        else:
            try:
                # TODO: this calls download, if this happens. should
                # hooks be executed when dwnloading? if so, we could
                # change the ran? column from the task report to something
                # like:
                # ran/downloaded/skipped and use that to determine if we should
                # run hooks
                res = self._run()
            except Exception as e:
                msg = 'Error building task "{}"'.format(self.name)
                self._logger.exception(msg)
                self.exec_status = TaskStatus.Errored

                # if there isn't anything left to run, raise exception here
                if self.on_failure is None:
                    if isinstance(e, DAGBuildEarlyStop):
                        raise DAGBuildEarlyStop(
                            'Stopping task {} gracefully'.format(
                                self.name)) from e
                    else:
                        # FIXME: this makes the traceback longer, consider
                        # removing it. The only information this nested
                        # exception provides is the name of the task but we
                        # are still able to provide that if theh executor
                        # has the option to capture exceptions turned on.
                        # An option to consider is to
                        raise TaskBuildError(msg) from e

                build_success = False
                build_exception = e
            else:
                build_success = True
                build_exception = None

            if build_success:
                try:
                    self._post_run_actions()
                except Exception as e:
                    self.exec_status = TaskStatus.Errored
                    msg = ('Exception when running on_finish '
                           'for task "{}": {}'.format(self.name, e))
                    self._logger.exception(msg)

                    if isinstance(e, DAGBuildEarlyStop):
                        raise DAGBuildEarlyStop(
                            'Stopping task {} gracefully'.format(
                                self.name)) from e
                    else:
                        raise TaskBuildError(msg) from e
                else:
                    # sucessful task execution, on_finish hook execution,
                    # metadata saving and upload
                    self.exec_status = TaskStatus.Executed

                return res, self.product.metadata.to_dict()
            # error bulding task
            else:
                try:
                    self._run_on_failure()
                except Exception as e:
                    msg = ('Exception when running on_failure '
                           'for task "{}": {}'.format(self.name, e))
                    self._logger.exception(msg)
                    raise TaskBuildError(msg) from e

                if isinstance(build_exception, DAGBuildEarlyStop):
                    raise DAGBuildEarlyStop(
                        'Stopping task {} gracefully'.format(
                            self.name)) from build_exception
                else:
                    msg = 'Error building task "{}"'.format(self.name)
                    raise TaskBuildError(msg) from build_exception
Beispiel #10
0
    def build(self, force=False, catch_exceptions=True):
        """Build a single task

        Although Tasks are primarily designed to execute via DAG.build(), it
        is possible to do so in isolation. However, this only works if the
        task does not have any unrendered upstream dependencies, if that's the
        case, you should call DAG.render() before calling Task.build()

        Examples
        --------
        >>> from pathlib import Path
        >>> from ploomber import DAG
        >>> from ploomber.tasks import PythonCallable
        >>> from ploomber.products import File
        >>> def fn(product):
        ...     Path(str(product)).touch()
        >>> PythonCallable(fn, File('file.txt'), dag=DAG()).build()

        Returns
        -------
        dict
            A dictionary with keys 'run' and 'elapsed'

        Raises
        ------
        TaskBuildError
            If the error failed to build because it has upstream dependencies,
            the build itself failed or build succeded but on_finish hook failed

        DAGBuildEarlyStop
            If any task or on_finish hook raises a DAGBuildEarlyStop error
        """
        # This is the public API for users who'd to run tasks in isolation,
        # we have to make sure we clear product cache status, otherwise
        # this will interfere with other render calls
        self.render(force=force)

        upstream_exec_status = [t.exec_status for t in self.upstream.values()]

        if any(exec_status == TaskStatus.WaitingRender
               for exec_status in upstream_exec_status):
            raise TaskBuildError('Cannot directly build task "{}" as it '
                                 'has upstream dependencies, call '
                                 'dag.render() first'.format(self.name))

        # we can execute an individual tasks if missing up-to-date upstream
        # dependencies exist in remote storage
        if self.exec_status == TaskStatus.WaitingUpstream:
            ok = {
                t
                for t in self.upstream.values() if t.exec_status in
                {TaskStatus.Skipped, TaskStatus.WaitingDownload}
            }

            not_ok = set(self.upstream.values()) - ok

            if not_ok:
                raise TaskBuildError(
                    f'Cannot build task {self.name!r} because '
                    'the following upstream dependencies are '
                    f'missing: {[t.name for t in not_ok]!r}. Execute upstream '
                    'tasks first. If upstream tasks generate File(s) and you'
                    'configured a File.client, you may also upload '
                    'up-to-date copies to remote storage and they will be '
                    'automatically downloaded')

            download_products_in_parallel(
                t for t in ok if t.exec_status == TaskStatus.WaitingDownload)

        # at this point the task must be WaitingDownload or WaitingExecution
        res, _ = self._build(catch_exceptions=catch_exceptions)

        self.product.metadata.clear()
        return res
Beispiel #11
0
    def build(self, force=False):
        """Run the task if needed by checking its dependencies

        Returns
        -------
        dict
            A dictionary with keys 'run' and 'elapsed'
        """
        # TODO: if this is run in a task that has upstream dependencies
        # it will fail with a useless error since self.params does not have
        # upstream yet (added after rendering)

        # NOTE: should i fetch metadata here? I need to make sure I have
        # the latest before building

        self._logger.info(f'-----\nChecking {repr(self)}....')

        # do not run unless some of the conditions below match...
        run = False
        elapsed = 0

        if force:
            self._logger.info('Forcing run, skipping checks...')
            run = True
        else:
            # not forcing, need to check dependencies...
            p_exists = self.product.exists()

            # check dependencies only if the product exists and there is
            # metadata
            if p_exists and self.product.metadata is not None:

                outdated_data_deps = self.product._outdated_data_dependencies()
                outdated_code_dep = self.product._outdated_code_dependency()

                self._logger.info('Checking dependencies...')

                if outdated_data_deps:
                    run = True
                    self._logger.info('Outdated data deps...')
                else:
                    self._logger.info('Up-to-date data deps...')

                if outdated_code_dep:
                    run = True
                    self._logger.info('Outdated code dep...')
                else:
                    self._logger.info('Up-to-date code dep...')
            else:
                run = True

                # just log why it will run
                if not p_exists:
                    self._logger.info('Product does not exist...')

                if self.product.metadata is None:
                    self._logger.info('Product metadata is None...')

                self._logger.info('Running...')

        if run:
            self._logger.info(f'Starting execution: {repr(self)}')

            then = datetime.now()

            try:
                self.run()
            except Exception as e:
                tb = traceback.format_exc()

                if self.on_failure:
                    try:
                        self.on_failure(self, tb)
                    except Exception:
                        self._logger.exception('Error executing on_failure '
                                               'callback')
                raise e

            now = datetime.now()
            elapsed = (now - then).total_seconds()
            self._logger.info(f'Done. Operation took {elapsed:.1f} seconds')

            # update metadata
            self.product.timestamp = datetime.now().timestamp()
            self.product.stored_source_code = self.source_code
            self.product.save_metadata()

            # TODO: also check that the Products were updated:
            # if they did not exist, they must exist now, if they alredy
            # exist, timestamp must be recent equal to the datetime.now()
            # used. maybe run fetch metadata again and validate?

            if not self.product.exists():
                raise TaskBuildError(f'Error building task "{self}": '
                                     'the task ran successfully but product '
                                     f'"{self.product}" does not exist yet '
                                     '(task.product.exist() returned False)')

            if self.on_finish:
                try:
                    if 'client' in inspect.getfullargspec(self.on_finish).args:
                        self.on_finish(self, client=self.client)
                    else:
                        self.on_finish(self)

                except Exception as e:
                    raise TaskBuildError('Exception when running on_finish '
                                         'for task {}: {}'.format(self, e))

        else:
            self._logger.info(f'No need to run {repr(self)}')

        self._logger.info('-----\n')

        self._status = TaskStatus.Executed

        for t in self._get_downstream():
            t._update_status()

        self.build_report = Row({'name': self.name, 'Ran?': run,
                                 'Elapsed (s)': elapsed, })

        return self
Beispiel #12
0
    def _build(self, catch_exceptions):
        """
        Private API for building DAGs. This is what executors should call.
        Unlike the public method, this one does not call render, as it
        should happen via a dag.render() call. It takes care of running the
        task and updating status accordingly

        Parameters
        ----------
        catch_exceptions : bool
            If True, catches exceptions during execution and shows a chained
            exception at the end: [original exception] then
            [exception with context info]. Set it to False when debugging
            tasks to drop-in a debugging session at the failing line.
        """

        if not catch_exceptions:
            res = self._run()
            self._finish_task_execution()
            return res, self.product.metadata.to_dict()
        else:
            try:
                res = self._run()
            except Exception as e:
                msg = 'Error building task "{}"'.format(self.name)
                self._logger.exception(msg)
                self.exec_status = TaskStatus.Errored

                # if there isn't anything left to run, raise exception here
                if self.on_failure is None:
                    if isinstance(e, DAGBuildEarlyStop):
                        raise DAGBuildEarlyStop(
                            'Stopping task {} gracefully'.format(
                                self.name)) from e
                    else:
                        # FIXME: this makes the traceback longer, consider
                        # removing it. The only information this nested
                        # exception provides is the name of the task but we
                        # are still able to provide that if theh executor
                        # has the option to capture exceptions turned on.
                        # An option to consider is to
                        raise TaskBuildError(msg) from e

                build_success = False
                build_exception = e
            else:
                build_success = True
                build_exception = None

            if build_success:
                try:
                    # FIXME: move metadata saving and product checking,
                    # the error message is misleading
                    # this not only runs the hook, but also
                    # calls save metadata and checks that the product exists
                    self._finish_task_execution()
                except Exception as e:
                    self.exec_status = TaskStatus.Errored
                    msg = ('Exception when running on_finish '
                           'for task "{}": {}'.format(self.name, e))
                    self._logger.exception(msg)

                    if isinstance(e, DAGBuildEarlyStop):
                        raise DAGBuildEarlyStop(
                            'Stopping task {} gracefully'.format(
                                self.name)) from e
                    else:
                        raise TaskBuildError(msg) from e
                else:
                    self.exec_status = TaskStatus.Executed

                return res, self.product.metadata.to_dict()
            else:
                try:
                    self._run_on_failure()
                except Exception as e:
                    msg = ('Exception when running on_failure '
                           'for task "{}": {}'.format(self.name, e))
                    self._logger.exception(msg)
                    raise TaskBuildError(msg) from e

                if isinstance(build_exception, DAGBuildEarlyStop):
                    raise DAGBuildEarlyStop(
                        'Stopping task {} gracefully'.format(
                            self.name)) from build_exception
                else:
                    msg = 'Error building task "{}"'.format(self.name)
                    raise TaskBuildError(msg) from build_exception