def _finish_task_execution(self): """ Call on_finish hook, verify products exist and save metadata """ # run on finish first, if this fails, we don't want to save metadata try: self._run_on_finish() except Exception: # NOTE: we also set the status in Task._build, which runs during # DAG.build() - but setting if here as well to prevent DAG # inconsistent state when the user calls Tas.build() directly self.exec_status = TaskStatus.Errored raise self.product.metadata.update(str(self.source)) # For most Products, it's ok to do this check before # saving metadata, but not for GenericProduct, since the way # exists() works is by checking metadata, so we have to do it # here, after saving metadata if not self.product.exists(): if isinstance(self.product, MetaProduct): raise TaskBuildError( 'Error building task "{}": ' 'the task ran successfully but product ' '"{}" does not exist yet ' '(task.product.exists() returned False). '.format( self.name, self.product)) else: raise TaskBuildError( 'Error building task "{}": ' 'the task ran successfully but at least one of the ' 'products in "{}" does not exist yet ' '(task.product.exists() returned False). '.format( self.name, self.product))
def _run(self): """ Run or download task if certain status conditions are met, otherwise raise a TaskBuildError exception """ # cannot keep running, we depend on the render step to get all the # parameters resolved (params, upstream, product) if self.exec_status == TaskStatus.WaitingRender: raise TaskBuildError('Error building task "{}". ' 'Cannot build task that has not been ' 'rendered, call DAG.render() first'.format( self.name)) elif self.exec_status == TaskStatus.Aborted: raise TaskBuildError('Attempted to run task "{}", whose ' 'status is TaskStatus.Aborted'.format( self.name)) elif self.exec_status == TaskStatus.Skipped: raise TaskBuildError('Attempted to run task "{}", whose ' 'status TaskStatus.Skipped. Render again and ' 'set force=True if you want to force ' 'execution'.format(self.name)) # NOTE: should i fetch metadata here? I need to make sure I have # the latest before building self._logger.info('Starting execution: %s', repr(self)) then = datetime.now() _ensure_parents_exist(self.product) if self.exec_status == TaskStatus.WaitingDownload: try: self.product.download() except Exception as e: raise TaskBuildError( f'Error downloading Product {self.product!r} ' f'from task {self!r}. Check the full traceback above for ' 'details') from e # NOTE: should we validate status here? # (i.e., check it's WaitingExecution) else: self.run() now = datetime.now() elapsed = (now - then).total_seconds() self._logger.info( 'Done. Operation took {:.1f} seconds'.format(elapsed)) # TODO: also check that the Products were updated: # if they did not exist, they must exist now, if they alredy # exist, timestamp must be recent equal to the datetime.now() # used. maybe run fetch metadata again and validate? return TaskReport.with_data(name=self.name, ran=True, elapsed=elapsed)
def _post_run_actions(self): """ Call on_finish hook, save metadata, verify products exist and upload product """ # run on finish first, if this fails, we don't want to save metadata try: self._run_on_finish() except Exception: # NOTE: we also set the status in Task._build, which runs during # DAG.build() - but setting if here as well to prevent DAG # inconsistent state when the user calls Tas.build() directly self.exec_status = TaskStatus.Errored raise if self.exec_status == TaskStatus.WaitingDownload: # clear current metadata to force reload # and ensure the task uses the downloaded metadata self.product.metadata.clear() else: self.product.metadata.update( source_code=str(self.source), params=self.params.to_json_serializable(params_only=True)) # For most Products, it's ok to do this check before # saving metadata, but not for GenericProduct, since the way # exists() works is by checking metadata, so we have to do it # here, after saving metadata if not self.product.exists(): if isinstance(self.product, MetaProduct): raise TaskBuildError( 'Error building task "{}": ' 'the task ran successfully but product ' '"{}" does not exist yet ' '(task.product.exists() returned False). '.format( self.name, self.product)) else: raise TaskBuildError( 'Error building task "{}": ' 'the task ran successfully but at least one of the ' 'products in "{}" does not exist yet ' '(task.product.exists() returned False). '.format( self.name, self.product)) if self.exec_status != TaskStatus.WaitingDownload: self.product.upload()
def debug(self, kind='ipdb'): """ Run callable in debug mode. Parameters ---------- kind : str ('ipdb' or 'pdb') Which debugger to use 'ipdb' for IPython debugger or 'pdb' for debugger from the standard library Notes ----- Be careful when debugging tasks. If the task has run successfully, you overwrite products but don't save the updated source code, your DAG will enter an inconsistent state where the metadata won't match the overwritten product. """ opts = {'ipdb', 'pdb'} if kind not in opts: raise ValueError('"kind" must be one of {}, got: "{}"'.format( opts, kind)) if self.exec_status == TaskStatus.WaitingRender: raise TaskBuildError('Error in task "{}". ' 'Cannot call task.debug() on a task that has ' 'not been ' 'rendered, call DAG.render() first'.format( self.name)) if 'upstream' in self.params and self._unserializer: params = _unserialize_params(self.params, self._unserializer) else: params = self.params.to_dict() if self._serializer: params.pop('product') if kind == 'ipdb': try: # this seems to only work in a Terminal ipdb = TerminalPdb() except AttributeError: # this works in a Jupyter notebook ipdb = Pdb() ipdb.runcall(self.source.primitive, **params) elif kind == 'pdb': pdb.runcall(self.source.primitive, **params)
def build(self, force=False, catch_exceptions=True): """Build a single task Although Tasks are primarily designed to execute via DAG.build(), it is possible to do so in isolation. However, this only works if the task does not have any unrendered upstream dependencies, if that's the case, you should call DAG.render() before calling Task.build() Examples -------- >>> from pathlib import Path >>> from ploomber import DAG >>> from ploomber.tasks import PythonCallable >>> from ploomber.products import File >>> def fn(product): ... Path(str(product)).touch() >>> PythonCallable(fn, File('file.txt'), dag=DAG()).build() Returns ------- dict A dictionary with keys 'run' and 'elapsed' Raises ------ TaskBuildError If the error failed to build because it has upstream dependencies, the build itself failed or build succeded but on_finish hook failed DAGBuildEarlyStop If any task or on_finish hook raises a DAGBuildEarlyStop error """ if any(t.exec_status == TaskStatus.WaitingRender for t in self.upstream.values()): raise TaskBuildError('Cannot directly build task "{}" as it ' 'has upstream dependencies, call ' 'dag.render() first'.format(self.name)) # This is the public API for users who'd to run tasks in isolation, # we have to make sure we clear product cache status, otherwise # this will interfer with other render calls self.render(force=force) res, _ = self._build(catch_exceptions=catch_exceptions) self.product.metadata.clear() return res
def develop(self, app='notebook', args=None): """Edit function interactively using Jupyter Parameters ---------- app : str, {'notebook', 'lab'}, default='notebook' Which jupyter application to use args : str Extra args passed to the selected jupyter application Notes ----- Cells whose first line is an empty commenf ("#"), will be removed when exporting back to the function, you can use this for temporary, exploratory work Be careful when developing tasks interacively. If the task has run successfully, you overwrite products but don't save the updated source code, your DAG will enter an inconsistent state where the metadata won't match the overwritten product. """ apps = {'notebook', 'lab'} if app not in apps: raise ValueError('"app" must be one of {}, got: "{}"'.format( apps, app)) if self.exec_status == TaskStatus.WaitingRender: raise TaskBuildError( 'Error in task "{}". ' 'Cannot call task.develop() on a task that has ' 'not been ' 'rendered, call DAG.render() first'.format(self.name)) with self._interactive_developer() as tmp: try: subprocess.run(['jupyter', app, tmp] + shlex.split(args or ''), check=True) except KeyboardInterrupt: print(f'Jupyter {app} application closed...')
def _from_ipynb(path_to_nb, extension, nbconvert_exporter_name): if nbconvert_exporter_name is not None: exporter = nbconvert.get_exporter(nbconvert_exporter_name) else: try: exporter = nbconvert.get_exporter(extension.replace('.', '')) except ValueError: raise TaskBuildError('Could not determine nbconvert exporter ' 'either specify in the path extension ' 'or pass a valid exporter name in ' 'the NotebookRunner constructor, ' 'valid expoers are: {}'.format( nbconvert.get_export_names())) path = Path(path_to_nb) nb = nbformat.v4.reads(path.read_text()) content, _ = nbconvert.export(exporter, nb, exclude_input=True) path.write_text(content) return content
def run(self): if isinstance(self.product, MetaProduct): path_to_out = Path(str(self.product[self.nb_product_key])) else: path_to_out = Path(str(self.product)) # we will run the notebook with this extension, regardless of the # user's choice, if any error happens, this will allow them to debug # we will change the extension after the notebook runs successfully path_to_out_ipynb = path_to_out.with_suffix('.ipynb') fd, tmp = tempfile.mkstemp('.ipynb') os.close(fd) tmp = Path(tmp) tmp.write_text(self.source.nb_str_rendered) if self.local_execution: self.papermill_params['cwd'] = str(self.source.loc.parent) # create parent folders if they don't exist Path(path_to_out_ipynb).parent.mkdir(parents=True, exist_ok=True) try: # no need to pass parameters, they are already there pm.execute_notebook(str(tmp), str(path_to_out_ipynb), **self.papermill_params) except Exception as e: raise TaskBuildError('An error occurred when calling' ' papermil.execute_notebook, partially' ' executed notebook with traceback ' 'available at {}'.format( str(path_to_out_ipynb))) from e finally: tmp.unlink() path_to_out_ipynb.rename(path_to_out) self._converter.convert()
def _build(self, catch_exceptions): """ Private API for building DAGs. This is what executors should call. Unlike the public method, this one does not call render, as it should happen via a dag.render() call. It takes care of running the task and updating status accordingly Parameters ---------- catch_exceptions : bool If True, catches exceptions during execution and shows a chained exception at the end: [original exception] then [exception with context info]. Set it to False when debugging tasks to drop-in a debugging session at the failing line. """ if not catch_exceptions: res = self._run() self._post_run_actions() return res, self.product.metadata.to_dict() else: try: # TODO: this calls download, if this happens. should # hooks be executed when dwnloading? if so, we could # change the ran? column from the task report to something # like: # ran/downloaded/skipped and use that to determine if we should # run hooks res = self._run() except Exception as e: msg = 'Error building task "{}"'.format(self.name) self._logger.exception(msg) self.exec_status = TaskStatus.Errored # if there isn't anything left to run, raise exception here if self.on_failure is None: if isinstance(e, DAGBuildEarlyStop): raise DAGBuildEarlyStop( 'Stopping task {} gracefully'.format( self.name)) from e else: # FIXME: this makes the traceback longer, consider # removing it. The only information this nested # exception provides is the name of the task but we # are still able to provide that if theh executor # has the option to capture exceptions turned on. # An option to consider is to raise TaskBuildError(msg) from e build_success = False build_exception = e else: build_success = True build_exception = None if build_success: try: self._post_run_actions() except Exception as e: self.exec_status = TaskStatus.Errored msg = ('Exception when running on_finish ' 'for task "{}": {}'.format(self.name, e)) self._logger.exception(msg) if isinstance(e, DAGBuildEarlyStop): raise DAGBuildEarlyStop( 'Stopping task {} gracefully'.format( self.name)) from e else: raise TaskBuildError(msg) from e else: # sucessful task execution, on_finish hook execution, # metadata saving and upload self.exec_status = TaskStatus.Executed return res, self.product.metadata.to_dict() # error bulding task else: try: self._run_on_failure() except Exception as e: msg = ('Exception when running on_failure ' 'for task "{}": {}'.format(self.name, e)) self._logger.exception(msg) raise TaskBuildError(msg) from e if isinstance(build_exception, DAGBuildEarlyStop): raise DAGBuildEarlyStop( 'Stopping task {} gracefully'.format( self.name)) from build_exception else: msg = 'Error building task "{}"'.format(self.name) raise TaskBuildError(msg) from build_exception
def build(self, force=False, catch_exceptions=True): """Build a single task Although Tasks are primarily designed to execute via DAG.build(), it is possible to do so in isolation. However, this only works if the task does not have any unrendered upstream dependencies, if that's the case, you should call DAG.render() before calling Task.build() Examples -------- >>> from pathlib import Path >>> from ploomber import DAG >>> from ploomber.tasks import PythonCallable >>> from ploomber.products import File >>> def fn(product): ... Path(str(product)).touch() >>> PythonCallable(fn, File('file.txt'), dag=DAG()).build() Returns ------- dict A dictionary with keys 'run' and 'elapsed' Raises ------ TaskBuildError If the error failed to build because it has upstream dependencies, the build itself failed or build succeded but on_finish hook failed DAGBuildEarlyStop If any task or on_finish hook raises a DAGBuildEarlyStop error """ # This is the public API for users who'd to run tasks in isolation, # we have to make sure we clear product cache status, otherwise # this will interfere with other render calls self.render(force=force) upstream_exec_status = [t.exec_status for t in self.upstream.values()] if any(exec_status == TaskStatus.WaitingRender for exec_status in upstream_exec_status): raise TaskBuildError('Cannot directly build task "{}" as it ' 'has upstream dependencies, call ' 'dag.render() first'.format(self.name)) # we can execute an individual tasks if missing up-to-date upstream # dependencies exist in remote storage if self.exec_status == TaskStatus.WaitingUpstream: ok = { t for t in self.upstream.values() if t.exec_status in {TaskStatus.Skipped, TaskStatus.WaitingDownload} } not_ok = set(self.upstream.values()) - ok if not_ok: raise TaskBuildError( f'Cannot build task {self.name!r} because ' 'the following upstream dependencies are ' f'missing: {[t.name for t in not_ok]!r}. Execute upstream ' 'tasks first. If upstream tasks generate File(s) and you' 'configured a File.client, you may also upload ' 'up-to-date copies to remote storage and they will be ' 'automatically downloaded') download_products_in_parallel( t for t in ok if t.exec_status == TaskStatus.WaitingDownload) # at this point the task must be WaitingDownload or WaitingExecution res, _ = self._build(catch_exceptions=catch_exceptions) self.product.metadata.clear() return res
def build(self, force=False): """Run the task if needed by checking its dependencies Returns ------- dict A dictionary with keys 'run' and 'elapsed' """ # TODO: if this is run in a task that has upstream dependencies # it will fail with a useless error since self.params does not have # upstream yet (added after rendering) # NOTE: should i fetch metadata here? I need to make sure I have # the latest before building self._logger.info(f'-----\nChecking {repr(self)}....') # do not run unless some of the conditions below match... run = False elapsed = 0 if force: self._logger.info('Forcing run, skipping checks...') run = True else: # not forcing, need to check dependencies... p_exists = self.product.exists() # check dependencies only if the product exists and there is # metadata if p_exists and self.product.metadata is not None: outdated_data_deps = self.product._outdated_data_dependencies() outdated_code_dep = self.product._outdated_code_dependency() self._logger.info('Checking dependencies...') if outdated_data_deps: run = True self._logger.info('Outdated data deps...') else: self._logger.info('Up-to-date data deps...') if outdated_code_dep: run = True self._logger.info('Outdated code dep...') else: self._logger.info('Up-to-date code dep...') else: run = True # just log why it will run if not p_exists: self._logger.info('Product does not exist...') if self.product.metadata is None: self._logger.info('Product metadata is None...') self._logger.info('Running...') if run: self._logger.info(f'Starting execution: {repr(self)}') then = datetime.now() try: self.run() except Exception as e: tb = traceback.format_exc() if self.on_failure: try: self.on_failure(self, tb) except Exception: self._logger.exception('Error executing on_failure ' 'callback') raise e now = datetime.now() elapsed = (now - then).total_seconds() self._logger.info(f'Done. Operation took {elapsed:.1f} seconds') # update metadata self.product.timestamp = datetime.now().timestamp() self.product.stored_source_code = self.source_code self.product.save_metadata() # TODO: also check that the Products were updated: # if they did not exist, they must exist now, if they alredy # exist, timestamp must be recent equal to the datetime.now() # used. maybe run fetch metadata again and validate? if not self.product.exists(): raise TaskBuildError(f'Error building task "{self}": ' 'the task ran successfully but product ' f'"{self.product}" does not exist yet ' '(task.product.exist() returned False)') if self.on_finish: try: if 'client' in inspect.getfullargspec(self.on_finish).args: self.on_finish(self, client=self.client) else: self.on_finish(self) except Exception as e: raise TaskBuildError('Exception when running on_finish ' 'for task {}: {}'.format(self, e)) else: self._logger.info(f'No need to run {repr(self)}') self._logger.info('-----\n') self._status = TaskStatus.Executed for t in self._get_downstream(): t._update_status() self.build_report = Row({'name': self.name, 'Ran?': run, 'Elapsed (s)': elapsed, }) return self
def _build(self, catch_exceptions): """ Private API for building DAGs. This is what executors should call. Unlike the public method, this one does not call render, as it should happen via a dag.render() call. It takes care of running the task and updating status accordingly Parameters ---------- catch_exceptions : bool If True, catches exceptions during execution and shows a chained exception at the end: [original exception] then [exception with context info]. Set it to False when debugging tasks to drop-in a debugging session at the failing line. """ if not catch_exceptions: res = self._run() self._finish_task_execution() return res, self.product.metadata.to_dict() else: try: res = self._run() except Exception as e: msg = 'Error building task "{}"'.format(self.name) self._logger.exception(msg) self.exec_status = TaskStatus.Errored # if there isn't anything left to run, raise exception here if self.on_failure is None: if isinstance(e, DAGBuildEarlyStop): raise DAGBuildEarlyStop( 'Stopping task {} gracefully'.format( self.name)) from e else: # FIXME: this makes the traceback longer, consider # removing it. The only information this nested # exception provides is the name of the task but we # are still able to provide that if theh executor # has the option to capture exceptions turned on. # An option to consider is to raise TaskBuildError(msg) from e build_success = False build_exception = e else: build_success = True build_exception = None if build_success: try: # FIXME: move metadata saving and product checking, # the error message is misleading # this not only runs the hook, but also # calls save metadata and checks that the product exists self._finish_task_execution() except Exception as e: self.exec_status = TaskStatus.Errored msg = ('Exception when running on_finish ' 'for task "{}": {}'.format(self.name, e)) self._logger.exception(msg) if isinstance(e, DAGBuildEarlyStop): raise DAGBuildEarlyStop( 'Stopping task {} gracefully'.format( self.name)) from e else: raise TaskBuildError(msg) from e else: self.exec_status = TaskStatus.Executed return res, self.product.metadata.to_dict() else: try: self._run_on_failure() except Exception as e: msg = ('Exception when running on_failure ' 'for task "{}": {}'.format(self.name, e)) self._logger.exception(msg) raise TaskBuildError(msg) from e if isinstance(build_exception, DAGBuildEarlyStop): raise DAGBuildEarlyStop( 'Stopping task {} gracefully'.format( self.name)) from build_exception else: msg = 'Error building task "{}"'.format(self.name) raise TaskBuildError(msg) from build_exception