def run(*, inputname: str, outputname: str) -> None: from papermill.parameterize import parameterize_notebook from papermill.iorw import load_notebook_node, write_ipynb nb = load_notebook_node(inputname) nb = parameterize_notebook(nb, parameters={"data": "yyy"}) write_ipynb(nb, outputname)
def execute_retroactive_scaffold(notebook_path): nb = load_notebook_node(notebook_path) new_nb = copy.deepcopy(nb) import_cell_source = 'import dagstermill' import_cell = nbformat.v4.new_code_cell(source=import_cell_source) parameters_cell_source = 'context = dagstermill.get_context()' parameters_cell = nbformat.v4.new_code_cell(source=parameters_cell_source) parameters_cell.metadata['tags'] = ['parameters'] new_nb.cells = [import_cell, parameters_cell] + nb.cells write_ipynb(new_nb, notebook_path)
def execute_notebook(notebook, output, parameters=None, kernel_name=None, progress_bar=True, log_output=False): """Executes a single notebook locally. Args: notebook (str): Path to input notebook. output (str): Path to save exexuted notebook. parameters (dict): Arbitrary keyword arguments to pass to the notebook parameters. kernel_name (str): Name of kernel to execute the notebook against. progress_bar (bool): Flag for whether or not to show the progress bar. log_output (bool): Flag for whether or not to write notebook output to stderr. """ print("Input Notebook: %s" % get_pretty_path(notebook)) print("Output Notebook: %s" % get_pretty_path(output)) nb = load_notebook_node(notebook) # Parameterize the Notebook. if parameters: _parameterize_notebook(nb, kernel_name, parameters) # Record specified environment variable values. nb.metadata.papermill['parameters'] = parameters nb.metadata.papermill[ 'environment_variables'] = _fetch_environment_variables() nb.metadata.papermill['output_path'] = output # Execute the Notebook. t0 = datetime.datetime.utcnow() processor = ExecutePreprocessor( timeout=None, kernel_name=kernel_name or nb.metadata.kernelspec.name, ) processor.progress_bar = progress_bar processor.log_output = log_output processor.preprocess(nb, {}) t1 = datetime.datetime.utcnow() nb.metadata.papermill['start_time'] = t0.isoformat() nb.metadata.papermill['end_time'] = t1.isoformat() nb.metadata.papermill['duration'] = (t1 - t0).total_seconds() nb.metadata.papermill['exception'] = any( [cell.metadata.papermill.get('exception') for cell in nb.cells]) # Write final Notebook to disk. write_ipynb(nb, output) raise_for_execution_errors(nb, output)
def execute_retroactive_scaffold(notebook_path, **kwargs): nb = load_notebook_node(notebook_path) new_nb = copy.deepcopy(nb) register_repo_info = get_register_repo_info(kwargs, allow_none=False) cell_source = 'import dagstermill as dm\n{import_statement}\n{declaration_statement}'.format( import_statement=register_repo_info.import_statement, declaration_statement=register_repo_info.declaration_statement, ) newcell = nbformat.v4.new_code_cell(source=cell_source) newcell.metadata['tags'] = ['injected-repo-registration'] new_nb.cells = [newcell] + nb.cells write_ipynb(new_nb, notebook_path)
def execute_notebook(notebook, output, parameters=None, kernel_name=None): """Executes a single notebook locally. Args: notebook (str): Path to input notebook. output (str): Path to save exexuted notebook. parameters (dict): Arbitrary keyword arguments to pass to the notebook parameters. kernel_name (str): Name of kernel to execute the notebook against. """ nb = load_notebook_node(notebook) # Parameterize the Notebook. if parameters: _parameterize_notebook(nb, kernel_name, parameters) # Record specified environment variable values. nb.metadata.papermill['parameters'] = parameters nb.metadata.papermill[ 'environment_variables'] = _fetch_environment_variables() nb.metadata.papermill['output_path'] = output # Execute the Notebook. t0 = datetime.datetime.utcnow() processor = ExecutePreprocessor( timeout=None, kernel_name=kernel_name or nb.metadata.kernelspec.name, ) processor.preprocess(nb, {}) t1 = datetime.datetime.utcnow() nb.metadata.papermill['start_time'] = t0.isoformat() nb.metadata.papermill['end_time'] = t1.isoformat() nb.metadata.papermill['duration'] = (t1 - t0).total_seconds() nb.metadata.papermill['exception'] = any( [cell.metadata.papermill.exception for cell in nb.cells]) # Write final Notebook to disk. write_ipynb(nb, output)
def generate_notebooks_from_yml(input_nb_path: str, yml_parameters_path: str): """ Generate a set of notebooks using the Papermill APIs. The input notebook must contain a cell with a `parameters` tag. Papermill will generate a set of notebooks based on the parameters defined in the input yaml Args: input_nb_path: string Path to the source template notebook yml_parameters_path: string Path to the yaml spec with parameters Returns: list A list of paths to the generated notebooks """ y = read_yaml_file(yml_parameters_path) input_nb = load_notebook_node(input_nb_path) # Create the cartesian product of the parameters hp_values = list(product(*y.values())) # Now recreate a dictionary with the correct keys hp_dicts = [dict(zip(y.keys(), x)) for x in hp_values] # For each combination of parameters generate a notebook from the template output_paths = list() for params in hp_dicts: params_str = print_dict_parametes(params) output_path = input_nb_path.replace(".ipynb", "") + params_str + ".ipynb" output_nb = parameterize_notebook(input_nb, parameters=params) # write the nb to file write_ipynb(output_nb, path=output_path) output_paths.append((output_path, params_str)) # Return list of generated notebook paths return output_paths
def raise_for_execution_errors(nb, output_path): error = None for cell in nb.cells: if cell.get("outputs") is None: continue for output in cell.outputs: if output.output_type == "error": error = PapermillExecutionError( exec_count=cell.execution_count, source=cell.source, ename=output.ename, evalue=output.evalue, traceback=output.traceback) break if error: # Write notebook back out with the Error Message at the top of the Notebook. error_msg = ERROR_MESSAGE_TEMPLATE % str(error.exec_count) error_msg_cell = nbformat.v4.new_markdown_cell(source=error_msg) nb.cells = [error_msg_cell] + nb.cells write_ipynb(nb, output_path) raise error
def _t_fn(info, inputs): check.param_invariant( isinstance(info.context.environment_config, dict), 'info', 'TransformExecutionInfo must have valid environment_config', ) base_dir = '/tmp/dagstermill/{run_id}/'.format( run_id=info.context.run_id) output_notebook_dir = os.path.join(base_dir, 'output_notebooks/') if not os.path.exists(output_notebook_dir): os.makedirs(output_notebook_dir) temp_path = os.path.join( output_notebook_dir, '{prefix}-out.ipynb'.format(prefix=str(uuid.uuid4()))) output_log_path = os.path.join(base_dir, 'run.log') try: nb = load_notebook_node(notebook_path) nb_no_parameters = replace_parameters( info, nb, get_papermill_parameters(info, inputs, output_log_path)) intermediate_path = os.path.join( output_notebook_dir, '{prefix}-inter.ipynb'.format(prefix=str(uuid.uuid4()))) write_ipynb(nb_no_parameters, intermediate_path) with open(output_log_path, 'a') as f: f.close() # info.log.info("Output log path is {}".format(output_log_path)) # info.log.info("info.context.event_callback {}".format(info.context.event_callback)) process = subprocess.Popen( ["papermill", intermediate_path, temp_path]) # _source_nb = pm.execute_notebook(intermediate_path, temp_path) while process.poll() is None: # while subprocess alive if info.context.event_callback: with open(output_log_path, 'r') as ff: current_time = os.path.getmtime(output_log_path) while process.poll() is None: new_time = os.path.getmtime(output_log_path) if new_time != current_time: line = ff.readline() if not line: break event_record_dict = json.loads(line) event_record_dict['event_type'] = EventType( event_record_dict['event_type']) info.context.event_callback( EventRecord(**event_record_dict)) current_time = new_time if process.returncode != 0: # Throw event that is an execution error! info.log.debug("There was an error in Papermill!") info.log.debug('stderr was None' if process.stderr is None else process.stderr) exit() output_nb = pm.read_notebook(temp_path) info.log.debug( 'Notebook execution complete for {name}. Data is {data}'. format(name=name, data=output_nb.data)) info.log.info( "Output notebook path is {}".format(output_notebook_dir)) for output_def in info.solid_def.output_defs: if output_def.name in output_nb.data: value = read_value(output_def.runtime_type, output_nb.data[output_def.name]) yield Result(value, output_def.name) finally: if do_cleanup and os.path.exists(temp_path): os.remove(temp_path)
def _t_fn(compute_context, inputs): check.inst_param(compute_context, "compute_context", SolidExecutionContext) check.param_invariant( isinstance(compute_context.run_config, dict), "context", "SystemComputeExecutionContext must have valid run_config", ) system_compute_context = compute_context.get_system_context() with seven.TemporaryDirectory() as output_notebook_dir: with safe_tempfile_path() as output_log_path: parameterized_notebook_path = os.path.join( output_notebook_dir, "{prefix}-inter.ipynb".format(prefix=str(uuid.uuid4()))) executed_notebook_path = os.path.join( output_notebook_dir, "{prefix}-out.ipynb".format(prefix=str(uuid.uuid4()))) # Scaffold the registration here nb = load_notebook_node(notebook_path) nb_no_parameters = replace_parameters( system_compute_context, nb, get_papermill_parameters(system_compute_context, inputs, output_log_path), ) write_ipynb(nb_no_parameters, parameterized_notebook_path) with user_code_error_boundary( DagstermillExecutionError, lambda: ("Error occurred during the execution of Dagstermill solid " "{solid_name}: {notebook_path}".format( solid_name=name, notebook_path=notebook_path)), ): try: papermill_engines.register("dagstermill", DagstermillNBConvertEngine) papermill.execute_notebook( input_path=parameterized_notebook_path, output_path=executed_notebook_path, engine_name="dagstermill", log_output=True, ) except Exception as exc: # pylint: disable=broad-except try: with open(executed_notebook_path, "rb") as fd: executed_notebook_file_handle = compute_context.resources.file_manager.write( fd, mode="wb", ext="ipynb") executed_notebook_materialization_path = ( executed_notebook_file_handle.path_desc) except Exception as exc_inner: # pylint: disable=broad-except compute_context.log.warning( "Error when attempting to materialize executed notebook using file manager (falling back to local): {exc}" .format(exc=exc_inner)) executed_notebook_materialization_path = executed_notebook_path yield AssetMaterialization( asset_key=(asset_key_prefix + [f"{name}_output_notebook"]), description= "Location of output notebook in file manager", metadata_entries=[ EventMetadataEntry.fspath( executed_notebook_materialization_path, label="executed_notebook_path", ) ], ) raise exc system_compute_context.log.debug( "Notebook execution complete for {name} at {executed_notebook_path}." .format( name=name, executed_notebook_path=executed_notebook_path, )) executed_notebook_file_handle = None try: # use binary mode when when moving the file since certain file_managers such as S3 # may try to hash the contents with open(executed_notebook_path, "rb") as fd: executed_notebook_file_handle = compute_context.resources.file_manager.write( fd, mode="wb", ext="ipynb") executed_notebook_materialization_path = executed_notebook_file_handle.path_desc except Exception as exc: # pylint: disable=broad-except compute_context.log.warning( "Error when attempting to materialize executed notebook using file manager (falling back to local): {exc}" .format(exc=str(exc))) executed_notebook_materialization_path = executed_notebook_path yield AssetMaterialization( asset_key=(asset_key_prefix + [f"{name}_output_notebook"]), description="Location of output notebook in file manager", metadata_entries=[ EventMetadataEntry.fspath( executed_notebook_materialization_path) ], ) if output_notebook is not None: yield Output(executed_notebook_file_handle, output_notebook) # deferred import for perf import scrapbook output_nb = scrapbook.read_notebook(executed_notebook_path) for (output_name, output_def ) in system_compute_context.solid_def.output_dict.items(): data_dict = output_nb.scraps.data_dict if output_name in data_dict: value = read_value(output_def.dagster_type, data_dict[output_name]) yield Output(value, output_name) for key, value in output_nb.scraps.items(): if key.startswith("event-"): with open(value.data, "rb") as fd: yield pickle.loads(fd.read())
def _t_fn(step_context, inputs): check.inst_param(step_context, "step_context", SolidExecutionContext) check.param_invariant( isinstance(step_context.run_config, dict), "context", "StepExecutionContext must have valid run_config", ) step_execution_context = step_context.get_step_execution_context() with tempfile.TemporaryDirectory() as output_notebook_dir: with safe_tempfile_path() as output_log_path: parameterized_notebook_path = os.path.join( output_notebook_dir, "{prefix}-inter.ipynb".format(prefix=str(uuid.uuid4()))) executed_notebook_path = os.path.join( output_notebook_dir, "{prefix}-out.ipynb".format(prefix=str(uuid.uuid4()))) # Scaffold the registration here nb = load_notebook_node(notebook_path) nb_no_parameters = replace_parameters( step_execution_context, nb, get_papermill_parameters(step_execution_context, inputs, output_log_path), ) write_ipynb(nb_no_parameters, parameterized_notebook_path) try: papermill_engines.register("dagstermill", DagstermillNBConvertEngine) papermill.execute_notebook( input_path=parameterized_notebook_path, output_path=executed_notebook_path, engine_name="dagstermill", log_output=True, ) except Exception as ex: # pylint: disable=broad-except try: with open(executed_notebook_path, "rb") as fd: executed_notebook_file_handle = ( step_context.resources.file_manager.write( fd, mode="wb", ext="ipynb")) executed_notebook_materialization_path = ( executed_notebook_file_handle.path_desc) except Exception: # pylint: disable=broad-except step_context.log.warning( "Error when attempting to materialize executed notebook using file manager (falling back to local): {exc}" .format(exc=str( serializable_error_info_from_exc_info( sys.exc_info())))) executed_notebook_materialization_path = executed_notebook_path yield AssetMaterialization( asset_key=(asset_key_prefix + [f"{name}_output_notebook"]), description= "Location of output notebook in file manager", metadata_entries=[ EventMetadataEntry.fspath( executed_notebook_materialization_path, label="executed_notebook_path", ) ], ) # pylint: disable=no-member if isinstance(ex, PapermillExecutionError) and ( ex.ename == "RetryRequested" or ex.ename == "Failure"): step_execution_context.log.warn( f"Encountered raised {ex.ename} in notebook. Use dagstermill.yield_event " "with RetryRequested or Failure to trigger their behavior." ) raise step_execution_context.log.debug( "Notebook execution complete for {name} at {executed_notebook_path}." .format( name=name, executed_notebook_path=executed_notebook_path, )) executed_notebook_file_handle = None try: # use binary mode when when moving the file since certain file_managers such as S3 # may try to hash the contents with open(executed_notebook_path, "rb") as fd: executed_notebook_file_handle = step_context.resources.file_manager.write( fd, mode="wb", ext="ipynb") executed_notebook_materialization_path = executed_notebook_file_handle.path_desc except Exception: # pylint: disable=broad-except step_context.log.warning( "Error when attempting to materialize executed notebook using file manager (falling back to local): {exc}" .format(exc=str( serializable_error_info_from_exc_info( sys.exc_info())))) executed_notebook_materialization_path = executed_notebook_path yield AssetMaterialization( asset_key=(asset_key_prefix + [f"{name}_output_notebook"]), description="Location of output notebook in file manager", metadata_entries=[ EventMetadataEntry.fspath( executed_notebook_materialization_path) ], ) if output_notebook is not None: yield Output(executed_notebook_file_handle, output_notebook) # deferred import for perf import scrapbook output_nb = scrapbook.read_notebook(executed_notebook_path) for (output_name, output_def ) in step_execution_context.solid_def.output_dict.items(): data_dict = output_nb.scraps.data_dict if output_name in data_dict: value = read_value(output_def.dagster_type, data_dict[output_name]) yield Output(value, output_name) for key, value in output_nb.scraps.items(): if key.startswith("event-"): with open(value.data, "rb") as fd: event = pickle.loads(fd.read()) if isinstance(event, (Failure, RetryRequested)): raise event else: yield event
def execute_retroactive_scaffold(notebook_path): nb = load_notebook_node(notebook_path) new_nb = copy.deepcopy(nb) new_nb.cells = [get_import_cell(), get_parameters_cell()] + nb.cells write_ipynb(new_nb, notebook_path)
def _t_fn(transform_context, inputs): check.inst_param(transform_context, 'transform_context', TransformExecutionContext) check.param_invariant( isinstance(transform_context.environment_dict, dict), 'context', 'SystemTransformExecutionContext must have valid environment_dict', ) system_transform_context = transform_context.get_system_context() base_dir = '/tmp/dagstermill/{run_id}/'.format( run_id=transform_context.run_id) output_notebook_dir = os.path.join(base_dir, 'output_notebooks/') if not os.path.exists(output_notebook_dir): os.makedirs(output_notebook_dir) temp_path = os.path.join( output_notebook_dir, '{prefix}-out.ipynb'.format(prefix=str(uuid.uuid4()))) output_log_path = os.path.join(base_dir, 'run.log') try: nb = load_notebook_node(notebook_path) nb_no_parameters = replace_parameters( system_transform_context, nb, get_papermill_parameters(system_transform_context, inputs, output_log_path), ) intermediate_path = os.path.join( output_notebook_dir, '{prefix}-inter.ipynb'.format(prefix=str(uuid.uuid4()))) write_ipynb(nb_no_parameters, intermediate_path) with open(output_log_path, 'a') as f: f.close() process = subprocess.Popen( [ 'papermill', '--log-output', '--log-level', 'ERROR', intermediate_path, temp_path ], stderr=subprocess.PIPE, ) _stdout, stderr = process.communicate() while process.poll() is None: # while subprocess alive if system_transform_context.event_callback: with open(output_log_path, 'r') as ff: current_time = os.path.getmtime(output_log_path) while process.poll() is None: new_time = os.path.getmtime(output_log_path) if new_time != current_time: line = ff.readline() if not line: break event_record_dict = json.loads(line) system_transform_context.event_callback( EventRecord(**event_record_dict)) current_time = new_time if process.returncode != 0: raise DagstermillError( 'There was an error when Papermill tried to execute the notebook. ' 'The process stderr is \'{stderr}\''.format(stderr=stderr)) output_nb = pm.read_notebook(temp_path) system_transform_context.log.debug( 'Notebook execution complete for {name}. Data is {data}'. format(name=name, data=output_nb.data)) yield Materialization( '{name} output notebook'.format( name=transform_context.solid.name), temp_path) for output_def in system_transform_context.solid_def.output_defs: if output_def.name in output_nb.data: value = read_value(output_def.runtime_type, output_nb.data[output_def.name]) yield Result(value, output_def.name) finally: if do_cleanup and os.path.exists(temp_path): os.remove(temp_path)
def _t_fn(compute_context, inputs): check.inst_param(compute_context, 'compute_context', ComputeExecutionContext) check.param_invariant( isinstance(compute_context.environment_dict, dict), 'context', 'SystemComputeExecutionContext must have valid environment_dict', ) system_compute_context = compute_context.get_system_context() base_dir = '/tmp/dagstermill/{run_id}/'.format( run_id=compute_context.run_id) output_notebook_dir = os.path.join(base_dir, 'output_notebooks/') mkdir_p(output_notebook_dir) temp_path = os.path.join( output_notebook_dir, '{prefix}-out.ipynb'.format(prefix=str(uuid.uuid4()))) with safe_tempfile_path() as output_log_path: # Scaffold the registration here nb = load_notebook_node(notebook_path) nb_no_parameters = replace_parameters( system_compute_context, nb, get_papermill_parameters(system_compute_context, inputs, output_log_path), ) intermediate_path = os.path.join( output_notebook_dir, '{prefix}-inter.ipynb'.format(prefix=str(uuid.uuid4()))) write_ipynb(nb_no_parameters, intermediate_path) with user_code_error_boundary( DagstermillExecutionError, lambda: ('Error occurred during the execution of Dagstermill solid ' '{solid_name}: {notebook_path}'.format( solid_name=name, notebook_path=notebook_path)), ): try: papermill_engines.register('dagstermill', DagstermillNBConvertEngine) papermill.execute_notebook(intermediate_path, temp_path, engine_name='dagstermill', log_output=True) except Exception as exc: yield Materialization( label='output_notebook', description= 'Location of output notebook on the filesystem', metadata_entries=[ EventMetadataEntry.fspath(temp_path) ], ) raise exc # deferred import for perf import scrapbook output_nb = scrapbook.read_notebook(temp_path) system_compute_context.log.debug( 'Notebook execution complete for {name}. Data is {data}'. format(name=name, data=output_nb.scraps)) yield Materialization( label='output_notebook', description='Location of output notebook on the filesystem', metadata_entries=[EventMetadataEntry.fspath(temp_path)], ) for (output_name, output_def ) in system_compute_context.solid_def.output_dict.items(): data_dict = output_nb.scraps.data_dict if output_name in data_dict: value = read_value(output_def.dagster_type, data_dict[output_name]) yield Output(value, output_name) for key, value in output_nb.scraps.items(): if key.startswith('event-'): with open(value.data, 'rb') as fd: yield pickle.loads(fd.read())
def _t_fn(compute_context, inputs): check.inst_param(compute_context, 'compute_context', SolidExecutionContext) check.param_invariant( isinstance(compute_context.environment_dict, dict), 'context', 'SystemComputeExecutionContext must have valid environment_dict', ) system_compute_context = compute_context.get_system_context() with seven.TemporaryDirectory() as output_notebook_dir: with safe_tempfile_path() as output_log_path: parameterized_notebook_path = os.path.join( output_notebook_dir, '{prefix}-inter.ipynb'.format(prefix=str(uuid.uuid4()))) executed_notebook_path = os.path.join( output_notebook_dir, '{prefix}-out.ipynb'.format(prefix=str(uuid.uuid4()))) # Scaffold the registration here nb = load_notebook_node(notebook_path) nb_no_parameters = replace_parameters( system_compute_context, nb, get_papermill_parameters(system_compute_context, inputs, output_log_path), ) write_ipynb(nb_no_parameters, parameterized_notebook_path) with user_code_error_boundary( DagstermillExecutionError, lambda: ('Error occurred during the execution of Dagstermill solid ' '{solid_name}: {notebook_path}'.format( solid_name=name, notebook_path=notebook_path)), ): try: papermill_engines.register('dagstermill', DagstermillNBConvertEngine) papermill.execute_notebook( input_path=parameterized_notebook_path, output_path=executed_notebook_path, engine_name='dagstermill', log_output=True, ) except Exception as exc: # pylint: disable=broad-except try: with open(executed_notebook_path, 'r') as fd: executed_notebook_file_handle = compute_context.file_manager.write( fd, mode='w', ext='ipynb') executed_notebook_materialization_path = ( executed_notebook_file_handle.path_desc) except Exception as exc_inner: # pylint: disable=broad-except compute_context.log.warning( 'Error when attempting to materialize executed notebook using file manager (falling back to local): {exc}' .format(exc=exc_inner)) executed_notebook_materialization_path = executed_notebook_path yield Materialization( label='output_notebook', description= 'Location of output notebook in file manager', metadata_entries=[ EventMetadataEntry.fspath( executed_notebook_materialization_path) ], ) raise exc system_compute_context.log.debug( 'Notebook execution complete for {name} at {executed_notebook_path}.' .format( name=name, executed_notebook_path=executed_notebook_path, )) try: with open(executed_notebook_path, 'r') as fd: executed_notebook_file_handle = compute_context.file_manager.write( fd, mode='w', ext='ipynb') executed_notebook_materialization_path = executed_notebook_file_handle.path_desc except Exception as exc: # pylint: disable=broad-except compute_context.log.warning( 'Error when attempting to materialize executed notebook using file manager (falling back to local): {exc}' .format(exc=str(exc))) executed_notebook_materialization_path = executed_notebook_path yield Materialization( label='output_notebook', description='Location of output notebook in file manager', metadata_entries=[ EventMetadataEntry.fspath( executed_notebook_materialization_path) ], ) if output_notebook is not None: yield Output(executed_notebook_file_handle, output_notebook) # deferred import for perf import scrapbook output_nb = scrapbook.read_notebook(executed_notebook_path) for (output_name, output_def ) in system_compute_context.solid_def.output_dict.items(): data_dict = output_nb.scraps.data_dict if output_name in data_dict: value = read_value(output_def.dagster_type, data_dict[output_name]) yield Output(value, output_name) for key, value in output_nb.scraps.items(): if key.startswith('event-'): with open(value.data, 'rb') as fd: yield pickle.loads(fd.read())
def _t_fn(step_context, inputs): check.inst_param(step_context, "step_context", SolidExecutionContext) check.param_invariant( isinstance(step_context.run_config, dict), "context", "StepExecutionContext must have valid run_config", ) step_execution_context = step_context.get_step_execution_context() with tempfile.TemporaryDirectory() as output_notebook_dir: with safe_tempfile_path() as output_log_path: prefix = str(uuid.uuid4()) parameterized_notebook_path = os.path.join( output_notebook_dir, f"{prefix}-inter.ipynb") executed_notebook_path = os.path.join(output_notebook_dir, f"{prefix}-out.ipynb") # Scaffold the registration here nb = load_notebook_node(notebook_path) compute_descriptor = ("solid" if dagster_factory_name == "define_dagstermill_solid" else "op") nb_no_parameters = replace_parameters( step_execution_context, nb, get_papermill_parameters(step_execution_context, inputs, output_log_path, compute_descriptor), ) write_ipynb(nb_no_parameters, parameterized_notebook_path) try: papermill_engines.register("dagstermill", DagstermillEngine) papermill.execute_notebook( input_path=parameterized_notebook_path, output_path=executed_notebook_path, engine_name="dagstermill", log_output=True, ) except Exception as ex: step_execution_context.log.warn( "Error when attempting to materialize executed notebook: {exc}" .format(exc=str( serializable_error_info_from_exc_info( sys.exc_info())))) # pylint: disable=no-member # compat: if isinstance( ex, ExecutionError) and (ex.ename == "RetryRequested" or ex.ename == "Failure"): step_execution_context.log.warn( f"Encountered raised {ex.ename} in notebook. Use dagstermill.yield_event " "with RetryRequested or Failure to trigger their behavior." ) raise step_execution_context.log.debug( "Notebook execution complete for {name} at {executed_notebook_path}." .format( name=name, executed_notebook_path=executed_notebook_path, )) if output_notebook_name is not None: # yield output notebook binary stream as a solid output with open(executed_notebook_path, "rb") as fd: yield Output(fd.read(), output_notebook_name) else: # backcompat executed_notebook_file_handle = None try: # use binary mode when when moving the file since certain file_managers such as S3 # may try to hash the contents with open(executed_notebook_path, "rb") as fd: executed_notebook_file_handle = step_context.resources.file_manager.write( fd, mode="wb", ext="ipynb") executed_notebook_materialization_path = ( executed_notebook_file_handle.path_desc) yield AssetMaterialization( asset_key=(asset_key_prefix + [f"{name}_output_notebook"]), description= "Location of output notebook in file manager", metadata_entries=[ MetadataEntry.fspath( executed_notebook_materialization_path) ], ) except Exception: # if file manager writing errors, e.g. file manager is not provided, we throw a warning # and fall back to the previously stored temp executed notebook. step_context.log.warning( "Error when attempting to materialize executed notebook using file manager: " f"{str(serializable_error_info_from_exc_info(sys.exc_info()))}" f"\nNow falling back to local: notebook execution was temporarily materialized at {executed_notebook_path}" "\nIf you have supplied a file manager and expect to use it for materializing the " 'notebook, please include "file_manager" in the `required_resource_keys` argument ' f"to `{dagster_factory_name}`") if output_notebook is not None: yield Output(executed_notebook_file_handle, output_notebook) # deferred import for perf import scrapbook output_nb = scrapbook.read_notebook(executed_notebook_path) for (output_name, _) in step_execution_context.solid_def.output_dict.items(): data_dict = output_nb.scraps.data_dict if output_name in data_dict: # read outputs that were passed out of process via io manager from `yield_result` step_output_handle = StepOutputHandle( step_key=step_execution_context.step.key, output_name=output_name) output_context = step_execution_context.get_output_context( step_output_handle) io_manager = step_execution_context.get_io_manager( step_output_handle) value = io_manager.load_input( build_input_context(upstream_output=output_context)) yield Output(value, output_name) for key, value in output_nb.scraps.items(): if key.startswith("event-"): with open(value.data, "rb") as fd: event = pickle.loads(fd.read()) if isinstance(event, (Failure, RetryRequested)): raise event else: yield event
def _t_fn(transform_context, inputs): check.inst_param(transform_context, 'transform_context', TransformExecutionContext) check.param_invariant( isinstance(transform_context.environment_dict, dict), 'context', 'SystemTransformExecutionContext must have valid environment_dict', ) system_transform_context = transform_context.get_system_context() base_dir = '/tmp/dagstermill/{run_id}/'.format( run_id=transform_context.run_id) output_notebook_dir = os.path.join(base_dir, 'output_notebooks/') mkdir_p(output_notebook_dir) temp_path = os.path.join( output_notebook_dir, '{prefix}-out.ipynb'.format(prefix=str(uuid.uuid4()))) with tempfile.NamedTemporaryFile() as output_log_file: output_log_path = output_log_file.name init_db(output_log_path) nb = load_notebook_node(notebook_path) nb_no_parameters = replace_parameters( system_transform_context, nb, get_papermill_parameters(system_transform_context, inputs, output_log_path), ) intermediate_path = os.path.join( output_notebook_dir, '{prefix}-inter.ipynb'.format(prefix=str(uuid.uuid4()))) write_ipynb(nb_no_parameters, intermediate_path) # Although the type of is_done is threading._Event in py2, not threading.Event, # it is still constructed using the threading.Event() factory is_done = threading.Event() def log_watcher_thread_target(): log_watcher = JsonSqlite3LogWatcher( output_log_path, system_transform_context.log, is_done) log_watcher.watch() log_watcher_thread = threading.Thread( target=log_watcher_thread_target) log_watcher_thread.start() with user_code_error_boundary( DagstermillExecutionError, 'Error occurred during the execution of Dagstermill solid ' '{solid_name}: {notebook_path}'.format( solid_name=name, notebook_path=notebook_path), ): try: papermill.execute_notebook(intermediate_path, temp_path, log_output=True) except Exception as exc: yield Materialization( path=temp_path, description='{name} output notebook'.format( name=transform_context.solid.name), ) raise exc finally: is_done.set() log_watcher_thread.join() output_nb = scrapbook.read_notebook(temp_path) system_transform_context.log.debug( 'Notebook execution complete for {name}. Data is {data}'. format(name=name, data=output_nb.scraps)) yield Materialization( path=temp_path, description='{name} output notebook'.format( name=transform_context.solid.name), ) for (output_name, output_def ) in system_transform_context.solid_def.output_dict.items(): data_dict = output_nb.scraps.data_dict if output_name in data_dict: value = read_value(output_def.runtime_type, data_dict[output_name]) yield Result(value, output_name) for key, value in output_nb.scraps.items(): print(output_nb.scraps) if key.startswith('materialization-'): with open(value.data, 'rb') as fd: yield pickle.loads(fd.read())