Exemple #1
0
def run(*, inputname: str, outputname: str) -> None:
    from papermill.parameterize import parameterize_notebook
    from papermill.iorw import load_notebook_node, write_ipynb

    nb = load_notebook_node(inputname)
    nb = parameterize_notebook(nb, parameters={"data": "yyy"})
    write_ipynb(nb, outputname)
Exemple #2
0
    def get_params(self):
        def process_node(node):
            valnode = node.value
            val = None
            if isinstance(valnode, ast.NameConstant):
                val = valnode.value
            elif isinstance(valnode, ast.Num):
                val = valnode.n
            elif isinstance(valnode, ast.Str):
                val = valnode.s
            t = node.targets[0]
            if isinstance(t, ast.Name):
                key = t.id
            return {key: val}

        nb = load_notebook_node(notebook_path=self.input_path)
        index = self._find_first_tagged_cell_index(nb, 'parameters')
        if index >= 0:
            self.doc = ''
            s = str(nb.cells[index].source)
            for tok in tokenize.tokenize(BytesIO(s.encode('utf-8')).readline):
                if tok.type == 3 and tok.string.startswith("'''"):
                    self.doc = tok.string.translate(str.maketrans('', '', "'"))
            a = ast.parse(nb.cells[index].source)
            output = [
                process_node(node) for node in ast.walk(a)
                if isinstance(node, ast.Assign)
            ]
            self.params = dict(ChainMap(*output))
            if len(self.params) > 0:
                self.ex_function = 'execute(self,' + ', '.join(
                    "{!s}={!r}".format(key, val)
                    for (key, val) in self.params.items()) + ')'
        return self.params
Exemple #3
0
def get_outputs(nbname):
    incell = None
    nb = load_notebook_node(nbname)
    for cell in nb.cells:
        if cell['source'].startswith('%%yaml OUTPUTS'):
            incell = cell['source']
            break
    if incell is None:
        return None
    # remove first line (cell magic)
    incell = incell.split('\n', 1)[1]
    out_dict = yaml.load(incell)
    return parse(out_dict)
Exemple #4
0
def execute_retroactive_scaffold(notebook_path):
    nb = load_notebook_node(notebook_path)
    new_nb = copy.deepcopy(nb)

    import_cell_source = 'import dagstermill'
    import_cell = nbformat.v4.new_code_cell(source=import_cell_source)

    parameters_cell_source = 'context = dagstermill.get_context()'
    parameters_cell = nbformat.v4.new_code_cell(source=parameters_cell_source)
    parameters_cell.metadata['tags'] = ['parameters']

    new_nb.cells = [import_cell, parameters_cell] + nb.cells
    write_ipynb(new_nb, notebook_path)
Exemple #5
0
def execute_notebook(notebook,
                     output,
                     parameters=None,
                     kernel_name=None,
                     progress_bar=True,
                     log_output=False):
    """Executes a single notebook locally.

    Args:
        notebook (str): Path to input notebook.
        output (str): Path to save exexuted notebook.
        parameters (dict): Arbitrary keyword arguments to pass to the notebook parameters.
        kernel_name (str): Name of kernel to execute the notebook against.
        progress_bar (bool): Flag for whether or not to show the progress bar.
        log_output (bool): Flag for whether or not to write notebook output to stderr.
    """
    print("Input Notebook:  %s" % get_pretty_path(notebook))
    print("Output Notebook: %s" % get_pretty_path(output))
    nb = load_notebook_node(notebook)

    # Parameterize the Notebook.
    if parameters:
        _parameterize_notebook(nb, kernel_name, parameters)

    # Record specified environment variable values.
    nb.metadata.papermill['parameters'] = parameters
    nb.metadata.papermill[
        'environment_variables'] = _fetch_environment_variables()
    nb.metadata.papermill['output_path'] = output

    # Execute the Notebook.
    t0 = datetime.datetime.utcnow()
    processor = ExecutePreprocessor(
        timeout=None,
        kernel_name=kernel_name or nb.metadata.kernelspec.name,
    )
    processor.progress_bar = progress_bar
    processor.log_output = log_output

    processor.preprocess(nb, {})
    t1 = datetime.datetime.utcnow()

    nb.metadata.papermill['start_time'] = t0.isoformat()
    nb.metadata.papermill['end_time'] = t1.isoformat()
    nb.metadata.papermill['duration'] = (t1 - t0).total_seconds()
    nb.metadata.papermill['exception'] = any(
        [cell.metadata.papermill.get('exception') for cell in nb.cells])

    # Write final Notebook to disk.
    write_ipynb(nb, output)
    raise_for_execution_errors(nb, output)
Exemple #6
0
def execute_retroactive_scaffold(notebook_path, **kwargs):
    nb = load_notebook_node(notebook_path)
    new_nb = copy.deepcopy(nb)
    register_repo_info = get_register_repo_info(kwargs, allow_none=False)

    cell_source = 'import dagstermill as dm\n{import_statement}\n{declaration_statement}'.format(
        import_statement=register_repo_info.import_statement,
        declaration_statement=register_repo_info.declaration_statement,
    )

    newcell = nbformat.v4.new_code_cell(source=cell_source)
    newcell.metadata['tags'] = ['injected-repo-registration']
    new_nb.cells = [newcell] + nb.cells
    write_ipynb(new_nb, notebook_path)
Exemple #7
0
    def __init__(self, node_or_path, translators=None):
        if isinstance(node_or_path, string_types):
            if not node_or_path.endswith(".ipynb"):
                raise ValueError(
                    "Requires an '.ipynb' file extension. Provided path: '{}'".
                    format(node_or_path))
            self.path = node_or_path
            self.node = load_notebook_node(node_or_path)
        else:
            self.path = ""
            self.node = node_or_path
        self.translators = translators or translator_registry

        # Memoized traits
        self._scraps = None
        self._snaps = None
Exemple #8
0
def read_notebook(path):
    """
    Returns a Notebook object loaded from the location specified at 'path'.

    Args:
        path (str): Path to notebook ".ipynb" file.

    Returns:
        A Notebook object.
    """
    if not path.endswith(".ipynb"):
        raise PapermillException(
            "Notebooks should have an '.ipynb' file extension. Provided path: '%s'",
            path)

    nb = Notebook()
    nb.path = path
    nb.node = load_notebook_node(path)
    return nb
Exemple #9
0
def execute_notebook(notebook, output, parameters=None, kernel_name=None):
    """Executes a single notebook locally.

    Args:
        notebook (str): Path to input notebook.
        output (str): Path to save exexuted notebook.
        parameters (dict): Arbitrary keyword arguments to pass to the notebook parameters.
        kernel_name (str): Name of kernel to execute the notebook against.

    """
    nb = load_notebook_node(notebook)

    # Parameterize the Notebook.
    if parameters:
        _parameterize_notebook(nb, kernel_name, parameters)

    # Record specified environment variable values.
    nb.metadata.papermill['parameters'] = parameters
    nb.metadata.papermill[
        'environment_variables'] = _fetch_environment_variables()
    nb.metadata.papermill['output_path'] = output

    # Execute the Notebook.
    t0 = datetime.datetime.utcnow()
    processor = ExecutePreprocessor(
        timeout=None,
        kernel_name=kernel_name or nb.metadata.kernelspec.name,
    )
    processor.preprocess(nb, {})
    t1 = datetime.datetime.utcnow()

    nb.metadata.papermill['start_time'] = t0.isoformat()
    nb.metadata.papermill['end_time'] = t1.isoformat()
    nb.metadata.papermill['duration'] = (t1 - t0).total_seconds()
    nb.metadata.papermill['exception'] = any(
        [cell.metadata.papermill.exception for cell in nb.cells])

    # Write final Notebook to disk.
    write_ipynb(nb, output)
Exemple #10
0
def generate_notebooks_from_yml(input_nb_path: str, yml_parameters_path: str):
    """
    Generate a set of notebooks using the Papermill APIs. The input notebook
    must contain a cell with a `parameters` tag. Papermill will generate a set of
    notebooks based on the parameters defined in the input yaml
    Args:
        input_nb_path: string
                        Path to the source template notebook
        yml_parameters_path: string
                                Path to the yaml spec with parameters

    Returns: list
                A list of paths to the generated notebooks

    """
    y = read_yaml_file(yml_parameters_path)
    input_nb = load_notebook_node(input_nb_path)

    # Create the cartesian product of the parameters
    hp_values = list(product(*y.values()))

    # Now recreate a dictionary with the correct keys
    hp_dicts = [dict(zip(y.keys(), x)) for x in hp_values]

    # For each combination of parameters generate a notebook from the template
    output_paths = list()
    for params in hp_dicts:
        params_str = print_dict_parametes(params)
        output_path = input_nb_path.replace(".ipynb",
                                            "") + params_str + ".ipynb"
        output_nb = parameterize_notebook(input_nb, parameters=params)
        # write the nb to file
        write_ipynb(output_nb, path=output_path)
        output_paths.append((output_path, params_str))

    # Return list of generated notebook paths
    return output_paths
Exemple #11
0
def execute_retroactive_scaffold(notebook_path):
    nb = load_notebook_node(notebook_path)
    new_nb = copy.deepcopy(nb)
    new_nb.cells = [get_import_cell(), get_parameters_cell()] + nb.cells
    write_ipynb(new_nb, notebook_path)
Exemple #12
0
    def _t_fn(step_context, inputs):
        check.inst_param(step_context, "step_context", SolidExecutionContext)
        check.param_invariant(
            isinstance(step_context.run_config, dict),
            "context",
            "StepExecutionContext must have valid run_config",
        )

        step_execution_context = step_context.get_step_execution_context()

        with tempfile.TemporaryDirectory() as output_notebook_dir:
            with safe_tempfile_path() as output_log_path:

                parameterized_notebook_path = os.path.join(
                    output_notebook_dir,
                    "{prefix}-inter.ipynb".format(prefix=str(uuid.uuid4())))

                executed_notebook_path = os.path.join(
                    output_notebook_dir,
                    "{prefix}-out.ipynb".format(prefix=str(uuid.uuid4())))

                # Scaffold the registration here
                nb = load_notebook_node(notebook_path)
                nb_no_parameters = replace_parameters(
                    step_execution_context,
                    nb,
                    get_papermill_parameters(step_execution_context, inputs,
                                             output_log_path),
                )
                write_ipynb(nb_no_parameters, parameterized_notebook_path)

                try:
                    papermill_engines.register("dagstermill",
                                               DagstermillNBConvertEngine)
                    papermill.execute_notebook(
                        input_path=parameterized_notebook_path,
                        output_path=executed_notebook_path,
                        engine_name="dagstermill",
                        log_output=True,
                    )

                except Exception as ex:  # pylint: disable=broad-except
                    try:
                        with open(executed_notebook_path, "rb") as fd:
                            executed_notebook_file_handle = (
                                step_context.resources.file_manager.write(
                                    fd, mode="wb", ext="ipynb"))
                            executed_notebook_materialization_path = (
                                executed_notebook_file_handle.path_desc)
                    except Exception:  # pylint: disable=broad-except
                        step_context.log.warning(
                            "Error when attempting to materialize executed notebook using file manager (falling back to local): {exc}"
                            .format(exc=str(
                                serializable_error_info_from_exc_info(
                                    sys.exc_info()))))
                        executed_notebook_materialization_path = executed_notebook_path

                    yield AssetMaterialization(
                        asset_key=(asset_key_prefix +
                                   [f"{name}_output_notebook"]),
                        description=
                        "Location of output notebook in file manager",
                        metadata_entries=[
                            EventMetadataEntry.fspath(
                                executed_notebook_materialization_path,
                                label="executed_notebook_path",
                            )
                        ],
                    )

                    # pylint: disable=no-member
                    if isinstance(ex, PapermillExecutionError) and (
                            ex.ename == "RetryRequested"
                            or ex.ename == "Failure"):
                        step_execution_context.log.warn(
                            f"Encountered raised {ex.ename} in notebook. Use dagstermill.yield_event "
                            "with RetryRequested or Failure to trigger their behavior."
                        )

                    raise

            step_execution_context.log.debug(
                "Notebook execution complete for {name} at {executed_notebook_path}."
                .format(
                    name=name,
                    executed_notebook_path=executed_notebook_path,
                ))

            executed_notebook_file_handle = None
            try:
                # use binary mode when when moving the file since certain file_managers such as S3
                # may try to hash the contents
                with open(executed_notebook_path, "rb") as fd:
                    executed_notebook_file_handle = step_context.resources.file_manager.write(
                        fd, mode="wb", ext="ipynb")
                    executed_notebook_materialization_path = executed_notebook_file_handle.path_desc
            except Exception:  # pylint: disable=broad-except
                step_context.log.warning(
                    "Error when attempting to materialize executed notebook using file manager (falling back to local): {exc}"
                    .format(exc=str(
                        serializable_error_info_from_exc_info(
                            sys.exc_info()))))
                executed_notebook_materialization_path = executed_notebook_path

            yield AssetMaterialization(
                asset_key=(asset_key_prefix + [f"{name}_output_notebook"]),
                description="Location of output notebook in file manager",
                metadata_entries=[
                    EventMetadataEntry.fspath(
                        executed_notebook_materialization_path)
                ],
            )

            if output_notebook is not None:
                yield Output(executed_notebook_file_handle, output_notebook)

            # deferred import for perf
            import scrapbook

            output_nb = scrapbook.read_notebook(executed_notebook_path)

            for (output_name, output_def
                 ) in step_execution_context.solid_def.output_dict.items():
                data_dict = output_nb.scraps.data_dict
                if output_name in data_dict:
                    value = read_value(output_def.dagster_type,
                                       data_dict[output_name])

                    yield Output(value, output_name)

            for key, value in output_nb.scraps.items():
                if key.startswith("event-"):
                    with open(value.data, "rb") as fd:
                        event = pickle.loads(fd.read())
                        if isinstance(event, (Failure, RetryRequested)):
                            raise event
                        else:
                            yield event
Exemple #13
0
    def _t_fn(transform_context, inputs):
        check.inst_param(transform_context, 'transform_context',
                         TransformExecutionContext)
        check.param_invariant(
            isinstance(transform_context.environment_dict, dict),
            'context',
            'SystemTransformExecutionContext must have valid environment_dict',
        )

        system_transform_context = transform_context.get_system_context()

        base_dir = '/tmp/dagstermill/{run_id}/'.format(
            run_id=transform_context.run_id)
        output_notebook_dir = os.path.join(base_dir, 'output_notebooks/')
        mkdir_p(output_notebook_dir)

        temp_path = os.path.join(
            output_notebook_dir,
            '{prefix}-out.ipynb'.format(prefix=str(uuid.uuid4())))

        with tempfile.NamedTemporaryFile() as output_log_file:
            output_log_path = output_log_file.name
            init_db(output_log_path)

            nb = load_notebook_node(notebook_path)
            nb_no_parameters = replace_parameters(
                system_transform_context,
                nb,
                get_papermill_parameters(system_transform_context, inputs,
                                         output_log_path),
            )
            intermediate_path = os.path.join(
                output_notebook_dir,
                '{prefix}-inter.ipynb'.format(prefix=str(uuid.uuid4())))
            write_ipynb(nb_no_parameters, intermediate_path)

            # Although the type of is_done is threading._Event in py2, not threading.Event,
            # it is still constructed using the threading.Event() factory
            is_done = threading.Event()

            def log_watcher_thread_target():
                log_watcher = JsonSqlite3LogWatcher(
                    output_log_path, system_transform_context.log, is_done)
                log_watcher.watch()

            log_watcher_thread = threading.Thread(
                target=log_watcher_thread_target)

            log_watcher_thread.start()

            with user_code_error_boundary(
                    DagstermillExecutionError,
                    'Error occurred during the execution of Dagstermill solid '
                    '{solid_name}: {notebook_path}'.format(
                        solid_name=name, notebook_path=notebook_path),
            ):
                try:
                    papermill.execute_notebook(intermediate_path,
                                               temp_path,
                                               log_output=True)
                except Exception as exc:
                    yield Materialization(
                        path=temp_path,
                        description='{name} output notebook'.format(
                            name=transform_context.solid.name),
                    )
                    raise exc
                finally:
                    is_done.set()
                    log_watcher_thread.join()

            output_nb = scrapbook.read_notebook(temp_path)

            system_transform_context.log.debug(
                'Notebook execution complete for {name}. Data is {data}'.
                format(name=name, data=output_nb.scraps))

            yield Materialization(
                path=temp_path,
                description='{name} output notebook'.format(
                    name=transform_context.solid.name),
            )

            for (output_name, output_def
                 ) in system_transform_context.solid_def.output_dict.items():
                data_dict = output_nb.scraps.data_dict
                if output_name in data_dict:
                    value = read_value(output_def.runtime_type,
                                       data_dict[output_name])

                    yield Result(value, output_name)

            for key, value in output_nb.scraps.items():
                print(output_nb.scraps)
                if key.startswith('materialization-'):
                    with open(value.data, 'rb') as fd:
                        yield pickle.loads(fd.read())
Exemple #14
0
    def _t_fn(transform_context, inputs):
        check.inst_param(transform_context, 'transform_context',
                         TransformExecutionContext)
        check.param_invariant(
            isinstance(transform_context.environment_dict, dict),
            'context',
            'SystemTransformExecutionContext must have valid environment_dict',
        )

        system_transform_context = transform_context.get_system_context()

        base_dir = '/tmp/dagstermill/{run_id}/'.format(
            run_id=transform_context.run_id)
        output_notebook_dir = os.path.join(base_dir, 'output_notebooks/')

        if not os.path.exists(output_notebook_dir):
            os.makedirs(output_notebook_dir)

        temp_path = os.path.join(
            output_notebook_dir,
            '{prefix}-out.ipynb'.format(prefix=str(uuid.uuid4())))

        output_log_path = os.path.join(base_dir, 'run.log')

        try:
            nb = load_notebook_node(notebook_path)
            nb_no_parameters = replace_parameters(
                system_transform_context,
                nb,
                get_papermill_parameters(system_transform_context, inputs,
                                         output_log_path),
            )
            intermediate_path = os.path.join(
                output_notebook_dir,
                '{prefix}-inter.ipynb'.format(prefix=str(uuid.uuid4())))
            write_ipynb(nb_no_parameters, intermediate_path)

            with open(output_log_path, 'a') as f:
                f.close()

            process = subprocess.Popen(
                [
                    'papermill', '--log-output', '--log-level', 'ERROR',
                    intermediate_path, temp_path
                ],
                stderr=subprocess.PIPE,
            )
            _stdout, stderr = process.communicate()
            while process.poll() is None:  # while subprocess alive
                if system_transform_context.event_callback:
                    with open(output_log_path, 'r') as ff:
                        current_time = os.path.getmtime(output_log_path)
                        while process.poll() is None:
                            new_time = os.path.getmtime(output_log_path)
                            if new_time != current_time:
                                line = ff.readline()
                                if not line:
                                    break
                                event_record_dict = json.loads(line)

                                system_transform_context.event_callback(
                                    EventRecord(**event_record_dict))
                                current_time = new_time

            if process.returncode != 0:
                raise DagstermillError(
                    'There was an error when Papermill tried to execute the notebook. '
                    'The process stderr is \'{stderr}\''.format(stderr=stderr))

            output_nb = pm.read_notebook(temp_path)

            system_transform_context.log.debug(
                'Notebook execution complete for {name}. Data is {data}'.
                format(name=name, data=output_nb.data))

            yield Materialization(
                '{name} output notebook'.format(
                    name=transform_context.solid.name), temp_path)

            for output_def in system_transform_context.solid_def.output_defs:
                if output_def.name in output_nb.data:

                    value = read_value(output_def.runtime_type,
                                       output_nb.data[output_def.name])

                    yield Result(value, output_def.name)

        finally:
            if do_cleanup and os.path.exists(temp_path):
                os.remove(temp_path)
Exemple #15
0
def load_notebook(notebook_name):
    notebook_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'notebooks', notebook_name)
    nb = load_notebook_node(notebook_path)
    nb.metadata.papermill['input_path'] = notebook_name
    return nb
Exemple #16
0
    def _t_fn(compute_context, inputs):
        check.inst_param(compute_context, 'compute_context',
                         ComputeExecutionContext)
        check.param_invariant(
            isinstance(compute_context.environment_dict, dict),
            'context',
            'SystemComputeExecutionContext must have valid environment_dict',
        )

        system_compute_context = compute_context.get_system_context()

        base_dir = '/tmp/dagstermill/{run_id}/'.format(
            run_id=compute_context.run_id)
        output_notebook_dir = os.path.join(base_dir, 'output_notebooks/')
        mkdir_p(output_notebook_dir)

        temp_path = os.path.join(
            output_notebook_dir,
            '{prefix}-out.ipynb'.format(prefix=str(uuid.uuid4())))

        with safe_tempfile_path() as output_log_path:
            # Scaffold the registration here
            nb = load_notebook_node(notebook_path)
            nb_no_parameters = replace_parameters(
                system_compute_context,
                nb,
                get_papermill_parameters(system_compute_context, inputs,
                                         output_log_path),
            )
            intermediate_path = os.path.join(
                output_notebook_dir,
                '{prefix}-inter.ipynb'.format(prefix=str(uuid.uuid4())))
            write_ipynb(nb_no_parameters, intermediate_path)

            with user_code_error_boundary(
                    DagstermillExecutionError,
                    lambda:
                ('Error occurred during the execution of Dagstermill solid '
                 '{solid_name}: {notebook_path}'.format(
                     solid_name=name, notebook_path=notebook_path)),
            ):
                try:
                    papermill_engines.register('dagstermill',
                                               DagstermillNBConvertEngine)
                    papermill.execute_notebook(intermediate_path,
                                               temp_path,
                                               engine_name='dagstermill',
                                               log_output=True)
                except Exception as exc:
                    yield Materialization(
                        label='output_notebook',
                        description=
                        'Location of output notebook on the filesystem',
                        metadata_entries=[
                            EventMetadataEntry.fspath(temp_path)
                        ],
                    )
                    raise exc

            # deferred import for perf
            import scrapbook

            output_nb = scrapbook.read_notebook(temp_path)

            system_compute_context.log.debug(
                'Notebook execution complete for {name}. Data is {data}'.
                format(name=name, data=output_nb.scraps))

            yield Materialization(
                label='output_notebook',
                description='Location of output notebook on the filesystem',
                metadata_entries=[EventMetadataEntry.fspath(temp_path)],
            )

            for (output_name, output_def
                 ) in system_compute_context.solid_def.output_dict.items():
                data_dict = output_nb.scraps.data_dict
                if output_name in data_dict:
                    value = read_value(output_def.dagster_type,
                                       data_dict[output_name])

                    yield Output(value, output_name)

            for key, value in output_nb.scraps.items():
                if key.startswith('event-'):
                    with open(value.data, 'rb') as fd:
                        yield pickle.loads(fd.read())
Exemple #17
0
    def _t_fn(compute_context, inputs):
        check.inst_param(compute_context, 'compute_context',
                         SolidExecutionContext)
        check.param_invariant(
            isinstance(compute_context.environment_dict, dict),
            'context',
            'SystemComputeExecutionContext must have valid environment_dict',
        )

        system_compute_context = compute_context.get_system_context()

        with seven.TemporaryDirectory() as output_notebook_dir:
            with safe_tempfile_path() as output_log_path:

                parameterized_notebook_path = os.path.join(
                    output_notebook_dir,
                    '{prefix}-inter.ipynb'.format(prefix=str(uuid.uuid4())))

                executed_notebook_path = os.path.join(
                    output_notebook_dir,
                    '{prefix}-out.ipynb'.format(prefix=str(uuid.uuid4())))

                # Scaffold the registration here
                nb = load_notebook_node(notebook_path)
                nb_no_parameters = replace_parameters(
                    system_compute_context,
                    nb,
                    get_papermill_parameters(system_compute_context, inputs,
                                             output_log_path),
                )
                write_ipynb(nb_no_parameters, parameterized_notebook_path)

                with user_code_error_boundary(
                        DagstermillExecutionError,
                        lambda:
                    ('Error occurred during the execution of Dagstermill solid '
                     '{solid_name}: {notebook_path}'.format(
                         solid_name=name, notebook_path=notebook_path)),
                ):
                    try:
                        papermill_engines.register('dagstermill',
                                                   DagstermillNBConvertEngine)
                        papermill.execute_notebook(
                            input_path=parameterized_notebook_path,
                            output_path=executed_notebook_path,
                            engine_name='dagstermill',
                            log_output=True,
                        )

                    except Exception as exc:  # pylint: disable=broad-except
                        try:
                            with open(executed_notebook_path, 'r') as fd:
                                executed_notebook_file_handle = compute_context.file_manager.write(
                                    fd, mode='w', ext='ipynb')
                                executed_notebook_materialization_path = (
                                    executed_notebook_file_handle.path_desc)
                        except Exception as exc_inner:  # pylint: disable=broad-except
                            compute_context.log.warning(
                                'Error when attempting to materialize executed notebook using file manager (falling back to local): {exc}'
                                .format(exc=exc_inner))
                            executed_notebook_materialization_path = executed_notebook_path

                        yield Materialization(
                            label='output_notebook',
                            description=
                            'Location of output notebook in file manager',
                            metadata_entries=[
                                EventMetadataEntry.fspath(
                                    executed_notebook_materialization_path)
                            ],
                        )
                        raise exc

            system_compute_context.log.debug(
                'Notebook execution complete for {name} at {executed_notebook_path}.'
                .format(
                    name=name,
                    executed_notebook_path=executed_notebook_path,
                ))

            try:
                with open(executed_notebook_path, 'r') as fd:
                    executed_notebook_file_handle = compute_context.file_manager.write(
                        fd, mode='w', ext='ipynb')
                    executed_notebook_materialization_path = executed_notebook_file_handle.path_desc
            except Exception as exc:  # pylint: disable=broad-except
                compute_context.log.warning(
                    'Error when attempting to materialize executed notebook using file manager (falling back to local): {exc}'
                    .format(exc=str(exc)))
                executed_notebook_materialization_path = executed_notebook_path

            yield Materialization(
                label='output_notebook',
                description='Location of output notebook in file manager',
                metadata_entries=[
                    EventMetadataEntry.fspath(
                        executed_notebook_materialization_path)
                ],
            )

            if output_notebook is not None:
                yield Output(executed_notebook_file_handle, output_notebook)

            # deferred import for perf
            import scrapbook

            output_nb = scrapbook.read_notebook(executed_notebook_path)

            for (output_name, output_def
                 ) in system_compute_context.solid_def.output_dict.items():
                data_dict = output_nb.scraps.data_dict
                if output_name in data_dict:
                    value = read_value(output_def.dagster_type,
                                       data_dict[output_name])

                    yield Output(value, output_name)

            for key, value in output_nb.scraps.items():
                if key.startswith('event-'):
                    with open(value.data, 'rb') as fd:
                        yield pickle.loads(fd.read())
 def setUp(self):
     self.notebook_name = 'sos_python3.ipynb'
     self.notebook_path = get_notebook_path(self.notebook_name)
     self.nb = load_notebook_node(self.notebook_path)
     self.nb.metadata.papermill['input_path'] = 'sos_python3.ipynb'
Exemple #19
0
    def _t_fn(compute_context, inputs):
        check.inst_param(compute_context, "compute_context",
                         SolidExecutionContext)
        check.param_invariant(
            isinstance(compute_context.run_config, dict),
            "context",
            "SystemComputeExecutionContext must have valid run_config",
        )

        system_compute_context = compute_context.get_system_context()

        with seven.TemporaryDirectory() as output_notebook_dir:
            with safe_tempfile_path() as output_log_path:

                parameterized_notebook_path = os.path.join(
                    output_notebook_dir,
                    "{prefix}-inter.ipynb".format(prefix=str(uuid.uuid4())))

                executed_notebook_path = os.path.join(
                    output_notebook_dir,
                    "{prefix}-out.ipynb".format(prefix=str(uuid.uuid4())))

                # Scaffold the registration here
                nb = load_notebook_node(notebook_path)
                nb_no_parameters = replace_parameters(
                    system_compute_context,
                    nb,
                    get_papermill_parameters(system_compute_context, inputs,
                                             output_log_path),
                )
                write_ipynb(nb_no_parameters, parameterized_notebook_path)

                with user_code_error_boundary(
                        DagstermillExecutionError,
                        lambda:
                    ("Error occurred during the execution of Dagstermill solid "
                     "{solid_name}: {notebook_path}".format(
                         solid_name=name, notebook_path=notebook_path)),
                ):
                    try:
                        papermill_engines.register("dagstermill",
                                                   DagstermillNBConvertEngine)
                        papermill.execute_notebook(
                            input_path=parameterized_notebook_path,
                            output_path=executed_notebook_path,
                            engine_name="dagstermill",
                            log_output=True,
                        )

                    except Exception as exc:  # pylint: disable=broad-except
                        try:
                            with open(executed_notebook_path, "rb") as fd:
                                executed_notebook_file_handle = compute_context.resources.file_manager.write(
                                    fd, mode="wb", ext="ipynb")
                                executed_notebook_materialization_path = (
                                    executed_notebook_file_handle.path_desc)
                        except Exception as exc_inner:  # pylint: disable=broad-except
                            compute_context.log.warning(
                                "Error when attempting to materialize executed notebook using file manager (falling back to local): {exc}"
                                .format(exc=exc_inner))
                            executed_notebook_materialization_path = executed_notebook_path

                        yield AssetMaterialization(
                            asset_key=(asset_key_prefix +
                                       [f"{name}_output_notebook"]),
                            description=
                            "Location of output notebook in file manager",
                            metadata_entries=[
                                EventMetadataEntry.fspath(
                                    executed_notebook_materialization_path,
                                    label="executed_notebook_path",
                                )
                            ],
                        )
                        raise exc

            system_compute_context.log.debug(
                "Notebook execution complete for {name} at {executed_notebook_path}."
                .format(
                    name=name,
                    executed_notebook_path=executed_notebook_path,
                ))

            executed_notebook_file_handle = None
            try:
                # use binary mode when when moving the file since certain file_managers such as S3
                # may try to hash the contents
                with open(executed_notebook_path, "rb") as fd:
                    executed_notebook_file_handle = compute_context.resources.file_manager.write(
                        fd, mode="wb", ext="ipynb")
                    executed_notebook_materialization_path = executed_notebook_file_handle.path_desc
            except Exception as exc:  # pylint: disable=broad-except
                compute_context.log.warning(
                    "Error when attempting to materialize executed notebook using file manager (falling back to local): {exc}"
                    .format(exc=str(exc)))
                executed_notebook_materialization_path = executed_notebook_path

            yield AssetMaterialization(
                asset_key=(asset_key_prefix + [f"{name}_output_notebook"]),
                description="Location of output notebook in file manager",
                metadata_entries=[
                    EventMetadataEntry.fspath(
                        executed_notebook_materialization_path)
                ],
            )

            if output_notebook is not None:
                yield Output(executed_notebook_file_handle, output_notebook)

            # deferred import for perf
            import scrapbook

            output_nb = scrapbook.read_notebook(executed_notebook_path)

            for (output_name, output_def
                 ) in system_compute_context.solid_def.output_dict.items():
                data_dict = output_nb.scraps.data_dict
                if output_name in data_dict:
                    value = read_value(output_def.dagster_type,
                                       data_dict[output_name])

                    yield Output(value, output_name)

            for key, value in output_nb.scraps.items():
                if key.startswith("event-"):
                    with open(value.data, "rb") as fd:
                        yield pickle.loads(fd.read())
Exemple #20
0
    def _t_fn(info, inputs):
        check.param_invariant(
            isinstance(info.context.environment_config, dict),
            'info',
            'TransformExecutionInfo must have valid environment_config',
        )

        base_dir = '/tmp/dagstermill/{run_id}/'.format(
            run_id=info.context.run_id)
        output_notebook_dir = os.path.join(base_dir, 'output_notebooks/')

        if not os.path.exists(output_notebook_dir):
            os.makedirs(output_notebook_dir)

        temp_path = os.path.join(
            output_notebook_dir,
            '{prefix}-out.ipynb'.format(prefix=str(uuid.uuid4())))

        output_log_path = os.path.join(base_dir, 'run.log')

        try:
            nb = load_notebook_node(notebook_path)
            nb_no_parameters = replace_parameters(
                info, nb,
                get_papermill_parameters(info, inputs, output_log_path))
            intermediate_path = os.path.join(
                output_notebook_dir,
                '{prefix}-inter.ipynb'.format(prefix=str(uuid.uuid4())))
            write_ipynb(nb_no_parameters, intermediate_path)

            with open(output_log_path, 'a') as f:
                f.close()

            # info.log.info("Output log path is {}".format(output_log_path))
            # info.log.info("info.context.event_callback {}".format(info.context.event_callback))

            process = subprocess.Popen(
                ["papermill", intermediate_path, temp_path])
            # _source_nb = pm.execute_notebook(intermediate_path, temp_path)

            while process.poll() is None:  # while subprocess alive
                if info.context.event_callback:
                    with open(output_log_path, 'r') as ff:
                        current_time = os.path.getmtime(output_log_path)
                        while process.poll() is None:
                            new_time = os.path.getmtime(output_log_path)
                            if new_time != current_time:
                                line = ff.readline()
                                if not line:
                                    break
                                event_record_dict = json.loads(line)

                                event_record_dict['event_type'] = EventType(
                                    event_record_dict['event_type'])
                                info.context.event_callback(
                                    EventRecord(**event_record_dict))
                                current_time = new_time

            if process.returncode != 0:
                # Throw event that is an execution error!
                info.log.debug("There was an error in Papermill!")
                info.log.debug('stderr was None'
                               if process.stderr is None else process.stderr)
                exit()

            output_nb = pm.read_notebook(temp_path)

            info.log.debug(
                'Notebook execution complete for {name}. Data is {data}'.
                format(name=name, data=output_nb.data))

            info.log.info(
                "Output notebook path is {}".format(output_notebook_dir))

            for output_def in info.solid_def.output_defs:
                if output_def.name in output_nb.data:

                    value = read_value(output_def.runtime_type,
                                       output_nb.data[output_def.name])

                    yield Result(value, output_def.name)

        finally:
            if do_cleanup and os.path.exists(temp_path):
                os.remove(temp_path)
Exemple #21
0
    def _t_fn(step_context, inputs):
        check.inst_param(step_context, "step_context", SolidExecutionContext)
        check.param_invariant(
            isinstance(step_context.run_config, dict),
            "context",
            "StepExecutionContext must have valid run_config",
        )

        step_execution_context = step_context.get_step_execution_context()

        with tempfile.TemporaryDirectory() as output_notebook_dir:
            with safe_tempfile_path() as output_log_path:

                prefix = str(uuid.uuid4())
                parameterized_notebook_path = os.path.join(
                    output_notebook_dir, f"{prefix}-inter.ipynb")

                executed_notebook_path = os.path.join(output_notebook_dir,
                                                      f"{prefix}-out.ipynb")

                # Scaffold the registration here
                nb = load_notebook_node(notebook_path)
                compute_descriptor = ("solid" if dagster_factory_name
                                      == "define_dagstermill_solid" else "op")
                nb_no_parameters = replace_parameters(
                    step_execution_context,
                    nb,
                    get_papermill_parameters(step_execution_context, inputs,
                                             output_log_path,
                                             compute_descriptor),
                )
                write_ipynb(nb_no_parameters, parameterized_notebook_path)

                try:
                    papermill_engines.register("dagstermill",
                                               DagstermillEngine)
                    papermill.execute_notebook(
                        input_path=parameterized_notebook_path,
                        output_path=executed_notebook_path,
                        engine_name="dagstermill",
                        log_output=True,
                    )

                except Exception as ex:
                    step_execution_context.log.warn(
                        "Error when attempting to materialize executed notebook: {exc}"
                        .format(exc=str(
                            serializable_error_info_from_exc_info(
                                sys.exc_info()))))
                    # pylint: disable=no-member
                    # compat:
                    if isinstance(
                            ex,
                            ExecutionError) and (ex.ename == "RetryRequested"
                                                 or ex.ename == "Failure"):
                        step_execution_context.log.warn(
                            f"Encountered raised {ex.ename} in notebook. Use dagstermill.yield_event "
                            "with RetryRequested or Failure to trigger their behavior."
                        )

                    raise

            step_execution_context.log.debug(
                "Notebook execution complete for {name} at {executed_notebook_path}."
                .format(
                    name=name,
                    executed_notebook_path=executed_notebook_path,
                ))
            if output_notebook_name is not None:
                # yield output notebook binary stream as a solid output
                with open(executed_notebook_path, "rb") as fd:
                    yield Output(fd.read(), output_notebook_name)

            else:
                # backcompat
                executed_notebook_file_handle = None
                try:
                    # use binary mode when when moving the file since certain file_managers such as S3
                    # may try to hash the contents
                    with open(executed_notebook_path, "rb") as fd:
                        executed_notebook_file_handle = step_context.resources.file_manager.write(
                            fd, mode="wb", ext="ipynb")
                        executed_notebook_materialization_path = (
                            executed_notebook_file_handle.path_desc)

                    yield AssetMaterialization(
                        asset_key=(asset_key_prefix +
                                   [f"{name}_output_notebook"]),
                        description=
                        "Location of output notebook in file manager",
                        metadata_entries=[
                            MetadataEntry.fspath(
                                executed_notebook_materialization_path)
                        ],
                    )

                except Exception:
                    # if file manager writing errors, e.g. file manager is not provided, we throw a warning
                    # and fall back to the previously stored temp executed notebook.
                    step_context.log.warning(
                        "Error when attempting to materialize executed notebook using file manager: "
                        f"{str(serializable_error_info_from_exc_info(sys.exc_info()))}"
                        f"\nNow falling back to local: notebook execution was temporarily materialized at {executed_notebook_path}"
                        "\nIf you have supplied a file manager and expect to use it for materializing the "
                        'notebook, please include "file_manager" in the `required_resource_keys` argument '
                        f"to `{dagster_factory_name}`")

                if output_notebook is not None:
                    yield Output(executed_notebook_file_handle,
                                 output_notebook)

            # deferred import for perf
            import scrapbook

            output_nb = scrapbook.read_notebook(executed_notebook_path)

            for (output_name,
                 _) in step_execution_context.solid_def.output_dict.items():
                data_dict = output_nb.scraps.data_dict
                if output_name in data_dict:
                    # read outputs that were passed out of process via io manager from `yield_result`
                    step_output_handle = StepOutputHandle(
                        step_key=step_execution_context.step.key,
                        output_name=output_name)
                    output_context = step_execution_context.get_output_context(
                        step_output_handle)
                    io_manager = step_execution_context.get_io_manager(
                        step_output_handle)
                    value = io_manager.load_input(
                        build_input_context(upstream_output=output_context))

                    yield Output(value, output_name)

            for key, value in output_nb.scraps.items():
                if key.startswith("event-"):
                    with open(value.data, "rb") as fd:
                        event = pickle.loads(fd.read())
                        if isinstance(event, (Failure, RetryRequested)):
                            raise event
                        else:
                            yield event