Beispiel #1
0
    def find(cls,
             env=None,
             reload=False,
             lazy_import=False,
             starting_dir=None,
             name=None):
        """
        Automatically find pipeline.yaml and return a DAGSpec object, which
        can be converted to a DAG using .to_dag()

        Parameters
        ----------
        env
            The environment to pass to the spec

        name : str, default=None
            Filename to search for. If None, it looks for a pipeline.yaml file,
            otherwise it looks for a file with such name.
        """
        starting_dir = starting_dir or os.getcwd()
        path_to_entry_point = default.entry_point_with_name(
            root_path=starting_dir, name=name)

        try:
            return cls(path_to_entry_point,
                       env=env,
                       lazy_import=lazy_import,
                       reload=reload)
        except Exception as e:
            exc = DAGSpecInitializationError('Error initializing DAG from '
                                             f'{path_to_entry_point!s}')
            raise exc from e
Beispiel #2
0
def _default_spec_load(starting_dir=None, lazy_import=False, reload=False):
    """
    NOTE: this is a private API. Use DAGSpec.find() instead

    Looks for a pipeline.yaml, generates a DAGSpec and returns a DAG.
    Currently, this is only used by the PloomberContentsManager, this is
    not intended to be a public API since initializing specs from paths
    where we have to recursively look for a pipeline.yaml has some
    considerations regarding relative paths that make this confusing,
    inside the contents manager, all those things are all handled for that
    use case.

    The pipeline.yaml parent folder is temporarily added to sys.path when
    calling DAGSpec.to_dag() to make sure imports work as expected

    Returns DAG and the directory where the pipeline.yaml file is located.
    """
    root_path = starting_dir or os.getcwd()
    path_to_entry_point = default.entry_point(root_path=root_path)

    try:
        spec = DAGSpec(path_to_entry_point,
                       env=None,
                       lazy_import=lazy_import,
                       reload=reload)

        path_to_spec = Path(path_to_entry_point)
        return spec, path_to_spec.parent, path_to_spec

    except Exception as e:
        exc = DAGSpecInitializationError('Error initializing DAG from '
                                         f'{path_to_entry_point!s}')
        raise exc from e
Beispiel #3
0
def _try_product_init(class_, path_to_source, kwargs):
    """
    Try to initialize product, raises a chained exception if not possible.
    To provide more context.
    """
    try:
        return class_(path_to_source, **kwargs)
    except Exception as e:
        kwargs_msg = f' and keyword arguments: {kwargs!r}' if kwargs else ''
        raise DAGSpecInitializationError(
            f'Error initializing {class_.__name__} with source: '
            f'{path_to_source!r}' + kwargs_msg) from e
Beispiel #4
0
    def _to_dag(self):
        """
        Internal method to manage the different cases to convert to a DAG
        object
        """
        if 'location' in self:
            return dotted_path.call_dotted_path(self['location'])

        dag = DAG()

        if 'config' in self:
            dag._params = DAGConfiguration.from_dict(self['config'])

        if 'executor' in self:
            executor = self['executor']

            if isinstance(executor,
                          str) and executor in {'serial', 'parallel'}:
                if executor == 'parallel':
                    dag.executor = Parallel()
            elif isinstance(executor, Mapping):
                dag.executor = dotted_path.DottedPath(
                    executor, lazy_load=False, allow_return_none=False)()
            else:
                raise DAGSpecInitializationError(
                    '"executor" must be '
                    '"serial", "parallel", or a dotted path'
                    f', got: {executor!r}')

        clients = self.get('clients')

        if clients:
            for class_name, dotted_path_spec in clients.items():
                dps = dotted_path.DottedPath(dotted_path_spec,
                                             lazy_load=self._lazy_import,
                                             allow_return_none=False)

                if self._lazy_import:
                    dag.clients[class_name] = dps
                else:
                    dag.clients[class_name] = dps()

        for attr in ('serializer', 'unserializer', 'on_finish', 'on_render',
                     'on_failure'):
            if attr in self:
                setattr(
                    dag, attr,
                    dotted_path.DottedPath(self[attr],
                                           lazy_load=self._lazy_import))

        process_tasks(dag, self, root_path=self._parent_path)

        return dag
Beispiel #5
0
def _find_product_class(task_class, task_dict, meta):
    key = 'product_default_class.' + task_class.__name__
    meta_product_default_class = get_value_at(meta, key)

    if 'product_class' in task_dict:
        return validate_product_class_name(task_dict.pop('product_class'))
    elif meta_product_default_class:
        return validate_product_class_name(meta_product_default_class)
    else:
        raise DAGSpecInitializationError(
            f'Could not determine a product class for task: '
            f'{task_dict!r}. Add an explicit value in the '
            '"product_class"')
Beispiel #6
0
    def __init__(self,
                 data,
                 env=None,
                 lazy_import=False,
                 reload=False,
                 parent_path=None):
        if isinstance(data, (str, Path)):
            if parent_path is not None:
                raise ValueError('parent_path must be None when '
                                 f'initializing {type(self).__name__} with '
                                 'a path to a YAML spec')
            # this is only used to display an error message with the path
            # to the loaded file
            path_for_errors = data
            # resolve the parent path to make sources and products unambiguous
            # even if the current working directory changes
            path_to_entry_point = Path(data).resolve()
            self._parent_path = str(path_to_entry_point.parent)

            content = Path(data).read_text()

            try:
                data = yaml.safe_load(content)
            except (yaml.parser.ParserError,
                    yaml.constructor.ConstructorError) as e:
                error = e
            else:
                error = None

            if error:
                if '{{' in content or '}}' in content:
                    raise DAGSpecInitializationError(
                        'Failed to initialize spec. It looks like '
                        'you\'re using placeholders (i.e. {{placeholder}}). '
                        'Make sure values are enclosed in parentheses '
                        '(e.g. key: "{{placeholder}}"). Original '
                        'parser error:\n\n'
                        f'{error}')
                else:
                    raise error

        else:
            path_for_errors = None
            # FIXME: add test cases, some of those features wont work if
            # _parent_path is None. We should make sure that we either raise
            # an error if _parent_path is needed or use the current working
            # directory if it's appropriate - this is mostly to make relative
            # paths consistent: they should be relative to the file that
            # contains them
            self._parent_path = (None if not parent_path else str(
                Path(parent_path).resolve()))

        # try to look env.yaml in default locations
        env_default_path = default.path_to_env(self._parent_path)

        self.data = data

        if isinstance(self.data, list):
            self.data = {'tasks': self.data}

        # validate keys defined at the top (nested keys are not validated here)
        self._validate_top_keys(self.data, path_for_errors)

        logger.debug('DAGSpec enviroment:\n%s', pp.pformat(env))

        env = env or dict()

        # NOTE: when loading from a path, EnvDict recursively looks
        # at parent folders, this is useful when loading envs
        # in nested directories where scripts/functions need the env
        # but here, since we just need this for the spec, we might
        # want to turn it off. should we add a parameter to EnvDict
        # to control this?
        if env_default_path:
            defaults = yaml.safe_load(Path(env_default_path).read_text())
            self.env = EnvDict(env,
                               path_to_here=self._parent_path,
                               defaults=defaults)
        else:
            self.env = EnvDict(env, path_to_here=self._parent_path)

        self.data = expand_raw_dictionary(self.data, self.env)

        logger.debug('Expanded DAGSpec:\n%s', pp.pformat(data))

        # if there is a "location" top key, we don't have to do anything else
        # as we will just load the dotted path when .to_dag() is called
        if 'location' not in self.data:

            Meta.initialize_inplace(self.data)

            import_tasks_from = self.data['meta']['import_tasks_from']

            if import_tasks_from is not None:
                # when using a relative path in "import_tasks_from", we must
                # make it absolute...
                if not Path(import_tasks_from).is_absolute():
                    # use _parent_path if there is one
                    if self._parent_path:
                        self.data['meta']['import_tasks_from'] = str(
                            Path(self._parent_path, import_tasks_from))
                    # otherwise just make it absolute
                    else:
                        self.data['meta']['import_tasks_from'] = str(
                            Path(import_tasks_from).resolve())

                imported = yaml.safe_load(
                    Path(self.data['meta']['import_tasks_from']).read_text())

                if self.env is not None:
                    imported = expand_raw_dictionaries(imported, self.env)

                # relative paths here are relative to the file where they
                # are declared
                base_path = Path(self.data['meta']['import_tasks_from']).parent

                for task in imported:
                    add_base_path_to_source_if_relative(task,
                                                        base_path=base_path)

                self.data['tasks'].extend(imported)

            self.data['tasks'] = [
                normalize_task(task) for task in self.data['tasks']
            ]

            # make sure the folder where the pipeline is located is in sys.path
            # otherwise dynamic imports needed by TaskSpec will fail
            with add_to_sys_path(self._parent_path, chdir=False):
                self.data['tasks'] = [
                    TaskSpec(t,
                             self.data['meta'],
                             project_root=self._parent_path,
                             lazy_import=lazy_import,
                             reload=reload) for t in self.data['tasks']
                ]
        else:
            self.data['meta'] = Meta.empty()
Beispiel #7
0
def _init_task(data, meta, project_root, lazy_import, dag):
    """Initialize a single task from a dictionary spec
    """
    task_dict = copy(data)
    class_ = task_dict.pop('class')

    product = _init_product(task_dict,
                            meta,
                            class_,
                            project_root,
                            lazy_import=lazy_import)

    _init_client(task_dict, lazy_import=lazy_import)

    source = task_dict.pop('source')

    name = task_dict.pop('name', None)

    on_finish = task_dict.pop('on_finish', None)
    on_render = task_dict.pop('on_render', None)
    on_failure = task_dict.pop('on_failure', None)

    if 'serializer' in task_dict:
        task_dict['serializer'] = dotted_path.DottedPath(
            task_dict['serializer'], lazy_load=lazy_import)

    if 'unserializer' in task_dict:
        task_dict['unserializer'] = dotted_path.DottedPath(
            task_dict['unserializer'], lazy_load=lazy_import)

    # edge case: if using lazy_import, we should not check if the kernel
    # is installed. this is used when exporting to Argo/Airflow using
    # soopervisor, since the exporting process should not require to have
    # the ir kernel installed. The same applies when Airflow has to convert
    # the DAG, the Airflow environment shouldn't require the ir kernel
    if (class_ == tasks.NotebookRunner and lazy_import
            and 'check_if_kernel_installed' not in task_dict):
        task_dict['check_if_kernel_installed'] = False

    # make paths to resources absolute
    if 'params' in task_dict:
        task_dict['params'] = resolve_resources(task_dict['params'],
                                                relative_to=project_root)

    try:
        task = class_(source=source,
                      product=product,
                      name=name,
                      dag=dag,
                      **task_dict)
    except Exception as e:
        msg = (f'Failed to initialize {class_.__name__} task with '
               f'source {str(source)!r}.')
        raise DAGSpecInitializationError(msg) from e

    if on_finish:
        task.on_finish = dotted_path.DottedPath(on_finish,
                                                lazy_load=lazy_import)

    if on_render:
        task.on_render = dotted_path.DottedPath(on_render,
                                                lazy_load=lazy_import)

    if on_failure:
        task.on_failure = dotted_path.DottedPath(on_failure,
                                                 lazy_load=lazy_import)

    return task
Beispiel #8
0
    def to_task(self, dag):
        """
        Convert the spec to a Task or TaskGroup and add it to the dag.
        Returns a (task, upstream) tuple with the Task instance and list of
        upstream dependencies (as described in the 'upstream' key, if any,
        empty if no 'upstream' key). If the spec has a 'grid' key, a TaskGroup
        instance instead

        Parameters
        ----------
        dag
            The DAG to add the task(s) to
        """
        data = copy(self.data)
        upstream = _make_iterable(data.pop('upstream'))

        if 'grid' in data:
            data_source_ = data["source"]
            data_source = str(data_source_ if not hasattr(
                data_source_, '__name__') else data_source_.__name__)

            if 'params' in data:
                raise DAGSpecInitializationError(
                    'Error initializing task with '
                    f'source {data_source!r}: '
                    '\'params\' is not allowed when using \'grid\'')

            if 'name' not in data:
                raise DAGSpecInitializationError(
                    f'Error initializing task with '
                    f'source {data_source!r}: '
                    'tasks with \'grid\' must have a \'name\'')

            task_class = data.pop('class')
            product_class = _find_product_class(task_class, data, self.meta)
            product = data.pop('product')
            name = data.pop('name')
            grid = data.pop('grid')

            # hooks
            on_render = data.pop('on_render', None)
            on_finish = data.pop('on_finish', None)
            on_failure = data.pop('on_failure', None)

            if on_render:
                on_render = dotted_path.DottedPath(on_render,
                                                   lazy_load=self.lazy_import)

            if on_finish:
                on_finish = dotted_path.DottedPath(on_finish,
                                                   lazy_load=self.lazy_import)

            if on_failure:
                on_failure = dotted_path.DottedPath(on_failure,
                                                    lazy_load=self.lazy_import)

            return TaskGroup.from_grid(task_class=task_class,
                                       product_class=product_class,
                                       product_primitive=product,
                                       task_kwargs=data,
                                       dag=dag,
                                       name=name,
                                       grid=grid,
                                       resolve_relative_to=self.project_root,
                                       on_render=on_render,
                                       on_finish=on_finish,
                                       on_failure=on_failure), upstream
        else:
            return _init_task(data=data,
                              meta=self.meta,
                              project_root=self.project_root,
                              lazy_import=self.lazy_import,
                              dag=dag), upstream
Beispiel #9
0
    def _init(self, data, env, lazy_import, reload, parent_path,
              look_up_project_root_recursively):
        self._lazy_import = lazy_import

        # initialized with a path to a yaml file...
        if isinstance(data, (str, Path)):
            # TODO: test this
            if parent_path is not None:
                raise ValueError('parent_path must be None when '
                                 f'initializing {type(self).__name__} with '
                                 'a path to a YAML spec')
            # resolve the parent path to make sources and products unambiguous
            # even if the current working directory changes
            self._path = Path(data).resolve()
            self._parent_path = str(self._path.parent)

            if not Path(data).is_file():
                raise FileNotFoundError(
                    'Error initializing DAGSpec with argument '
                    f'{data!r}: Expected it to be a path to a YAML file, but '
                    'such file does not exist')

            content = Path(data).read_text()

            try:
                data = yaml.safe_load(content)
            except (yaml.parser.ParserError,
                    yaml.constructor.ConstructorError) as e:
                error = e
            else:
                error = None

            if error:
                if '{{' in content or '}}' in content:
                    raise DAGSpecInitializationError(
                        'Failed to initialize spec. It looks like '
                        'you\'re using placeholders (i.e. {{placeholder}}). '
                        'Make sure values are enclosed in parentheses '
                        '(e.g. key: "{{placeholder}}"). Original '
                        'parser error:\n\n'
                        f'{error}')
                else:
                    raise error

        # initialized with a dictionary...
        else:
            self._path = None
            # FIXME: add test cases, some of those features wont work if
            # _parent_path is None. We should make sure that we either raise
            # an error if _parent_path is needed or use the current working
            # directory if it's appropriate - this is mostly to make relative
            # paths consistent: they should be relative to the file that
            # contains them
            self._parent_path = (None if not parent_path else str(
                Path(parent_path).resolve()))

        self.data = data

        if isinstance(self.data, list):
            self.data = {'tasks': self.data}

        # validate keys defined at the top (nested keys are not validated here)
        self._validate_top_keys(self.data, self._path)

        logger.debug('DAGSpec enviroment:\n%s', pp.pformat(env))

        env = env or dict()
        path_to_defaults = default.path_to_env_from_spec(
            path_to_spec=self._path)

        if path_to_defaults:
            defaults = yaml.safe_load(Path(path_to_defaults).read_text())
            self.env = EnvDict(env,
                               path_to_here=self._parent_path,
                               defaults=defaults)
        else:
            self.env = EnvDict(env, path_to_here=self._parent_path)

        self.data, tags = expand_raw_dictionary_and_extract_tags(
            self.data, self.env)

        logger.debug('Expanded DAGSpec:\n%s', pp.pformat(data))

        # if there is a "location" top key, we don't have to do anything else
        # as we will just load the dotted path when .to_dag() is called
        if 'location' not in self.data:

            Meta.initialize_inplace(self.data)

            import_tasks_from = self.data['meta']['import_tasks_from']

            if import_tasks_from is not None:
                # when using a relative path in "import_tasks_from", we must
                # make it absolute...
                if not Path(import_tasks_from).is_absolute():
                    # use _parent_path if there is one
                    if self._parent_path:
                        self.data['meta']['import_tasks_from'] = str(
                            Path(self._parent_path, import_tasks_from))
                    # otherwise just make it absolute
                    else:
                        self.data['meta']['import_tasks_from'] = str(
                            Path(import_tasks_from).resolve())

                imported = yaml.safe_load(
                    Path(self.data['meta']['import_tasks_from']).read_text())

                if self.env is not None:
                    (imported,
                     tags_other) = expand_raw_dictionaries_and_extract_tags(
                         imported, self.env)
                    tags = tags | tags_other

                # relative paths here are relative to the file where they
                # are declared
                base_path = Path(self.data['meta']['import_tasks_from']).parent

                for task in imported:
                    add_base_path_to_source_if_relative(task,
                                                        base_path=base_path)

                self.data['tasks'].extend(imported)

            # check if there are any params declared in env, not used in
            # in the pipeline
            extra = set(self.env) - self.env.default_keys - tags

            if extra:
                warnings.warn('The following placeholders are declared in the '
                              'environment but '
                              f'unused in the spec: {extra}')

            self.data['tasks'] = [
                normalize_task(task) for task in self.data['tasks']
            ]

            # NOTE: for simple projects, project root is the parent folder
            # of pipeline.yaml, for package projects is the parent folder
            # of setup.py
            if look_up_project_root_recursively:
                project_root = (
                    None if not self._parent_path else
                    default.find_root_recursively(
                        starting_dir=self._parent_path,
                        filename=None if not self._path else self._path.name))
            else:
                project_root = self._parent_path

            # make sure the folder where the pipeline is located is in sys.path
            # otherwise dynamic imports needed by TaskSpec will fail
            with add_to_sys_path(self._parent_path, chdir=False):
                self.data['tasks'] = [
                    TaskSpec(t,
                             self.data['meta'],
                             project_root=project_root,
                             lazy_import=lazy_import,
                             reload=reload) for t in self.data['tasks']
                ]
        else:
            self.data['meta'] = Meta.empty()
Beispiel #10
0
def process_tasks(dag, dag_spec, root_path=None):
    """
    Initialize Task objects from TaskSpec, extract product and dependencies
    if needed and set the dag dependencies structure
    """
    root_path = root_path or '.'

    # options
    extract_up = dag_spec['meta']['extract_upstream']
    extract_prod = dag_spec['meta']['extract_product']

    # raw values extracted from the upstream key
    upstream_raw = {}

    # first pass: init tasks and them to dag
    for task_dict in dag_spec['tasks']:
        # init source to extract product
        fn = task_dict['class']._init_source
        kwargs = {'kwargs': {}, **task_dict}
        source = call_with_dictionary(fn, kwargs=kwargs)

        if extract_prod:
            task_dict['product'] = source.extract_product()

        # convert to task, up has the content of "upstream" if any
        task, up = task_dict.to_task(dag)

        if isinstance(task, TaskGroup):
            for t in task:
                upstream_raw[t] = up
        else:
            if extract_prod:
                logger.debug('Extracted product for task "%s": %s', task.name,
                             task.product)
            upstream_raw[task] = up

    # second optional pass: extract upstream
    tasks = list(dag.values())
    task_names = list(dag._iter())
    # actual upstream values after matching wildcards
    upstream = {}

    # expand upstream dependencies (in case there are any wildcards)
    for task in tasks:
        if extract_up:
            upstream[task] = _expand_upstream(task.source.extract_upstream(),
                                              task_names)
        else:
            upstream[task] = _expand_upstream(upstream_raw[task], task_names)

        logger.debug('Extracted upstream dependencies for task %s: %s',
                     task.name, upstream[task])

    # Last pass: set upstream dependencies
    for task in tasks:
        if upstream[task]:
            for task_name, group_name in upstream[task].items():

                up = dag.get(task_name)

                if up is None:
                    names = [t.name for t in tasks]
                    raise DAGSpecInitializationError(
                        f'Task {task.name!r} '
                        'has an upstream dependency '
                        f'{task_name!r}, but such task '
                        'doesn\'t exist. Available tasks: '
                        f'{pretty_print.iterable(names)}')

                task.set_upstream(up, group_name=group_name)