def find(cls, env=None, reload=False, lazy_import=False, starting_dir=None, name=None): """ Automatically find pipeline.yaml and return a DAGSpec object, which can be converted to a DAG using .to_dag() Parameters ---------- env The environment to pass to the spec name : str, default=None Filename to search for. If None, it looks for a pipeline.yaml file, otherwise it looks for a file with such name. """ starting_dir = starting_dir or os.getcwd() path_to_entry_point = default.entry_point_with_name( root_path=starting_dir, name=name) try: return cls(path_to_entry_point, env=env, lazy_import=lazy_import, reload=reload) except Exception as e: exc = DAGSpecInitializationError('Error initializing DAG from ' f'{path_to_entry_point!s}') raise exc from e
def _default_spec_load(starting_dir=None, lazy_import=False, reload=False): """ NOTE: this is a private API. Use DAGSpec.find() instead Looks for a pipeline.yaml, generates a DAGSpec and returns a DAG. Currently, this is only used by the PloomberContentsManager, this is not intended to be a public API since initializing specs from paths where we have to recursively look for a pipeline.yaml has some considerations regarding relative paths that make this confusing, inside the contents manager, all those things are all handled for that use case. The pipeline.yaml parent folder is temporarily added to sys.path when calling DAGSpec.to_dag() to make sure imports work as expected Returns DAG and the directory where the pipeline.yaml file is located. """ root_path = starting_dir or os.getcwd() path_to_entry_point = default.entry_point(root_path=root_path) try: spec = DAGSpec(path_to_entry_point, env=None, lazy_import=lazy_import, reload=reload) path_to_spec = Path(path_to_entry_point) return spec, path_to_spec.parent, path_to_spec except Exception as e: exc = DAGSpecInitializationError('Error initializing DAG from ' f'{path_to_entry_point!s}') raise exc from e
def _try_product_init(class_, path_to_source, kwargs): """ Try to initialize product, raises a chained exception if not possible. To provide more context. """ try: return class_(path_to_source, **kwargs) except Exception as e: kwargs_msg = f' and keyword arguments: {kwargs!r}' if kwargs else '' raise DAGSpecInitializationError( f'Error initializing {class_.__name__} with source: ' f'{path_to_source!r}' + kwargs_msg) from e
def _to_dag(self): """ Internal method to manage the different cases to convert to a DAG object """ if 'location' in self: return dotted_path.call_dotted_path(self['location']) dag = DAG() if 'config' in self: dag._params = DAGConfiguration.from_dict(self['config']) if 'executor' in self: executor = self['executor'] if isinstance(executor, str) and executor in {'serial', 'parallel'}: if executor == 'parallel': dag.executor = Parallel() elif isinstance(executor, Mapping): dag.executor = dotted_path.DottedPath( executor, lazy_load=False, allow_return_none=False)() else: raise DAGSpecInitializationError( '"executor" must be ' '"serial", "parallel", or a dotted path' f', got: {executor!r}') clients = self.get('clients') if clients: for class_name, dotted_path_spec in clients.items(): dps = dotted_path.DottedPath(dotted_path_spec, lazy_load=self._lazy_import, allow_return_none=False) if self._lazy_import: dag.clients[class_name] = dps else: dag.clients[class_name] = dps() for attr in ('serializer', 'unserializer', 'on_finish', 'on_render', 'on_failure'): if attr in self: setattr( dag, attr, dotted_path.DottedPath(self[attr], lazy_load=self._lazy_import)) process_tasks(dag, self, root_path=self._parent_path) return dag
def _find_product_class(task_class, task_dict, meta): key = 'product_default_class.' + task_class.__name__ meta_product_default_class = get_value_at(meta, key) if 'product_class' in task_dict: return validate_product_class_name(task_dict.pop('product_class')) elif meta_product_default_class: return validate_product_class_name(meta_product_default_class) else: raise DAGSpecInitializationError( f'Could not determine a product class for task: ' f'{task_dict!r}. Add an explicit value in the ' '"product_class"')
def __init__(self, data, env=None, lazy_import=False, reload=False, parent_path=None): if isinstance(data, (str, Path)): if parent_path is not None: raise ValueError('parent_path must be None when ' f'initializing {type(self).__name__} with ' 'a path to a YAML spec') # this is only used to display an error message with the path # to the loaded file path_for_errors = data # resolve the parent path to make sources and products unambiguous # even if the current working directory changes path_to_entry_point = Path(data).resolve() self._parent_path = str(path_to_entry_point.parent) content = Path(data).read_text() try: data = yaml.safe_load(content) except (yaml.parser.ParserError, yaml.constructor.ConstructorError) as e: error = e else: error = None if error: if '{{' in content or '}}' in content: raise DAGSpecInitializationError( 'Failed to initialize spec. It looks like ' 'you\'re using placeholders (i.e. {{placeholder}}). ' 'Make sure values are enclosed in parentheses ' '(e.g. key: "{{placeholder}}"). Original ' 'parser error:\n\n' f'{error}') else: raise error else: path_for_errors = None # FIXME: add test cases, some of those features wont work if # _parent_path is None. We should make sure that we either raise # an error if _parent_path is needed or use the current working # directory if it's appropriate - this is mostly to make relative # paths consistent: they should be relative to the file that # contains them self._parent_path = (None if not parent_path else str( Path(parent_path).resolve())) # try to look env.yaml in default locations env_default_path = default.path_to_env(self._parent_path) self.data = data if isinstance(self.data, list): self.data = {'tasks': self.data} # validate keys defined at the top (nested keys are not validated here) self._validate_top_keys(self.data, path_for_errors) logger.debug('DAGSpec enviroment:\n%s', pp.pformat(env)) env = env or dict() # NOTE: when loading from a path, EnvDict recursively looks # at parent folders, this is useful when loading envs # in nested directories where scripts/functions need the env # but here, since we just need this for the spec, we might # want to turn it off. should we add a parameter to EnvDict # to control this? if env_default_path: defaults = yaml.safe_load(Path(env_default_path).read_text()) self.env = EnvDict(env, path_to_here=self._parent_path, defaults=defaults) else: self.env = EnvDict(env, path_to_here=self._parent_path) self.data = expand_raw_dictionary(self.data, self.env) logger.debug('Expanded DAGSpec:\n%s', pp.pformat(data)) # if there is a "location" top key, we don't have to do anything else # as we will just load the dotted path when .to_dag() is called if 'location' not in self.data: Meta.initialize_inplace(self.data) import_tasks_from = self.data['meta']['import_tasks_from'] if import_tasks_from is not None: # when using a relative path in "import_tasks_from", we must # make it absolute... if not Path(import_tasks_from).is_absolute(): # use _parent_path if there is one if self._parent_path: self.data['meta']['import_tasks_from'] = str( Path(self._parent_path, import_tasks_from)) # otherwise just make it absolute else: self.data['meta']['import_tasks_from'] = str( Path(import_tasks_from).resolve()) imported = yaml.safe_load( Path(self.data['meta']['import_tasks_from']).read_text()) if self.env is not None: imported = expand_raw_dictionaries(imported, self.env) # relative paths here are relative to the file where they # are declared base_path = Path(self.data['meta']['import_tasks_from']).parent for task in imported: add_base_path_to_source_if_relative(task, base_path=base_path) self.data['tasks'].extend(imported) self.data['tasks'] = [ normalize_task(task) for task in self.data['tasks'] ] # make sure the folder where the pipeline is located is in sys.path # otherwise dynamic imports needed by TaskSpec will fail with add_to_sys_path(self._parent_path, chdir=False): self.data['tasks'] = [ TaskSpec(t, self.data['meta'], project_root=self._parent_path, lazy_import=lazy_import, reload=reload) for t in self.data['tasks'] ] else: self.data['meta'] = Meta.empty()
def _init_task(data, meta, project_root, lazy_import, dag): """Initialize a single task from a dictionary spec """ task_dict = copy(data) class_ = task_dict.pop('class') product = _init_product(task_dict, meta, class_, project_root, lazy_import=lazy_import) _init_client(task_dict, lazy_import=lazy_import) source = task_dict.pop('source') name = task_dict.pop('name', None) on_finish = task_dict.pop('on_finish', None) on_render = task_dict.pop('on_render', None) on_failure = task_dict.pop('on_failure', None) if 'serializer' in task_dict: task_dict['serializer'] = dotted_path.DottedPath( task_dict['serializer'], lazy_load=lazy_import) if 'unserializer' in task_dict: task_dict['unserializer'] = dotted_path.DottedPath( task_dict['unserializer'], lazy_load=lazy_import) # edge case: if using lazy_import, we should not check if the kernel # is installed. this is used when exporting to Argo/Airflow using # soopervisor, since the exporting process should not require to have # the ir kernel installed. The same applies when Airflow has to convert # the DAG, the Airflow environment shouldn't require the ir kernel if (class_ == tasks.NotebookRunner and lazy_import and 'check_if_kernel_installed' not in task_dict): task_dict['check_if_kernel_installed'] = False # make paths to resources absolute if 'params' in task_dict: task_dict['params'] = resolve_resources(task_dict['params'], relative_to=project_root) try: task = class_(source=source, product=product, name=name, dag=dag, **task_dict) except Exception as e: msg = (f'Failed to initialize {class_.__name__} task with ' f'source {str(source)!r}.') raise DAGSpecInitializationError(msg) from e if on_finish: task.on_finish = dotted_path.DottedPath(on_finish, lazy_load=lazy_import) if on_render: task.on_render = dotted_path.DottedPath(on_render, lazy_load=lazy_import) if on_failure: task.on_failure = dotted_path.DottedPath(on_failure, lazy_load=lazy_import) return task
def to_task(self, dag): """ Convert the spec to a Task or TaskGroup and add it to the dag. Returns a (task, upstream) tuple with the Task instance and list of upstream dependencies (as described in the 'upstream' key, if any, empty if no 'upstream' key). If the spec has a 'grid' key, a TaskGroup instance instead Parameters ---------- dag The DAG to add the task(s) to """ data = copy(self.data) upstream = _make_iterable(data.pop('upstream')) if 'grid' in data: data_source_ = data["source"] data_source = str(data_source_ if not hasattr( data_source_, '__name__') else data_source_.__name__) if 'params' in data: raise DAGSpecInitializationError( 'Error initializing task with ' f'source {data_source!r}: ' '\'params\' is not allowed when using \'grid\'') if 'name' not in data: raise DAGSpecInitializationError( f'Error initializing task with ' f'source {data_source!r}: ' 'tasks with \'grid\' must have a \'name\'') task_class = data.pop('class') product_class = _find_product_class(task_class, data, self.meta) product = data.pop('product') name = data.pop('name') grid = data.pop('grid') # hooks on_render = data.pop('on_render', None) on_finish = data.pop('on_finish', None) on_failure = data.pop('on_failure', None) if on_render: on_render = dotted_path.DottedPath(on_render, lazy_load=self.lazy_import) if on_finish: on_finish = dotted_path.DottedPath(on_finish, lazy_load=self.lazy_import) if on_failure: on_failure = dotted_path.DottedPath(on_failure, lazy_load=self.lazy_import) return TaskGroup.from_grid(task_class=task_class, product_class=product_class, product_primitive=product, task_kwargs=data, dag=dag, name=name, grid=grid, resolve_relative_to=self.project_root, on_render=on_render, on_finish=on_finish, on_failure=on_failure), upstream else: return _init_task(data=data, meta=self.meta, project_root=self.project_root, lazy_import=self.lazy_import, dag=dag), upstream
def _init(self, data, env, lazy_import, reload, parent_path, look_up_project_root_recursively): self._lazy_import = lazy_import # initialized with a path to a yaml file... if isinstance(data, (str, Path)): # TODO: test this if parent_path is not None: raise ValueError('parent_path must be None when ' f'initializing {type(self).__name__} with ' 'a path to a YAML spec') # resolve the parent path to make sources and products unambiguous # even if the current working directory changes self._path = Path(data).resolve() self._parent_path = str(self._path.parent) if not Path(data).is_file(): raise FileNotFoundError( 'Error initializing DAGSpec with argument ' f'{data!r}: Expected it to be a path to a YAML file, but ' 'such file does not exist') content = Path(data).read_text() try: data = yaml.safe_load(content) except (yaml.parser.ParserError, yaml.constructor.ConstructorError) as e: error = e else: error = None if error: if '{{' in content or '}}' in content: raise DAGSpecInitializationError( 'Failed to initialize spec. It looks like ' 'you\'re using placeholders (i.e. {{placeholder}}). ' 'Make sure values are enclosed in parentheses ' '(e.g. key: "{{placeholder}}"). Original ' 'parser error:\n\n' f'{error}') else: raise error # initialized with a dictionary... else: self._path = None # FIXME: add test cases, some of those features wont work if # _parent_path is None. We should make sure that we either raise # an error if _parent_path is needed or use the current working # directory if it's appropriate - this is mostly to make relative # paths consistent: they should be relative to the file that # contains them self._parent_path = (None if not parent_path else str( Path(parent_path).resolve())) self.data = data if isinstance(self.data, list): self.data = {'tasks': self.data} # validate keys defined at the top (nested keys are not validated here) self._validate_top_keys(self.data, self._path) logger.debug('DAGSpec enviroment:\n%s', pp.pformat(env)) env = env or dict() path_to_defaults = default.path_to_env_from_spec( path_to_spec=self._path) if path_to_defaults: defaults = yaml.safe_load(Path(path_to_defaults).read_text()) self.env = EnvDict(env, path_to_here=self._parent_path, defaults=defaults) else: self.env = EnvDict(env, path_to_here=self._parent_path) self.data, tags = expand_raw_dictionary_and_extract_tags( self.data, self.env) logger.debug('Expanded DAGSpec:\n%s', pp.pformat(data)) # if there is a "location" top key, we don't have to do anything else # as we will just load the dotted path when .to_dag() is called if 'location' not in self.data: Meta.initialize_inplace(self.data) import_tasks_from = self.data['meta']['import_tasks_from'] if import_tasks_from is not None: # when using a relative path in "import_tasks_from", we must # make it absolute... if not Path(import_tasks_from).is_absolute(): # use _parent_path if there is one if self._parent_path: self.data['meta']['import_tasks_from'] = str( Path(self._parent_path, import_tasks_from)) # otherwise just make it absolute else: self.data['meta']['import_tasks_from'] = str( Path(import_tasks_from).resolve()) imported = yaml.safe_load( Path(self.data['meta']['import_tasks_from']).read_text()) if self.env is not None: (imported, tags_other) = expand_raw_dictionaries_and_extract_tags( imported, self.env) tags = tags | tags_other # relative paths here are relative to the file where they # are declared base_path = Path(self.data['meta']['import_tasks_from']).parent for task in imported: add_base_path_to_source_if_relative(task, base_path=base_path) self.data['tasks'].extend(imported) # check if there are any params declared in env, not used in # in the pipeline extra = set(self.env) - self.env.default_keys - tags if extra: warnings.warn('The following placeholders are declared in the ' 'environment but ' f'unused in the spec: {extra}') self.data['tasks'] = [ normalize_task(task) for task in self.data['tasks'] ] # NOTE: for simple projects, project root is the parent folder # of pipeline.yaml, for package projects is the parent folder # of setup.py if look_up_project_root_recursively: project_root = ( None if not self._parent_path else default.find_root_recursively( starting_dir=self._parent_path, filename=None if not self._path else self._path.name)) else: project_root = self._parent_path # make sure the folder where the pipeline is located is in sys.path # otherwise dynamic imports needed by TaskSpec will fail with add_to_sys_path(self._parent_path, chdir=False): self.data['tasks'] = [ TaskSpec(t, self.data['meta'], project_root=project_root, lazy_import=lazy_import, reload=reload) for t in self.data['tasks'] ] else: self.data['meta'] = Meta.empty()
def process_tasks(dag, dag_spec, root_path=None): """ Initialize Task objects from TaskSpec, extract product and dependencies if needed and set the dag dependencies structure """ root_path = root_path or '.' # options extract_up = dag_spec['meta']['extract_upstream'] extract_prod = dag_spec['meta']['extract_product'] # raw values extracted from the upstream key upstream_raw = {} # first pass: init tasks and them to dag for task_dict in dag_spec['tasks']: # init source to extract product fn = task_dict['class']._init_source kwargs = {'kwargs': {}, **task_dict} source = call_with_dictionary(fn, kwargs=kwargs) if extract_prod: task_dict['product'] = source.extract_product() # convert to task, up has the content of "upstream" if any task, up = task_dict.to_task(dag) if isinstance(task, TaskGroup): for t in task: upstream_raw[t] = up else: if extract_prod: logger.debug('Extracted product for task "%s": %s', task.name, task.product) upstream_raw[task] = up # second optional pass: extract upstream tasks = list(dag.values()) task_names = list(dag._iter()) # actual upstream values after matching wildcards upstream = {} # expand upstream dependencies (in case there are any wildcards) for task in tasks: if extract_up: upstream[task] = _expand_upstream(task.source.extract_upstream(), task_names) else: upstream[task] = _expand_upstream(upstream_raw[task], task_names) logger.debug('Extracted upstream dependencies for task %s: %s', task.name, upstream[task]) # Last pass: set upstream dependencies for task in tasks: if upstream[task]: for task_name, group_name in upstream[task].items(): up = dag.get(task_name) if up is None: names = [t.name for t in tasks] raise DAGSpecInitializationError( f'Task {task.name!r} ' 'has an upstream dependency ' f'{task_name!r}, but such task ' 'doesn\'t exist. Available tasks: ' f'{pretty_print.iterable(names)}') task.set_upstream(up, group_name=group_name)