Ejemplo n.º 1
0
    def _validate_top_keys(self, spec, path):
        """Validate keys at the top of the spec
        """
        if 'tasks' not in spec and 'location' not in spec:
            raise DAGSpecInitializationError(
                'Failed to initialize spec. Missing "tasks" key')

        if 'location' in spec:
            if len(spec) > 1:
                raise DAGSpecInitializationError(
                    'Failed to initialize spec. If '
                    'using the "location" key there should not '
                    'be other keys')

        else:
            valid = {
                'meta',
                'config',
                'clients',
                'tasks',
                'serializer',
                'unserializer',
                'executor',
                'on_finish',
                'on_render',
                'on_failure',
            }
            validate.keys(valid, spec.keys(), name='dag spec')
Ejemplo n.º 2
0
    def validate(self):
        """
        Validates the data schema
        """
        if 'upstream' not in self.data:
            self.data['upstream'] = None

        if self.meta['extract_product']:
            required = {'source'}
        else:
            required = {'product', 'source'}

        validate.keys(valid=None,
                      passed=self.data,
                      required=required,
                      name=repr(self))

        if self.meta['extract_upstream'] and self.data.get('upstream'):
            raise DAGSpecInitializationError(
                'Error validating task "{}", if '
                'meta.extract_upstream is set to True, tasks '
                'should not have an "upstream" key'.format(self.data))

        if self.meta['extract_product'] and self.data.get('product'):
            raise DAGSpecInitializationError(
                'Error validating task "{}", if '
                'meta.extract_product is set to True, tasks '
                'should not have a "product" key'.format(self.data))
Ejemplo n.º 3
0
    def _validate_top_keys(self, spec, path):
        """Validate keys at the top of the spec
        """
        if 'tasks' not in spec and 'location' not in spec:
            path_ = f'(file: "{path}")' if self._parent_path else ''
            raise KeyError('Invalid data to initialize DAGSpec, missing '
                           f'key "tasks" {path_}')

        if 'location' in spec:
            if len(spec) > 1:
                raise KeyError('If specifying dag through a "location" key '
                               'it must be the unique key in the spec')
        else:
            valid = {
                'meta', 'config', 'clients', 'tasks', 'serializer',
                'unserializer'
            }
            validate.keys(valid, spec.keys(), name='dag spec')
Ejemplo n.º 4
0
    def default_meta(cls, meta=None):
        """Fill missing values in a meta dictionary
        """
        if meta is None:
            meta = {}

        validate.keys(cls.VALID, meta, name='dag spec')

        if 'extract_upstream' not in meta:
            meta['extract_upstream'] = True

        if 'extract_product' not in meta:
            meta['extract_product'] = False

        if 'product_relative_to_source' not in meta:
            meta['product_relative_to_source'] = False

        if 'jupyter_hot_reload' not in meta:
            meta['jupyter_hot_reload'] = False

        if 'jupyter_functions_as_notebooks' not in meta:
            meta['jupyter_functions_as_notebooks'] = False

        if 'import_tasks_from' not in meta:
            meta['import_tasks_from'] = None

        if 'source_loader' not in meta:
            meta['source_loader'] = None
        else:
            try:
                meta['source_loader'] = SourceLoader(**meta['source_loader'])
            except Exception as e:
                msg = ('Error initializing SourceLoader with '
                       f'{meta["source_loader"]}. Error message: {e.args[0]}')
                e.args = (msg, )
                raise

        defaults = {
            'SQLDump': 'File',
            'NotebookRunner': 'File',
            'SQLScript': 'SQLRelation',
            'PythonCallable': 'File',
            'ShellScript': 'File',
        }

        if 'product_default_class' not in meta:
            meta['product_default_class'] = defaults
        else:
            for class_, prod in defaults.items():
                if class_ not in meta['product_default_class']:
                    meta['product_default_class'][class_] = prod

        # validate keys and values in product_default_class
        for task_name, product_name in meta['product_default_class'].items():
            try:
                validate_task_class_name(task_name)
                validate_product_class_name(product_name)
            except Exception as e:
                msg = f'Error validating product_default_class: {e.args[0]}'
                e.args = (msg, )
                raise

        return meta
Ejemplo n.º 5
0
    def build(self, input_data, copy=False):
        """Run the DAG

        Parameters
        ----------
        input_data : dict
            A dictionary mapping root tasks (names) to dict params. Root tasks
            are tasks in the DAG that do not have upstream dependencies,
            the corresponding dictionary is passed to the respective task
            source function as keyword arguments

        copy : bool or callable
            Whether to copy the output of an upstream task before passing it
            to the task being processed. It is recommended to turn this off
            for memory efficiency but if the tasks are not pure functions
            (i.e. mutate their inputs) this migh lead to bugs, in such
            case, the best way to fix it would be to make all your tasks
            pure functions but you can enable this option if memory
            consumption is not a problem. If True it uses the ``copy.copy``
            function before passing the upstream products, if you pass a
            callable instead, such function is used (for example, you
            may pass ``copy.deepcopy``)

        Returns
        -------
        dict
            A dictionary mapping task names to their respective outputs
        """
        outs = {}

        input_data_names = set(self.root_nodes)
        # FIXME: for this particula case, the error here should be TypeError,
        # not KeyError (the former is the one used when calling functions with
        # invalid arguments) - maybe an argument validate.keys to choose
        # which error to raise?
        validate.keys(valid=input_data_names,
                      passed=set(input_data),
                      required=input_data_names,
                      name='input_data')

        if copy is True:
            copying_function = copy_module.copy
        elif callable(copy):
            copying_function = copy
        else:
            copying_function = _do_nothing

        for task_name in self.dag:
            task = self.dag[task_name]
            params = task.params.to_dict()

            if task_name in self.root_nodes:
                params = {**params, 'input_data': input_data[task_name]}

            # replace params with the returned value from upstream tasks
            if 'upstream' in params:
                params['upstream'] = {
                    k: copying_function(outs[k])
                    for k, v in params['upstream'].items()
                }

            params.pop('product', None)

            output = self.return_postprocessor(task.source.primitive(**params))

            if output is None:
                raise ValueError(
                    'All callables in a {} must return a value. '
                    'Callable "{}", from task "{}" returned None'.format(
                        type(self).__name__, task.source.name, task_name))

            outs[task_name] = output

        return outs