Example #1
0
def test_initialization(spec, expected, tmp_sample_tasks, tmp_imports):
    meta = Meta.default_meta({
        'extract_product': False,
        'extract_upstream': True
    })

    spec = TaskSpec(spec, meta=meta, project_root='.')

    # check values after initialization
    assert spec['class'] == expected
    assert isinstance(spec['source'], Path)

    # check we can convert it to a Task
    spec.to_task(dag=DAG())
Example #2
0
def test_error_if_client_dotted_path_returns_none(tmp_sample_tasks,
                                                  add_current_to_sys_path,
                                                  no_sys_modules_cache, key):
    Path('client_dotted_path_returns_none.py').write_text("""
def get():
    return None
""")

    meta = Meta.default_meta({
        'extract_product': False,
        'extract_upstream': True,
    })

    dag = DAG()

    spec = {
        'source': 'sample.sql',
        'product': ['name', 'table'],
    }

    spec[key] = 'client_dotted_path_returns_none.get'

    with pytest.raises(TypeError) as excinfo:
        TaskSpec(spec, meta=meta, project_root='.').to_task(dag=dag)

    assert (
        "Error calling dotted path "
        "'client_dotted_path_returns_none.get'. Expected a value but got None"
    ) in str(excinfo.value)
Example #3
0
def test_add_hook(tmp_directory, add_current_to_sys_path):
    task = {
        'product': 'notebook.ipynb',
        'source': 'source.py',
        'on_finish': 'hooks.some_hook',
        'on_render': 'hooks.some_hook',
        'on_failure': 'hooks.some_hook'
    }
    meta = Meta.default_meta()
    meta['extract_product'] = False

    Path('source.py').write_text("""
# + tags=["parameters"]
# some code
    """)

    Path('hooks.py').write_text("""

def some_hook():
    pass
    """)

    dag = DAG()
    t, _ = TaskSpec(task, meta, project_root='.').to_task(dag)
    assert t.on_finish
    assert t.on_render
    assert t.on_failure
Example #4
0
def test_validate_missing_source(key):
    with pytest.raises(KeyError):
        TaskSpec({key: None}, {
            'extract_product': False,
            'extract_upstream': False
        },
                 project_root='.')
Example #5
0
def test_grid_with_hook_lazy_import(backup_spec_with_functions_flat,
                                    tmp_imports):
    grid_spec = {
        'source': 'my_tasks_flat.raw.function',
        'name': 'function-',
        'product': 'some_file.txt',
        'grid': {
            'a': [1, 2],
            'b': [3, 4]
        },
        'on_render': 'hooks.on_render',
        'on_finish': 'hooks.on_finish',
        'on_failure': 'hooks.on_failure',
    }

    meta = Meta.default_meta()
    dag = DAG()

    TaskSpec(grid_spec, meta, project_root='.',
             lazy_import=True).to_task(dag=dag)

    assert all(t.on_render.callable is None for t in dag.values())
    assert all(t.on_finish.callable is None for t in dag.values())
    assert all(t.on_failure.callable is None for t in dag.values())

    assert all(t.on_render._spec.dotted_path == 'hooks.on_render'
               for t in dag.values())
    assert all(t.on_finish._spec.dotted_path == 'hooks.on_finish'
               for t in dag.values())
    assert all(t.on_failure._spec.dotted_path == 'hooks.on_failure'
               for t in dag.values())
Example #6
0
def test_error_if_dotted_path_does_not_return_a_callable(
        backup_spec_with_functions_flat, add_current_to_sys_path,
        no_sys_modules_cache, key):

    Path('test_error_if_dotted_path_does_not_return_a_callable.py').write_text(
        """
some_non_function = 1
""")

    meta = Meta.default_meta({'extract_product': False})

    spec = {
        'source': 'my_tasks_flat.raw.function',
        'product': 'some_file.txt',
    }

    spec[key] = ('test_error_if_dotted_path_does_not_return_a_callable'
                 '.some_non_function')

    with pytest.raises(TypeError) as excinfo:
        TaskSpec(spec, meta=meta, project_root='.').to_task(dag=DAG())

    expected = ("Error loading dotted path 'test_error_if_dotted_path"
                "_does_not_return_a_callable.some_non_function'. Expected a "
                "callable object (i.e., some kind of function). Got "
                "1 (an object of type: int)")
    assert str(excinfo.value) == expected
Example #7
0
def test_error_on_invalid_value_for_file_product(backup_online, tmp_imports):
    meta = Meta.default_meta()
    meta['extract_product'] = False

    spec = TaskSpec({
        'source': 'online_tasks.square',
        'product': 1,
    },
                    meta=meta,
                    project_root='.')

    with pytest.raises(TypeError) as excinfo:
        spec.to_task(dag=DAG())

    expected = ('Error initializing File with argument 1 '
                '(expected str, bytes or os.PathLike object, not int)')
    assert expected == str(excinfo.value)
Example #8
0
def test_grid_with_missing_name(backup_spec_with_functions_flat,
                                add_current_to_sys_path, spec):
    del spec['name']

    with pytest.raises(KeyError) as excinfo:
        TaskSpec(spec, Meta.default_meta(),
                 project_root='.').to_task(dag=DAG())

    assert 'Error initializing task with spec' in str(excinfo.value)
Example #9
0
def test_grid_and_params(backup_spec_with_functions_flat, tmp_imports,
                         grid_spec):
    grid_spec['params'] = {'a': 1}

    with pytest.raises(DAGSpecInitializationError) as excinfo:
        TaskSpec(grid_spec, Meta.default_meta(),
                 project_root='.').to_task(dag=DAG())

    assert "'params' is not allowed when using 'grid'" in str(excinfo.value)
Example #10
0
def test_grid_with_missing_name(backup_spec_with_functions_flat, tmp_imports,
                                grid_spec):
    del grid_spec['name']

    with pytest.raises(DAGSpecInitializationError) as excinfo:
        TaskSpec(grid_spec, Meta.default_meta(),
                 project_root='.').to_task(dag=DAG())

    assert 'Error initializing task with source' in str(excinfo.value)
Example #11
0
def test_error_when_failing_to_init(spec, tmp_sample_tasks, tmp_imports):
    meta = Meta.default_meta({
        'extract_product': False,
        'extract_upstream': True
    })

    dag = DAG()

    with pytest.raises(DAGSpecInitializationError) as excinfo:
        TaskSpec(spec, meta=meta, project_root='.').to_task(dag=dag)

    assert 'Error initializing SQLRelation' in str(excinfo.value)
Example #12
0
def test_loads_serializer_and_unserializer(backup_online, tmp_imports):
    meta = Meta.default_meta()
    meta['extract_product'] = False

    spec = TaskSpec(
        {
            'source': 'online_tasks.square',
            'product': 'output/square.parquet',
            'serializer': 'online_io.serialize',
            'unserializer': 'online_io.unserialize',
        },
        meta=meta,
        project_root='.')

    dag = DAG()
    task, _ = spec.to_task(dag=dag)

    from online_io import serialize, unserialize

    assert task._serializer.callable is serialize
    assert task._unserializer.callable is unserialize
Example #13
0
def test_lazy_load(tmp_directory, tmp_imports):
    Path('my_module.py').write_text("""
def fn():
    pass
""")

    meta = Meta.default_meta()
    spec = TaskSpec(
        {
            'source': 'my_module.fn',
            'product': 'report.ipynb',
            'on_finish': 'not_a_module.not_a_function',
            'on_render': 'not_a_module.not_a_function',
            'on_failure': 'not_a_module.not_a_function',
            'serializer': 'not_a_module.not_a_function',
            'unserializer': 'not_a_module.not_a_function',
        },
        meta,
        '.',
        lazy_import=True)

    assert spec.to_task(dag=DAG())
Example #14
0
def test_grid(backup_spec_with_functions_flat, add_current_to_sys_path, spec):
    meta = Meta.default_meta()
    dag = DAG()

    task_group, _ = TaskSpec(spec, meta, project_root='.').to_task(dag=dag)

    assert len(task_group) == 4
    assert str(dag['function-0'].product) == str(
        Path('some_file-0.txt').resolve())
    assert str(dag['function-1'].product) == str(
        Path('some_file-1.txt').resolve())
    assert str(dag['function-2'].product) == str(
        Path('some_file-2.txt').resolve())
    assert str(dag['function-3'].product) == str(
        Path('some_file-3.txt').resolve())
Example #15
0
def test_error_on_invalid_class(backup_spec_with_functions_flat, tmp_imports):
    meta = Meta.default_meta({'extract_product': False})

    spec = {
        'source': 'my_tasks_flat.raw.function',
        'product': 'some_file.txt',
        'class': 'unknown_class'
    }

    with pytest.raises(ValueError) as excinfo:
        TaskSpec(spec, meta=meta, project_root='.').to_task(dag=DAG())

    expected = ("Error validating Task spec (class field): "
                "'unknown_class' is not a valid Task class name")
    assert str(excinfo.value) == expected
Example #16
0
def test_constructor_deep_copies_spec_and_meta(tmp_directory, tmp_imports):
    prod_default_class = {'SQLScript': 'SQLRelation'}
    meta = Meta.default_meta({
        'extract_product': False,
        'product_default_class': prod_default_class
    })
    params = {'params': {'a': 1}}
    spec = {
        'source': 'sample.sql',
        'product': 'some_file.txt',
        'params': params
    }
    task_spec = TaskSpec(data=spec, meta=meta, project_root='.')

    assert spec is not task_spec.data
    assert meta is not task_spec.meta
    assert params is not task_spec.data['params']
    assert prod_default_class is not task_spec.meta['product_default_class']
Example #17
0
def test_skips_source_loader_if_absolute_path(tmp_sample_tasks, tmp_imports):
    Path('templates').mkdir()

    meta = Meta.default_meta({
        'extract_product': False,
        'extract_upstream': True,
        'source_loader': {
            'path': 'templates'
        }
    })

    dag = DAG()

    spec = {
        'source': str(Path(tmp_sample_tasks, 'sample.sql')),
        'product': ['name', 'table'],
        'client': 'db.get_client'
    }

    assert TaskSpec(spec, meta=meta, project_root='.').to_task(dag=dag)
Example #18
0
    def _init(self, data, env, lazy_import, reload, parent_path,
              look_up_project_root_recursively):
        self._lazy_import = lazy_import

        # initialized with a path to a yaml file...
        if isinstance(data, (str, Path)):
            # TODO: test this
            if parent_path is not None:
                raise ValueError('parent_path must be None when '
                                 f'initializing {type(self).__name__} with '
                                 'a path to a YAML spec')
            # resolve the parent path to make sources and products unambiguous
            # even if the current working directory changes
            self._path = Path(data).resolve()
            self._parent_path = str(self._path.parent)

            if not Path(data).is_file():
                raise FileNotFoundError(
                    'Error initializing DAGSpec with argument '
                    f'{data!r}: Expected it to be a path to a YAML file, but '
                    'such file does not exist')

            content = Path(data).read_text()

            try:
                data = yaml.safe_load(content)
            except (yaml.parser.ParserError,
                    yaml.constructor.ConstructorError) as e:
                error = e
            else:
                error = None

            if error:
                if '{{' in content or '}}' in content:
                    raise DAGSpecInitializationError(
                        'Failed to initialize spec. It looks like '
                        'you\'re using placeholders (i.e. {{placeholder}}). '
                        'Make sure values are enclosed in parentheses '
                        '(e.g. key: "{{placeholder}}"). Original '
                        'parser error:\n\n'
                        f'{error}')
                else:
                    raise error

        # initialized with a dictionary...
        else:
            self._path = None
            # FIXME: add test cases, some of those features wont work if
            # _parent_path is None. We should make sure that we either raise
            # an error if _parent_path is needed or use the current working
            # directory if it's appropriate - this is mostly to make relative
            # paths consistent: they should be relative to the file that
            # contains them
            self._parent_path = (None if not parent_path else str(
                Path(parent_path).resolve()))

        self.data = data

        if isinstance(self.data, list):
            self.data = {'tasks': self.data}

        # validate keys defined at the top (nested keys are not validated here)
        self._validate_top_keys(self.data, self._path)

        logger.debug('DAGSpec enviroment:\n%s', pp.pformat(env))

        env = env or dict()
        path_to_defaults = default.path_to_env_from_spec(
            path_to_spec=self._path)

        if path_to_defaults:
            defaults = yaml.safe_load(Path(path_to_defaults).read_text())
            self.env = EnvDict(env,
                               path_to_here=self._parent_path,
                               defaults=defaults)
        else:
            self.env = EnvDict(env, path_to_here=self._parent_path)

        self.data, tags = expand_raw_dictionary_and_extract_tags(
            self.data, self.env)

        logger.debug('Expanded DAGSpec:\n%s', pp.pformat(data))

        # if there is a "location" top key, we don't have to do anything else
        # as we will just load the dotted path when .to_dag() is called
        if 'location' not in self.data:

            Meta.initialize_inplace(self.data)

            import_tasks_from = self.data['meta']['import_tasks_from']

            if import_tasks_from is not None:
                # when using a relative path in "import_tasks_from", we must
                # make it absolute...
                if not Path(import_tasks_from).is_absolute():
                    # use _parent_path if there is one
                    if self._parent_path:
                        self.data['meta']['import_tasks_from'] = str(
                            Path(self._parent_path, import_tasks_from))
                    # otherwise just make it absolute
                    else:
                        self.data['meta']['import_tasks_from'] = str(
                            Path(import_tasks_from).resolve())

                imported = yaml.safe_load(
                    Path(self.data['meta']['import_tasks_from']).read_text())

                if self.env is not None:
                    (imported,
                     tags_other) = expand_raw_dictionaries_and_extract_tags(
                         imported, self.env)
                    tags = tags | tags_other

                # relative paths here are relative to the file where they
                # are declared
                base_path = Path(self.data['meta']['import_tasks_from']).parent

                for task in imported:
                    add_base_path_to_source_if_relative(task,
                                                        base_path=base_path)

                self.data['tasks'].extend(imported)

            # check if there are any params declared in env, not used in
            # in the pipeline
            extra = set(self.env) - self.env.default_keys - tags

            if extra:
                warnings.warn('The following placeholders are declared in the '
                              'environment but '
                              f'unused in the spec: {extra}')

            self.data['tasks'] = [
                normalize_task(task) for task in self.data['tasks']
            ]

            # NOTE: for simple projects, project root is the parent folder
            # of pipeline.yaml, for package projects is the parent folder
            # of setup.py
            if look_up_project_root_recursively:
                project_root = (
                    None if not self._parent_path else
                    default.find_root_recursively(
                        starting_dir=self._parent_path,
                        filename=None if not self._path else self._path.name))
            else:
                project_root = self._parent_path

            # make sure the folder where the pipeline is located is in sys.path
            # otherwise dynamic imports needed by TaskSpec will fail
            with add_to_sys_path(self._parent_path, chdir=False):
                self.data['tasks'] = [
                    TaskSpec(t,
                             self.data['meta'],
                             project_root=project_root,
                             lazy_import=lazy_import,
                             reload=reload) for t in self.data['tasks']
                ]
        else:
            self.data['meta'] = Meta.empty()
Example #19
0
def test_error_if_extract_but_keys_declared(task, meta):
    with pytest.raises(DAGSpecInitializationError):
        TaskSpec(task, meta, project_root='.')
Example #20
0
def test_error_if_extract_but_keys_declared(task, meta):
    with pytest.raises(ValueError):
        TaskSpec(task, meta, project_root='.')