Beispiel #1
0
def test_copy(copy):
    def _assign_upstream(upstream):
        _assign_upstream.obj = upstream
        return 42

    dag_ = DAG()

    root = PythonCallable(_root,
                          File('root.parquet'),
                          dag_,
                          name='root',
                          serializer=serializer,
                          params={'input_data': {
                              'x': [0, 0, 0]
                          }})

    task = PythonCallable(_assign_upstream,
                          File('task.parquet'),
                          dag_,
                          name='task',
                          unserializer=unserializer,
                          serializer=serializer)

    root >> task

    dag = InMemoryDAG(dag_)

    out = dag.build({'root': {'x': [1]}}, copy=copy)

    # test that the function _assign_upstream received the same object
    # the task root returned in the upstream argument if copy is disabled.
    # if copying, then it should be a different object
    assert (_assign_upstream.obj['root'] is out['root']) is (not copy)
Beispiel #2
0
def test_in_memory_dag(dag):

    dag_in_memory = InMemoryDAG(dag)

    out = dag_in_memory.build({'root': {'x': [1, 2, 3]}})

    assert out['root']['x'].tolist() == [1, 2, 3]
    assert out['task']['x'].tolist() == [2, 3, 4]
Beispiel #3
0
def make_predict():
    """Instantiate a prediction DAG using a previously trained model
    """
    dag_pred = DAG()

    # this special function adds a task with name "get" that will just forward
    # whatever value we pass when calling .build(). You can pass a function
    # in the "preprocessor" argument to perform arbitrary logic like parsing
    # or validation
    input_data_passer(dag=dag_pred,
                      name='get',
                      preprocessor=validate_input_data)

    # we re-use the same code that we used for training!
    add_features(dag_pred)

    # load model generated by the training graph
    with open(Path('output', 'model.pickle'), 'rb') as f:
        model = pickle.load(f)

    # add the final task, this special function just executes whatever
    # function we pass as the first argument, we can pass arbitrary parameters
    # using "params"
    predict_task = in_memory_callable(predict,
                                      dag=dag_pred,
                                      name='predict',
                                      params=dict(model=model))

    # predict after joining features
    dag_pred['join'] >> predict_task

    # convert our batch-processing pipeline to a in-memory one and return
    return InMemoryDAG(dag_pred)
Beispiel #4
0
def test_input_data_passer():
    dag = DAG()

    root = input_data_passer(dag, name='root')
    task = PythonCallable(_add_one,
                          File('task.parquet'),
                          dag,
                          name='task',
                          unserializer=unserializer,
                          serializer=serializer)

    root >> task

    dag_ = InMemoryDAG(dag)

    assert dag_.build({'root': 1}) == {'root': 1, 'task': 2}
Beispiel #5
0
def test_in_memory_callable():
    dag = DAG()

    def add_some(upstream, to_add):
        return upstream['root'] + to_add

    root = input_data_passer(dag, name='root')
    task = in_memory_callable(add_some,
                              dag,
                              name='task',
                              params=dict(to_add=2))

    root >> task

    dag_ = InMemoryDAG(dag)

    assert dag_.build({'root': 1}) == {'root': 1, 'task': 3}
Beispiel #6
0
def test_error_if_a_task_returns_none():
    dag = DAG()

    PythonCallable(_return_none,
                   File('root.parquet'),
                   dag,
                   name='root',
                   params={'input_data': None},
                   serializer=serializer)

    dag_ = InMemoryDAG(dag)

    with pytest.raises(ValueError) as excinfo:
        dag_.build({'root': None})

    expected = ('All callables in a InMemoryDAG must return a value. '
                'Callable "_return_none", from task "root" returned None')
    assert str(excinfo.value) == expected
Beispiel #7
0
    def __init__(self):
        dag = self.init_dag_from_partial(self.get_partial())

        # TODO: add support for manually specifying upstream dependencies
        upstream = {
            name: dag[name].source.extract_upstream()
            for name in dag._iter()
        }

        # names of all tasks used as upstream
        upstream_tasks = chain(*upstream.values())

        # find tasks that are declared as upstream but do not exist in the dag
        missing = set(upstream_tasks) - set(dag)

        for name in missing:
            input_data_passer(dag, name=name)

        # TODO: maybe delete all upstream dependencies and set them again
        # (raise a warning if there are some upstream dependencies?)
        # this doesn't happen when we get a yaml file because we control
        # that using extract_upstream=False but might happen if we receive
        # a DAG object already
        # the dag is complete now, set all upstream dependencies
        for name in dag._iter():
            for dependency in upstream.get(name, []):
                dag[name].set_upstream(dag[dependency])

        # get all terminal nodes and make them a dependency of the  node
        terminal_current = [
            name for name, degree in dag._G.out_degree() if not degree
        ]

        # TODO: extract upstream and make sure they match with the ones in
        # terminal_current
        terminal = in_memory_callable(self.terminal_task,
                                      dag,
                                      name='terminal',
                                      params=self.terminal_params())

        for dependency in terminal_current:
            terminal.set_upstream(dag[dependency])

        self.in_memory = InMemoryDAG(dag)
Beispiel #8
0
def test_error_if_non_compatible_tasks():
    dag = DAG()
    ShellScript('touch {{product}}', File('file.txt'), dag, name='task')

    with pytest.raises(TypeError) as excinfo:
        InMemoryDAG(dag)

    expected = ('All tasks in the DAG must be PythonCallable, '
                'got unallowed types: ShellScript')
    assert str(excinfo.value) == expected
Beispiel #9
0
class OnlineDAG(abc.ABC):
    """
    Execute partial DAGs in-memory. This is an abstract class, to use it.
    Create a subclass and provide the required static methods.

    See here for a complete example:
    https://github.com/ploomber/projects/blob/master/ml-online/src/ml_online/infer.py
    """

    # FIXME: add a way to customize
    def __init__(self):
        dag = self.init_dag_from_partial(self.get_partial())

        # TODO: add support for manually specifying upstream dependencies
        upstream = {
            name: dag[name].source.extract_upstream()
            for name in dag._iter()
        }

        # names of all tasks used as upstream
        upstream_tasks = chain(*upstream.values())

        # find tasks that are declared as upstream but do not exist in the dag
        missing = set(upstream_tasks) - set(dag)

        for name in missing:
            input_data_passer(dag, name=name)

        # TODO: maybe delete all upstream dependencies and set them again
        # (raise a warning if there are some upstream dependencies?)
        # this doesn't happen when we get a yaml file because we control
        # that using extract_upstream=False but might happen if we receive
        # a DAG object already
        # the dag is complete now, set all upstream dependencies
        for name in dag._iter():
            for dependency in upstream.get(name, []):
                dag[name].set_upstream(dag[dependency])

        # get all terminal nodes and make them a dependency of the  node
        terminal_current = [
            name for name, degree in dag._G.out_degree() if not degree
        ]

        # TODO: extract upstream and make sure they match with the ones in
        # terminal_current
        terminal = in_memory_callable(self.terminal_task,
                                      dag,
                                      name='terminal',
                                      params=self.terminal_params())

        for dependency in terminal_current:
            terminal.set_upstream(dag[dependency])

        self.in_memory = InMemoryDAG(dag)

    @classmethod
    def init_dag_from_partial(cls, partial):
        """Initialize partial returned by get_partial()
        """
        if isinstance(partial, (str, Path)):
            with open(partial) as f:
                tasks = yaml.safe_load(f)

            # cannot extract upstream because this is an incomplete DAG
            meta = {'extract_product': False, 'extract_upstream': False}
            spec = DAGSpec(
                {
                    'tasks': tasks,
                    'meta': meta
                },
                parent_path=Path(partial).parent,
            )

            return spec.to_dag()
        elif isinstance(partial, DAG):
            return partial
        else:
            raise TypeError(f'Expected {cls.__name__}.get_partial() to '
                            'return a str, pathlib.Path or ploomber.DAG, '
                            f'got {type(partial).__name__}')

    def predict(self, **kwargs):
        """
        Run the DAG

        Parameters
        ----------
        **kwargs
            One parameter per root task (task with no upstream dependencies)
            in the partial DAG.

        Returns
        -------
        A dictionary with {task_name: returned_value}
        """
        return self.in_memory.build(kwargs)

    @abc.abstractstaticmethod
    def get_partial():
        """
        Must return the location of a partial dag (str or pathlib.Path)
        """
        pass

    @abc.abstractstaticmethod
    def terminal_task(upstream, model):
        """
        Las function to execute. The ``upstream`` parameter contains the
        output of all tasks that have no downstream dependencies
        """
        pass

    @abc.abstractstaticmethod
    def terminal_params():
        """
        Must return a dictionary with parameters passed to ``terminal_task``
        """
        pass
Beispiel #10
0
def test_error_input_data(input_data, dag):
    dag_ = InMemoryDAG(dag)

    with pytest.raises(KeyError):
        dag_.build(input_data)