Ejemplo n.º 1
0
from ploomber.products import File


def test_unsupported_extension():
    task = SQLDump('SELECT * FROM table',
                   File('my_file.json'),
                   DAG(),
                   name='task',
                   client=Mock())

    with pytest.raises(NotImplementedError):
        task.load()


@pytest.mark.parametrize('product, kwargs', [
    [File('my_file.csv'), dict()],
    [File('my_file.csv'), dict(sep=',')],
],
                         ids=['simple', 'with-kwargs'])
def test_sqldump(product, kwargs, tmp_directory):
    df = pd.DataFrame({'a': [1, 2, 3]})
    df.to_csv('my_file.csv', index=False)
    task = SQLDump('SELECT * FROM table',
                   product,
                   DAG(),
                   name='task',
                   client=Mock())

    loaded = task.load(**kwargs)
    assert df.equals(loaded)
Ejemplo n.º 2
0
def test_file_delete(tmp_directory):
    f = Path('file')
    f.touch()
    File('file').delete()

    assert not f.exists()
Ejemplo n.º 3
0
dag = DAG()

# loop over params and create one notebook task for each...
for name, params in params_all.items():
    # NotebookRunner is able to execute ipynb files using
    # papermill under the hood, if the input file has a
    # different extension (like in our case), it will first
    # convert it to an ipynb file using jupytext
    NotebookRunner(
        notebook,
        # save it in artifacts/{name}.html
        # NotebookRunner will generate ipynb files by
        # default, but you can choose other formats,
        # any format supported by the official nbconvert
        # package is supported here
        product=File(out / (name + '.html')),
        dag=dag,
        name=name,
        # pass the parameters
        params=params,
        ext_in='py',
        kernelspec_name='python3')
# -

# Build the DAG:

dag.build()

# That's it. After building the DAG, each model will generate one report, you can see them here: [Ridge](https://ploomber.github.io/posts/model-selection/artifacts/ridge), [Random Forest](https://ploomber.github.io/posts/model-selection/artifacts/rf) and [NuSVR](https://ploomber.github.io/posts/model-selection/artifacts/nusvr).
#
# Splitting logic into separate files improves readability and maintainability, if we want to add another model we only have to add a new dictionary with the parameter grid, if preprocessing is needed, we just add a factory in `pipelines.py`.
Ejemplo n.º 4
0
def test_is_a_file_like_object():
    assert isinstance(File('/path/to/file'), os.PathLike)
Ejemplo n.º 5
0
def test_file_initialized_with_path():
    path = Path('/path/to/file')
    f = File(path)
    f.render({})
    assert str(f) == str(path)
Ejemplo n.º 6
0
def test_repr_absolute():
    assert repr(File('/a/b/c')) == "File('/a/b/c')"
Ejemplo n.º 7
0
def test_repr_short():
    expected = "File('abcdefghijklmnopq...IJKLMNOPQRSTUVWXYZ')"
    assert repr(File(string.ascii_letters)) == expected
Ejemplo n.º 8
0
 def make():
     dag = DAG()
     a = PythonCallable(touch_root, File('a.txt'), dag, name='a')
     b = PythonCallable(touch, File('b.txt'), dag, name='b')
     a >> b
     return dag
Ejemplo n.º 9
0
def test_exception_is_not_masked_if_not_catching_them(executor):
    dag = DAG(executor=executor)
    PythonCallable(failing_root, File('file.txt'), dag)

    with pytest.raises(FailedTask):
        dag.build()
Ejemplo n.º 10
0
def test_params_are_accesible_after_init():
    dag = DAG()
    t = PythonCallable(fn, File('file.txt'), dag, 'callable',
                       params=dict(a=1))
    assert t.params == dict(a=1)
Ejemplo n.º 11
0
def test_can_execute_python_callable(tmp_directory):
    dag = DAG()
    PythonCallable(fn, File('file.txt'), dag, 'callable',
                   params=dict(a=1))
    assert dag.build()
Ejemplo n.º 12
0
def test_delete_metadata(tmp_directory, file_, metadata):
    Path('dir').mkdir()
    Path(metadata).touch()
    File(file_)._delete_metadata()
    assert not Path(metadata).exists()
Ejemplo n.º 13
0
def test_delete_non_existing_metadata(tmp_directory):
    File('some_file')._delete_metadata()
    assert not Path('.some_file.metadata').exists()
Ejemplo n.º 14
0
                          File('join.parquet'),
                          dag,
                          name='join')

    dag['get'] >> fts

    (dag['get'] + fts) >> join

    return dag


tmp_dir = Path(tempfile.mkdtemp())

# build training pipeline
dag_fit = DAG()
get = PythonCallable(_get, File(tmp_dir / 'data.parquet'), dag_fit, name='get')
dag_fit = add_fts(dag_fit)
fit = PythonCallable(_fit, {
    'report': File(tmp_dir / 'report.txt'),
    'model': File(tmp_dir / 'model.joblib')
},
                     dag_fit,
                     name='fit')
dag_fit['join'] >> fit

###############################################################################
# Fit pipeline plot
dag_fit.plot(output='matplotlib')

dag_fit.build()
Ejemplo n.º 15
0
def test_error_when_initializing_with_obj_other_than_str_or_path():
    with pytest.raises(TypeError) as excinfo:
        File(dict())

    msg = 'File must be initialized with a str or a pathlib.Path'
    assert str(excinfo.value) == msg
Ejemplo n.º 16
0
def test_early_stop(executor, tmp_directory):
    dag = DAG(executor=executor)
    PythonCallable(early_stop_root, File('file.txt'), dag)
    assert dag.build() is None
Ejemplo n.º 17
0
def test_repr_relative():
    assert repr(File('a/b/c')) == "File('a/b/c')"
Ejemplo n.º 18
0
def test_early_stop_from_task_level_on_finish(executor, tmp_directory):
    dag = DAG(executor=executor)
    t = PythonCallable(touch_root, File('file.txt'), dag)
    t.on_finish = early_stop
    assert dag.build() is None
Ejemplo n.º 19
0
def test_repr_absolute_shows_as_relative_if_possible():
    path = Path('.').resolve() / 'a'
    assert repr(File(path)) == "File('a')"
Ejemplo n.º 20
0
def make_task_dump(dag, env):
    return PythonCallable(_dump,
                          product=File(env.path.raw / 'raw.parquet'),
                          dag=dag,
                          name='raw')
Ejemplo n.º 21
0
def test_client_is_none_by_default():
    dag = DAG()
    product = File('file.txt')
    PythonCallable(_touch, product, dag=dag)
    assert product.client is None
Ejemplo n.º 22
0
# Setup
tmp_dir = Path(tempfile.mkdtemp())
uri = 'sqlite:///' + str(tmp_dir / 'example.db')
engine = create_engine(uri)
df = pd.DataFrame({'a': [1, 2, 3, 4, 5]})
df.to_sql('example', engine)

###############################################################################
# Pipeline declaration
# ---------------------

dag = DAG(executor=Serial(build_in_subprocess=False))

# the first task dumps data from the db to the local filesystem
task_dump = SQLDump('SELECT * FROM example',
                    File(tmp_dir / 'example.csv'),
                    dag,
                    name='dump',
                    client=SQLAlchemyClient(uri),
                    chunksize=None)


# since this task will have an upstream dependency, it has to accept the
# upstream parameter, all tasks must accept a product parameter
def _add_one(upstream, product):
    """Add one to column a
    """
    df = pd.read_csv(str(upstream['dump']))
    df['a'] = df['a'] + 1
    df.to_csv(str(product), index=False)
Ejemplo n.º 23
0
def test_file_initialized_with_str():
    f = File('/path/to/file')
    f.render({})
    assert str(f) == '/path/to/file'
Ejemplo n.º 24
0
def test_file_is_pickable():
    f = File('/path/to/file.csv')
    pickle.loads(pickle.dumps(f))
Ejemplo n.º 25
0
def test_file_is_rendered_correctly():
    f = File('/path/to/{{name}}')
    f.render(params=dict(name='file'))
    assert str(f) == '/path/to/file'
Ejemplo n.º 26
0
def get_data(product, dates, name):
    """
    Dummy code, in reality this would usually be a Task that pulls data
    from a database
    """
    dates_series = pd.date_range(start=dates[0],
                                 end=dates[1],
                                 closed='left',
                                 freq='D')
    values = np.random.rand(dates_series.shape[0])
    df = pd.DataFrame({'dates': dates_series, 'values': values})
    df.to_parquet(str(product))


dag = DAG()
product = File('{{name}}.parquet')

start_date = date(year=2010, month=1, day=1)
end_date = date(year=2019, month=6, day=1)
delta = relativedelta(years=1)

params_array = ParamGrid({
    'dates': Interval(start_date, end_date, delta)
}).zip()


def namer(params):
    s = str(params['dates'][0]).replace('-', '_')
    e = str(params['dates'][1]).replace('-', '_')
    return f'get_data_{s}_{e}'
Ejemplo n.º 27
0
def test_path_to_metadata():
    assert File('file.txt')._path_to_metadata == Path('.file.txt.metadata')
Ejemplo n.º 28
0
 def make():
     # NOTE: must run callables in the same process so counting works
     dag = DAG(executor=Serial(build_in_subprocess=False))
     t = PythonCallable(fn, File('file1.txt'), dag)
     t.on_finish = hook_crashing
     return dag