Example #1
0
CREATE TABLE {{product}} AS
SELECT *,
       pH > AVG(pH) AS high_pH
FROM {{upstream['upload']}}
"""

features = SQLScript(sql,
                     product=SQLiteRelation((None, 'features', 'table')),
                     dag=dag,
                     name='features')


red_task >> concat_task
white_task >> concat_task

concat_task >> upload_task >> features

###############################################################################
# render will pass all parameters so you can see exactly which SQL code
# will be executed
dag.render()

###############################################################################
# print source code for task "features"
print(dag['features'].source_code)


dag.plot(output='matplotlib')

dag.build()
Example #2
0
    df = pd.read_csv(str(upstream['dump']))
    df['a'] = df['a'] + 1
    df.to_csv(str(product), index=False)

# we convert the Python function into a Task
task_add_one = PythonCallable(_add_one,
                              File(tmp_dir / 'add_one.csv'),
                              dag,
                              name='add_one')

# declare how tasks relate to each other: first dump then add one
task_dump >> task_add_one


# plot the workflow, pending tasks are shown in red
dag.plot(output='matplotlib', clear_cached_status=True)

# run our sample pipeline
dag.build()



###############################################################################
# Each time the DAG is run it will save the current timestamp and the
# source code of each task, next time we run it it will only run the
# necessary tasks to get everything up-to-date, there is a simple rule to
# that: a task will run if its code (or the code from any dependency) has
# changed since the last time it ran.

# Data processing pipelines consist on many small long-running tasks which
# depend on each other. During early development phases things are expected to
Example #3
0
dag.clients[SQLiteRelation] = client
dag.clients[SQLScript] = client

source_loader = SourceLoader(tmp_dir)

transfer = SQLTransfer(source_loader['data_select.sql'],
                       product=SQLiteRelation((None, 'data2', 'table')),
                       dag=dag,
                       name='transfer')

subset = SQLScript(source_loader['subset_create.sql'],
                   product=SQLiteRelation((None, 'subset', 'table')),
                   dag=dag,
                   name='subset')

transfer >> subset

dag.render()

###############################################################################
# Our macro is correctly rendered:

print(dag['subset'].source)

###############################################################################
# Plot and execute pipeline:

dag.plot()

dag.build()
Example #4
0
# build training pipeline
dag_fit = DAG()
get = PythonCallable(_get, File(tmp_dir / 'data.parquet'), dag_fit, name='get')
dag_fit = add_fts(dag_fit)
fit = PythonCallable(_fit, {
    'report': File(tmp_dir / 'report.txt'),
    'model': File(tmp_dir / 'model.joblib')
},
                     dag_fit,
                     name='fit')
dag_fit['join'] >> fit

###############################################################################
# Fit pipeline plot
dag_fit.plot(output='matplotlib')

dag_fit.build()

# build prediction pipeline - pass a new observation with values [1, 0, 10, 2]
dag_pred = DAG()
get = PythonCallable(_new_obs,
                     File(tmp_dir / 'obs.parquet'),
                     dag_pred,
                     name='get',
                     params={'values': [1, 0, 10, 2]})

dag_pred = add_fts(dag_pred)
pred = PythonCallable(_pred,
                      File(tmp_dir / 'pred.csv'),
                      dag_pred,