Exemple #1
0
def test_dag_reports_sub_select_cols(sqlite_client_and_tmp_dir):
    client, _ = sqlite_client_and_tmp_dir
    dag = DAG()

    dag.clients[SQLScript] = client
    dag.clients[SQLiteRelation] = client

    PythonCallable(touch_root, File('some_file.txt'), dag, name='task')
    sql = 'CREATE TABLE {{product}} AS SELECT * FROM data'
    SQLScript(sql, SQLiteRelation(('data2', 'table')), dag, name='task2')

    assert dag.status()[['name', 'Last run']]
    assert dag.build()[['Ran?', 'Elapsed (s)']]
Exemple #2
0
def test_status_cleared_after_reporting_status(executor, tmp_directory):
    # this is a pesky scenario, we try to avoid retrieving metdata when we
    # don't have to because it's slow, so we keep a local copy, but this means
    # we have to keep an eye on conditions where we must retrieve again, here's
    # one edge case
    dag = DAG(executor=executor)
    PythonCallable(touch_root, File('ok.txt'), dag, name='t1')

    # dag status requires retrieving metadata, we have a local copy now...
    dag.status()

    # building a task means saving metadata again, if the task was executed
    # in the process where the dag lives, metadata is still up-to-date because
    # to save metadata, we first have to override the local copy, the edge
    # case happens when task is executed in a child process, which means
    # the local copy in the DAG process is now outdated and should be cleared
    # up
    dag.build()

    # this should not trigger any execution, because we just built
    dag.build()

    assert set(t.exec_status for t in dag.values()) == {TaskStatus.Skipped}
Exemple #3
0
###############################################################################
# Inspecting a pipeline
# *********************

# A lot of data pipelines start as experimental projects (e.g. developing a 
# Machine Learning model), which causes them to grow unpredictably. As the
# pipeline evolves, it will span dozens of files whose intent is unclear. The
# DAG object serves as the primary reference for anyone seeking to understand
# the pipeline.


# Making a pipeline transparent helps others quickly understand it without going
# through the code details and eases debugging for developers.
# status returns a summary of each task status
dag.status()



###############################################################################
# Inspecting the `DAG` object
# ---------------------------
# A lot of data work is done interactively using Jupyter or similar tools, being
# able interact with a pipeline in the same way is an effective way of
# experimenting new methods.

# say you are adding a new method to task add_one, you can run your code
# with all upstream dependencies being taken care of like this

# run your task
dag['add_one'].build(force=True)