Esempio n. 1
0
def test_warns_if_sql_scipt_does_not_create_relation():
    dag = DAG()

    t = SQLScript('SELECT * FROM {{product}}',
                  SQLiteRelation((None, 'my_table', 'table')),
                  dag=dag,
                  client=Mock(),
                  name='sql')

    match = 'will not create any tables or views but the task has product'

    with pytest.warns(UserWarning, match=match):
        t.render()
Esempio n. 2
0
def test_postgresscript_with_relation():
    dag = DAG()
    t = SQLScript('CREATE TABLE {{product}} AS SELECT * FROM {{name}}',
                  PostgresRelation(('user', 'table', 'table'), client=Dummy()),
                  dag,
                  name='name',
                  params=dict(name='some_table'),
                  client=Dummy())

    t.render()

    assert str(t.product) == 'user.table'
    assert (str(
        t.source) == 'CREATE TABLE user.table AS SELECT * FROM some_table')
Esempio n. 3
0
def test_passing_upstream_and_product_in_postgres(pg_client, db_credentials):
    dag = DAG()

    client = SQLAlchemyClient(db_credentials['uri'])

    dag.clients[SQLScript] = client
    dag.clients[PostgresRelation] = client

    conn = pg_client.connection
    cur = conn.cursor()
    cur.execute('drop table if exists series;')
    conn.commit()
    conn.close()

    ta_t = """begin;
              drop table if exists {{product}};
              create table {{product}} as
              select * from generate_series(0, 15) as n;
              commit;"""
    ta_rel = PostgresRelation((None, 'series', 'table'))
    ta = SQLScript(ta_t, ta_rel, dag, 'ta')

    dag.build()

    assert ta_rel.exists()
Esempio n. 4
0
def test_clients_are_closed_after_build(tmp_directory):
    # TODO: same test but when the dag breaks (make sure clients are closed
    # even on that case)
    tmp = Path(tmp_directory)

    # create a db
    conn = sqlite3.connect(str(tmp / "database.db"))
    uri = 'sqlite:///{}'.format(tmp / "database.db")

    # make some data and save it in the db
    df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)})
    df.to_sql('numbers', conn)
    conn.close()

    # create the task and run it
    dag = DAG()

    def mock_client():
        m = Mock(wraps=SQLAlchemyClient(uri))
        m.split_source = ';'
        return m

    clients = [mock_client() for _ in range(4)]

    dag.clients[SQLScript] = clients[0]
    dag.clients[SQLiteRelation] = clients[1]

    t1 = SQLScript("""
    CREATE TABLE {{product}} AS SELECT * FROM numbers
    """,
                   SQLiteRelation(('another', 'table')),
                   dag=dag,
                   name='t1')

    t2 = SQLScript("""
    CREATE TABLE {{product}} AS SELECT * FROM {{upstream['t1']}}
    """,
                   SQLiteRelation(('yet_another', 'table'), client=clients[2]),
                   dag=dag,
                   name='t2',
                   client=clients[3])

    t1 >> t2

    dag.build()

    assert all(client.close.called for client in clients)
Esempio n. 5
0
def test_placeholder_is_copied_upon_initialization():
    dag = DAG()
    dag.clients[SQLScript] = Mock()
    dag.clients[PostgresRelation] = Mock()

    p = Placeholder('CREATE TABLE {{product}} AS SELECT * FROM TABLE')

    t1 = SQLScript(p,
                   PostgresRelation(('schema', 'a_table', 'table')),
                   dag,
                   name='t1')
    t2 = SQLScript(p,
                   PostgresRelation(('schema', 'another_table', 'table')),
                   dag,
                   name='t2')

    assert t1.source._placeholder is not t2.source._placeholder
Esempio n. 6
0
    def make(client):
        dag = DAG()
        dag.clients[SQLScript] = client
        dag.clients[SQLiteRelation] = client

        SQLScript('CREATE TABLE {{product}} AS SELECT * FROM data',
                  SQLiteRelation(['data2', 'table']),
                  dag=dag,
                  name='task')

        return dag
Esempio n. 7
0
def test_warns_if_number_of_relations_does_not_match_products():
    dag = DAG()

    sql = """
    -- wrong sql, products must be used in CREATE statements
    CREATE TABLE {{product[0]}} AS
    SELECT * FROM my_table
    """

    t = SQLScript(sql, [
        SQLiteRelation((None, 'my_table', 'table')),
        SQLiteRelation((None, 'another_table', 'table'))
    ],
                  dag=dag,
                  client=Mock(),
                  name='sql')

    match = r'.*will create 1 relation\(s\) but you declared 2 product\(s\).*'

    with pytest.warns(UserWarning, match=match):
        t.render()
Esempio n. 8
0
def test_dag_reports_sub_select_cols(sqlite_client_and_tmp_dir):
    client, _ = sqlite_client_and_tmp_dir
    dag = DAG()

    dag.clients[SQLScript] = client
    dag.clients[SQLiteRelation] = client

    PythonCallable(touch_root, File('some_file.txt'), dag, name='task')
    sql = 'CREATE TABLE {{product}} AS SELECT * FROM data'
    SQLScript(sql, SQLiteRelation(('data2', 'table')), dag, name='task2')

    assert dag.status()[['name', 'Last run']]
    assert dag.build()[['Ran?', 'Elapsed (s)']]
Esempio n. 9
0
def test_drill_dag():
    code = """
    -- {{product.name}}
    CREATE TABLE dfs.tmp.`/test/` AS
    SELECT * FROM cp.`employee.json` LIMIT 1
    """

    drill = DrillClient()
    dag = DAG()

    SQLScript(code, File('/tmp/test/'), dag, client=drill)

    dag.build()
Esempio n. 10
0
def test_warns_if_sql_script_does_not_create_relation(
        sqlite_client_and_tmp_dir):
    client, _ = sqlite_client_and_tmp_dir
    dag = DAG()
    dag.clients[SQLiteRelation] = client

    mock_client = Mock()
    mock_client.split_source = ';'

    t = SQLScript('SELECT * FROM {{product}}',
                  SQLiteRelation((None, 'my_table', 'table')),
                  dag=dag,
                  client=mock_client,
                  name='sql')

    with pytest.warns(UserWarning) as record:
        t.render()

    assert len(record) == 1
    msg = ('It appears that your script will not create any tables/views but '
           "the product parameter is "
           "SQLiteRelation(('my_table', 'table'))")
    assert record[0].message.args[0] == msg
Esempio n. 11
0
def test_can_request_params(sqlite_client_and_tmp_dir):
    on_finish_sql.task = None
    on_finish_sql.product = None
    on_finish_sql.client = None

    client, _ = sqlite_client_and_tmp_dir
    dag = DAG()

    dag.clients[SQLScript] = client
    dag.clients[SQLiteRelation] = client

    t = SQLScript('CREATE TABLE {{product}} AS SELECT * FROM data',
                  SQLiteRelation(('new_table', 'table')),
                  dag,
                  name='t')

    t.on_finish = on_finish_sql

    dag.build()

    assert on_finish_sql.task is t
    assert on_finish_sql.product is t.product
    assert on_finish_sql.client is client
Esempio n. 12
0
def test_duplicated_sql_product(class1, class2, return_value):
    dag = DAG()

    client = Mock()
    client.split_source = ';'
    client.connection.cursor().fetchone.return_value = return_value

    dag.clients[SQLScript] = client
    dag.clients[class1] = client
    dag.clients[class2] = client

    SQLScript('CREATE TABLE {{product}} AS SELECT * FROM table',
              class1(('schema', 'name', 'table')),
              dag=dag,
              name='task')
    SQLScript('CREATE TABLE {{product}} AS SELECT * FROM table',
              class2(('schema', 'name', 'table')),
              dag=dag,
              name='another')

    with pytest.raises(DAGRenderError) as excinfo:
        dag.render()

    assert 'Tasks must generate unique Products.' in str(excinfo.value)
Esempio n. 13
0
def test_add_metadata_fields(client_and_prod):
    client, product, schema = client_and_prod
    dag = DAG()
    dag.clients[SQLScript] = client
    dag.clients[type(product)] = client

    query = 'CREATE TABLE {{product}} AS SELECT * FROM data'
    product.prepare_metadata = add_number_one

    SQLScript(query, product, dag, name='t1')

    dag.build()

    metadata = product.fetch_metadata()

    assert metadata['number'] == 1
Esempio n. 14
0
def test_sql_script_shows_executed_code_if_fails(tmp_directory, sample_data):

    dag = DAG()

    client = SQLAlchemyClient('sqlite:///database.db')
    dag.clients[SQLScript] = client
    dag.clients[SQLiteRelation] = client

    SQLScript('SOME INVALID SQL {{product}}',
              SQLiteRelation((None, 'another', 'table')),
              dag=dag,
              name='task')

    with pytest.raises(DAGBuildError) as excinfo:
        dag.build()

    assert 'SOME INVALID SQL' in str(excinfo.value)
    assert 'near "SOME": syntax error' in str(excinfo.value)
Esempio n. 15
0
def test_passing_upstream_and_product_in_postgres(pg_client_and_schema):
    client, _ = pg_client_and_schema
    dag = DAG()
    dag.clients[SQLScript] = client
    dag.clients[PostgresRelation] = client

    client.execute('drop table if exists series;')

    ta_t = """begin;
              drop table if exists {{product}};
              create table {{product}} as
              select * from generate_series(0, 15) as n;
              commit;"""
    ta_rel = PostgresRelation((None, 'series', 'table'))
    SQLScript(ta_t, ta_rel, dag, 'ta')

    dag.build()

    assert ta_rel.exists()
Esempio n. 16
0
def make(env, artist_name, language):
    artist_name_ = artist_name.lower().replace(' ', '_')

    dag = DAG(executor=Serial(False))

    loader = SourceLoader(path='sql', module='ploomber_lyrics')

    db_location = env.path.data / 'clean.db'

    client = DBAPIClient(sqlite3.connect,
                         {'database': db_location},
                         split_source=True)
    dag.clients[SQLScript] = client
    dag.clients[SQLiteRelation] = client

    t = DownloadFromURL(env.source.lyrics,
                        File(env.path.data / 'lyrics.db'),
                        dag, name='raw_db')

    t2 = DownloadFromURL(env.source.metadata,
                         File(env.path.data / 'metadata.db'),
                         dag, name='raw_metadata_db')

    t3 = DownloadFromURL(env.source.artist_tags,
                         File(env.path.data / 'artist_tags.db'),
                         dag, name='raw_artist_tags_db')

    top = SQLScript(loader['get_top_words.sql'],
                    SQLiteRelation(('top_words_{{artist_name_}}', 'table')),
                    dag,
                    params={'artist_name': artist_name,
                            'artist_name_': artist_name_})

    wc = PythonCallable(tasks.wordcloud,
                        File(env.path.data / (artist_name_ + '.png')),
                        dag,
                        params={'db_location': db_location,
                                'language': language})

    (t + t2 + t3) >> top >> wc

    return dag
def test_ignores_non_file_products(tmp_directory, monkeypatch):
    mock_remote = Mock()
    monkeypatch.setattr(file._RemoteFile, '_fetch_remote_metadata',
                        mock_remote)

    client = SQLAlchemyClient('sqlite:///my.db')

    dag = DAG()
    dag.clients[SQLScript] = client
    dag.clients[SQLiteRelation] = client
    dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.')

    SQLScript('CREATE TABLE {{product}} AS SELECT * FROM data',
              SQLiteRelation(['data2', 'table']),
              dag=dag,
              name='task')

    dag.render()
    client.close()

    mock_remote.assert_not_called()
Esempio n. 18
0
def test_sqlscript_load(tmp_directory, sample_data, client):

    dag = DAG()

    dag.clients[SQLScript] = client
    dag.clients[SQLiteRelation] = client

    SQLScript('CREATE TABLE {{product}} AS SELECT * FROM numbers',
              SQLiteRelation((None, 'another', 'table')),
              dag=dag,
              name='task')

    dag.build(close_clients=False)

    df = dag['task'].load()

    dag.close_clients()

    assert df.to_dict(orient='list') == {
        'a': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        'b': [100, 101, 102, 103, 104, 105, 106, 107, 108, 109]
    }
Esempio n. 19
0

###############################################################################
# you can use jinja2 to parametrize SQL, {{upstream}} and {{product}}
# are available for your script. this way you could switch products without
# changing your source code (e.g. each Data Scientist in your team writes
# to his/her own db schema to have isolated runs)
sql = """
CREATE TABLE {{product}} AS
SELECT *,
       pH > AVG(pH) AS high_pH
FROM {{upstream['upload']}}
"""

features = SQLScript(sql,
                     product=SQLiteRelation((None, 'features', 'table')),
                     dag=dag,
                     name='features')


red_task >> concat_task
white_task >> concat_task

concat_task >> upload_task >> features

###############################################################################
# render will pass all parameters so you can see exactly which SQL code
# will be executed
dag.render()

###############################################################################
# print source code for task "features"
Esempio n. 20
0
                       name='red')
sample >> red_task

white_task = ShellScript(('csvsql --db {{uri}} --tables {{product.name}} --insert {{upstream["sample"][1]}} '
                          '--overwrite'),
                         PostgresRelation(('public', 'white', 'table')),
                         dag,
                         params=dict(uri=uri),
                         name='white')
sample >> white_task


# let's introduce a new type of task, a SQLScript, this task will execute
# a script in a SQL database (any database supported by sql alchemy)
wine_task = SQLScript(Path('sql', 'create_wine.sql'),
                      PostgresRelation(('public', 'wine', 'table')),
                      dag, name='wine')
(red_task + white_task) >> wine_task


dataset_task = SQLScript(Path('sql', 'create_dataset.sql'),
                         PostgresRelation(('public', 'dataset', 'table')),
                         dag, name='dataset')
wine_task >> dataset_task


training_task = SQLScript(Path('sql', 'create_training.sql'),
                          PostgresRelation(('public', 'training', 'table')),
                          dag, name='training')
dataset_task >> training_task
Esempio n. 21
0
# DAG declaration

dag = DAG(executor=Serial(build_in_subprocess=False))
dag.clients[SQLTransfer] = client
dag.clients[SQLiteRelation] = client
dag.clients[SQLScript] = client

source_loader = SourceLoader(tmp_dir)

transfer = SQLTransfer(source_loader['data_select.sql'],
                       product=SQLiteRelation((None, 'data2', 'table')),
                       dag=dag,
                       name='transfer')

subset = SQLScript(source_loader['subset_create.sql'],
                   product=SQLiteRelation((None, 'subset', 'table')),
                   dag=dag,
                   name='subset')

transfer >> subset

dag.render()

###############################################################################
# Our macro is correctly rendered:

print(dag['subset'].source)

###############################################################################
# Plot and execute pipeline:

dag.plot()
Esempio n. 22
0
# this here to make this example standalone. SQL is a language that people
# from a lot of backgrounds understand, you could easily communicate your
# analysis with business analysts to make your your data assumptions are
# correct
_clean = """
/*
Cleaning dataset, we decided to ignore rows where magnesium is over 100
since we believe the data is corrupted
*/
CREATE TABLE {{product}} AS
SELECT * FROM {{upstream['upload']}}
WHERE magnesium < 100.0
"""

clean = SQLScript(_clean,
                  product=SQLiteRelation((None, 'wine_clean', 'table')),
                  dag=dag,
                  name='clean')


get_data >> upload >> clean

###############################################################################
# Pipeline build
# --------------

dag.build()


###############################################################################
# Pipeline plot
# -------------