Ejemplo n.º 1
0
def test_sqlite_product_fetch_metadata_none_if_not_exists(tmp_directory):
    tmp = Path(tmp_directory)
    conn = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db"))

    numbers = SQLiteRelation((None, 'numbers', 'table'), conn)
    numbers.render({})

    assert numbers.fetch_metadata() is None
Ejemplo n.º 2
0
def test_sqlite_product_fetch_metadata_none_if_empty_metadata(tmp_directory):
    tmp = Path(tmp_directory)
    conn = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db"))

    df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)})
    df.to_sql('numbers', conn.engine)

    numbers = SQLiteRelation((None, 'numbers', 'table'), conn)
    numbers.render({})

    assert numbers.fetch_metadata() is None
Ejemplo n.º 3
0
def client_and_prod(request, sqlite_client_and_tmp_dir, pg_client_and_schema):
    # Based on: https://github.com/pytest-dev/pytest/issues/349#issue-88534390
    if request.param == 'sqlite':
        client, _ = sqlite_client_and_tmp_dir
        product = SQLiteRelation((None, 'numbers', 'table'), client)
        schema = None
    else:
        client, schema = pg_client_and_schema
        product = PostgresRelation((schema, 'numbers', 'table'), client)

    yield client, product, schema

    product.delete()
Ejemplo n.º 4
0
def test_old_metadata_is_replaced(arg, sqlite_client_and_tmp_dir):
    client, tmp = sqlite_client_and_tmp_dir
    schema, name, _ = arg
    product = SQLiteRelation(arg, client=client)

    product.render({})

    product.save_metadata({
        'timestamp': datetime.now().timestamp(),
        'stored_source_code': 'some code'
    })

    product.save_metadata({
        'timestamp': datetime.now().timestamp(),
        'stored_source_code': 'some code'
    })

    query = "SELECT COUNT(*) FROM _metadata WHERE name='{}'".format(name)

    if schema is not None:
        query += " AND schema='{}'".format(schema)

    result = list(client.engine.execute(query))[0][0]

    assert result == 1
Ejemplo n.º 5
0
def test_can_transfer_sqlite(tmp_directory):
    """
    >>> import tempfile
    >>> tmp_directory = tempfile.mkdtemp()
    """
    tmp = Path(tmp_directory)

    # create clientections to 2 dbs
    client_in = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database_in.db"))
    client_out = SQLAlchemyClient('sqlite:///{}'.format(tmp /
                                                        "database_out.db"))

    # make some data and save it in the db
    df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)})
    df.to_sql('numbers', client_in.engine, index=False)

    # create the task and run it
    dag = DAG()
    SQLTransfer('SELECT * FROM numbers',
                SQLiteRelation((None, 'numbers2', 'table'), client=client_out),
                dag,
                name='transfer',
                client=client_in,
                chunksize=10)
    dag.build()

    # load dumped data and data from the db
    original = pd.read_sql_query('SELECT * FROM numbers', client_in.engine)
    transfer = pd.read_sql_query('SELECT * FROM numbers2', client_out.engine)

    client_in.close()
    client_out.close()

    # make sure they are the same
    assert original.equals(transfer)
Ejemplo n.º 6
0
def test_custom_io_handler(tmp_directory):
    dag = DAG()
    client = SQLAlchemyClient('sqlite:///database.db')
    dag.clients[SQLUpload] = client
    dag.clients[SQLiteRelation] = client

    df = pd.DataFrame({'a': [1, 2, 3], 'b': [1, 2, 3]})
    df.to_csv('some-file.tsv', sep='\t', index=False)

    def my_reading_fn(path):
        return pd.read_csv(path, sep='\t')

    SQLUpload('some-file.tsv',
              SQLiteRelation(('my-table', 'table')),
              dag=dag,
              name='task',
              io_handler=my_reading_fn,
              to_sql_kwargs=dict(index=False))

    dag.build()

    other = pd.read_sql('SELECT * FROM "my-table"', con=client)

    client.close()

    assert other.equals(df)
Ejemplo n.º 7
0
def test_clients_are_closed_after_build(tmp_directory):
    # TODO: same test but when the dag breaks (make sure clients are closed
    # even on that case)
    tmp = Path(tmp_directory)

    # create a db
    conn = sqlite3.connect(str(tmp / "database.db"))
    uri = 'sqlite:///{}'.format(tmp / "database.db")

    # make some data and save it in the db
    df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)})
    df.to_sql('numbers', conn)
    conn.close()

    # create the task and run it
    dag = DAG()

    def mock_client():
        m = Mock(wraps=SQLAlchemyClient(uri))
        m.split_source = ';'
        return m

    clients = [mock_client() for _ in range(4)]

    dag.clients[SQLScript] = clients[0]
    dag.clients[SQLiteRelation] = clients[1]

    t1 = SQLScript("""
    CREATE TABLE {{product}} AS SELECT * FROM numbers
    """,
                   SQLiteRelation(('another', 'table')),
                   dag=dag,
                   name='t1')

    t2 = SQLScript("""
    CREATE TABLE {{product}} AS SELECT * FROM {{upstream['t1']}}
    """,
                   SQLiteRelation(('yet_another', 'table'), client=clients[2]),
                   dag=dag,
                   name='t2',
                   client=clients[3])

    t1 >> t2

    dag.build()

    assert all(client.close.called for client in clients)
Ejemplo n.º 8
0
def test_doesnt_warn_if_relations_match(sql, split_source):
    product = SQLiteRelation((None, 'my_table', 'table'))

    source = SQLScriptSource(sql, split_source=split_source)

    with pytest.warns(None) as record:
        source.render({'product': product})

    assert not record
Ejemplo n.º 9
0
def test_sqlite_product_save_metadata(tmp_directory):
    tmp = Path(tmp_directory)
    conn = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db"))

    numbers = SQLiteRelation((None, 'numbers', 'table'), conn)
    numbers.render({})

    numbers.metadata['timestamp'] = datetime.now().timestamp()
    numbers.metadata['stored_source_code'] = 'some code'

    numbers.save_metadata()

    fetched = numbers.fetch_metadata()

    assert fetched == numbers.metadata
Ejemplo n.º 10
0
    def make(client):
        dag = DAG()
        dag.clients[SQLScript] = client
        dag.clients[SQLiteRelation] = client

        SQLScript('CREATE TABLE {{product}} AS SELECT * FROM data',
                  SQLiteRelation(['data2', 'table']),
                  dag=dag,
                  name='task')

        return dag
Ejemplo n.º 11
0
def test_warns_if_number_of_relations_does_not_match_number_of_products():
    product = [
        SQLiteRelation((None, 'my_table', 'table')),
        SQLiteRelation((None, 'another_table', 'table'))
    ]

    source = SQLScriptSource("""
    CREATE TABLE {{product[0]}} AS
    SELECT * FROM my_table
    GROUP BY some_column
    """)

    with pytest.warns(UserWarning) as record:
        source.render({'product': product})

    assert len(record) == 1
    msg = ('It appears that your script will create 1 relation(s) '
           'but you declared 2 product(s): '
           "[SQLiteRelation(('my_table', 'table')), "
           "SQLiteRelation(('another_table', 'table'))]")
    assert record[0].message.args[0] == msg
Ejemplo n.º 12
0
def test_warns_if_number_of_relations_does_not_match_products():
    dag = DAG()

    sql = """
    -- wrong sql, products must be used in CREATE statements
    CREATE TABLE {{product[0]}} AS
    SELECT * FROM my_table
    """

    t = SQLScript(sql, [
        SQLiteRelation((None, 'my_table', 'table')),
        SQLiteRelation((None, 'another_table', 'table'))
    ],
                  dag=dag,
                  client=Mock(),
                  name='sql')

    match = r'.*will create 1 relation\(s\) but you declared 2 product\(s\).*'

    with pytest.warns(UserWarning, match=match):
        t.render()
Ejemplo n.º 13
0
def test_warns_if_sql_scipt_does_not_create_relation():
    dag = DAG()

    t = SQLScript('SELECT * FROM {{product}}',
                  SQLiteRelation((None, 'my_table', 'table')),
                  dag=dag,
                  client=Mock(),
                  name='sql')

    match = 'will not create any tables or views but the task has product'

    with pytest.warns(UserWarning, match=match):
        t.render()
Ejemplo n.º 14
0
def test_dag_reports_sub_select_cols(sqlite_client_and_tmp_dir):
    client, _ = sqlite_client_and_tmp_dir
    dag = DAG()

    dag.clients[SQLScript] = client
    dag.clients[SQLiteRelation] = client

    PythonCallable(touch_root, File('some_file.txt'), dag, name='task')
    sql = 'CREATE TABLE {{product}} AS SELECT * FROM data'
    SQLScript(sql, SQLiteRelation(('data2', 'table')), dag, name='task2')

    assert dag.status()[['name', 'Last run']]
    assert dag.build()[['Ran?', 'Elapsed (s)']]
Ejemplo n.º 15
0
def test_source_loader_and_task(sqlite_client_and_tmp_dir):
    client, tmp_dir = sqlite_client_and_tmp_dir

    Path(tmp_dir, 'data_query.sql').write_text('SELECT * FROM data')
    source_loader = SourceLoader(str(tmp_dir))

    dag = DAG()
    dag.clients[SQLTransfer] = client
    dag.clients[SQLiteRelation] = client

    SQLTransfer(source_loader['data_query.sql'],
                product=SQLiteRelation((None, 'data2', 'table')),
                dag=dag,
                name='transfer')

    dag.build()
Ejemplo n.º 16
0
def test_warns_if_no_create_statement_found():
    product = SQLiteRelation((None, 'my_table', 'table'))

    source = SQLScriptSource("""
    -- {{product}} without CREATE statement
    SELECT * FROM my_table
    GROUP BY some_column
    """)

    with pytest.warns(UserWarning) as record:
        source.render({'product': product})

    assert len(record) == 1
    msg = ('It appears that your script will not create any tables/views '
           'but the product parameter is '
           "SQLiteRelation(('my_table', 'table'))")
    assert record[0].message.args[0] == msg
Ejemplo n.º 17
0
def test_sqlite_product_delete(tmp_directory):
    """
    >>> import tempfile
    >>> tmp_directory = tempfile.mkdtemp()
    """
    tmp = Path(tmp_directory)
    conn = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db"))

    df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)})
    df.to_sql('numbers', conn.engine)

    numbers = SQLiteRelation((None, 'numbers', 'table'), conn)
    numbers.render({})
    numbers.delete()

    assert not numbers.exists()
Ejemplo n.º 18
0
def test_sql_script_shows_executed_code_if_fails(tmp_directory, sample_data):

    dag = DAG()

    client = SQLAlchemyClient('sqlite:///database.db')
    dag.clients[SQLScript] = client
    dag.clients[SQLiteRelation] = client

    SQLScript('SOME INVALID SQL {{product}}',
              SQLiteRelation((None, 'another', 'table')),
              dag=dag,
              name='task')

    with pytest.raises(DAGBuildError) as excinfo:
        dag.build()

    assert 'SOME INVALID SQL' in str(excinfo.value)
    assert 'near "SOME": syntax error' in str(excinfo.value)
Ejemplo n.º 19
0
def test_warns_if_inferred_relations_do_not_match_product():
    product = SQLiteRelation((None, 'my_table', 'table'))

    source = SQLScriptSource("""
    -- using {{product}} in the wrong place
    CREATE TABLE some_table AS SELECT * FROM another_table
    """)

    with pytest.warns(UserWarning) as record:
        source.render({'product': product})

    assert len(record) == 1
    msg = ('It appears that your script will create relations '
           '{ParsedSQLRelation((\'some_table\', \'table\'))}, '
           'which doesn\'t match '
           'products: {SQLiteRelation((\'my_table\', \'table\'))}. '
           'Make sure schema, '
           'name and kind (table or view) match')
    assert record[0].message.args[0] == msg
Ejemplo n.º 20
0
def test_hot_reload_sql_sources(class_, tmp_directory):
    path = Path(tmp_directory, 'script.sql')
    path.write_text('/*doc*/\n{{product}}')

    product = SQLiteRelation(('some_table', 'table'))

    source = class_(path, hot_reload=True)
    source.render({'product': product})

    assert str(source) == '/*doc*/\nsome_table'
    assert source.variables == {'product'}
    assert source.doc == 'doc'

    path.write_text('/*new doc*/\n{{product}} {{new_tag}}')
    source.render({'product': product, 'new_tag': 'modified'})

    assert str(source) == '/*new doc*/\nsome_table modified'
    assert source.variables == {'product', 'new_tag'}
    assert source.doc == 'new doc'
Ejemplo n.º 21
0
def make(env, artist_name, language):
    artist_name_ = artist_name.lower().replace(' ', '_')

    dag = DAG(executor=Serial(False))

    loader = SourceLoader(path='sql', module='ploomber_lyrics')

    db_location = env.path.data / 'clean.db'

    client = DBAPIClient(sqlite3.connect,
                         {'database': db_location},
                         split_source=True)
    dag.clients[SQLScript] = client
    dag.clients[SQLiteRelation] = client

    t = DownloadFromURL(env.source.lyrics,
                        File(env.path.data / 'lyrics.db'),
                        dag, name='raw_db')

    t2 = DownloadFromURL(env.source.metadata,
                         File(env.path.data / 'metadata.db'),
                         dag, name='raw_metadata_db')

    t3 = DownloadFromURL(env.source.artist_tags,
                         File(env.path.data / 'artist_tags.db'),
                         dag, name='raw_artist_tags_db')

    top = SQLScript(loader['get_top_words.sql'],
                    SQLiteRelation(('top_words_{{artist_name_}}', 'table')),
                    dag,
                    params={'artist_name': artist_name,
                            'artist_name_': artist_name_})

    wc = PythonCallable(tasks.wordcloud,
                        File(env.path.data / (artist_name_ + '.png')),
                        dag,
                        params={'db_location': db_location,
                                'language': language})

    (t + t2 + t3) >> top >> wc

    return dag
def test_ignores_non_file_products(tmp_directory, monkeypatch):
    mock_remote = Mock()
    monkeypatch.setattr(file._RemoteFile, '_fetch_remote_metadata',
                        mock_remote)

    client = SQLAlchemyClient('sqlite:///my.db')

    dag = DAG()
    dag.clients[SQLScript] = client
    dag.clients[SQLiteRelation] = client
    dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.')

    SQLScript('CREATE TABLE {{product}} AS SELECT * FROM data',
              SQLiteRelation(['data2', 'table']),
              dag=dag,
              name='task')

    dag.render()
    client.close()

    mock_remote.assert_not_called()
Ejemplo n.º 23
0
def test_sqlscript_load(tmp_directory, sample_data, client):

    dag = DAG()

    dag.clients[SQLScript] = client
    dag.clients[SQLiteRelation] = client

    SQLScript('CREATE TABLE {{product}} AS SELECT * FROM numbers',
              SQLiteRelation((None, 'another', 'table')),
              dag=dag,
              name='task')

    dag.build(close_clients=False)

    df = dag['task'].load()

    dag.close_clients()

    assert df.to_dict(orient='list') == {
        'a': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        'b': [100, 101, 102, 103, 104, 105, 106, 107, 108, 109]
    }
Ejemplo n.º 24
0
def test_can_request_params(sqlite_client_and_tmp_dir):
    on_finish_sql.task = None
    on_finish_sql.product = None
    on_finish_sql.client = None

    client, _ = sqlite_client_and_tmp_dir
    dag = DAG()

    dag.clients[SQLScript] = client
    dag.clients[SQLiteRelation] = client

    t = SQLScript('CREATE TABLE {{product}} AS SELECT * FROM data',
                  SQLiteRelation(('new_table', 'table')),
                  dag,
                  name='t')

    t.on_finish = on_finish_sql

    dag.build()

    assert on_finish_sql.task is t
    assert on_finish_sql.product is t.product
    assert on_finish_sql.client is client
Ejemplo n.º 25
0
def test_warns_if_sql_script_does_not_create_relation(
        sqlite_client_and_tmp_dir):
    client, _ = sqlite_client_and_tmp_dir
    dag = DAG()
    dag.clients[SQLiteRelation] = client

    mock_client = Mock()
    mock_client.split_source = ';'

    t = SQLScript('SELECT * FROM {{product}}',
                  SQLiteRelation((None, 'my_table', 'table')),
                  dag=dag,
                  client=mock_client,
                  name='sql')

    with pytest.warns(UserWarning) as record:
        t.render()

    assert len(record) == 1
    msg = ('It appears that your script will not create any tables/views but '
           "the product parameter is "
           "SQLiteRelation(('my_table', 'table'))")
    assert record[0].message.args[0] == msg
Ejemplo n.º 26
0
                          name='red',
                          params={'filename': 'winequality-red.csv'})

white_task = PythonCallable(get_data,
                            product=File(tmp_dir / 'white.parquet'),
                            dag=dag,
                            name='white',
                            params={'filename': 'winequality-white.csv'})

concat_task = PythonCallable(concat_data,
                             product=File(tmp_dir / 'all.parquet'),
                             dag=dag, name='all')


upload_task = SQLUpload(tmp_dir / 'all.parquet',
                        product=SQLiteRelation((None, 'data', 'table')),
                        dag=dag,
                        name='upload')


###############################################################################
# you can use jinja2 to parametrize SQL, {{upstream}} and {{product}}
# are available for your script. this way you could switch products without
# changing your source code (e.g. each Data Scientist in your team writes
# to his/her own db schema to have isolated runs)
sql = """
CREATE TABLE {{product}} AS
SELECT *,
       pH > AVG(pH) AS high_pH
FROM {{upstream['upload']}}
"""
Ejemplo n.º 27
0
    with pytest.raises(MissingClientError):
        prod.client


@pytest.mark.parametrize(
    'task_class, task_arg, product',
    [[SQLDump, 'SELECT * FROM my_table',
      File('data.csv')],
     [
         SQLScript, 'CREATE TABLE {{product}} AS SELECT * FROM my_table',
         SQLRelation(['schema', 'name', 'table'])
     ],
     [
         SQLTransfer, 'SELECT * FROM my_table',
         SQLiteRelation(['schema', 'name', 'table'])
     ],
     [
         SQLUpload, 'SELECT * FROM my_table',
         SQLiteRelation(['schema', 'name', 'table'])
     ],
     [
         PostgresCopyFrom, 'SELECT * FROM my_table',
         PostgresRelation(['schema', 'name', 'table'])
     ]])
def test_exception_if_missing_task_client(task_class, task_arg, product):

    task = task_class(task_arg, product, dag=DAG(), name='task')

    with pytest.raises(MissingClientError):
        task.client
Ejemplo n.º 28
0
def make_task_upload(env, dag):
    return SQLUpload('{{upstream["raw"]}}',
                     product=SQLiteRelation((None, 'raw', 'table')),
                     dag=dag,
                     name='upload',
                     to_sql_kwargs={'if_exists': 'replace'})
Ejemplo n.º 29
0
def make(tmp):
    """Make the dag
    """
    tmp = Path(tmp)
    dag = DAG()

    # db with the source data
    client_source = SQLAlchemyClient('sqlite:///' + str(tmp / 'source.db'))
    # db where we'll insert processed data (can be the same as the
    # source db)
    client_target = SQLAlchemyClient('sqlite:///' + str(tmp / 'target.db'))

    dag.clients[SQLDump] = client_source
    dag.clients[SQLUpload] = client_target
    dag.clients[SQLiteRelation] = client_target

    cur = client_target.connection.execute("""
    SELECT name FROM sqlite_master WHERE type='table' AND name='plus_one'""")

    if cur.fetchone():
        cur = client_target.connection.execute('SELECT MAX(id) FROM plus_one')
        last_id = cur.fetchone()[0]
    else:
        last_id = None

    # we dump new observations to this file
    dumped_data = File(tmp / 'x.csv')
    # we add a hook that allows us to save info on the latest seen value
    dumped_data.prepare_metadata = add_last_value

    # the actual task that dumps data
    dump = SQLDump("""
        SELECT * FROM data
        {% if last_id %}
        WHERE id > {{last_id}}
        {% endif %}
    """,
                   dumped_data,
                   dag=dag,
                   name='dump',
                   chunksize=None,
                   params=dict(last_id=last_id))

    # on finish hook, will stop DAG execution if there aren't new observations
    dump.on_finish = dump_on_finish

    # a dummy task to modify the data
    plus_one = PythonCallable(_plus_one,
                              File(tmp / 'plus_one.csv'),
                              dag=dag,
                              name='plus_one')

    # upload the data to the target database
    upload = SQLUpload(
        '{{upstream["plus_one"]}}',
        product=SQLiteRelation((None, 'plus_one', 'table')),
        dag=dag,
        name='upload',
        # append observations if the table already exists
        to_sql_kwargs={
            'if_exists': 'append',
            'index': False
        })

    dump >> plus_one >> upload

    return dag
Ejemplo n.º 30
0
SELECT * FROM
{{upstream["transfer"]}} WHERE x > 1
"""))

###############################################################################
# DAG declaration

dag = DAG(executor=Serial(build_in_subprocess=False))
dag.clients[SQLTransfer] = client
dag.clients[SQLiteRelation] = client
dag.clients[SQLScript] = client

source_loader = SourceLoader(tmp_dir)

transfer = SQLTransfer(source_loader['data_select.sql'],
                       product=SQLiteRelation((None, 'data2', 'table')),
                       dag=dag,
                       name='transfer')

subset = SQLScript(source_loader['subset_create.sql'],
                   product=SQLiteRelation((None, 'subset', 'table')),
                   dag=dag,
                   name='subset')

transfer >> subset

dag.render()

###############################################################################
# Our macro is correctly rendered: