def test_warns_if_sql_scipt_does_not_create_relation(): dag = DAG() t = SQLScript('SELECT * FROM {{product}}', SQLiteRelation((None, 'my_table', 'table')), dag=dag, client=Mock(), name='sql') match = 'will not create any tables or views but the task has product' with pytest.warns(UserWarning, match=match): t.render()
def test_postgresscript_with_relation(): dag = DAG() t = SQLScript('CREATE TABLE {{product}} AS SELECT * FROM {{name}}', PostgresRelation(('user', 'table', 'table'), client=Dummy()), dag, name='name', params=dict(name='some_table'), client=Dummy()) t.render() assert str(t.product) == 'user.table' assert (str( t.source) == 'CREATE TABLE user.table AS SELECT * FROM some_table')
def test_passing_upstream_and_product_in_postgres(pg_client, db_credentials): dag = DAG() client = SQLAlchemyClient(db_credentials['uri']) dag.clients[SQLScript] = client dag.clients[PostgresRelation] = client conn = pg_client.connection cur = conn.cursor() cur.execute('drop table if exists series;') conn.commit() conn.close() ta_t = """begin; drop table if exists {{product}}; create table {{product}} as select * from generate_series(0, 15) as n; commit;""" ta_rel = PostgresRelation((None, 'series', 'table')) ta = SQLScript(ta_t, ta_rel, dag, 'ta') dag.build() assert ta_rel.exists()
def test_clients_are_closed_after_build(tmp_directory): # TODO: same test but when the dag breaks (make sure clients are closed # even on that case) tmp = Path(tmp_directory) # create a db conn = sqlite3.connect(str(tmp / "database.db")) uri = 'sqlite:///{}'.format(tmp / "database.db") # make some data and save it in the db df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)}) df.to_sql('numbers', conn) conn.close() # create the task and run it dag = DAG() def mock_client(): m = Mock(wraps=SQLAlchemyClient(uri)) m.split_source = ';' return m clients = [mock_client() for _ in range(4)] dag.clients[SQLScript] = clients[0] dag.clients[SQLiteRelation] = clients[1] t1 = SQLScript(""" CREATE TABLE {{product}} AS SELECT * FROM numbers """, SQLiteRelation(('another', 'table')), dag=dag, name='t1') t2 = SQLScript(""" CREATE TABLE {{product}} AS SELECT * FROM {{upstream['t1']}} """, SQLiteRelation(('yet_another', 'table'), client=clients[2]), dag=dag, name='t2', client=clients[3]) t1 >> t2 dag.build() assert all(client.close.called for client in clients)
def test_placeholder_is_copied_upon_initialization(): dag = DAG() dag.clients[SQLScript] = Mock() dag.clients[PostgresRelation] = Mock() p = Placeholder('CREATE TABLE {{product}} AS SELECT * FROM TABLE') t1 = SQLScript(p, PostgresRelation(('schema', 'a_table', 'table')), dag, name='t1') t2 = SQLScript(p, PostgresRelation(('schema', 'another_table', 'table')), dag, name='t2') assert t1.source._placeholder is not t2.source._placeholder
def make(client): dag = DAG() dag.clients[SQLScript] = client dag.clients[SQLiteRelation] = client SQLScript('CREATE TABLE {{product}} AS SELECT * FROM data', SQLiteRelation(['data2', 'table']), dag=dag, name='task') return dag
def test_warns_if_number_of_relations_does_not_match_products(): dag = DAG() sql = """ -- wrong sql, products must be used in CREATE statements CREATE TABLE {{product[0]}} AS SELECT * FROM my_table """ t = SQLScript(sql, [ SQLiteRelation((None, 'my_table', 'table')), SQLiteRelation((None, 'another_table', 'table')) ], dag=dag, client=Mock(), name='sql') match = r'.*will create 1 relation\(s\) but you declared 2 product\(s\).*' with pytest.warns(UserWarning, match=match): t.render()
def test_dag_reports_sub_select_cols(sqlite_client_and_tmp_dir): client, _ = sqlite_client_and_tmp_dir dag = DAG() dag.clients[SQLScript] = client dag.clients[SQLiteRelation] = client PythonCallable(touch_root, File('some_file.txt'), dag, name='task') sql = 'CREATE TABLE {{product}} AS SELECT * FROM data' SQLScript(sql, SQLiteRelation(('data2', 'table')), dag, name='task2') assert dag.status()[['name', 'Last run']] assert dag.build()[['Ran?', 'Elapsed (s)']]
def test_drill_dag(): code = """ -- {{product.name}} CREATE TABLE dfs.tmp.`/test/` AS SELECT * FROM cp.`employee.json` LIMIT 1 """ drill = DrillClient() dag = DAG() SQLScript(code, File('/tmp/test/'), dag, client=drill) dag.build()
def test_warns_if_sql_script_does_not_create_relation( sqlite_client_and_tmp_dir): client, _ = sqlite_client_and_tmp_dir dag = DAG() dag.clients[SQLiteRelation] = client mock_client = Mock() mock_client.split_source = ';' t = SQLScript('SELECT * FROM {{product}}', SQLiteRelation((None, 'my_table', 'table')), dag=dag, client=mock_client, name='sql') with pytest.warns(UserWarning) as record: t.render() assert len(record) == 1 msg = ('It appears that your script will not create any tables/views but ' "the product parameter is " "SQLiteRelation(('my_table', 'table'))") assert record[0].message.args[0] == msg
def test_can_request_params(sqlite_client_and_tmp_dir): on_finish_sql.task = None on_finish_sql.product = None on_finish_sql.client = None client, _ = sqlite_client_and_tmp_dir dag = DAG() dag.clients[SQLScript] = client dag.clients[SQLiteRelation] = client t = SQLScript('CREATE TABLE {{product}} AS SELECT * FROM data', SQLiteRelation(('new_table', 'table')), dag, name='t') t.on_finish = on_finish_sql dag.build() assert on_finish_sql.task is t assert on_finish_sql.product is t.product assert on_finish_sql.client is client
def test_duplicated_sql_product(class1, class2, return_value): dag = DAG() client = Mock() client.split_source = ';' client.connection.cursor().fetchone.return_value = return_value dag.clients[SQLScript] = client dag.clients[class1] = client dag.clients[class2] = client SQLScript('CREATE TABLE {{product}} AS SELECT * FROM table', class1(('schema', 'name', 'table')), dag=dag, name='task') SQLScript('CREATE TABLE {{product}} AS SELECT * FROM table', class2(('schema', 'name', 'table')), dag=dag, name='another') with pytest.raises(DAGRenderError) as excinfo: dag.render() assert 'Tasks must generate unique Products.' in str(excinfo.value)
def test_add_metadata_fields(client_and_prod): client, product, schema = client_and_prod dag = DAG() dag.clients[SQLScript] = client dag.clients[type(product)] = client query = 'CREATE TABLE {{product}} AS SELECT * FROM data' product.prepare_metadata = add_number_one SQLScript(query, product, dag, name='t1') dag.build() metadata = product.fetch_metadata() assert metadata['number'] == 1
def test_sql_script_shows_executed_code_if_fails(tmp_directory, sample_data): dag = DAG() client = SQLAlchemyClient('sqlite:///database.db') dag.clients[SQLScript] = client dag.clients[SQLiteRelation] = client SQLScript('SOME INVALID SQL {{product}}', SQLiteRelation((None, 'another', 'table')), dag=dag, name='task') with pytest.raises(DAGBuildError) as excinfo: dag.build() assert 'SOME INVALID SQL' in str(excinfo.value) assert 'near "SOME": syntax error' in str(excinfo.value)
def test_passing_upstream_and_product_in_postgres(pg_client_and_schema): client, _ = pg_client_and_schema dag = DAG() dag.clients[SQLScript] = client dag.clients[PostgresRelation] = client client.execute('drop table if exists series;') ta_t = """begin; drop table if exists {{product}}; create table {{product}} as select * from generate_series(0, 15) as n; commit;""" ta_rel = PostgresRelation((None, 'series', 'table')) SQLScript(ta_t, ta_rel, dag, 'ta') dag.build() assert ta_rel.exists()
def make(env, artist_name, language): artist_name_ = artist_name.lower().replace(' ', '_') dag = DAG(executor=Serial(False)) loader = SourceLoader(path='sql', module='ploomber_lyrics') db_location = env.path.data / 'clean.db' client = DBAPIClient(sqlite3.connect, {'database': db_location}, split_source=True) dag.clients[SQLScript] = client dag.clients[SQLiteRelation] = client t = DownloadFromURL(env.source.lyrics, File(env.path.data / 'lyrics.db'), dag, name='raw_db') t2 = DownloadFromURL(env.source.metadata, File(env.path.data / 'metadata.db'), dag, name='raw_metadata_db') t3 = DownloadFromURL(env.source.artist_tags, File(env.path.data / 'artist_tags.db'), dag, name='raw_artist_tags_db') top = SQLScript(loader['get_top_words.sql'], SQLiteRelation(('top_words_{{artist_name_}}', 'table')), dag, params={'artist_name': artist_name, 'artist_name_': artist_name_}) wc = PythonCallable(tasks.wordcloud, File(env.path.data / (artist_name_ + '.png')), dag, params={'db_location': db_location, 'language': language}) (t + t2 + t3) >> top >> wc return dag
def test_ignores_non_file_products(tmp_directory, monkeypatch): mock_remote = Mock() monkeypatch.setattr(file._RemoteFile, '_fetch_remote_metadata', mock_remote) client = SQLAlchemyClient('sqlite:///my.db') dag = DAG() dag.clients[SQLScript] = client dag.clients[SQLiteRelation] = client dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.') SQLScript('CREATE TABLE {{product}} AS SELECT * FROM data', SQLiteRelation(['data2', 'table']), dag=dag, name='task') dag.render() client.close() mock_remote.assert_not_called()
def test_sqlscript_load(tmp_directory, sample_data, client): dag = DAG() dag.clients[SQLScript] = client dag.clients[SQLiteRelation] = client SQLScript('CREATE TABLE {{product}} AS SELECT * FROM numbers', SQLiteRelation((None, 'another', 'table')), dag=dag, name='task') dag.build(close_clients=False) df = dag['task'].load() dag.close_clients() assert df.to_dict(orient='list') == { 'a': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'b': [100, 101, 102, 103, 104, 105, 106, 107, 108, 109] }
############################################################################### # you can use jinja2 to parametrize SQL, {{upstream}} and {{product}} # are available for your script. this way you could switch products without # changing your source code (e.g. each Data Scientist in your team writes # to his/her own db schema to have isolated runs) sql = """ CREATE TABLE {{product}} AS SELECT *, pH > AVG(pH) AS high_pH FROM {{upstream['upload']}} """ features = SQLScript(sql, product=SQLiteRelation((None, 'features', 'table')), dag=dag, name='features') red_task >> concat_task white_task >> concat_task concat_task >> upload_task >> features ############################################################################### # render will pass all parameters so you can see exactly which SQL code # will be executed dag.render() ############################################################################### # print source code for task "features"
name='red') sample >> red_task white_task = ShellScript(('csvsql --db {{uri}} --tables {{product.name}} --insert {{upstream["sample"][1]}} ' '--overwrite'), PostgresRelation(('public', 'white', 'table')), dag, params=dict(uri=uri), name='white') sample >> white_task # let's introduce a new type of task, a SQLScript, this task will execute # a script in a SQL database (any database supported by sql alchemy) wine_task = SQLScript(Path('sql', 'create_wine.sql'), PostgresRelation(('public', 'wine', 'table')), dag, name='wine') (red_task + white_task) >> wine_task dataset_task = SQLScript(Path('sql', 'create_dataset.sql'), PostgresRelation(('public', 'dataset', 'table')), dag, name='dataset') wine_task >> dataset_task training_task = SQLScript(Path('sql', 'create_training.sql'), PostgresRelation(('public', 'training', 'table')), dag, name='training') dataset_task >> training_task
# DAG declaration dag = DAG(executor=Serial(build_in_subprocess=False)) dag.clients[SQLTransfer] = client dag.clients[SQLiteRelation] = client dag.clients[SQLScript] = client source_loader = SourceLoader(tmp_dir) transfer = SQLTransfer(source_loader['data_select.sql'], product=SQLiteRelation((None, 'data2', 'table')), dag=dag, name='transfer') subset = SQLScript(source_loader['subset_create.sql'], product=SQLiteRelation((None, 'subset', 'table')), dag=dag, name='subset') transfer >> subset dag.render() ############################################################################### # Our macro is correctly rendered: print(dag['subset'].source) ############################################################################### # Plot and execute pipeline: dag.plot()
# this here to make this example standalone. SQL is a language that people # from a lot of backgrounds understand, you could easily communicate your # analysis with business analysts to make your your data assumptions are # correct _clean = """ /* Cleaning dataset, we decided to ignore rows where magnesium is over 100 since we believe the data is corrupted */ CREATE TABLE {{product}} AS SELECT * FROM {{upstream['upload']}} WHERE magnesium < 100.0 """ clean = SQLScript(_clean, product=SQLiteRelation((None, 'wine_clean', 'table')), dag=dag, name='clean') get_data >> upload >> clean ############################################################################### # Pipeline build # -------------- dag.build() ############################################################################### # Pipeline plot # -------------