Example #1
0
def test_custom_io_handler(tmp_directory):
    dag = DAG()
    client = SQLAlchemyClient('sqlite:///database.db')
    dag.clients[SQLUpload] = client
    dag.clients[SQLiteRelation] = client

    df = pd.DataFrame({'a': [1, 2, 3], 'b': [1, 2, 3]})
    df.to_csv('some-file.tsv', sep='\t', index=False)

    def my_reading_fn(path):
        return pd.read_csv(path, sep='\t')

    SQLUpload('some-file.tsv',
              SQLiteRelation(('my-table', 'table')),
              dag=dag,
              name='task',
              io_handler=my_reading_fn,
              to_sql_kwargs=dict(index=False))

    dag.build()

    other = pd.read_sql('SELECT * FROM "my-table"', con=client)

    client.close()

    assert other.equals(df)
Example #2
0
def test_append_rows(tmp_directory, pg_client_and_schema):
    pg_client, schema = pg_client_and_schema

    df = pd.DataFrame({'a': [1, 2, 3]})
    df.to_csv('data.csv', index=False)

    dag = DAG()

    dag.clients[SQLUpload] = pg_client
    dag.clients[PostgresRelation] = pg_client

    # create table
    df.to_sql('test_append',
              pg_client.engine,
              schema=schema,
              if_exists='replace',
              index=False)

    SQLUpload('data.csv',
              product=PostgresRelation((schema, 'test_append', 'table')),
              dag=dag,
              name='upload',
              to_sql_kwargs={
                  'if_exists': 'append',
                  'index': False
              })

    dag.build()

    df = pd.read_sql('SELECT * FROM {}.test_append'.format(schema),
                     pg_client.engine)

    assert df.shape[0] == 6
Example #3
0
def test_can_upload_file_from_upstream_dependency(tmp_directory,
                                                  pg_client_and_schema):

    pg_client, schema = pg_client_and_schema

    dag = DAG()

    dag.clients[SQLUpload] = pg_client
    dag.clients[PostgresRelation] = pg_client

    make = PythonCallable(make_data,
                          product=File('data.parquet'),
                          dag=dag,
                          name='make')

    name = 'test_can_upload_file_from_upstream_dependency'
    pg = SQLUpload('{{upstream["make"]}}',
                   product=PostgresRelation((schema, name, 'table')),
                   dag=dag,
                   name='upload',
                   to_sql_kwargs={'if_exists': 'replace'})

    make >> pg

    dag.build()
Example #4
0
def test_can_upload_a_file_using_a_path(tmp_directory, pg_client_and_schema):
    pg_client, schema = pg_client_and_schema

    df = pd.DataFrame({'a': [1, 2, 3]})
    df.to_parquet('data.parquet')

    dag = DAG()

    dag.clients[PostgresRelation] = pg_client
    dag.clients[SQLUpload] = pg_client

    SQLUpload(Path('data.parquet'),
              product=PostgresRelation(
                  (schema, 'test_can_upload_a_file', 'table')),
              dag=dag,
              name='upload')

    dag.build()
Example #5
0
def test_can_upload_a_file(serializer, task_arg, tmp_directory,
                           pg_client_and_schema):
    pg_client, schema = pg_client_and_schema

    df = pd.DataFrame({'a': [1, 2, 3]})
    getattr(df, serializer)(task_arg)

    dag = DAG()

    dag.clients[SQLUpload] = pg_client
    dag.clients[PostgresRelation] = pg_client

    SQLUpload(task_arg,
              product=PostgresRelation(
                  (schema, 'test_can_upload_a_file', 'table')),
              dag=dag,
              name='upload',
              to_sql_kwargs={'if_exists': 'replace'})

    dag.build()
Example #6
0
def test_upload_a_file_with_generic_relation(serializer, task_arg,
                                             sqlite_client_and_tmp_dir,
                                             pg_client_and_schema):
    client, _ = sqlite_client_and_tmp_dir
    pg_client, schema = pg_client_and_schema

    df = pd.DataFrame({'a': [1, 2, 3]})
    getattr(df, serializer)(task_arg)

    dag = DAG()

    dag.clients[SQLUpload] = pg_client
    dag.clients[GenericSQLRelation] = client

    SQLUpload(task_arg,
              product=GenericSQLRelation(
                  (schema, 'test_can_upload_a_file', 'table')),
              dag=dag,
              name='upload',
              to_sql_kwargs={'if_exists': 'replace'})

    dag.build()
Example #7
0
                          name='red',
                          params={'filename': 'winequality-red.csv'})

white_task = PythonCallable(get_data,
                            product=File(tmp_dir / 'white.parquet'),
                            dag=dag,
                            name='white',
                            params={'filename': 'winequality-white.csv'})

concat_task = PythonCallable(concat_data,
                             product=File(tmp_dir / 'all.parquet'),
                             dag=dag, name='all')


upload_task = SQLUpload(tmp_dir / 'all.parquet',
                        product=SQLiteRelation((None, 'data', 'table')),
                        dag=dag,
                        name='upload')


###############################################################################
# you can use jinja2 to parametrize SQL, {{upstream}} and {{product}}
# are available for your script. this way you could switch products without
# changing your source code (e.g. each Data Scientist in your team writes
# to his/her own db schema to have isolated runs)
sql = """
CREATE TABLE {{product}} AS
SELECT *,
       pH > AVG(pH) AS high_pH
FROM {{upstream['upload']}}
"""
Example #8
0
def make_task_upload(env, dag):
    return SQLUpload('{{upstream["raw"]}}',
                     product=SQLiteRelation((None, 'raw', 'table')),
                     dag=dag,
                     name='upload',
                     to_sql_kwargs={'if_exists': 'replace'})
Example #9
0
def make(tmp):
    """Make the dag
    """
    tmp = Path(tmp)
    dag = DAG()

    # db with the source data
    client_source = SQLAlchemyClient('sqlite:///' + str(tmp / 'source.db'))
    # db where we'll insert processed data (can be the same as the
    # source db)
    client_target = SQLAlchemyClient('sqlite:///' + str(tmp / 'target.db'))

    dag.clients[SQLDump] = client_source
    dag.clients[SQLUpload] = client_target
    dag.clients[SQLiteRelation] = client_target

    cur = client_target.connection.execute("""
    SELECT name FROM sqlite_master WHERE type='table' AND name='plus_one'""")

    if cur.fetchone():
        cur = client_target.connection.execute('SELECT MAX(id) FROM plus_one')
        last_id = cur.fetchone()[0]
    else:
        last_id = None

    # we dump new observations to this file
    dumped_data = File(tmp / 'x.csv')
    # we add a hook that allows us to save info on the latest seen value
    dumped_data.prepare_metadata = add_last_value

    # the actual task that dumps data
    dump = SQLDump("""
        SELECT * FROM data
        {% if last_id %}
        WHERE id > {{last_id}}
        {% endif %}
    """,
                   dumped_data,
                   dag=dag,
                   name='dump',
                   chunksize=None,
                   params=dict(last_id=last_id))

    # on finish hook, will stop DAG execution if there aren't new observations
    dump.on_finish = dump_on_finish

    # a dummy task to modify the data
    plus_one = PythonCallable(_plus_one,
                              File(tmp / 'plus_one.csv'),
                              dag=dag,
                              name='plus_one')

    # upload the data to the target database
    upload = SQLUpload(
        '{{upstream["plus_one"]}}',
        product=SQLiteRelation((None, 'plus_one', 'table')),
        dag=dag,
        name='upload',
        # append observations if the table already exists
        to_sql_kwargs={
            'if_exists': 'append',
            'index': False
        })

    dump >> plus_one >> upload

    return dag
Example #10
0
    df['label'] = wine.target
    df.to_parquet(str(product))


dag = DAG()
dag.clients[SQLUpload] = client
dag.clients[SQLiteRelation] = client
dag.clients[SQLScript] = client

get_data = PythonCallable(_get_data,
                          product=File(tmp_dir / 'wine.parquet'),
                          dag=dag,
                          name='get')

upload = SQLUpload('{{upstream["get"]}}',
                   product=SQLiteRelation((None, 'wine', 'table')),
                   dag=dag,
                   name='upload')


###############################################################################
# In a real project, your SQL scripts should be separate files, we include
# this here to make this example standalone. SQL is a language that people
# from a lot of backgrounds understand, you could easily communicate your
# analysis with business analysts to make your your data assumptions are
# correct
_clean = """
/*
Cleaning dataset, we decided to ignore rows where magnesium is over 100
since we believe the data is corrupted
*/
CREATE TABLE {{product}} AS
Example #11
0
def make_task_upload(dag, env):
    return SQLUpload('{{upstream["raw"]}}',
                     product=SQLiteRelation((None, 'raw', 'table')),
                     dag=dag,
                     name='upload')