Exemple #1
0
def test_can_transfer_sqlite(tmp_directory):
    """
    >>> import tempfile
    >>> tmp_directory = tempfile.mkdtemp()
    """
    tmp = Path(tmp_directory)

    # create clientections to 2 dbs
    client_in = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database_in.db"))
    client_out = SQLAlchemyClient('sqlite:///{}'.format(tmp /
                                                        "database_out.db"))

    # make some data and save it in the db
    df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)})
    df.to_sql('numbers', client_in.engine, index=False)

    # create the task and run it
    dag = DAG()
    SQLTransfer('SELECT * FROM numbers',
                SQLiteRelation((None, 'numbers2', 'table'), client=client_out),
                dag,
                name='transfer',
                client=client_in,
                chunksize=10)
    dag.build()

    # load dumped data and data from the db
    original = pd.read_sql_query('SELECT * FROM numbers', client_in.engine)
    transfer = pd.read_sql_query('SELECT * FROM numbers2', client_out.engine)

    client_in.close()
    client_out.close()

    # make sure they are the same
    assert original.equals(transfer)
Exemple #2
0
def make_dag(params, env):
    dag = DAG()
    dag.clients[SQLUpload] = SQLAlchemyClient(env.db_uri)
    dag.clients[SQLiteRelation] = SQLAlchemyClient(env.db_uri)
    dump = make_task_dump(dag)
    upload = make_task_upload(dag)
    dump >> upload
    return dag
Exemple #3
0
def make_dag(env, params):
    dag = DAG(executor=Serial(build_in_subprocess=False))
    dag.clients[SQLUpload] = SQLAlchemyClient(env.db_uri)
    dag.clients[SQLiteRelation] = SQLAlchemyClient(env.db_uri)
    dump = make_task_dump(dag)
    upload = make_task_upload(dag)
    dump >> upload
    return dag
Exemple #4
0
def test_custom_io_handler(tmp_directory):
    dag = DAG()
    client = SQLAlchemyClient('sqlite:///database.db')
    dag.clients[SQLUpload] = client
    dag.clients[SQLiteRelation] = client

    df = pd.DataFrame({'a': [1, 2, 3], 'b': [1, 2, 3]})
    df.to_csv('some-file.tsv', sep='\t', index=False)

    def my_reading_fn(path):
        return pd.read_csv(path, sep='\t')

    SQLUpload('some-file.tsv',
              SQLiteRelation(('my-table', 'table')),
              dag=dag,
              name='task',
              io_handler=my_reading_fn,
              to_sql_kwargs=dict(index=False))

    dag.build()

    other = pd.read_sql('SELECT * FROM "my-table"', con=client)

    client.close()

    assert other.equals(df)
Exemple #5
0
def test_can_check_range(tmp_directory):
    client = SQLAlchemyClient('sqlite:///' + str(Path(tmp_directory, 'db.db')))

    df = pd.DataFrame({'x': [1, 2, 3, 4, 5, 1000]})
    df.to_sql('my_table', client.engine)

    assert testing.sql.range_in_column(client, 'x', 'my_table') == (1, 1000)
Exemple #6
0
def test_sqldump_does_not_required_product_tag(tmp_directory):
    tmp = Path(tmp_directory)

    # create a db
    conn = connect(str(tmp / "database.db"))
    client = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db"))
    # dump output path
    out = tmp / 'dump'

    # make some data and save it in the db
    df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)})
    df.to_sql('numbers', conn)

    # create the task and run it
    dag = DAG()

    # pass template SQL code so it's treated as a placeholder, this will force
    # the render step
    SQLDump('SELECT * FROM numbers LIMIT {{limit}}',
            File(out),
            dag,
            name='dump.csv',
            client=client,
            chunksize=None,
            io_handler=io.CSVIO,
            params={'limit': 10})

    dag.render()
Exemple #7
0
def test_render_remote_with_non_file_products(tmp_directory):
    client = SQLAlchemyClient('sqlite:///my.db')

    pd.DataFrame({'x': range(3)}).to_sql('data', client.engine)

    def make(client):
        dag = DAG()
        dag.clients[SQLScript] = client
        dag.clients[SQLiteRelation] = client

        SQLScript('CREATE TABLE {{product}} AS SELECT * FROM data',
                  SQLiteRelation(['data2', 'table']),
                  dag=dag,
                  name='task')

        return dag

    make(client).build()

    dag = make(client)
    dag.render(remote=True)
    client.close()

    # should match the local status
    assert {t.exec_status for t in dag.values()} == {TaskStatus.Skipped}
Exemple #8
0
def test_can_dump_sqlite_to_parquet(tmp_directory):
    tmp = Path(tmp_directory)

    # create a db
    conn = connect(str(tmp / "database.db"))
    client = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db"))
    # dump output path
    out = tmp / 'dump'

    # make some data and save it in the db
    df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)})
    df.to_sql('numbers', conn)

    cur = conn.cursor()
    cur.execute('select * from numbers')

    # create the task and run it
    dag = DAG()
    SQLDump('SELECT * FROM numbers',
            File(out),
            dag,
            name='dump',
            client=client,
            chunksize=10,
            io_handler=io.ParquetIO)
    dag.build()

    # load dumped data and data from the db
    dump = pd.read_parquet(out)
    db = pd.read_sql_query('SELECT * FROM numbers', conn)

    conn.close()

    # make sure they are the same
    assert dump.equals(db)
Exemple #9
0
def pg_client_and_schema():
    """
    Creates a temporary schema for the testing session, drops everything
    at the end
    """
    db = _load_db_credentials()

    # set a new schema for this session, otherwise if two test sessions
    # are run at the same time, tests might conflict with each other
    # NOTE: avoid upper case characters, pandas.DataFrame.to_sql does not like
    # them
    schema = (''.join(random.choice(string.ascii_letters)
                      for i in range(12))).lower()

    # initialize client, set default schema
    # info: https://www.postgresonline.com/article_pfriendly/279.html
    client = SQLAlchemyClient(db['uri'],
                              create_engine_kwargs=dict(connect_args=dict(
                                  options=f'-c search_path={schema}')))

    # create schema
    client.execute('CREATE SCHEMA {};'.format(schema))

    df = pd.DataFrame({'x': range(10)})
    df.to_sql('data', client.engine)

    yield client, schema

    # clean up schema
    client.execute('DROP SCHEMA {} CASCADE;'.format(schema))
    client.close()
Exemple #10
0
def test_passing_upstream_and_product_in_postgres(pg_client, db_credentials):
    dag = DAG()

    client = SQLAlchemyClient(db_credentials['uri'])

    dag.clients[SQLScript] = client
    dag.clients[PostgresRelation] = client

    conn = pg_client.connection
    cur = conn.cursor()
    cur.execute('drop table if exists series;')
    conn.commit()
    conn.close()

    ta_t = """begin;
              drop table if exists {{product}};
              create table {{product}} as
              select * from generate_series(0, 15) as n;
              commit;"""
    ta_rel = PostgresRelation((None, 'series', 'table'))
    ta = SQLScript(ta_t, ta_rel, dag, 'ta')

    dag.build()

    assert ta_rel.exists()
Exemple #11
0
def test_exists_row_where(tmp_directory):
    client = SQLAlchemyClient('sqlite:///' + str(Path(tmp_directory, 'db.db')))

    df = pd.DataFrame({'x': [1, 2, 3, 4]})
    df.to_sql('my_table', client.engine)

    assert testing.sql.exists_row_where(client, 'x > 1', 'my_table')
    assert not testing.sql.exists_row_where(client, 'x > 4', 'my_table')
Exemple #12
0
def get_client(env):
    path = env.path.products_root / 'data.db'

    # create parent folders if they don't exist, otherwise sqlalchemy fails
    if not path.parent.exists():
        path.parent.mkdir(exist_ok=True, parents=True)

    return SQLAlchemyClient(f'sqlite:///{path}')
def test_sqlite_product_fetch_metadata_none_if_not_exists(tmp_directory):
    tmp = Path(tmp_directory)
    conn = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db"))

    numbers = SQLiteRelation((None, 'numbers', 'table'), conn)
    numbers.render({})

    assert numbers.fetch_metadata() is None
Exemple #14
0
def test_can_check_nulls(tmp_directory):
    client = SQLAlchemyClient('sqlite:///' + str(Path(tmp_directory, 'db.db')))

    df = pd.DataFrame({'no_nas': [1, 2, 1], 'nas': [1, np.nan, 1]})
    df.to_sql('my_table', client.engine)

    assert not testing.sql.nulls_in_columns(client, ['no_nas'], 'my_table')
    assert testing.sql.nulls_in_columns(client, ['nas'], 'my_table')
    assert testing.sql.nulls_in_columns(client, ['no_nas', 'nas'], 'my_table')
Exemple #15
0
def test_can_check_duplicates(tmp_directory):
    client = SQLAlchemyClient('sqlite:///' + str(Path(tmp_directory, 'db.db')))

    df = pd.DataFrame({'duplicates': [1, 1], 'no_duplicates': [1, 2]})
    df.to_sql('my_table', client.engine)

    assert not testing.sql.duplicates_in_column(client, 'no_duplicates',
                                                'my_table')
    assert testing.sql.duplicates_in_column(client, 'duplicates', 'my_table')
Exemple #16
0
def test_can_check_distinct(tmp_directory):
    client = SQLAlchemyClient('sqlite:///' + str(Path(tmp_directory, 'db.db')))

    df = pd.DataFrame({'no_nas': [1, 2, 1], 'nas': [1, np.nan, 1]})
    df.to_sql('my_table', client.engine)

    assert (testing.sql.distinct_values_in_column(client, 'no_nas',
                                                  'my_table') == {1, 2})

    assert (testing.sql.distinct_values_in_column(client, 'nas',
                                                  'my_table') == {1.0, None})
Exemple #17
0
def test_custom_create_engine_kwargs(monkeypatch):

    mock = Mock()
    monkeypatch.setattr(db, 'create_engine', mock)
    client = SQLAlchemyClient('sqlite:///my_db.db',
                              create_engine_kwargs=dict(key='value'))

    # trigger call to create_engine
    client.engine

    mock.assert_called_once_with('sqlite:///my_db.db', key='value')
def test_sqlite_product_fetch_metadata_none_if_empty_metadata(tmp_directory):
    tmp = Path(tmp_directory)
    conn = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db"))

    df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)})
    df.to_sql('numbers', conn.engine)

    numbers = SQLiteRelation((None, 'numbers', 'table'), conn)
    numbers.render({})

    assert numbers.fetch_metadata() is None
Exemple #19
0
def test_assert_no_duplicates_in_column(tmp_directory, stats):
    client = SQLAlchemyClient('sqlite:///' + str(Path(tmp_directory, 'db.db')))

    df = pd.DataFrame({'a': [1, 1], 'b': [1, 2]})
    df.to_sql('my_table', client.engine)

    with pytest.raises(AssertionError):
        testing.sql.assert_no_duplicates_in_column(client,
                                                   'a',
                                                   'my_table',
                                                   stats=stats)
Exemple #20
0
def sqlite_client_and_tmp_dir():
    """
    Creates a sqlite db with sample data and yields initialized client
    along with a temporary directory location
    """
    tmp_dir = Path(tempfile.mkdtemp())
    client = SQLAlchemyClient('sqlite:///' + str(tmp_dir / 'my_db.db'))
    df = pd.DataFrame({'x': range(10)})
    df.to_sql('data', client.engine)
    yield client, tmp_dir
    client.close()
Exemple #21
0
def test_duplicates_stats(tmp_directory):
    client = SQLAlchemyClient('sqlite:///' + str(Path(tmp_directory, 'db.db')))

    df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1]})
    df.to_sql('my_table', client.engine)

    n_rows, n_unique, n_duplicates = (testing.sql.duplicates_stats(
        client, 'a', 'my_table'))

    assert n_rows == 11
    assert n_unique == 5
    assert n_duplicates == 6
def test_sqlite_product_save_metadata(tmp_directory):
    tmp = Path(tmp_directory)
    conn = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db"))

    numbers = SQLiteRelation((None, 'numbers', 'table'), conn)
    numbers.render({})

    numbers.metadata['timestamp'] = datetime.now().timestamp()
    numbers.metadata['stored_source_code'] = 'some code'

    numbers.save_metadata()

    fetched = numbers.fetch_metadata()

    assert fetched == numbers.metadata
Exemple #23
0
def test_upload_to_s3(s3, tmp_directory):
    dag = DAG()
    dag.clients[GenericProduct] = SQLAlchemyClient('sqlite://')
    Path('somefile.txt').touch()
    UploadToS3('somefile.txt',
               GenericProduct('somefile-in-s3.txt'),
               dag,
               bucket='some-bucket',
               name='s3_upload')

    dag.build()

    contents = s3.list_objects(Bucket='some-bucket')['Contents']

    assert len(contents) == 1
    assert contents[0]['Key'] == 'somefile-in-s3.txt'
Exemple #24
0
def test_can_check_duplicates(tmp_directory):
    client = SQLAlchemyClient('sqlite:///' + str(Path(tmp_directory, 'db.db')))

    df = pd.DataFrame({'duplicates': [1, 1], 'no_duplicates': [1, 2]})
    df.to_sql('my_table', client.engine)

    assert not testing.sql.duplicates_in_column(client, 'no_duplicates',
                                                'my_table')
    assert testing.sql.duplicates_in_column(client, 'duplicates', 'my_table')

    # check cols that have duplicates but do not have duplicate pairs

    df = pd.DataFrame({'a': [1, 1, 1], 'b': [1, 2, 3]})
    df.to_sql('another_table', client.engine)
    assert not testing.sql.duplicates_in_column(client, ['a', 'b'],
                                                'another_table')
def test_sqlite_product_delete(tmp_directory):
    """
    >>> import tempfile
    >>> tmp_directory = tempfile.mkdtemp()
    """
    tmp = Path(tmp_directory)
    conn = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db"))

    df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)})
    df.to_sql('numbers', conn.engine)

    numbers = SQLiteRelation((None, 'numbers', 'table'), conn)
    numbers.render({})
    numbers.delete()

    assert not numbers.exists()
Exemple #26
0
def pg_client():
    db = _load_db_credentials()

    client = SQLAlchemyClient(db['uri'])

    # set a new schema for this session, otherwise if two test sessions
    # are run at the same time, tests might conflict with each other
    schema = (''.join(random.choice(string.ascii_letters) for i in range(8)))

    client.execute('CREATE SCHEMA {};'.format(schema))
    client.execute('SET search_path TO {};'.format(schema))

    yield client

    # clean up schema
    client.execute('drop schema {} cascade;'.format(schema))

    client.close()
Exemple #27
0
def test_sql_script_shows_executed_code_if_fails(tmp_directory, sample_data):

    dag = DAG()

    client = SQLAlchemyClient('sqlite:///database.db')
    dag.clients[SQLScript] = client
    dag.clients[SQLiteRelation] = client

    SQLScript('SOME INVALID SQL {{product}}',
              SQLiteRelation((None, 'another', 'table')),
              dag=dag,
              name='task')

    with pytest.raises(DAGBuildError) as excinfo:
        dag.build()

    assert 'SOME INVALID SQL' in str(excinfo.value)
    assert 'near "SOME": syntax error' in str(excinfo.value)
Exemple #28
0
def test_sql_dump_shows_executed_code_if_fails(tmp_directory):
    tmp = Path(tmp_directory)

    client = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db"))

    dag = DAG()

    SQLDump('SOME INVALID SQL',
            File('data.parquet'),
            dag,
            name='data',
            client=client)

    with pytest.raises(DAGBuildError) as excinfo:
        dag.build()

    assert 'SOME INVALID SQL' in str(excinfo.value)
    assert 'near "SOME": syntax error' in str(excinfo.value)
Exemple #29
0
def get_pg_client():
    p = Path('~', '.auth', 'postgres-ploomber.json').expanduser()

    # if running locally, load from file
    if p.exists():
        with open(p) as f:
            uri = json.load(f)['uri']

    # if no credentials file, use env variable (used in windows CI)
    elif 'POSTGRES' in os.environ:
        b64 = os.environ['POSTGRES']
        json_str = base64.b64decode(b64).decode()
        uri = json.loads(json_str)['uri']

    # otherwise, use local posttgres db (used in linux CI)
    else:
        uri = 'postgresql://*****:*****@localhost:5432/postgres'

    return SQLAlchemyClient(uri)
Exemple #30
0
def pg_client_and_schema():
    db = _load_db_credentials()

    client = SQLAlchemyClient(db['uri'])

    # set a new schema for this session, otherwise if two test sessions
    # are run at the same time, tests might conflict with each other
    # NOTE: avoid upper case characters, pandas.DataFrame.to_sql does not like
    # them
    schema = (''.join(random.choice(string.ascii_letters)
                      for i in range(12))).lower()

    client.execute('CREATE SCHEMA {};'.format(schema))
    client.execute('SET search_path TO {};'.format(schema))

    yield client, schema

    # clean up schema
    client.execute('drop schema {} cascade;'.format(schema))

    client.close()