def dataset_db_with_swap_table(metadata_db): database = factories.DatabaseFactory(memorable_name="my_database") with psycopg2.connect(database_dsn(settings.DATABASES_DATA["my_database"])) as conn: conn.cursor().execute( """ CREATE TABLE IF NOT EXISTS dataset_test ( id INT, name VARCHAR(255), date DATE ); DELETE FROM dataset_test; INSERT INTO dataset_test values(1,'test','2022-01-01'); CREATE TABLE IF NOT EXISTS dataset_test_20220101t000000_swap ( id INT, name VARCHAR(255), date DATE ); DELETE FROM dataset_test_20220101t000000_swap; INSERT INTO dataset_test_20220101t000000_swap values(1,'test','2022-01-01'); INSERT INTO dataset_test_20220101t000000_swap values(2,'test_2','2022-01-02'); """ ) return database
def test_view_download(self): dsn = database_dsn(settings.DATABASES_DATA['my_database']) with connect(dsn) as conn, conn.cursor() as cursor: cursor.execute(''' CREATE TABLE if not exists download_test_table (field2 int,field1 varchar(255)); TRUNCATE TABLE download_test_table; INSERT INTO download_test_table VALUES(1, 'record1'); INSERT INTO download_test_table VALUES(2, 'record2'); CREATE OR REPLACE VIEW download_test_view AS SELECT * FROM download_test_table; ''') dataset = factories.DataSetFactory( user_access_type='REQUIRES_AUTHENTICATION') source_view = factories.SourceViewFactory( dataset=dataset, database=factories.DatabaseFactory(memorable_name='my_database'), schema='public', view='download_test_view', ) log_count = EventLog.objects.count() download_count = dataset.number_of_downloads response = self._authenticated_get(source_view.get_absolute_url()) self.assertEqual(response.status_code, 200) self.assertEqual( b''.join(response.streaming_content), b'"field2","field1"\r\n1,"record1"\r\n2,"record2"\r\n"Number of rows: 2"\r\n', ) self.assertEqual(EventLog.objects.count(), log_count + 1) self.assertEqual( EventLog.objects.latest().event_type, EventLog.TYPE_DATASET_SOURCE_VIEW_DOWNLOAD, ) self.assertEqual( DataSet.objects.get(pk=dataset.id).number_of_downloads, download_count + 1)
def test_view_download(self, access_type, request_client, published): dsn = database_dsn(settings.DATABASES_DATA["my_database"]) with psycopg2.connect(dsn) as conn, conn.cursor() as cursor: cursor.execute(""" CREATE TABLE if not exists download_test_table (field2 int,field1 varchar(255)); TRUNCATE TABLE download_test_table; INSERT INTO download_test_table VALUES(1, 'record1'); INSERT INTO download_test_table VALUES(2, 'record2'); CREATE OR REPLACE VIEW download_test_view AS SELECT * FROM download_test_table; """) dataset = factories.DataSetFactory(user_access_type=access_type, published=published) source_view = factories.SourceViewFactory( dataset=dataset, database=factories.DatabaseFactory(memorable_name="my_database"), schema="public", view="download_test_view", ) log_count = EventLog.objects.count() download_count = dataset.number_of_downloads response = request_client.get(source_view.get_absolute_url()) assert response.status_code == 200 assert ( b"".join(response.streaming_content) == b'"field2","field1"\r\n1,"record1"\r\n2,"record2"\r\n"Number of rows: 2"\r\n' ) assert EventLog.objects.count() == log_count + 1 assert EventLog.objects.latest( ).event_type == EventLog.TYPE_DATASET_SOURCE_VIEW_DOWNLOAD assert DataSet.objects.get( pk=dataset.id).number_of_downloads == download_count + 1
def test_materialized_view_download(self, request_client, published): dsn = database_dsn(settings.DATABASES_DATA['my_database']) with psycopg2.connect(dsn) as conn, conn.cursor() as cursor: cursor.execute(''' CREATE TABLE if not exists materialized_test_table (field2 int,field1 varchar(255)); TRUNCATE TABLE materialized_test_table; INSERT INTO materialized_test_table VALUES(1, 'record1'); INSERT INTO materialized_test_table VALUES(2, 'record2'); DROP MATERIALIZED VIEW IF EXISTS materialized_test_view; CREATE MATERIALIZED VIEW materialized_test_view AS SELECT * FROM materialized_test_table; ''') dataset = factories.DataSetFactory( user_access_type='REQUIRES_AUTHENTICATION', published=published) source_view = factories.SourceViewFactory( dataset=dataset, database=factories.DatabaseFactory(memorable_name='my_database'), schema='public', view='materialized_test_view', ) log_count = EventLog.objects.count() download_count = dataset.number_of_downloads response = request_client.get(source_view.get_absolute_url()) assert response.status_code == 200 assert ( b''.join(response.streaming_content) == b'"field2","field1"\r\n1,"record1"\r\n2,"record2"\r\n"Number of rows: 2"\r\n' ) assert EventLog.objects.count() == log_count + 1 assert (EventLog.objects.latest().event_type == EventLog.TYPE_DATASET_SOURCE_VIEW_DOWNLOAD) assert (DataSet.objects.get( pk=dataset.id).number_of_downloads == download_count + 1)
def metadata_db(db): database = factories.DatabaseFactory(memorable_name="my_database") with psycopg2.connect( database_dsn(settings.DATABASES_DATA["my_database"]) ) as conn, conn.cursor() as cursor: cursor.execute( """ CREATE SCHEMA IF NOT EXISTS dataflow; CREATE TABLE IF NOT EXISTS dataflow.metadata ( id SERIAL, table_schema TEXT, table_name TEXT, source_data_modified_utc TIMESTAMP WITHOUT TIME ZONE, dataflow_swapped_tables_utc TIMESTAMP WITHOUT TIME ZONE, table_structure JSONB, data_ids TEXT[], data_type INTEGER NOT NULL, data_hash_v1 TEXT, primary_keys TEXT[] ); TRUNCATE TABLE dataflow.metadata; INSERT INTO dataflow.metadata ( table_schema, table_name, source_data_modified_utc, dataflow_swapped_tables_utc, table_structure, data_type ) VALUES ('public','table1','2020-09-02 00:01:00.0','2020-09-02 00:01:00.0','{"field1":"int","field2":"varchar"}',1), ('public','table2','2020-09-01 00:01:00.0','2020-09-02 00:01:00.0',NULL,1), ('public','table1','2020-01-01 00:01:00.0','2020-09-02 00:01:00.0',NULL,1), ('public','table4', NULL,'2021-12-01 00:00:00.0',NULL,1); """ ) conn.commit() return database
def test_rds_sync(self, mock_client, dataset_db): cache.delete("q" "uery_tool_logs_last_run") log_count = ToolQueryAuditLog.objects.count() table_count = ToolQueryAuditLogTable.objects.count() factories.DatabaseFactory(memorable_name="my_database") factories.DatabaseUserFactory.create(username="******") factories.SourceTableFactory.create(schema="public", table="test_dataset") mock_client.return_value.describe_db_log_files.return_value = { "DescribeDBLogFiles": [ {"LogFileName": "/file/1.csv"}, {"LogFileName": "/file/2.csv"}, ] } mock_client.return_value.download_db_log_file_portion.side_effect = [ { "Marker": "1", "AdditionalDataPending": True, "LogFileData": ( # Valid user and db select statement self.log_data[0] # Non-pgaudit log + self.log_data[1] ), }, { "Marker": None, "AdditionalDataPending": False, "LogFileData": ( # Unrecognised user self.log_data[2] # Unrecognised database + self.log_data[3] ), }, { "Marker": None, "AdditionalDataPending": False, "LogFileData": ( # Valid username and db insert statement self.log_data[4] # Timestamp out of range + self.log_data[5] # No timestamp + self.log_data[6] # Duplicate log entry + self.log_data[7] ), }, ] _do_sync_tool_query_logs() queries = ToolQueryAuditLog.objects.all() tables = ToolQueryAuditLogTable.objects.all() assert queries.count() == log_count + 2 assert tables.count() == table_count + 1 assert list(queries)[-2].query_sql == "SELECT * FROM dataset_test" assert list(queries)[-2].connection_from == "172.19.0.4" assert list(queries)[-1].query_sql == "INSERT INTO dataset_test VALUES(1);" assert list(queries)[-1].connection_from == "172.19.0.5"
def test_missing_view(self, client): dataset = factories.DataSetFactory(user_access_type='REQUIRES_AUTHENTICATION') source_view = factories.SourceViewFactory( dataset=dataset, database=factories.DatabaseFactory(memorable_name='my_database'), ) download_count = dataset.number_of_downloads response = client.get(source_view.get_absolute_url()) assert response.status_code == 404 assert DataSet.objects.get(pk=dataset.id).number_of_downloads == download_count
def test_db(self, db): database = factories.DatabaseFactory(memorable_name='my_database') with psycopg2.connect( database_dsn(settings.DATABASES_DATA['my_database'])) as conn: conn.cursor().execute(''' CREATE TABLE IF NOT EXISTS test_table AS ( SELECT 1 as a, 2 as b ); ''') conn.commit() yield database conn.cursor().execute('DROP TABLE test_table')
def test_missing_view(self): dataset = factories.DataSetFactory( user_access_type='REQUIRES_AUTHENTICATION') source_view = factories.SourceViewFactory( dataset=dataset, database=factories.DatabaseFactory(memorable_name='my_database'), ) download_count = dataset.number_of_downloads response = self._authenticated_get(source_view.get_absolute_url()) self.assertEqual(response.status_code, 404) self.assertEqual( DataSet.objects.get(pk=dataset.id).number_of_downloads, download_count)
def setUp(self): super().setUp() self.database = factories.DatabaseFactory(memorable_name='my_database') self.dsn = database_dsn(settings.DATABASES_DATA['my_database']) with connect(self.dsn) as conn, conn.cursor() as cursor: cursor.execute(''' CREATE TABLE IF NOT EXISTS custom_query_test ( id INT, name VARCHAR(255), date DATE ); TRUNCATE TABLE custom_query_test; INSERT INTO custom_query_test VALUES(1, 'the first record', NULL); INSERT INTO custom_query_test VALUES(2, 'the second record', '2019-01-01'); INSERT INTO custom_query_test VALUES(3, 'the last record', NULL); ''')
def metadata_db(db): database = factories.DatabaseFactory(memorable_name='my_database') with psycopg2.connect(database_dsn(settings.DATABASES_DATA['my_database']) ) as conn, conn.cursor() as cursor: cursor.execute(''' CREATE SCHEMA IF NOT EXISTS dataflow; CREATE TABLE IF NOT EXISTS dataflow.metadata ( id int, table_schema text, table_name text, source_data_modified_utc timestamp ); TRUNCATE TABLE dataflow.metadata; INSERT INTO dataflow.metadata VALUES(1, 'public', 'table1', '2020-09-02 00:01:00.0'); INSERT INTO dataflow.metadata VALUES(1, 'public', 'table2', '2020-09-01 00:01:00.0'); INSERT INTO dataflow.metadata VALUES(1, 'public', 'table1', '2020-01-01 00:01:00.0'); INSERT INTO dataflow.metadata VALUES(1, 'public', 'table4', NULL); ''') conn.commit() return database
def dataset_finder_db(metadata_db): database = factories.DatabaseFactory(memorable_name="my_database") with psycopg2.connect(database_dsn(settings.DATABASES_DATA["my_database"])) as conn: conn.cursor().execute( """ CREATE TABLE IF NOT EXISTS dataworkspace__source_tables ( id INT, name VARCHAR(255), dataset_id UUID, schema VARCHAR(255), "table" VARCHAR(255) ); CREATE TABLE IF NOT EXISTS dataworkspace__catalogue_items ( id UUID, name VARCHAR(255), slug VARCHAR(255) ); INSERT INTO dataworkspace__source_tables VALUES( 1, 'public.data', '0dea6147-d355-4b6d-a140-0304ef9cfeca', 'public', 'data' ); INSERT INTO dataworkspace__catalogue_items VALUES( '0dea6147-d355-4b6d-a140-0304ef9cfeca', 'public.data', '1' ); CREATE SCHEMA IF NOT EXISTS public; CREATE TABLE IF NOT EXISTS data ( id int, name VARCHAR(255), database VARCHAR(255), schema VARCHAR(255), frequency VARCHAR(255), "table" VARCHAR(255) ); CREATE TABLE IF NOT EXISTS country_stats ( date DATE, driving NUMERIC, country VARCHAR(255) ); """ ) return database
def test_db(self, db): database = factories.DatabaseFactory(memorable_name="my_database") with psycopg2.connect( database_dsn(settings.DATABASES_DATA["my_database"])) as conn: conn.cursor().execute(""" CREATE TABLE IF NOT EXISTS test_sample AS ( with data (x, y, z) as (values (NULL, NULL, NULL), ('a', 'b', 'c'), ('a', 'b', NULL), ('a', NULL, NULL) ) SELECT * from data ); """) conn.commit() yield database.memorable_name conn.cursor().execute("DROP TABLE test_sample")
def test_docker_sync(self, mock_os, dataset_db): cache.delete("query_tool_logs_last_run") table_count = ToolQueryAuditLogTable.objects.count() log_count = ToolQueryAuditLog.objects.count() factories.DatabaseFactory(memorable_name="my_database") factories.DatabaseUserFactory.create(username="******") factories.SourceTableFactory.create(schema="public", table="test_dataset") mock_os.listdir.return_value = [ "file1.csv", "file2.log", ] mock_os.path.getmtime.return_value = datetime.datetime.now().timestamp() _do_sync_tool_query_logs() queries = ToolQueryAuditLog.objects.all() tables = ToolQueryAuditLogTable.objects.all() assert queries.count() == log_count + 2 assert tables.count() == table_count + 1 assert list(queries)[-2].query_sql == "SELECT * FROM dataset_test" assert list(queries)[-1].query_sql == "INSERT INTO dataset_test VALUES(1);"
def dataset_db(metadata_db): database = factories.DatabaseFactory(memorable_name='my_database') with psycopg2.connect(database_dsn( settings.DATABASES_DATA['my_database'])) as conn: conn.cursor().execute(''' CREATE TABLE IF NOT EXISTS dataset_test ( id INT, name VARCHAR(255), date DATE ); CREATE TABLE IF NOT EXISTS dataset_test2 ( id INT, name VARCHAR(255) ); CREATE OR REPLACE VIEW dataset_view AS (SELECT * FROM dataset_test); ''') return database
def test_download_filtered_columns(client, mocker, dataset_finder_db): source_table = factories.SourceTableFactory.create( dataset=factories.MasterDataSetFactory.create( user_access_type=UserAccessType.OPEN), schema="public", table="country_stats", database=factories.DatabaseFactory(memorable_name="my_database"), ) get_fields = mocker.patch( "dataworkspace.apps.finder.elasticsearch.ElasticsearchClient.get_fields" ) get_fields.return_value = ["date", "country", "driving"] dataset_search = mocker.patch("elasticsearch.Elasticsearch.search") dataset_search.side_effect = [ { "took": 11, "timed_out": False, "_shards": { "total": 45, "successful": 45, "skipped": 0, "failed": 0 }, "hits": { "total": { "value": 1260, "relation": "eq" }, "max_score": None, "hits": [], }, "aggregations": { "indexes": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0, "buckets": [ { "key": "20210316t070000--public--country_stats--1", "doc_count": 3, }, ], } }, }, { "took": 11, "timed_out": False, "_shards": { "total": 1, "successful": 1, "skipped": 0, "failed": 0 }, "hits": { "total": { "value": 3, "relation": "eq" }, "max_score": None, "hits": [ { "_index": "20210316t070000--public--country_stats--1", "_type": "_doc", "_id": "1", "_score": 1.0, "_source": { "country": "Albania", "date": "2020-01-13", "driving": 0.0, }, }, { "_index": "20210316t070000--public--country_stats--1", "_type": "_doc", "_id": "1", "_score": 1.0, "_source": { "country": "Algeria", "date": "2020-01-14", "driving": 1.5, }, }, ], }, }, ] params = { "q": "albania", "name": "test", "uuid": source_table.dataset.id, "slug": "slug", "download": "1", } response = client.post( reverse( "finder:data_grid_results", kwargs={ "schema": "public", "table": "country_stats" }, ) + "?" + urlencode(params), { "columns": ["country"], "filters": {} }, ) assert response.status_code == 200 assert b"".join(response.streaming_content) == ( b'"country"\r\n"Albania"\r\n"Algeria"\r\n')
def _get_database(self): return factories.DatabaseFactory(memorable_name='my_database')
def test_db(self, db): yield factories.DatabaseFactory(memorable_name='my_database')