def test_refresh_schema_creates_tables(self): EXPECTED_TABLE_METADATA = { 'id': 1, 'org_id': 1, 'exists': True, 'name': 'table', 'visible': True, 'sample_query': None, 'description': None, 'column_metadata': True, 'data_source_id': 1, 'sample_updated_at': None, } refresh_schema(self.factory.data_source.id) update_sample(self.factory.data_source.id, 'table', 1, "2019-05-09T17:07:52.386910Z") table_metadata = TableMetadata.query.all() column_metadata = ColumnMetadata.query.all() self.assertEqual(len(table_metadata), 1) self.assertEqual(len(column_metadata), 1) self.assertEqual(table_metadata[0].to_dict(), EXPECTED_TABLE_METADATA) self.assertEqual(column_metadata[0].to_dict(), self.EXPECTED_COLUMN_METADATA)
def test_refresh_schema_creates_tables(self): EXPECTED_TABLE_METADATA = { 'id': 1, 'org_id': 1, 'exists': True, 'name': 'table', 'sample_query': None, 'description': None, 'column_metadata': True, 'data_source_id': 1 } refresh_schema(self.factory.data_source.id) get_table_sample_data(self.factory.data_source.id, { "name": 'table', "columns": [self.COLUMN_NAME] }, 1) table_metadata = TableMetadata.query.all() column_metadata = ColumnMetadata.query.all() self.assertEqual(len(table_metadata), 1) self.assertEqual(len(column_metadata), 1) self.assertEqual(table_metadata[0].to_dict(), EXPECTED_TABLE_METADATA) self.assertEqual(column_metadata[0].to_dict(), self.EXPECTED_COLUMN_METADATA)
def test_refresh_schema_update_column(self): UPDATED_COLUMN_TYPE = "varchar" refresh_schema(self.factory.data_source.id) update_sample( self.factory.data_source.id, "table", 1, utils.utcnow() - datetime.timedelta(days=90), ) column_metadata = ColumnMetadata.query.all() self.assertEqual( ColumnMetadataSerializer(column_metadata[0]).serialize(), self.EXPECTED_COLUMN_METADATA, ) updated_schema = copy.deepcopy(self.default_schema_return_value) updated_schema[0]["metadata"][0]["type"] = UPDATED_COLUMN_TYPE self.patched_get_schema.return_value = updated_schema refresh_schema(self.factory.data_source.id) column_metadata = ColumnMetadata.query.all() self.assertNotEqual( ColumnMetadataSerializer(column_metadata[0]).serialize(), self.EXPECTED_COLUMN_METADATA, ) self.assertEqual( ColumnMetadataSerializer(column_metadata[0]).serialize()["type"], UPDATED_COLUMN_TYPE, )
def test_refresh_schema_delete_column(self): NEW_COLUMN_NAME = "new_column" refresh_schema(self.factory.data_source.id) column_metadata = ColumnMetadata.query.all() self.assertTrue( ColumnMetadataSerializer(column_metadata[0]).serialize()["exists"]) self.patched_get_schema.return_value = [{ "name": "table", "columns": [NEW_COLUMN_NAME], "metadata": [{ "name": NEW_COLUMN_NAME, "type": self.COLUMN_TYPE, }], }] refresh_schema(self.factory.data_source.id) column_metadata = ColumnMetadata.query.all() self.assertEqual(len(column_metadata), 2) self.assertFalse( ColumnMetadataSerializer(column_metadata[1]).serialize()["exists"]) self.assertTrue( ColumnMetadataSerializer(column_metadata[0]).serialize()["exists"])
def test_recent_empty_sample_refreshs(self): self.factory.data_source.query_runner.configuration['samples'] = True refresh_schema(self.factory.data_source.id) # Confirm no sample exists column_metadata = ColumnMetadata.query.first() self.assertEqual(column_metadata.example, None) LAST_UPDATE = utils.utcnow() - datetime.timedelta(days=5) update_sample(self.factory.data_source.id, 'table', 1, LAST_UPDATE.isoformat()) column_metadata = ColumnMetadata.query.first() self.assertEqual(column_metadata.example, self.COLUMN_EXAMPLE)
def test_refresh_samples_applied_to_one_data_source(self): ds1 = self.factory.create_data_source() ds2 = self.factory.create_data_source() ds1.query_runner.configuration['samples'] = True ds2.query_runner.configuration['samples'] = True refresh_schema(ds1.id) refresh_schema(ds2.id) refresh_samples(ds1.id, 50) table_metadata = TableMetadata.query.filter( TableMetadata.sample_updated_at.isnot(None)) self.assertEqual(table_metadata.count(), len(self.default_schema_return_value))
def test_refresh_schema_doesnt_overwrite_samples(self): self.factory.data_source.query_runner.configuration['samples'] = True refresh_schema(self.factory.data_source.id) column_metadata = ColumnMetadata.query.first() self.assertEqual(column_metadata.example, None) update_sample(self.factory.data_source.id, 'table', 1, "2019-05-09T17:07:52.386910Z") column_metadata = ColumnMetadata.query.first() self.assertEqual(column_metadata.example, self.COLUMN_EXAMPLE) # Check that a schema refresh doesn't overwrite examples refresh_schema(self.factory.data_source.id) column_metadata = ColumnMetadata.query.first() self.assertEqual(column_metadata.example, self.COLUMN_EXAMPLE)
def test_refresh_samples_rate_limits(self): NEW_COLUMN_NAME = 'new_column' NUM_TABLES = 105 tables = [] for i in range(NUM_TABLES): tables.append({ 'name': 'table{}'.format(i), 'columns': [NEW_COLUMN_NAME], 'metadata': [{ 'name': NEW_COLUMN_NAME, 'type': self.COLUMN_TYPE, }] }) self.patched_get_schema.return_value = tables self.factory.data_source.query_runner.configuration['samples'] = True refresh_schema(self.factory.data_source.id) refresh_samples(self.factory.data_source.id, 50) # There's a total of 105 tables table_metadata = TableMetadata.query.count() self.assertEqual(table_metadata, NUM_TABLES) # 50 tables are processed on the first call table_metadata = TableMetadata.query.filter( TableMetadata.sample_updated_at.is_(None)).all() self.assertEqual(len(table_metadata), 55) # 50 more tables are processed on the second call refresh_samples(self.factory.data_source.id, 50) table_metadata = TableMetadata.query.filter( TableMetadata.sample_updated_at.is_(None)).all() self.assertEqual(len(table_metadata), 5) # All tables are processed by the third call refresh_samples(self.factory.data_source.id, 50) table_metadata = TableMetadata.query.filter( TableMetadata.sample_updated_at.is_(None)).all() self.assertEqual(len(table_metadata), 0)
def test_refresh_samples_refreshes(self): NEW_COLUMN_NAME = 'new_column' NUM_TABLES = 5 TIME_BEFORE_UPDATE = utils.utcnow() tables = [] for i in range(NUM_TABLES): tables.append({ 'name': 'table{}'.format(i), 'columns': [NEW_COLUMN_NAME], 'metadata': [{ 'name': NEW_COLUMN_NAME, 'type': self.COLUMN_TYPE, }] }) self.patched_get_schema.return_value = tables self.factory.data_source.query_runner.configuration['samples'] = True refresh_schema(self.factory.data_source.id) refresh_samples(self.factory.data_source.id, 50) # There's a total of 5 processed tables table_metadata = TableMetadata.query.filter( TableMetadata.sample_updated_at.isnot(None)) self.assertEqual(table_metadata.count(), NUM_TABLES) self.assertTrue( table_metadata.first().sample_updated_at > TIME_BEFORE_UPDATE) table_metadata.update({ 'sample_updated_at': utils.utcnow() - datetime.timedelta(days=30) }) models.db.session.commit() TIME_BEFORE_UPDATE = utils.utcnow() refresh_samples(self.factory.data_source.id, 50) table_metadata_list = TableMetadata.query.filter( TableMetadata.sample_updated_at.isnot(None)) self.assertTrue( table_metadata_list.first().sample_updated_at > TIME_BEFORE_UPDATE)
def test_refresh_schema_update_column(self): UPDATED_COLUMN_TYPE = 'varchar' refresh_schema(self.factory.data_source.id) update_sample(self.factory.data_source.id, 'table', 1, "2019-05-09T17:07:52.386910Z") column_metadata = ColumnMetadata.query.all() self.assertEqual(column_metadata[0].to_dict(), self.EXPECTED_COLUMN_METADATA) updated_schema = copy.deepcopy(self.default_schema_return_value) updated_schema[0]['metadata'][0]['type'] = UPDATED_COLUMN_TYPE self.patched_get_schema.return_value = updated_schema refresh_schema(self.factory.data_source.id) column_metadata = ColumnMetadata.query.all() self.assertNotEqual(column_metadata[0].to_dict(), self.EXPECTED_COLUMN_METADATA) self.assertEqual(column_metadata[0].to_dict()['type'], UPDATED_COLUMN_TYPE)
def test_refresh_schema_doesnt_overwrite_samples(self): self.factory.data_source.query_runner.configuration["samples"] = True refresh_schema(self.factory.data_source.id) column_metadata = ColumnMetadata.query.first() self.assertEqual(column_metadata.example, None) update_sample( self.factory.data_source.id, "table", 1, utils.utcnow() - datetime.timedelta(days=90), ) column_metadata = ColumnMetadata.query.first() self.assertEqual(column_metadata.example, self.COLUMN_EXAMPLE) # Check that a schema refresh doesn't overwrite examples refresh_schema(self.factory.data_source.id) column_metadata = ColumnMetadata.query.first() self.assertEqual(column_metadata.example, self.COLUMN_EXAMPLE)
def test_recent_non_empty_sample_doesnt_refresh(self): self.factory.data_source.query_runner.configuration['samples'] = True refresh_schema(self.factory.data_source.id) update_sample(self.factory.data_source.id, 'table', 1, None) # Confirm a sample was added column_metadata = ColumnMetadata.query.first() self.assertEqual(column_metadata.example, self.COLUMN_EXAMPLE) self.patched_get_table_sample.return_value = { self.COLUMN_NAME: "a new example" } LAST_UPDATE = utils.utcnow() - datetime.timedelta(days=5) update_sample(self.factory.data_source.id, 'table', 1, LAST_UPDATE.isoformat()) # The sample doesn't take on the new value that is returned. column_metadata = ColumnMetadata.query.first() self.assertEqual(column_metadata.example, self.COLUMN_EXAMPLE)
def test_refresh_schema_table_with_new_metadata_updated(self): refresh_schema(self.factory.data_source.id) table_metadata = TableMetadata.query.all() column_metadata = ColumnMetadata.query.all() self.assertEqual(len(table_metadata), 1) self.assertEqual(len(column_metadata), 1) self.assertTrue(table_metadata[0].to_dict()['column_metadata']) # Table has no metdata field, `column_metadata` should be False. self.patched_get_schema.return_value = [{ 'name': 'table', 'columns': [self.COLUMN_NAME], }] refresh_schema(self.factory.data_source.id) table_metadata = TableMetadata.query.all() column_metadata = ColumnMetadata.query.all() self.assertEqual(len(table_metadata), 1) self.assertEqual(len(column_metadata), 1) self.assertFalse(table_metadata[0].to_dict()['column_metadata']) # Table metadata field is back, `column_metadata` should be True again. self.patched_get_schema.return_value = self.default_schema_return_value refresh_schema(self.factory.data_source.id) table_metadata = TableMetadata.query.all() self.assertTrue(table_metadata[0].to_dict()['column_metadata'])
def test_refresh_schema_deleted_table_marked(self): refresh_schema(self.factory.data_source.id) table_metadata = TableMetadata.query.all() column_metadata = ColumnMetadata.query.all() self.assertEqual(len(table_metadata), 1) self.assertEqual(len(column_metadata), 1) self.assertTrue(table_metadata[0].to_dict()['exists']) # Table is gone, `exists` should be False. self.patched_get_schema.return_value = [] refresh_schema(self.factory.data_source.id) table_metadata = TableMetadata.query.all() column_metadata = ColumnMetadata.query.all() self.assertEqual(len(table_metadata), 1) self.assertEqual(len(column_metadata), 1) self.assertFalse(table_metadata[0].to_dict()['exists']) # Table is back, `exists` should be True again. self.patched_get_schema.return_value = self.default_schema_return_value refresh_schema(self.factory.data_source.id) table_metadata = TableMetadata.query.all() self.assertTrue(table_metadata[0].to_dict()['exists'])
def test_refresh_schema_deleted_table_marked(self): refresh_schema(self.factory.data_source.id) table_metadata = TableMetadata.query.all() column_metadata = ColumnMetadata.query.all() self.assertEqual(len(table_metadata), 1) self.assertEqual(len(column_metadata), 1) self.assertTrue( TableMetadataSerializer( table_metadata[0], with_favorite_state=False).serialize()["exists"]) # Table is gone, `exists` should be False. self.patched_get_schema.return_value = [] refresh_schema(self.factory.data_source.id) table_metadata = TableMetadata.query.all() column_metadata = ColumnMetadata.query.all() self.assertEqual(len(table_metadata), 1) self.assertEqual(len(column_metadata), 1) self.assertFalse( TableMetadataSerializer( table_metadata[0], with_favorite_state=False).serialize()["exists"]) # Table is back, `exists` should be True again. self.patched_get_schema.return_value = self.default_schema_return_value refresh_schema(self.factory.data_source.id) table_metadata = TableMetadata.query.all() self.assertTrue( TableMetadataSerializer( table_metadata[0], with_favorite_state=False).serialize()["exists"])
def test_refresh_schema_update_column(self): UPDATED_COLUMN_TYPE = 'varchar' refresh_schema(self.factory.data_source.id) get_table_sample_data(self.factory.data_source.id, { "name": 'table', "columns": [self.COLUMN_NAME] }, 1) column_metadata = ColumnMetadata.query.all() self.assertEqual(column_metadata[0].to_dict(), self.EXPECTED_COLUMN_METADATA) updated_schema = copy.deepcopy(self.default_schema_return_value) updated_schema[0]['metadata'][0]['type'] = UPDATED_COLUMN_TYPE self.patched_get_schema.return_value = updated_schema refresh_schema(self.factory.data_source.id) column_metadata = ColumnMetadata.query.all() self.assertNotEqual(column_metadata[0].to_dict(), self.EXPECTED_COLUMN_METADATA) self.assertEqual(column_metadata[0].to_dict()['type'], UPDATED_COLUMN_TYPE)
def test_refresh_schema_delete_column(self): NEW_COLUMN_NAME = 'new_column' refresh_schema(self.factory.data_source.id) column_metadata = ColumnMetadata.query.all() self.assertTrue(column_metadata[0].to_dict()['exists']) self.patched_get_schema.return_value = [{ 'name': 'table', 'columns': [NEW_COLUMN_NAME], 'metadata': [{ 'name': NEW_COLUMN_NAME, 'type': self.COLUMN_TYPE, }] }] refresh_schema(self.factory.data_source.id) column_metadata = ColumnMetadata.query.all() self.assertEqual(len(column_metadata), 2) self.assertFalse(column_metadata[1].to_dict()['exists']) self.assertTrue(column_metadata[0].to_dict()['exists'])
def test_refresh_schema_creates_tables(self): EXPECTED_TABLE_METADATA = { "id": 1, "org_id": 1, "exists": True, "name": u"table", "visible": True, "description": None, "column_metadata": True, "data_source_id": 1, "sample_updated_at": None, "sample_queries": {}, "columns": [self.EXPECTED_COLUMN_METADATA], } refresh_schema(self.factory.data_source.id) update_sample( self.factory.data_source.id, "table", 1, utils.utcnow() - datetime.timedelta(days=90), ) table_metadata = TableMetadata.query.all() column_metadata = ColumnMetadata.query.all() self.assertEqual(len(table_metadata), 1) self.assertEqual(len(column_metadata), 1) self.assertEqual( TableMetadataSerializer(table_metadata[0], with_favorite_state=False).serialize(), EXPECTED_TABLE_METADATA, ) self.assertEqual( ColumnMetadataSerializer(column_metadata[0]).serialize(), self.EXPECTED_COLUMN_METADATA, )
def test_refresh_schema_table_with_new_metadata_updated(self): refresh_schema(self.factory.data_source.id) table_metadata = TableMetadata.query.all() column_metadata = ColumnMetadata.query.all() self.assertEqual(len(table_metadata), 1) self.assertEqual(len(column_metadata), 1) self.assertTrue( TableMetadataSerializer( table_metadata[0], with_favorite_state=False).serialize()["column_metadata"]) # Table has no metdata field, `column_metadata` should be False. self.patched_get_schema.return_value = [{ "name": "table", "columns": [self.COLUMN_NAME], }] refresh_schema(self.factory.data_source.id) table_metadata = TableMetadata.query.all() column_metadata = ColumnMetadata.query.all() self.assertEqual(len(table_metadata), 1) self.assertEqual(len(column_metadata), 1) self.assertFalse( TableMetadataSerializer( table_metadata[0], with_favorite_state=False).serialize()["column_metadata"]) # Table metadata field is back, `column_metadata` should be True again. self.patched_get_schema.return_value = self.default_schema_return_value refresh_schema(self.factory.data_source.id) table_metadata = TableMetadata.query.all() self.assertTrue( TableMetadataSerializer( table_metadata[0], with_favorite_state=False).serialize()["column_metadata"])