def test_dashboard_table_records(self) -> None: dashboard_table = DashboardTable( table_ids=[ 'hive://gold.schema/table1', 'hive://gold.schema/table2' ], cluster='cluster_id', product='product_id', dashboard_id='dashboard_id', dashboard_group_id='dashboard_group_id') actual1 = dashboard_table.create_next_record() actual1_serialized = mysql_serializer.serialize_record(actual1) expected1 = { 'dashboard_rk': 'product_id_dashboard://cluster_id.dashboard_group_id/dashboard_id', 'table_rk': 'hive://gold.schema/table1' } actual2 = dashboard_table.create_next_record() actual2_serialized = mysql_serializer.serialize_record(actual2) expected2 = { 'dashboard_rk': 'product_id_dashboard://cluster_id.dashboard_group_id/dashboard_id', 'table_rk': 'hive://gold.schema/table2' } assert actual1 is not None self.assertDictEqual(expected1, actual1_serialized) assert actual2 is not None self.assertDictEqual(expected2, actual2_serialized) self.assertIsNone(dashboard_table.create_next_record())
def test_dashboard_table_nodes(self) -> None: dashboard_table = DashboardTable(table_ids=['hive://gold.schema/table1', 'hive://gold.schema/table2'], cluster='cluster_id', product='product_id', dashboard_id='dashboard_id', dashboard_group_id='dashboard_group_id') actual = dashboard_table.create_next_node() self.assertIsNone(actual)
def test_dashboard_table_with_slash_as_name(self) -> None: dashboard_table = DashboardTable( table_ids=['bq/name://project/id.schema/name/table/name'], cluster='cluster_id', product='product_id', dashboard_id='dashboard_id', dashboard_group_id='dashboard_group_id') actual = dashboard_table.create_next_relation() self.assertIsNone(actual)
def test_dashboard_table_relations(self) -> None: dashboard_table = DashboardTable(table_ids=['hive://gold.schema/table1'], cluster='cluster_id', product='product_id', dashboard_id='dashboard_id', dashboard_group_id='dashboard_group_id') actual = dashboard_table.create_next_relation() actual_serialized = neo4_serializer.serialize_relationship(actual) actual_neptune_serialized = neptune_serializer.convert_relationship(actual) expected = {RELATION_END_KEY: 'hive://gold.schema/table1', RELATION_START_LABEL: 'Dashboard', RELATION_END_LABEL: 'Table', RELATION_START_KEY: 'product_id_dashboard://cluster_id.dashboard_group_id/dashboard_id', RELATION_TYPE: 'DASHBOARD_WITH_TABLE', RELATION_REVERSE_TYPE: 'TABLE_OF_DASHBOARD'} neptune_forward_expected = { NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id='Dashboard:product_id_dashboard://cluster_id.dashboard_group_id/dashboard_id', to_vertex_id='Table:hive://gold.schema/table1', label='DASHBOARD_WITH_TABLE' ), METADATA_KEY_PROPERTY_NAME: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id='Dashboard:product_id_dashboard://cluster_id.dashboard_group_id/dashboard_id', to_vertex_id='Table:hive://gold.schema/table1', label='DASHBOARD_WITH_TABLE' ), NEPTUNE_RELATIONSHIP_HEADER_FROM: 'Dashboard:product_id_dashboard://cluster_id.dashboard_group_id/dashboard_id', NEPTUNE_RELATIONSHIP_HEADER_TO: 'Table:hive://gold.schema/table1', NEPTUNE_HEADER_LABEL: 'DASHBOARD_WITH_TABLE', NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB } neptune_reversed_expected = { NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id='Table:hive://gold.schema/table1', to_vertex_id='Dashboard:product_id_dashboard://cluster_id.dashboard_group_id/dashboard_id', label='TABLE_OF_DASHBOARD' ), METADATA_KEY_PROPERTY_NAME: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id='Table:hive://gold.schema/table1', to_vertex_id='Dashboard:product_id_dashboard://cluster_id.dashboard_group_id/dashboard_id', label='TABLE_OF_DASHBOARD' ), NEPTUNE_RELATIONSHIP_HEADER_FROM: 'Table:hive://gold.schema/table1', NEPTUNE_RELATIONSHIP_HEADER_TO: 'Dashboard:product_id_dashboard://cluster_id.dashboard_group_id/dashboard_id', NEPTUNE_HEADER_LABEL: 'TABLE_OF_DASHBOARD', NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB } assert actual is not None self.assertDictEqual(actual_serialized, expected) self.assertDictEqual(actual_neptune_serialized[0], neptune_forward_expected) self.assertDictEqual(actual_neptune_serialized[1], neptune_reversed_expected)
def test_dashboard_table_without_dot_as_name(self) -> None: dashboard_table = DashboardTable(table_ids=['bq-name://project-id.schema-name/table-name'], cluster='cluster_id', product='product_id', dashboard_id='dashboard_id', dashboard_group_id='dashboard_group_id') actual = dashboard_table.create_next_relation() actual_serialized = neo4_serializer.serialize_relationship(actual) expected = {RELATION_END_KEY: 'bq-name://project-id.schema-name/table-name', RELATION_START_LABEL: 'Dashboard', RELATION_END_LABEL: 'Table', RELATION_START_KEY: 'product_id_dashboard://cluster_id.dashboard_group_id/dashboard_id', RELATION_TYPE: 'DASHBOARD_WITH_TABLE', RELATION_REVERSE_TYPE: 'TABLE_OF_DASHBOARD'} assert actual is not None self.assertDictEqual(actual_serialized, expected)
def test_dashboard_table_relations(self): # type: () -> None dashboard_table = DashboardTable(table_ids=['hive://gold.schema/table1'], cluster='cluster_id', product='product_id', dashboard_id='dashboard_id', dashboard_group_id='dashboard_group_id') actual = dashboard_table.create_next_relation() expected = {RELATION_END_KEY: 'hive://gold.schema/table1', RELATION_START_LABEL: 'Dashboard', RELATION_END_LABEL: 'Table', RELATION_START_KEY: 'product_id_dashboard://cluster_id.dashboard_group_id/dashboard_id', RELATION_TYPE: 'DASHBOARD_WITH_TABLE', RELATION_REVERSE_TYPE: 'TABLE_OF_DASHBOARD'} self.assertDictEqual(actual, expected)
def test_create_next_atlas_relation(self) -> None: dashboard_table = DashboardTable( table_ids=[ 'hive://gold.schema/table1', 'hive_table://gold.schema/table2' ], cluster='cluster_id', product='product_id', dashboard_id='dashboard_id', dashboard_group_id='dashboard_group_id', ) # 'hive' is db name compatible with Amundsen Databuilder sourced data. in such case qn = amundsen key # 'hive_table' is db name compatible with data sources from Atlas Hive Hook. in such case custom qn is used expected = [ { "relationshipType": "Table__Dashboard", "entityType1": "Table", "entityQualifiedName1": "hive://gold.schema/table1", "entityType2": "Dashboard", "entityQualifiedName2": "product_id_dashboard://cluster_id.dashboard_group_id/dashboard_id", }, { "relationshipType": "Table__Dashboard", "entityType1": "Table", "entityQualifiedName1": "schema.table2@gold", "entityType2": "Dashboard", "entityQualifiedName2": "product_id_dashboard://cluster_id.dashboard_group_id/dashboard_id", }, ] relationship = dashboard_table.create_next_atlas_relation( ) # type: ignore actual = [] while relationship: actual_serialized = atlas_serializer.serialize_relationship( relationship) actual.append(actual_serialized) relationship = dashboard_table.create_next_atlas_relation() self.assertEqual(expected, actual)
def _get_extract_iter(self) -> Iterator[Any]: while True: record = self._extractor.extract() if not record: break # the end. record = next(self._transformer.transform(record=record), None) if not self._is_published_dashboard(record): continue # filter this one out identity_data = { 'cluster': self._cluster, 'product': RedashDashboardExtractor.PRODUCT, 'dashboard_group_id': str(RedashDashboardExtractor.DASHBOARD_GROUP_ID), 'dashboard_id': str(record['dashboard_id']) } dash_data = { 'dashboard_group': RedashDashboardExtractor.DASHBOARD_GROUP_NAME, 'dashboard_group_url': self._redash_base_url, 'dashboard_name': record['dashboard_name'], 'dashboard_url': f'{self._redash_base_url}/dashboards/{record["dashboard_id"]}', 'created_timestamp': record['created_timestamp'] } dash_data.update(identity_data) widgets = sort_widgets(record['widgets']) text_widgets = get_text_widgets(widgets) viz_widgets = get_visualization_widgets(widgets) # generate a description for this dashboard, since Redash does not have descriptions dash_data['description'] = generate_dashboard_description( text_widgets, viz_widgets) yield DashboardMetadata(**dash_data) last_mod_data = { 'last_modified_timestamp': record['last_modified_timestamp'] } last_mod_data.update(identity_data) yield DashboardLastModifiedTimestamp(**last_mod_data) owner_data = {'email': record['user']['email']} owner_data.update(identity_data) yield DashboardOwner(**owner_data) table_keys = set() for viz in viz_widgets: query_data = { 'query_id': str(viz.query_id), 'query_name': viz.query_name, 'url': self._redash_base_url + viz.query_relative_url, 'query_text': viz.raw_query } query_data.update(identity_data) yield DashboardQuery(**query_data) chart_data = { 'query_id': str(viz.query_id), 'chart_id': str(viz.visualization_id), 'chart_name': viz.visualization_name, 'chart_type': viz.visualization_type, } chart_data.update(identity_data) yield DashboardChart(**chart_data) # if a table parser is provided, retrieve tables from this viz if self._parse_tables: for tbl in self._parse_tables(viz): table_keys.add(tbl.key) if len(table_keys) > 0: yield DashboardTable(table_ids=list(table_keys), **identity_data)
def test_with_one_dashboard(self) -> None: def mock_api_get(url: str, *args: Any, **kwargs: Any) -> MockApiResponse: if 'test-dash' in url: return MockApiResponse({ 'id': 123, 'widgets': [ { 'visualization': { 'query': { 'data_source_id': 1, 'id': '1234', 'name': 'Test Query', 'query': 'SELECT id FROM users' } }, 'options': {} } ] }) return MockApiResponse({ 'page': 1, 'count': 1, 'page_size': 50, 'results': [ { 'id': 123, 'name': 'Test Dash', 'slug': 'test-dash', 'created_at': '2020-01-01T00:00:00.000Z', 'updated_at': '2020-01-02T00:00:00.000Z', 'is_archived': False, 'is_draft': False, 'user': {'email': '*****@*****.**'} } ] }) redash_base_url = 'https://redash.example.com' config = ConfigFactory.from_dict({ 'extractor.redash_dashboard.redash_base_url': redash_base_url, 'extractor.redash_dashboard.api_base_url': redash_base_url, # probably not but doesn't matter 'extractor.redash_dashboard.api_key': 'abc123', 'extractor.redash_dashboard.table_parser': 'tests.unit.extractor.dashboard.redash.test_redash_dashboard_extractor.dummy_tables' }) with patch('databuilder.rest_api.rest_api_query.requests.get') as mock_get: mock_get.side_effect = mock_api_get extractor = RedashDashboardExtractor() extractor.init(Scoped.get_scoped_conf(conf=config, scope=extractor.get_scope())) # DashboardMetadata record = extractor.extract() self.assertEqual(record.dashboard_id, 123) self.assertEqual(record.dashboard_name, 'Test Dash') self.assertEqual(record.dashboard_group_id, RedashDashboardExtractor.DASHBOARD_GROUP_ID) self.assertEqual(record.dashboard_group, RedashDashboardExtractor.DASHBOARD_GROUP_NAME) self.assertEqual(record.product, RedashDashboardExtractor.PRODUCT) self.assertEqual(record.cluster, RedashDashboardExtractor.DEFAULT_CLUSTER) self.assertEqual(record.created_timestamp, 1577836800) self.assertTrue(redash_base_url in record.dashboard_url) self.assertTrue('test-dash' in record.dashboard_url) # DashboardLastModified record = extractor.extract() identity: Dict[str, Any] = { 'dashboard_id': 123, 'dashboard_group_id': RedashDashboardExtractor.DASHBOARD_GROUP_ID, 'product': RedashDashboardExtractor.PRODUCT, 'cluster': u'prod' } expected_timestamp = DashboardLastModifiedTimestamp( last_modified_timestamp=1577923200, **identity ) self.assertEqual(record.__repr__(), expected_timestamp.__repr__()) # DashboardOwner record = extractor.extract() expected_owner = DashboardOwner(email='*****@*****.**', **identity) self.assertEqual(record.__repr__(), expected_owner.__repr__()) # DashboardQuery record = extractor.extract() expected_query = DashboardQuery( query_id='1234', query_name='Test Query', url=u'{base}/queries/1234'.format(base=redash_base_url), query_text='SELECT id FROM users', **identity ) self.assertEqual(record.__repr__(), expected_query.__repr__()) # DashboardTable record = extractor.extract() expected_table = DashboardTable( table_ids=[TableRelationData('some_db', 'prod', 'public', 'users').key], **identity ) self.assertEqual(record.__repr__(), expected_table.__repr__())
def _get_extract_iter(self) -> Iterator[Union[DashboardTable, None]]: dashboards: Dict[str, set] = dict() ids = self._get_resource_ids('dataset') data = [(self._get_dataset_details(i), self._get_dataset_related_objects(i)) for i in ids] for entry in data: dataset_details, dataset_objects = entry database_id = self.get_nested_field(dataset_details, 'result.database.id') if database_id: database_details = self._get_database_details(database_id) sql = self.get_nested_field(dataset_details, 'result.sql') or '' # if sql exists then table_name cannot be associated with physical table in db if not len(sql) > 0: uri = self.get_nested_field(database_details, 'result.sqlalchemy_uri') database_spec = make_url(uri) db = self.driver_mapping.get(database_spec.drivername, database_spec.drivername) schema = database_spec.database cluster = self.cluster_mapping.get(str(database_id), self.cluster) tbl = self.get_nested_field(dataset_details, 'result.table_name') table_key = TableMetadata.TABLE_KEY_FORMAT.format( db=db, cluster=cluster, schema=schema, tbl=tbl) for dashboard in dataset_objects.get('dashboards', dict()).get( 'result', []): dashboard_id = str(dashboard.get('id')) if not dashboards.get(dashboard_id): dashboards[dashboard_id] = set() dashboards[dashboard_id].add(table_key) else: pass else: continue for dashboard_id, table_keys in dashboards.items(): table_metadata: Dict[str, Any] = { 'dashboard_id': dashboard_id, 'table_ids': table_keys } table_metadata.update(**self.common_params) result = DashboardTable(**table_metadata) yield result