def setUp(self) -> None: super(TestTableSource, self).setUp() self.table_source = TableSource(db_name='hive', schema=SCHEMA, table_name=TABLE, cluster=CLUSTER, source=SOURCE)
def setUp(self) -> None: super(TestTableSource, self).setUp() self.table_source = TableSource(db_name='hive', schema=SCHEMA, table_name=TABLE, cluster=CLUSTER, source=SOURCE) self.start_key = f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}/_source' self.end_key = f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}'
class TestTableSource(unittest.TestCase): def setUp(self): # type: () -> None super(TestTableSource, self).setUp() self.table_source = TableSource(db_name='hive', schema_name=SCHEMA, table_name=TABLE, cluster=CLUSTER, source=SOURCE) def test_get_source_model_key(self): # type: () -> None source = self.table_source.get_source_model_key() self.assertEquals(source, '{db}://{cluster}.{schema}/{tbl}/_source'.format(db=DB, schema=SCHEMA, tbl=TABLE, cluster=CLUSTER, )) def test_get_metadata_model_key(self): # type: () -> None metadata = self.table_source.get_metadata_model_key() self.assertEquals(metadata, 'hive://default.base/test') def test_create_nodes(self): # type: () -> None nodes = self.table_source.create_nodes() self.assertEquals(len(nodes), 1) def test_create_relation(self): # type: () -> None relations = self.table_source.create_relation() self.assertEquals(len(relations), 1) start_key = '{db}://{cluster}.{schema}/{tbl}/_source'.format(db=DB, schema=SCHEMA, tbl=TABLE, cluster=CLUSTER) end_key = '{db}://{cluster}.{schema}/{tbl}'.format(db=DB, schema=SCHEMA, tbl=TABLE, cluster=CLUSTER) relation = { RELATION_START_KEY: start_key, RELATION_START_LABEL: TableSource.LABEL, RELATION_END_KEY: end_key, RELATION_END_LABEL: 'Table', RELATION_TYPE: TableSource.SOURCE_TABLE_RELATION_TYPE, RELATION_REVERSE_TYPE: TableSource.TABLE_SOURCE_RELATION_TYPE } self.assertTrue(relation in relations)
class TestTableSource(unittest.TestCase): def setUp(self) -> None: super(TestTableSource, self).setUp() self.table_source = TableSource(db_name='hive', schema=SCHEMA, table_name=TABLE, cluster=CLUSTER, source=SOURCE) def test_get_source_model_key(self) -> None: source = self.table_source.get_source_model_key() self.assertEqual(source, f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}/_source') def test_get_metadata_model_key(self) -> None: metadata = self.table_source.get_metadata_model_key() self.assertEqual(metadata, 'hive://default.base/test') def test_create_nodes(self) -> None: nodes = self.table_source.create_nodes() self.assertEqual(len(nodes), 1) def test_create_relation(self) -> None: relations = self.table_source.create_relation() self.assertEquals(len(relations), 1) serialized_relation = neo4_serializer.serialize_relationship(relations[0]) start_key = f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}/_source' end_key = f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}' expected_relation = { RELATION_START_KEY: start_key, RELATION_START_LABEL: TableSource.LABEL, RELATION_END_KEY: end_key, RELATION_END_LABEL: 'Table', RELATION_TYPE: TableSource.SOURCE_TABLE_RELATION_TYPE, RELATION_REVERSE_TYPE: TableSource.TABLE_SOURCE_RELATION_TYPE } self.assertDictEqual(expected_relation, serialized_relation)
class TestTableSource(unittest.TestCase): def setUp(self) -> None: super(TestTableSource, self).setUp() self.table_source = TableSource(db_name='hive', schema=SCHEMA, table_name=TABLE, cluster=CLUSTER, source=SOURCE) self.start_key = f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}/_source' self.end_key = f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}' def test_get_source_model_key(self) -> None: source = self.table_source.get_source_model_key() self.assertEqual(source, f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}/_source') def test_get_metadata_model_key(self) -> None: metadata = self.table_source.get_metadata_model_key() self.assertEqual(metadata, 'hive://default.base/test') def test_create_nodes(self) -> None: nodes = self.table_source.create_nodes() self.assertEqual(len(nodes), 1) def test_create_relation(self) -> None: relations = self.table_source.create_relation() self.assertEquals(len(relations), 1) serialized_relation = neo4_serializer.serialize_relationship( relations[0]) expected_relation = { RELATION_START_KEY: self.start_key, RELATION_START_LABEL: TableSource.LABEL, RELATION_END_KEY: self.end_key, RELATION_END_LABEL: 'Table', RELATION_TYPE: TableSource.SOURCE_TABLE_RELATION_TYPE, RELATION_REVERSE_TYPE: TableSource.TABLE_SOURCE_RELATION_TYPE } self.assertDictEqual(expected_relation, serialized_relation) def test_create_relation_neptune(self) -> None: relations = self.table_source.create_relation() serialized_relations = neptune_serializer.convert_relationship( relations[0]) expected = [{ NEPTUNE_HEADER_ID: "{from_vertex_id}_{to_vertex_id}_{label}".format( from_vertex_id=self.start_key, to_vertex_id=self.end_key, label=TableSource.SOURCE_TABLE_RELATION_TYPE), NEPTUNE_RELATIONSHIP_HEADER_FROM: self.start_key, NEPTUNE_RELATIONSHIP_HEADER_TO: self.end_key, NEPTUNE_HEADER_LABEL: TableSource.SOURCE_TABLE_RELATION_TYPE, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB }, { NEPTUNE_HEADER_ID: "{from_vertex_id}_{to_vertex_id}_{label}".format( from_vertex_id=self.end_key, to_vertex_id=self.start_key, label=TableSource.TABLE_SOURCE_RELATION_TYPE), NEPTUNE_RELATIONSHIP_HEADER_FROM: self.end_key, NEPTUNE_RELATIONSHIP_HEADER_TO: self.start_key, NEPTUNE_HEADER_LABEL: TableSource.TABLE_SOURCE_RELATION_TYPE, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB }] self.assertListEqual(expected, serialized_relations)
def _get_extract_iter(self) -> Iterator[Union[TableMetadata, BadgeMetadata, TableSource, TableLineage]]: """ Generates the extract iterator for all of the model types created by the dbt files. """ dbt_id_to_table_key = {} for tbl_node, manifest_content in self._dbt_manifest['nodes'].items(): if manifest_content['resource_type'] == DBT_MODEL_TYPE and tbl_node in self._dbt_catalog['nodes']: LOGGER.info( 'Extracting dbt {}.{}'.format(manifest_content['schema'], manifest_content[self._model_name_key]) ) catalog_content = self._dbt_catalog['nodes'][tbl_node] tbl_columns: List[ColumnMetadata] = self._get_column_values( manifest_columns=manifest_content['columns'], catalog_columns=catalog_content['columns'] ) desc, desc_src = self._get_table_descriptions(manifest_content) tags, tbl_badges = self._get_table_tags_badges(manifest_content) tbl_metadata = TableMetadata( database=self._default_sanitize(self._database_name), # The dbt "database" is the cluster here cluster=self._default_sanitize(manifest_content['database']), schema=self._default_sanitize(manifest_content['schema']), name=self._default_sanitize(manifest_content[self._model_name_key]), is_view=catalog_content['metadata']['type'] == 'view', columns=tbl_columns, tags=tags, description=desc, description_source=desc_src ) # Keep track for Lineage dbt_id_to_table_key[tbl_node] = tbl_metadata._get_table_key() # Optionally filter schemas in the output yield_schema = self._can_yield_schema(manifest_content['schema']) if self._extract_tables and yield_schema: yield tbl_metadata if self._extract_tags and tbl_badges and yield_schema: yield BadgeMetadata(start_label=TableMetadata.TABLE_NODE_LABEL, start_key=tbl_metadata._get_table_key(), badges=[Badge(badge, 'table') for badge in tbl_badges]) if self._source_url and yield_schema: yield TableSource(db_name=tbl_metadata.database, cluster=tbl_metadata.cluster, schema=tbl_metadata.schema, table_name=tbl_metadata.name, source=os.path.join(self._source_url, manifest_content.get('original_file_path'))) if self._extract_lineage: for upstream, downstreams in self._dbt_manifest['child_map'].items(): if upstream not in dbt_id_to_table_key: continue valid_downstreams = [ dbt_id_to_table_key[k] for k in downstreams if k.startswith(DBT_MODEL_PREFIX) and dbt_id_to_table_key.get(k) ] if valid_downstreams: yield TableLineage( table_key=dbt_id_to_table_key[upstream], downstream_deps=valid_downstreams )
class TestTableSource(unittest.TestCase): def setUp(self) -> None: super(TestTableSource, self).setUp() self.table_source = TableSource(db_name='hive', schema=SCHEMA, table_name=TABLE, cluster=CLUSTER, source=SOURCE) self.start_key = f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}/_source' self.end_key = f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}' def test_get_source_model_key(self) -> None: source = self.table_source.get_source_model_key() self.assertEqual(source, f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}/_source') def test_get_metadata_model_key(self) -> None: metadata = self.table_source.get_metadata_model_key() self.assertEqual(metadata, 'hive://default.base/test') def test_create_nodes(self) -> None: expected_nodes = [{ 'LABEL': 'Source', 'KEY': f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}/_source', 'source': SOURCE, 'source_type': 'github' }] actual = [] node = self.table_source.create_next_node() while node: serialized_node = neo4_serializer.serialize_node(node) actual.append(serialized_node) node = self.table_source.create_next_node() self.assertEqual(expected_nodes, actual) def test_create_relation(self) -> None: expected_relations = [{ RELATION_START_KEY: self.start_key, RELATION_START_LABEL: TableSource.LABEL, RELATION_END_KEY: self.end_key, RELATION_END_LABEL: 'Table', RELATION_TYPE: TableSource.SOURCE_TABLE_RELATION_TYPE, RELATION_REVERSE_TYPE: TableSource.TABLE_SOURCE_RELATION_TYPE }] actual = [] relation = self.table_source.create_next_relation() while relation: serialized_relation = neo4_serializer.serialize_relationship(relation) actual.append(serialized_relation) relation = self.table_source.create_next_relation() self.assertEqual(expected_relations, actual) def test_create_relation_neptune(self) -> None: actual = [] relation = self.table_source.create_next_relation() while relation: serialized_relation = neptune_serializer.convert_relationship(relation) actual.append(serialized_relation) relation = self.table_source.create_next_relation() expected = [ [ { NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id="Source:" + self.start_key, to_vertex_id="Table:" + self.end_key, label=TableSource.SOURCE_TABLE_RELATION_TYPE ), METADATA_KEY_PROPERTY_NAME: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id="Source:" + self.start_key, to_vertex_id="Table:" + self.end_key, label=TableSource.SOURCE_TABLE_RELATION_TYPE ), NEPTUNE_RELATIONSHIP_HEADER_FROM: "Source:" + self.start_key, NEPTUNE_RELATIONSHIP_HEADER_TO: "Table:" + self.end_key, NEPTUNE_HEADER_LABEL: TableSource.SOURCE_TABLE_RELATION_TYPE, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB }, { NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id="Table:" + self.end_key, to_vertex_id="Source:" + self.start_key, label=TableSource.TABLE_SOURCE_RELATION_TYPE ), METADATA_KEY_PROPERTY_NAME: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id="Table:" + self.end_key, to_vertex_id="Source:" + self.start_key, label=TableSource.TABLE_SOURCE_RELATION_TYPE ), NEPTUNE_RELATIONSHIP_HEADER_FROM: "Table:" + self.end_key, NEPTUNE_RELATIONSHIP_HEADER_TO: "Source:" + self.start_key, NEPTUNE_HEADER_LABEL: TableSource.TABLE_SOURCE_RELATION_TYPE, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB } ] ] self.assertListEqual(expected, actual) def test_create_records(self) -> None: expected = [{ 'rk': self.table_source.get_source_model_key(), 'source': self.table_source.source, 'source_type': self.table_source.source_type, 'table_rk': self.table_source.get_metadata_model_key() }] actual = [] record = self.table_source.create_next_record() while record: serialized_record = mysql_serializer.serialize_record(record) actual.append(serialized_record) record = self.table_source.create_next_record() self.assertEqual(expected, actual)