def _create_column_nodes(self, col: ColumnMetadata) -> Iterator[GraphNode]: column_node = GraphNode(key=self._get_col_key(col), label=ColumnMetadata.COLUMN_NODE_LABEL, attributes={ ColumnMetadata.COLUMN_NAME: col.name, ColumnMetadata.COLUMN_TYPE: col.type, ColumnMetadata.COLUMN_ORDER: col.sort_order }) yield column_node if col.description: node_key = self._get_col_description_key(col, col.description) yield col.description.get_node(node_key) if col.badges: col_badge_metadata = BadgeMetadata( start_label=ColumnMetadata.COLUMN_NODE_LABEL, start_key=self._get_col_key(col), badges=col.badges) badge_nodes = col_badge_metadata.get_badge_nodes() for node in badge_nodes: yield node type_metadata = col.get_type_metadata() if type_metadata: yield from type_metadata.create_node_iterator()
def test_create_relation(self) -> None: relations = self.badge_metada.create_relation() serialized_relations = [ neo4_serializer.serialize_relationship(relation) for relation in relations ] self.assertEqual(len(relations), 2) relation1 = { RELATION_START_LABEL: self.badge_metada.start_label, RELATION_END_LABEL: BadgeMetadata.BADGE_NODE_LABEL, RELATION_START_KEY: self.badge_metada.start_key, RELATION_END_KEY: BadgeMetadata.get_badge_key(badge1.name), RELATION_TYPE: BadgeMetadata.BADGE_RELATION_TYPE, RELATION_REVERSE_TYPE: BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, } relation2 = { RELATION_START_LABEL: self.badge_metada.start_label, RELATION_END_LABEL: BadgeMetadata.BADGE_NODE_LABEL, RELATION_START_KEY: self.badge_metada.start_key, RELATION_END_KEY: BadgeMetadata.get_badge_key(badge2.name), RELATION_TYPE: BadgeMetadata.BADGE_RELATION_TYPE, RELATION_REVERSE_TYPE: BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, } self.assertTrue(relation1 in serialized_relations) self.assertTrue(relation2 in serialized_relations)
def test_create_relation(self) -> None: actual = [] relation = self.badge_metada.create_next_relation() while relation: serialized_relation = neo4_serializer.serialize_relationship( relation) actual.append(serialized_relation) relation = self.badge_metada.create_next_relation() relation1 = { RELATION_START_LABEL: self.badge_metada.start_label, RELATION_END_LABEL: BadgeMetadata.BADGE_NODE_LABEL, RELATION_START_KEY: self.badge_metada.start_key, RELATION_END_KEY: BadgeMetadata.get_badge_key(badge1.name), RELATION_TYPE: BadgeMetadata.BADGE_RELATION_TYPE, RELATION_REVERSE_TYPE: BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, } relation2 = { RELATION_START_LABEL: self.badge_metada.start_label, RELATION_END_LABEL: BadgeMetadata.BADGE_NODE_LABEL, RELATION_START_KEY: self.badge_metada.start_key, RELATION_END_KEY: BadgeMetadata.get_badge_key(badge2.name), RELATION_TYPE: BadgeMetadata.BADGE_RELATION_TYPE, RELATION_REVERSE_TYPE: BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, } expected = [relation1, relation2] self.assertEqual(expected, actual)
def _create_column_relations( self, col: ColumnMetadata) -> Iterator[GraphRelationship]: column_relationship = GraphRelationship( start_label=TableMetadata.TABLE_NODE_LABEL, start_key=self._get_table_key(), end_label=ColumnMetadata.COLUMN_NODE_LABEL, end_key=self._get_col_key(col), type=TableMetadata.TABLE_COL_RELATION_TYPE, reverse_type=TableMetadata.COL_TABLE_RELATION_TYPE, attributes={}) yield column_relationship if col.description: yield col.description.get_relation( ColumnMetadata.COLUMN_NODE_LABEL, self._get_col_key(col), self._get_col_description_key(col, col.description)) if col.badges: badge_metadata = BadgeMetadata( start_label=ColumnMetadata.COLUMN_NODE_LABEL, start_key=self._get_col_key(col), badges=col.badges) badge_relations = badge_metadata.get_badge_relations() for relation in badge_relations: yield relation type_metadata = col.get_type_metadata() if type_metadata: yield from type_metadata.create_relation_iterator()
def setUp(self) -> None: super(TestBadge, self).setUp() self.badge_metada = BadgeMetadata( db_name='hive', schema=SCHEMA, start_label='Column', start_key='hive://default.base/test/ds', cluster=CLUSTER, badges=[badge1, badge2])
def _create_next_node(self) -> Iterator[GraphNode]: yield self._create_table_node() if self.description: node_key = self._get_table_description_key(self.description) yield self.description.get_node(node_key) # Create the table tag nodes if self.tags: for tag in self.tags: tag_node = TagMetadata(tag).get_node() yield tag_node for col in self.columns: column_node = GraphNode(key=self._get_col_key(col), label=ColumnMetadata.COLUMN_NODE_LABEL, attributes={ ColumnMetadata.COLUMN_NAME: col.name, ColumnMetadata.COLUMN_TYPE: col.type, ColumnMetadata.COLUMN_ORDER: col.sort_order }) yield column_node if col.description: node_key = self._get_col_description_key(col, col.description) yield col.description.get_node(node_key) if col.badges: col_badge_metadata = BadgeMetadata( start_label=ColumnMetadata.COLUMN_NODE_LABEL, start_key=self._get_col_key(col), badges=col.badges) badge_nodes = col_badge_metadata.get_badge_nodes() for node in badge_nodes: yield node # Database, cluster, schema others = [ GraphNode(key=self._get_database_key(), label=TableMetadata.DATABASE_NODE_LABEL, attributes={'name': self.database}), GraphNode(key=self._get_cluster_key(), label=TableMetadata.CLUSTER_NODE_LABEL, attributes={'name': self.cluster}), GraphNode(key=self._get_schema_key(), label=TableMetadata.SCHEMA_NODE_LABEL, attributes={'name': self.schema}) ] for node_tuple in others: if node_tuple.key not in TableMetadata.serialized_nodes_keys: TableMetadata.serialized_nodes_keys.add(node_tuple.key) yield node_tuple
def _load_csv(self) -> None: with open(self.badge_file_location, 'r') as fin: self.badges = [dict(i) for i in csv.DictReader(fin)] # print("BADGES: " + str(self.badges)) parsed_badges = defaultdict(list) for badge_dict in self.badges: db = badge_dict['database'] cluster = badge_dict['cluster'] schema = badge_dict['schema'] table_name = badge_dict['table_name'] id = self._get_key(db, cluster, schema, table_name) badge = Badge(name=badge_dict['name'], category=badge_dict['category']) parsed_badges[id].append(badge) with open(self.table_file_location, 'r') as fin: tables = [dict(i) for i in csv.DictReader(fin)] results = [] for table_dict in tables: db = table_dict['database'] cluster = table_dict['cluster'] schema = table_dict['schema'] table_name = table_dict['name'] id = self._get_key(db, cluster, schema, table_name) badges = parsed_badges[id] if badges is None: badges = [] badge_metadata = BadgeMetadata(start_label=TableMetadata.TABLE_NODE_LABEL, start_key=id, badges=badges) results.append(badge_metadata) self._iter = iter(results)
def test_create_relation_neptune(self) -> None: relations = self.badge_metada.create_relation() serialized_relations: List[Dict] = sum([ neptune_serializer.convert_relationship(rel) for rel in relations ], []) neptune_forward_expected_1 = { NEPTUNE_HEADER_ID: "{from_vertex_id}_{to_vertex_id}_{label}".format( from_vertex_id=self.badge_metada.start_key, to_vertex_id=BadgeMetadata.get_badge_key(badge1.name), label=BadgeMetadata.BADGE_RELATION_TYPE, ), NEPTUNE_RELATIONSHIP_HEADER_FROM: self.badge_metada.start_key, NEPTUNE_RELATIONSHIP_HEADER_TO: BadgeMetadata.get_badge_key(badge1.name), NEPTUNE_HEADER_LABEL: BadgeMetadata.BADGE_RELATION_TYPE, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB } neptune_reversed_expected_1 = { NEPTUNE_HEADER_ID: "{from_vertex_id}_{to_vertex_id}_{label}".format( from_vertex_id=BadgeMetadata.get_badge_key(badge1.name), to_vertex_id=self.badge_metada.start_key, label=BadgeMetadata.INVERSE_BADGE_RELATION_TYPE), NEPTUNE_RELATIONSHIP_HEADER_FROM: BadgeMetadata.get_badge_key(badge1.name), NEPTUNE_RELATIONSHIP_HEADER_TO: self.badge_metada.start_key, NEPTUNE_HEADER_LABEL: BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB } neptune_forward_expected_2 = { NEPTUNE_HEADER_ID: "{from_vertex_id}_{to_vertex_id}_{label}".format( from_vertex_id=self.badge_metada.start_key, to_vertex_id=BadgeMetadata.get_badge_key(badge2.name), label=BadgeMetadata.BADGE_RELATION_TYPE, ), NEPTUNE_RELATIONSHIP_HEADER_FROM: self.badge_metada.start_key, NEPTUNE_RELATIONSHIP_HEADER_TO: BadgeMetadata.get_badge_key(badge2.name), NEPTUNE_HEADER_LABEL: BadgeMetadata.BADGE_RELATION_TYPE, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB } neptune_reversed_expected_2 = { NEPTUNE_HEADER_ID: "{from_vertex_id}_{to_vertex_id}_{label}".format( from_vertex_id=BadgeMetadata.get_badge_key(badge2.name), to_vertex_id=self.badge_metada.start_key, label=BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, ), NEPTUNE_RELATIONSHIP_HEADER_FROM: BadgeMetadata.get_badge_key(badge2.name), NEPTUNE_RELATIONSHIP_HEADER_TO: self.badge_metada.start_key, NEPTUNE_HEADER_LABEL: BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB } self.assertTrue(neptune_forward_expected_1 in serialized_relations) self.assertTrue(neptune_reversed_expected_1 in serialized_relations) self.assertTrue(neptune_forward_expected_2 in serialized_relations) self.assertTrue(neptune_reversed_expected_2 in serialized_relations)
def _create_next_relation(self) -> Iterator[GraphRelationship]: schema_table_relationship = GraphRelationship( start_key=self._get_schema_key(), start_label=TableMetadata.SCHEMA_NODE_LABEL, end_key=self._get_table_key(), end_label=TableMetadata.TABLE_NODE_LABEL, type=TableMetadata.SCHEMA_TABLE_RELATION_TYPE, reverse_type=TableMetadata.TABLE_SCHEMA_RELATION_TYPE, attributes={}) yield schema_table_relationship if self.description: yield self.description.get_relation( TableMetadata.TABLE_NODE_LABEL, self._get_table_key(), self._get_table_description_key(self.description)) if self.tags: for tag in self.tags: tag_relationship = GraphRelationship( start_label=TableMetadata.TABLE_NODE_LABEL, start_key=self._get_table_key(), end_label=TagMetadata.TAG_NODE_LABEL, end_key=TagMetadata.get_tag_key(tag), type=TableMetadata.TABLE_TAG_RELATION_TYPE, reverse_type=TableMetadata.TAG_TABLE_RELATION_TYPE, attributes={}) yield tag_relationship for col in self.columns: column_relationship = GraphRelationship( start_label=TableMetadata.TABLE_NODE_LABEL, start_key=self._get_table_key(), end_label=ColumnMetadata.COLUMN_NODE_LABEL, end_key=self._get_col_key(col), type=TableMetadata.TABLE_COL_RELATION_TYPE, reverse_type=TableMetadata.COL_TABLE_RELATION_TYPE, attributes={}) yield column_relationship if col.description: yield col.description.get_relation( ColumnMetadata.COLUMN_NODE_LABEL, self._get_col_key(col), self._get_col_description_key(col, col.description)) if col.badges: badge_metadata = BadgeMetadata( start_label=ColumnMetadata.COLUMN_NODE_LABEL, start_key=self._get_col_key(col), badges=col.badges) badge_relations = badge_metadata.create_relation() for relation in badge_relations: yield relation others = [ GraphRelationship( start_label=TableMetadata.DATABASE_NODE_LABEL, end_label=TableMetadata.CLUSTER_NODE_LABEL, start_key=self._get_database_key(), end_key=self._get_cluster_key(), type=TableMetadata.DATABASE_CLUSTER_RELATION_TYPE, reverse_type=TableMetadata.CLUSTER_DATABASE_RELATION_TYPE, attributes={}), GraphRelationship( start_label=TableMetadata.CLUSTER_NODE_LABEL, end_label=TableMetadata.SCHEMA_NODE_LABEL, start_key=self._get_cluster_key(), end_key=self._get_schema_key(), type=TableMetadata.CLUSTER_SCHEMA_RELATION_TYPE, reverse_type=TableMetadata.SCHEMA_CLUSTER_RELATION_TYPE, attributes={}) ] for rel_tuple in others: if (rel_tuple.start_key, rel_tuple.end_key, rel_tuple.type) not in TableMetadata.serialized_rels_keys: TableMetadata.serialized_rels_keys.add( (rel_tuple.start_key, rel_tuple.end_key, rel_tuple.type)) yield rel_tuple
class TestBadge(unittest.TestCase): def setUp(self) -> None: super(TestBadge, self).setUp() self.badge_metada = BadgeMetadata( start_label='Column', start_key='hive://default.base/test/ds', badges=[badge1, badge2]) def test_get_badge_key(self) -> None: badge_key = self.badge_metada.get_badge_key(badge1.name) self.assertEqual(badge_key, badge1.name) def test_create_nodes(self) -> None: node1 = { NODE_KEY: BadgeMetadata.BADGE_KEY_FORMAT.format(badge=badge1.name), NODE_LABEL: BadgeMetadata.BADGE_NODE_LABEL, BadgeMetadata.BADGE_CATEGORY: badge1.category } node2 = { NODE_KEY: BadgeMetadata.BADGE_KEY_FORMAT.format(badge=badge2.name), NODE_LABEL: BadgeMetadata.BADGE_NODE_LABEL, BadgeMetadata.BADGE_CATEGORY: badge2.category } expected = [node1, node2] actual = [] node = self.badge_metada.create_next_node() while node: serialized_node = neo4_serializer.serialize_node(node) actual.append(serialized_node) node = self.badge_metada.create_next_node() self.assertEqual(expected, actual) def test_create_nodes_neptune(self) -> None: actual = [] node = self.badge_metada.create_next_node() while node: serialized_node = neptune_serializer.convert_node(node) actual.append(serialized_node) node = self.badge_metada.create_next_node() node_key_1 = BadgeMetadata.BADGE_KEY_FORMAT.format(badge=badge1.name) node_id_1 = BadgeMetadata.BADGE_NODE_LABEL + ":" + node_key_1 expected_node1 = { NEPTUNE_HEADER_ID: node_id_1, METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT: node_key_1, NEPTUNE_HEADER_LABEL: BadgeMetadata.BADGE_NODE_LABEL, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_NODE_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB, BadgeMetadata.BADGE_CATEGORY + ':String(single)': badge1.category } node_key_2 = BadgeMetadata.BADGE_KEY_FORMAT.format(badge=badge2.name) node_id_2 = BadgeMetadata.BADGE_NODE_LABEL + ":" + node_key_2 expected_node2 = { NEPTUNE_HEADER_ID: node_id_2, METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT: node_key_2, NEPTUNE_HEADER_LABEL: BadgeMetadata.BADGE_NODE_LABEL, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_NODE_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB, BadgeMetadata.BADGE_CATEGORY + ':String(single)': badge2.category } expected = [expected_node1, expected_node2] self.assertEqual(expected, actual) def test_bad_key_entity_match(self) -> None: column_label = 'Column' table_key = 'hive://default.base/test' self.assertRaises(Exception, BadgeMetadata, start_label=column_label, start_key=table_key, badges=[badge1, badge2]) def test_bad_entity_label(self) -> None: user_label = 'User' table_key = 'hive://default.base/test' self.assertRaises(Exception, BadgeMetadata, start_label=user_label, start_key=table_key, badges=[badge1, badge2]) def test_create_relation(self) -> None: actual = [] relation = self.badge_metada.create_next_relation() while relation: serialized_relation = neo4_serializer.serialize_relationship( relation) actual.append(serialized_relation) relation = self.badge_metada.create_next_relation() relation1 = { RELATION_START_LABEL: self.badge_metada.start_label, RELATION_END_LABEL: BadgeMetadata.BADGE_NODE_LABEL, RELATION_START_KEY: self.badge_metada.start_key, RELATION_END_KEY: BadgeMetadata.get_badge_key(badge1.name), RELATION_TYPE: BadgeMetadata.BADGE_RELATION_TYPE, RELATION_REVERSE_TYPE: BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, } relation2 = { RELATION_START_LABEL: self.badge_metada.start_label, RELATION_END_LABEL: BadgeMetadata.BADGE_NODE_LABEL, RELATION_START_KEY: self.badge_metada.start_key, RELATION_END_KEY: BadgeMetadata.get_badge_key(badge2.name), RELATION_TYPE: BadgeMetadata.BADGE_RELATION_TYPE, RELATION_REVERSE_TYPE: BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, } expected = [relation1, relation2] self.assertEqual(expected, actual) def test_create_relation_neptune(self) -> None: actual = [] relation = self.badge_metada.create_next_relation() while relation: serialized_relations = neptune_serializer.convert_relationship( relation) actual.append(serialized_relations) relation = self.badge_metada.create_next_relation() badge_id_1 = BadgeMetadata.BADGE_NODE_LABEL + ':' + BadgeMetadata.get_badge_key( badge1.name) badge_id_2 = BadgeMetadata.BADGE_NODE_LABEL + ':' + BadgeMetadata.get_badge_key( badge2.name) start_key = self.badge_metada.start_label + ':' + self.badge_metada.start_key neptune_forward_expected_1 = { NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id=start_key, to_vertex_id=badge_id_1, label=BadgeMetadata.BADGE_RELATION_TYPE, ), METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id=start_key, to_vertex_id=badge_id_1, label=BadgeMetadata.BADGE_RELATION_TYPE, ), NEPTUNE_RELATIONSHIP_HEADER_FROM: start_key, NEPTUNE_RELATIONSHIP_HEADER_TO: badge_id_1, NEPTUNE_HEADER_LABEL: BadgeMetadata.BADGE_RELATION_TYPE, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB } neptune_reversed_expected_1 = { NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id=badge_id_1, to_vertex_id=start_key, label=BadgeMetadata.INVERSE_BADGE_RELATION_TYPE), METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id=badge_id_1, to_vertex_id=start_key, label=BadgeMetadata.INVERSE_BADGE_RELATION_TYPE), NEPTUNE_RELATIONSHIP_HEADER_FROM: badge_id_1, NEPTUNE_RELATIONSHIP_HEADER_TO: start_key, NEPTUNE_HEADER_LABEL: BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB } neptune_forward_expected_2 = { NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id=start_key, to_vertex_id=badge_id_2, label=BadgeMetadata.BADGE_RELATION_TYPE, ), METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id=start_key, to_vertex_id=badge_id_2, label=BadgeMetadata.BADGE_RELATION_TYPE, ), NEPTUNE_RELATIONSHIP_HEADER_FROM: start_key, NEPTUNE_RELATIONSHIP_HEADER_TO: badge_id_2, NEPTUNE_HEADER_LABEL: BadgeMetadata.BADGE_RELATION_TYPE, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB } neptune_reversed_expected_2 = { NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id=badge_id_2, to_vertex_id=start_key, label=BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, ), METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id=badge_id_2, to_vertex_id=start_key, label=BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, ), NEPTUNE_RELATIONSHIP_HEADER_FROM: badge_id_2, NEPTUNE_RELATIONSHIP_HEADER_TO: start_key, NEPTUNE_HEADER_LABEL: BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB } expected = [[neptune_forward_expected_1, neptune_reversed_expected_1], [neptune_forward_expected_2, neptune_reversed_expected_2]] self.assertEqual(expected, actual) def test_create_records(self) -> None: expected = [{ 'rk': BadgeMetadata.BADGE_KEY_FORMAT.format(badge=badge1.name), 'category': badge1.category }, { 'rk': BadgeMetadata.BADGE_KEY_FORMAT.format(badge=badge2.name), 'category': badge2.category }] actual = [] record = self.badge_metada.create_next_record() while record: serialized_record = mysql_serializer.serialize_record(record) actual.append(serialized_record) record = self.badge_metada.create_next_record() self.assertEqual(expected, actual)
def test_create_relation_neptune(self) -> None: actual = [] relation = self.badge_metada.create_next_relation() while relation: serialized_relations = neptune_serializer.convert_relationship( relation) actual.append(serialized_relations) relation = self.badge_metada.create_next_relation() badge_id_1 = BadgeMetadata.BADGE_NODE_LABEL + ':' + BadgeMetadata.get_badge_key( badge1.name) badge_id_2 = BadgeMetadata.BADGE_NODE_LABEL + ':' + BadgeMetadata.get_badge_key( badge2.name) start_key = self.badge_metada.start_label + ':' + self.badge_metada.start_key neptune_forward_expected_1 = { NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id=start_key, to_vertex_id=badge_id_1, label=BadgeMetadata.BADGE_RELATION_TYPE, ), METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id=start_key, to_vertex_id=badge_id_1, label=BadgeMetadata.BADGE_RELATION_TYPE, ), NEPTUNE_RELATIONSHIP_HEADER_FROM: start_key, NEPTUNE_RELATIONSHIP_HEADER_TO: badge_id_1, NEPTUNE_HEADER_LABEL: BadgeMetadata.BADGE_RELATION_TYPE, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB } neptune_reversed_expected_1 = { NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id=badge_id_1, to_vertex_id=start_key, label=BadgeMetadata.INVERSE_BADGE_RELATION_TYPE), METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id=badge_id_1, to_vertex_id=start_key, label=BadgeMetadata.INVERSE_BADGE_RELATION_TYPE), NEPTUNE_RELATIONSHIP_HEADER_FROM: badge_id_1, NEPTUNE_RELATIONSHIP_HEADER_TO: start_key, NEPTUNE_HEADER_LABEL: BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB } neptune_forward_expected_2 = { NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id=start_key, to_vertex_id=badge_id_2, label=BadgeMetadata.BADGE_RELATION_TYPE, ), METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id=start_key, to_vertex_id=badge_id_2, label=BadgeMetadata.BADGE_RELATION_TYPE, ), NEPTUNE_RELATIONSHIP_HEADER_FROM: start_key, NEPTUNE_RELATIONSHIP_HEADER_TO: badge_id_2, NEPTUNE_HEADER_LABEL: BadgeMetadata.BADGE_RELATION_TYPE, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB } neptune_reversed_expected_2 = { NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id=badge_id_2, to_vertex_id=start_key, label=BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, ), METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id=badge_id_2, to_vertex_id=start_key, label=BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, ), NEPTUNE_RELATIONSHIP_HEADER_FROM: badge_id_2, NEPTUNE_RELATIONSHIP_HEADER_TO: start_key, NEPTUNE_HEADER_LABEL: BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB } expected = [[neptune_forward_expected_1, neptune_reversed_expected_1], [neptune_forward_expected_2, neptune_reversed_expected_2]] self.assertEqual(expected, actual)
def _create_next_relation(self) -> Iterator[Any]: yield { RELATION_START_LABEL: TableMetadata.SCHEMA_NODE_LABEL, RELATION_END_LABEL: TableMetadata.TABLE_NODE_LABEL, RELATION_START_KEY: self._get_schema_key(), RELATION_END_KEY: self._get_table_key(), RELATION_TYPE: TableMetadata.SCHEMA_TABLE_RELATION_TYPE, RELATION_REVERSE_TYPE: TableMetadata.TABLE_SCHEMA_RELATION_TYPE } if self.description: yield self.description.get_relation(TableMetadata.TABLE_NODE_LABEL, self._get_table_key(), self._get_table_description_key(self.description)) if self.tags: for tag in self.tags: yield { RELATION_START_LABEL: TableMetadata.TABLE_NODE_LABEL, RELATION_END_LABEL: TagMetadata.TAG_NODE_LABEL, RELATION_START_KEY: self._get_table_key(), RELATION_END_KEY: TagMetadata.get_tag_key(tag), RELATION_TYPE: TableMetadata.TABLE_TAG_RELATION_TYPE, RELATION_REVERSE_TYPE: TableMetadata.TAG_TABLE_RELATION_TYPE, } for col in self.columns: yield { RELATION_START_LABEL: TableMetadata.TABLE_NODE_LABEL, RELATION_END_LABEL: ColumnMetadata.COLUMN_NODE_LABEL, RELATION_START_KEY: self._get_table_key(), RELATION_END_KEY: self._get_col_key(col), RELATION_TYPE: TableMetadata.TABLE_COL_RELATION_TYPE, RELATION_REVERSE_TYPE: TableMetadata.COL_TABLE_RELATION_TYPE } if col.description: yield col.description.get_relation(ColumnMetadata.COLUMN_NODE_LABEL, self._get_col_key(col), self._get_col_description_key(col, col.description)) if col.badges: badge_metadata = BadgeMetadata(db_name=self._get_database_key(), schema=self._get_schema_key(), start_label=ColumnMetadata.COLUMN_NODE_LABEL, start_key=self._get_col_key(col), badges=col.badges, cluster=self._get_cluster_key()) badge_relations = badge_metadata.create_relation() for relation in badge_relations: yield relation others = [ RelTuple(start_label=TableMetadata.DATABASE_NODE_LABEL, end_label=TableMetadata.CLUSTER_NODE_LABEL, start_key=self._get_database_key(), end_key=self._get_cluster_key(), type=TableMetadata.DATABASE_CLUSTER_RELATION_TYPE, reverse_type=TableMetadata.CLUSTER_DATABASE_RELATION_TYPE), RelTuple(start_label=TableMetadata.CLUSTER_NODE_LABEL, end_label=TableMetadata.SCHEMA_NODE_LABEL, start_key=self._get_cluster_key(), end_key=self._get_schema_key(), type=TableMetadata.CLUSTER_SCHEMA_RELATION_TYPE, reverse_type=TableMetadata.SCHEMA_CLUSTER_RELATION_TYPE) ] for rel_tuple in others: if rel_tuple not in TableMetadata.serialized_rels: TableMetadata.serialized_rels.add(rel_tuple) yield { RELATION_START_LABEL: rel_tuple.start_label, RELATION_END_LABEL: rel_tuple.end_label, RELATION_START_KEY: rel_tuple.start_key, RELATION_END_KEY: rel_tuple.end_key, RELATION_TYPE: rel_tuple.type, RELATION_REVERSE_TYPE: rel_tuple.reverse_type }
def _create_record_iterator(self) -> Iterator[RDSModel]: # Database, Cluster, Schema others: List[RDSModel] = [ RDSDatabase(rk=self._get_database_key(), name=self.database), RDSCluster(rk=self._get_cluster_key(), name=self.cluster, database_rk=self._get_database_key()), RDSSchema(rk=self._get_schema_key(), name=self.schema, cluster_rk=self._get_cluster_key()) ] for record in others: if record.rk not in TableMetadata.serialized_records_keys: TableMetadata.serialized_records_keys.add(record.rk) yield record # Table yield RDSTable(rk=self._get_table_key(), name=self.name, is_view=self.is_view, schema_rk=self._get_schema_key()) # Table description if self.description: description_record_key = self._get_table_description_key( self.description) if self.description.label == DescriptionMetadata.DESCRIPTION_NODE_LABEL: yield RDSTableDescription( rk=description_record_key, description_source=self.description.source, description=self.description.text, table_rk=self._get_table_key()) else: yield RDSTableProgrammaticDescription( rk=description_record_key, description_source=self.description.source, description=self.description.text, table_rk=self._get_table_key()) # Tag for tag in self.tags: tag_record = TagMetadata(tag).get_record() yield tag_record table_tag_record = RDSTableTag(table_rk=self._get_table_key(), tag_rk=TagMetadata.get_tag_key(tag)) yield table_tag_record # Column for col in self.columns: yield RDSTableColumn(rk=self._get_col_key(col), name=col.name, type=col.type, sort_order=col.sort_order, table_rk=self._get_table_key()) if col.description: description_record_key = self._get_col_description_key( col, col.description) yield RDSColumnDescription( rk=description_record_key, description_source=col.description.source, description=col.description.text, column_rk=self._get_col_key(col)) if col.badges: badge_metadata = BadgeMetadata( start_label=ColumnMetadata.COLUMN_NODE_LABEL, start_key=self._get_col_key(col), badges=col.badges) badge_records = badge_metadata.get_badge_records() for badge_record in badge_records: yield badge_record column_badge_record = RDSColumnBadge( column_rk=self._get_col_key(col), badge_rk=badge_record.rk) yield column_badge_record
class TestBadge(unittest.TestCase): def setUp(self) -> None: super(TestBadge, self).setUp() self.badge_metada = BadgeMetadata( start_label='Column', start_key='hive://default.base/test/ds', badges=[badge1, badge2]) def test_get_badge_key(self) -> None: badge_key = self.badge_metada.get_badge_key(badge1.name) self.assertEqual(badge_key, badge1.name) def test_create_nodes(self) -> None: nodes = self.badge_metada.create_nodes() self.assertEqual(len(nodes), 2) node1 = { NODE_KEY: BadgeMetadata.BADGE_KEY_FORMAT.format(badge=badge1.name), NODE_LABEL: BadgeMetadata.BADGE_NODE_LABEL, BadgeMetadata.BADGE_CATEGORY: badge1.category } node2 = { NODE_KEY: BadgeMetadata.BADGE_KEY_FORMAT.format(badge=badge2.name), NODE_LABEL: BadgeMetadata.BADGE_NODE_LABEL, BadgeMetadata.BADGE_CATEGORY: badge2.category } serialized_nodes = [ neo4_serializer.serialize_node(node) for node in nodes ] self.assertTrue(node1 in serialized_nodes) self.assertTrue(node2 in serialized_nodes) def test_bad_key_entity_match(self) -> None: column_label = 'Column' table_key = 'hive://default.base/test' self.assertRaises(Exception, BadgeMetadata, start_label=column_label, start_key=table_key, badges=[badge1, badge2]) def test_bad_entity_label(self) -> None: user_label = 'User' table_key = 'hive://default.base/test' self.assertRaises(Exception, BadgeMetadata, start_label=user_label, start_key=table_key, badges=[badge1, badge2]) def test_create_relation(self) -> None: relations = self.badge_metada.create_relation() serialized_relations = [ neo4_serializer.serialize_relationship(relation) for relation in relations ] self.assertEqual(len(relations), 2) relation1 = { RELATION_START_LABEL: self.badge_metada.start_label, RELATION_END_LABEL: BadgeMetadata.BADGE_NODE_LABEL, RELATION_START_KEY: self.badge_metada.start_key, RELATION_END_KEY: BadgeMetadata.get_badge_key(badge1.name), RELATION_TYPE: BadgeMetadata.BADGE_RELATION_TYPE, RELATION_REVERSE_TYPE: BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, } relation2 = { RELATION_START_LABEL: self.badge_metada.start_label, RELATION_END_LABEL: BadgeMetadata.BADGE_NODE_LABEL, RELATION_START_KEY: self.badge_metada.start_key, RELATION_END_KEY: BadgeMetadata.get_badge_key(badge2.name), RELATION_TYPE: BadgeMetadata.BADGE_RELATION_TYPE, RELATION_REVERSE_TYPE: BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, } self.assertTrue(relation1 in serialized_relations) self.assertTrue(relation2 in serialized_relations)
class TestBadge(unittest.TestCase): def setUp(self) -> None: super(TestBadge, self).setUp() self.badge_metada = BadgeMetadata( start_label='Column', start_key='hive://default.base/test/ds', badges=[badge1, badge2]) def test_get_badge_key(self) -> None: badge_key = self.badge_metada.get_badge_key(badge1.name) self.assertEqual(badge_key, badge1.name) def test_create_nodes(self) -> None: nodes = self.badge_metada.create_nodes() self.assertEqual(len(nodes), 2) node1 = { NODE_KEY: BadgeMetadata.BADGE_KEY_FORMAT.format(badge=badge1.name), NODE_LABEL: BadgeMetadata.BADGE_NODE_LABEL, BadgeMetadata.BADGE_CATEGORY: badge1.category } node2 = { NODE_KEY: BadgeMetadata.BADGE_KEY_FORMAT.format(badge=badge2.name), NODE_LABEL: BadgeMetadata.BADGE_NODE_LABEL, BadgeMetadata.BADGE_CATEGORY: badge2.category } serialized_nodes = [ neo4_serializer.serialize_node(node) for node in nodes ] self.assertTrue(node1 in serialized_nodes) self.assertTrue(node2 in serialized_nodes) def test_create_nodes_neptune(self) -> None: nodes = self.badge_metada.create_nodes() serialized_nodes = [ neptune_serializer.convert_node(node) for node in nodes ] expected_node1 = { NEPTUNE_HEADER_ID: BadgeMetadata.BADGE_KEY_FORMAT.format(badge=badge1.name), NEPTUNE_HEADER_LABEL: BadgeMetadata.BADGE_NODE_LABEL, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_NODE_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB, BadgeMetadata.BADGE_CATEGORY + ':String(single)': badge1.category } expected_node2 = { NEPTUNE_HEADER_ID: BadgeMetadata.BADGE_KEY_FORMAT.format(badge=badge2.name), NEPTUNE_HEADER_LABEL: BadgeMetadata.BADGE_NODE_LABEL, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_NODE_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB, BadgeMetadata.BADGE_CATEGORY + ':String(single)': badge2.category } self.assertTrue(expected_node1 in serialized_nodes) self.assertTrue(expected_node2 in serialized_nodes) def test_bad_key_entity_match(self) -> None: column_label = 'Column' table_key = 'hive://default.base/test' self.assertRaises(Exception, BadgeMetadata, start_label=column_label, start_key=table_key, badges=[badge1, badge2]) def test_bad_entity_label(self) -> None: user_label = 'User' table_key = 'hive://default.base/test' self.assertRaises(Exception, BadgeMetadata, start_label=user_label, start_key=table_key, badges=[badge1, badge2]) def test_create_relation(self) -> None: relations = self.badge_metada.create_relation() serialized_relations = [ neo4_serializer.serialize_relationship(relation) for relation in relations ] self.assertEqual(len(relations), 2) relation1 = { RELATION_START_LABEL: self.badge_metada.start_label, RELATION_END_LABEL: BadgeMetadata.BADGE_NODE_LABEL, RELATION_START_KEY: self.badge_metada.start_key, RELATION_END_KEY: BadgeMetadata.get_badge_key(badge1.name), RELATION_TYPE: BadgeMetadata.BADGE_RELATION_TYPE, RELATION_REVERSE_TYPE: BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, } relation2 = { RELATION_START_LABEL: self.badge_metada.start_label, RELATION_END_LABEL: BadgeMetadata.BADGE_NODE_LABEL, RELATION_START_KEY: self.badge_metada.start_key, RELATION_END_KEY: BadgeMetadata.get_badge_key(badge2.name), RELATION_TYPE: BadgeMetadata.BADGE_RELATION_TYPE, RELATION_REVERSE_TYPE: BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, } self.assertTrue(relation1 in serialized_relations) self.assertTrue(relation2 in serialized_relations) def test_create_relation_neptune(self) -> None: relations = self.badge_metada.create_relation() serialized_relations: List[Dict] = sum([ neptune_serializer.convert_relationship(rel) for rel in relations ], []) neptune_forward_expected_1 = { NEPTUNE_HEADER_ID: "{from_vertex_id}_{to_vertex_id}_{label}".format( from_vertex_id=self.badge_metada.start_key, to_vertex_id=BadgeMetadata.get_badge_key(badge1.name), label=BadgeMetadata.BADGE_RELATION_TYPE, ), NEPTUNE_RELATIONSHIP_HEADER_FROM: self.badge_metada.start_key, NEPTUNE_RELATIONSHIP_HEADER_TO: BadgeMetadata.get_badge_key(badge1.name), NEPTUNE_HEADER_LABEL: BadgeMetadata.BADGE_RELATION_TYPE, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB } neptune_reversed_expected_1 = { NEPTUNE_HEADER_ID: "{from_vertex_id}_{to_vertex_id}_{label}".format( from_vertex_id=BadgeMetadata.get_badge_key(badge1.name), to_vertex_id=self.badge_metada.start_key, label=BadgeMetadata.INVERSE_BADGE_RELATION_TYPE), NEPTUNE_RELATIONSHIP_HEADER_FROM: BadgeMetadata.get_badge_key(badge1.name), NEPTUNE_RELATIONSHIP_HEADER_TO: self.badge_metada.start_key, NEPTUNE_HEADER_LABEL: BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB } neptune_forward_expected_2 = { NEPTUNE_HEADER_ID: "{from_vertex_id}_{to_vertex_id}_{label}".format( from_vertex_id=self.badge_metada.start_key, to_vertex_id=BadgeMetadata.get_badge_key(badge2.name), label=BadgeMetadata.BADGE_RELATION_TYPE, ), NEPTUNE_RELATIONSHIP_HEADER_FROM: self.badge_metada.start_key, NEPTUNE_RELATIONSHIP_HEADER_TO: BadgeMetadata.get_badge_key(badge2.name), NEPTUNE_HEADER_LABEL: BadgeMetadata.BADGE_RELATION_TYPE, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB } neptune_reversed_expected_2 = { NEPTUNE_HEADER_ID: "{from_vertex_id}_{to_vertex_id}_{label}".format( from_vertex_id=BadgeMetadata.get_badge_key(badge2.name), to_vertex_id=self.badge_metada.start_key, label=BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, ), NEPTUNE_RELATIONSHIP_HEADER_FROM: BadgeMetadata.get_badge_key(badge2.name), NEPTUNE_RELATIONSHIP_HEADER_TO: self.badge_metada.start_key, NEPTUNE_HEADER_LABEL: BadgeMetadata.INVERSE_BADGE_RELATION_TYPE, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB } self.assertTrue(neptune_forward_expected_1 in serialized_relations) self.assertTrue(neptune_reversed_expected_1 in serialized_relations) self.assertTrue(neptune_forward_expected_2 in serialized_relations) self.assertTrue(neptune_reversed_expected_2 in serialized_relations)
def _get_extract_iter(self) -> Iterator[Union[TableMetadata, BadgeMetadata, TableSource, TableLineage]]: """ Generates the extract iterator for all of the model types created by the dbt files. """ dbt_id_to_table_key = {} for tbl_node, manifest_content in self._dbt_manifest['nodes'].items(): if manifest_content['resource_type'] == DBT_MODEL_TYPE and tbl_node in self._dbt_catalog['nodes']: LOGGER.info( 'Extracting dbt {}.{}'.format(manifest_content['schema'], manifest_content[self._model_name_key]) ) catalog_content = self._dbt_catalog['nodes'][tbl_node] tbl_columns: List[ColumnMetadata] = self._get_column_values( manifest_columns=manifest_content['columns'], catalog_columns=catalog_content['columns'] ) desc, desc_src = self._get_table_descriptions(manifest_content) tags, tbl_badges = self._get_table_tags_badges(manifest_content) tbl_metadata = TableMetadata( database=self._default_sanitize(self._database_name), # The dbt "database" is the cluster here cluster=self._default_sanitize(manifest_content['database']), schema=self._default_sanitize(manifest_content['schema']), name=self._default_sanitize(manifest_content[self._model_name_key]), is_view=catalog_content['metadata']['type'] == 'view', columns=tbl_columns, tags=tags, description=desc, description_source=desc_src ) # Keep track for Lineage dbt_id_to_table_key[tbl_node] = tbl_metadata._get_table_key() # Optionally filter schemas in the output yield_schema = self._can_yield_schema(manifest_content['schema']) if self._extract_tables and yield_schema: yield tbl_metadata if self._extract_tags and tbl_badges and yield_schema: yield BadgeMetadata(start_label=TableMetadata.TABLE_NODE_LABEL, start_key=tbl_metadata._get_table_key(), badges=[Badge(badge, 'table') for badge in tbl_badges]) if self._source_url and yield_schema: yield TableSource(db_name=tbl_metadata.database, cluster=tbl_metadata.cluster, schema=tbl_metadata.schema, table_name=tbl_metadata.name, source=os.path.join(self._source_url, manifest_content.get('original_file_path'))) if self._extract_lineage: for upstream, downstreams in self._dbt_manifest['child_map'].items(): if upstream not in dbt_id_to_table_key: continue valid_downstreams = [ dbt_id_to_table_key[k] for k in downstreams if k.startswith(DBT_MODEL_PREFIX) and dbt_id_to_table_key.get(k) ] if valid_downstreams: yield TableLineage( table_key=dbt_id_to_table_key[upstream], downstream_deps=valid_downstreams )
def setUp(self) -> None: super(TestBadge, self).setUp() self.badge_metada = BadgeMetadata( start_label='Column', start_key='hive://default.base/test/ds', badges=[badge1, badge2])
def _create_next_node(self) -> Iterator[Any]: # noqa: C901 table_node = {NODE_LABEL: TableMetadata.TABLE_NODE_LABEL, NODE_KEY: self._get_table_key(), TableMetadata.TABLE_NAME: self.name, TableMetadata.IS_VIEW: self.is_view} if self.attrs: for k, v in self.attrs.items(): if k not in table_node: table_node[k] = v yield table_node if self.description: node_key = self._get_table_description_key(self.description) yield self.description.get_node_dict(node_key) # Create the table tag node if self.tags: for tag in self.tags: yield TagMetadata.create_tag_node(tag) for col in self.columns: yield { NODE_LABEL: ColumnMetadata.COLUMN_NODE_LABEL, NODE_KEY: self._get_col_key(col), ColumnMetadata.COLUMN_NAME: col.name, ColumnMetadata.COLUMN_TYPE: col.type, ColumnMetadata.COLUMN_ORDER: col.sort_order} if col.description: node_key = self._get_col_description_key(col, col.description) yield col.description.get_node_dict(node_key) if col.badges: badge_metadata = BadgeMetadata(db_name=self._get_database_key(), schema=self._get_schema_key(), start_label=ColumnMetadata.COLUMN_NODE_LABEL, start_key=self._get_col_key(col), badges=col.badges, cluster=self._get_cluster_key()) badge_nodes = badge_metadata.create_nodes() for node in badge_nodes: yield node # Database, cluster, schema others = [NodeTuple(key=self._get_database_key(), name=self.database, label=TableMetadata.DATABASE_NODE_LABEL), NodeTuple(key=self._get_cluster_key(), name=self.cluster, label=TableMetadata.CLUSTER_NODE_LABEL), NodeTuple(key=self._get_schema_key(), name=self.schema, label=TableMetadata.SCHEMA_NODE_LABEL) ] for node_tuple in others: if node_tuple not in TableMetadata.serialized_nodes: TableMetadata.serialized_nodes.add(node_tuple) yield { NODE_LABEL: node_tuple.label, NODE_KEY: node_tuple.key, 'name': node_tuple.name }