def create_table_metadata(self, table: ScrapedTableMetadata) -> TableMetadata: '''Creates the amundsen table metadata object from the ScrapedTableMetadata object.''' amundsen_columns = [] if table.columns: for column in table.columns: amundsen_columns.append( ColumnMetadata(name=column.name, description=column.description, col_type=column.data_type, sort_order=column.sort_order)) description = table.get_table_description() return TableMetadata(self._db, self._cluster, table.schema, table.table, description, amundsen_columns, table.is_view)
def test_extraction_with_single_result(self): # type: () -> None with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute table = {'schema': 'test_schema', 'name': 'test_table', 'description': 'a table for testing'} sql_execute.return_value = [ self._union( {'col_name': 'col_id1', 'col_type': 'bigint', 'col_description': 'description of id1', 'col_sort_order': 0}, table), self._union( {'col_name': 'col_id2', 'col_type': 'bigint', 'col_description': 'description of id2', 'col_sort_order': 1}, table), self._union( {'col_name': 'is_active', 'col_type': 'boolean', 'col_description': None, 'col_sort_order': 2}, table), self._union( {'col_name': 'source', 'col_type': 'varchar', 'col_description': 'description of source', 'col_sort_order': 3}, table), self._union( {'col_name': 'etl_created_at', 'col_type': 'timestamp', 'col_description': 'description of etl_created_at', 'col_sort_order': 4}, table), self._union( {'col_name': 'ds', 'col_type': 'varchar', 'col_description': None, 'col_sort_order': 5}, table) ] extractor = HiveTableMetadataExtractor() extractor.init(self.conf) actual = extractor.extract() expected = TableMetadata('hive', 'gold', 'test_schema', 'test_table', 'a table for testing', [ColumnMetadata('col_id1', 'description of id1', 'bigint', 0), ColumnMetadata('col_id2', 'description of id2', 'bigint', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('ds', None, 'varchar', 5)]) self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def test_feature_table_extraction(self) -> None: self._init_extractor(programmatic_description_enabled=False) self.extractor._client.list_projects.return_value = ["default"] self._mock_feature_table() table = self.extractor.extract() self.extractor._client.get_entity.assert_called_with("driver_id", project="default") expected = TableMetadata( database="feast", cluster="unittest-feast-instance", schema="default", name="driver_trips", description=None, columns=[ ColumnMetadata("driver_id", "Internal identifier of the driver", "INT64", 0), ColumnMetadata("trips_today", None, "INT32", 1), ], ) self.assertEqual(expected.__repr__(), table.__repr__()) self.assertIsNone(self.extractor.extract())
def test_tags_arent_populated_from_empty_list_and_str(self) -> None: self.table_metadata6 = TableMetadata( 'hive', 'gold', 'test_schema6', 'test_table6', 'test_table6', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0) ], tags=[]) self.table_metadata7 = TableMetadata( 'hive', 'gold', 'test_schema7', 'test_table7', 'test_table7', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0) ], tags="") # Test table tag fields are not populated from empty List node_row = self.table_metadata6.next_node() while node_row: node_row_serialized = neo4_serializer.serialize_node(node_row) self.assertNotEqual(node_row_serialized.get('LABEL'), 'Tag') node_row = self.table_metadata6.next_node() # Test table tag fields are not populated from empty str node_row = self.table_metadata7.next_node() while node_row: node_row_serialized = neo4_serializer.serialize_node(node_row) self.assertNotEqual(node_row_serialized.get('LABEL'), 'Tag') node_row = self.table_metadata7.next_node()
def test_create_table_metadata(self) -> None: scraped = ScrapedTableMetadata(schema="test_schema1", table="test_table1") scraped.set_columns([ ScrapedColumnMetadata(name="a", description=None, data_type="string", sort_order=0), ScrapedColumnMetadata(name="b", description=None, data_type="int", sort_order=1) ]) created_metadata = self.dExtractor.create_table_metadata(scraped) expected = TableMetadata("test_database", "test_cluster", "test_schema1", "test_table1", description=None, columns=[ ColumnMetadata("a", None, "string", 0), ColumnMetadata("b", None, "int", 1) ]) self.assertEqual(str(expected), str(created_metadata))
def test_col_badge_field(self) -> None: self.table_metadata4 = TableMetadata( 'hive', 'gold', 'test_schema4', 'test_table4', 'test_table4', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0, ['col-badge1', 'col-badge2']) ], is_view=False, attr1='uri', attr2='attr2') node_row = self.table_metadata4.next_node() actual = [] while node_row: actual.append(node_row) node_row = self.table_metadata4.next_node() self.assertEqual(actual[4].get('KEY'), 'col-badge1') self.assertEqual(actual[5].get('KEY'), 'col-badge2') relation_row = self.table_metadata4.next_relation() actual = [] while relation_row: actual.append(relation_row) relation_row = self.table_metadata4.next_relation() expected_col_badge_rel1 = { 'END_KEY': 'col-badge1', 'START_LABEL': 'Column', 'END_LABEL': 'Badge', 'START_KEY': 'hive://gold.test_schema4/test_table4/test_id1', 'TYPE': 'HAS_BADGE', 'REVERSE_TYPE': 'BADGE_FOR' } expected_col_badge_rel2 = { 'END_KEY': 'col-badge2', 'START_LABEL': 'Column', 'END_LABEL': 'Badge', 'START_KEY': 'hive://gold.test_schema4/test_table4/test_id1', 'TYPE': 'HAS_BADGE', 'REVERSE_TYPE': 'BADGE_FOR' } self.assertEqual(actual[4], expected_col_badge_rel1) self.assertEqual(actual[5], expected_col_badge_rel2)
def _load_csv(self) -> None: """ Create an iterator to execute sql. """ with open(self.column_file_location, 'r') as fin: self.columns = [dict(i) for i in csv.DictReader(fin)] parsed_columns = defaultdict(list) for column_dict in self.columns: db = column_dict['database'] cluster = column_dict['cluster'] schema = column_dict['schema'] table_name = column_dict['table_name'] id = self._get_key(db, cluster, schema, table_name) column = ColumnMetadata(name=column_dict['name'], description=column_dict['description'], col_type=column_dict['col_type'], sort_order=int(column_dict['sort_order'])) parsed_columns[id].append(column) # Create Table Dictionary with open(self.table_file_location, 'r') as fin: tables = [dict(i) for i in csv.DictReader(fin)] results = [] for table_dict in tables: db = table_dict['database'] cluster = table_dict['cluster'] schema = table_dict['schema'] table_name = table_dict['name'] id = self._get_key(db, cluster, schema, table_name) columns = parsed_columns[id] if columns is None: columns = [] table = TableMetadata( database=table_dict['database'], cluster=table_dict['cluster'], schema=table_dict['schema'], name=table_dict['name'], description=table_dict['description'], columns=columns, # TODO: this possibly should parse stringified booleans; # right now it only will be false for empty strings is_view=bool(table_dict['is_view']), tags=table_dict['tags']) results.append(table) self._iter = iter(results)
def test_tags_populated_from_str(self) -> None: self.table_metadata5 = TableMetadata( 'hive', 'gold', 'test_schema5', 'test_table5', 'test_table5', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0) ], tags="tag3, tag4") # Test table tag field populated from str node_row = self.table_metadata5.next_node() actual = [] while node_row: actual.append(node_row) node_row = self.table_metadata5.next_node() self.assertEqual(actual[2].get('LABEL'), 'Tag') self.assertEqual(actual[2].get('KEY'), 'tag3') self.assertEqual(actual[3].get('KEY'), 'tag4') relation_row = self.table_metadata5.next_relation() actual = [] while relation_row: actual.append(relation_row) relation_row = self.table_metadata5.next_relation() # Table tag relationship expected_tab_tag_rel3 = { 'END_KEY': 'tag3', 'START_LABEL': 'Table', 'END_LABEL': 'Tag', 'START_KEY': 'hive://gold.test_schema5/test_table5', 'TYPE': 'TAGGED_BY', 'REVERSE_TYPE': 'TAG' } expected_tab_tag_rel4 = { 'END_KEY': 'tag4', 'START_LABEL': 'Table', 'END_LABEL': 'Tag', 'START_KEY': 'hive://gold.test_schema5/test_table5', 'TYPE': 'TAGGED_BY', 'REVERSE_TYPE': 'TAG' } self.assertEqual(actual[2], expected_tab_tag_rel3) self.assertEqual(actual[3], expected_tab_tag_rel4)
def test_extraction_with_single_result(self): # type: () -> None with patch.object(GlueExtractor, '_search_tables') as mock_search: mock_search.return_value = [{ 'Name': 'test_table', 'DatabaseName': 'test_schema', 'Description': 'a table for testing', 'StorageDescriptor': { 'Columns': [{ 'Name': 'col_id1', 'Type': 'bigint', 'Comment': 'description of id1' }, { 'Name': 'col_id2', 'Type': 'bigint', 'Comment': 'description of id2' }, { 'Name': 'is_active', 'Type': 'boolean' }, { 'Name': 'source', 'Type': 'varchar', 'Comment': 'description of source' }, { 'Name': 'etl_created_at', 'Type': 'timestamp', 'Comment': 'description of etl_created_at' }, { 'Name': 'ds', 'Type': 'varchar' }] } }] extractor = GlueExtractor() extractor.init(self.conf) actual = extractor.extract() expected = TableMetadata( 'glue', 'gold', 'test_schema', 'test_table', 'a table for testing', [ ColumnMetadata('col_id1', 'description of id1', 'bigint', 0), ColumnMetadata('col_id2', 'description of id2', 'bigint', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('ds', None, 'varchar', 5) ]) self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def setUp(self) -> None: super(TestQueryExecution, self).setUp() # Display full diffs self.maxDiff = None self.table_metadata = TableMetadata( 'hive', 'gold', 'test_schema1', 'test_table1', 'test_table1', [ ColumnMetadata('field', '', '', 0), ]) self.query_metadata = QueryMetadata( sql="select * from table a where a.field > 3", tables=[self.table_metadata]) self.query_join_metadata = QueryExecutionsMetadata( query_metadata=self.query_metadata, start_time=10, execution_count=7) self._expected_key = '748c28f86de411b1d2b9deb6ae105eba-10'
def test_tags_field(self): # type: () -> None self.table_metadata4 = TableMetadata('hive', 'gold', 'test_schema4', 'test_table4', 'test_table4', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0, ['col-tag1', 'col-tag2'])], is_view=False, tags=['tag1', 'tag2'], attr1='uri', attr2='attr2') node_row = self.table_metadata4.next_node() actual = [] while node_row: actual.append(node_row) node_row = self.table_metadata4.next_node() self.assertEqual(actual[0].get('attr1'), 'uri') self.assertEqual(actual[0].get('attr2'), 'attr2') self.assertEqual(actual[2].get('LABEL'), 'Tag') self.assertEqual(actual[2].get('KEY'), 'tag1') self.assertEqual(actual[3].get('KEY'), 'tag2') self.assertEqual(actual[6].get('KEY'), 'col-tag1') self.assertEqual(actual[7].get('KEY'), 'col-tag2') relation_row = self.table_metadata4.next_relation() actual = [] while relation_row: actual.append(relation_row) relation_row = self.table_metadata4.next_relation() # Table tag relationship expected_tab_tag_rel1 = {'END_KEY': 'tag1', 'START_LABEL': 'Table', 'END_LABEL': 'Tag', 'START_KEY': 'hive://gold.test_schema4/test_table4', 'TYPE': 'TAGGED_BY', 'REVERSE_TYPE': 'TAG'} expected_tab_tag_rel2 = {'END_KEY': 'tag2', 'START_LABEL': 'Table', 'END_LABEL': 'Tag', 'START_KEY': 'hive://gold.test_schema4/test_table4', 'TYPE': 'TAGGED_BY', 'REVERSE_TYPE': 'TAG'} expected_col_tag_rel1 = {'END_KEY': 'col-tag1', 'START_LABEL': 'Table', 'END_LABEL': 'Tag', 'START_KEY': 'hive://gold.test_schema4/test_table4', 'TYPE': 'TAGGED_BY', 'REVERSE_TYPE': 'TAG'} expected_col_tag_rel2 = {'END_KEY': 'col-tag2', 'START_LABEL': 'Table', 'END_LABEL': 'Tag', 'START_KEY': 'hive://gold.test_schema4/test_table4', 'TYPE': 'TAGGED_BY', 'REVERSE_TYPE': 'TAG'} self.assertEqual(actual[2], expected_tab_tag_rel1) self.assertEqual(actual[3], expected_tab_tag_rel2) self.assertEqual(actual[6], expected_col_tag_rel1) self.assertEqual(actual[7], expected_col_tag_rel2)
def _load_csv(self): # type: () -> None """ Create an iterator to execute sql. """ with open(self.column_file_location, 'r') as fin: self.columns = [dict(i) for i in csv.DictReader(fin)] parsed_columns = defaultdict(list) for column_dict in self.columns: db = column_dict['database'] cluster = column_dict['cluster'] schema = column_dict['schema'] table = column_dict['table_name'] id = self._get_key(db, cluster, schema, table) column = ColumnMetadata(name=column_dict['name'], description=column_dict['description'], col_type=column_dict['col_type'], sort_order=int(column_dict['sort_order'])) parsed_columns[id].append(column) # Create Table Dictionary with open(self.table_file_location, 'r') as fin: tables = [dict(i) for i in csv.DictReader(fin)] results = [] for table_dict in tables: db = table_dict['database'] cluster = table_dict['cluster'] schema = table_dict['schema'] table = table_dict['name'] id = self._get_key(db, cluster, schema, table) columns = parsed_columns[id] if columns is None: columns = [] table = TableMetadata(database=table_dict['database'], cluster=table_dict['cluster'], schema=table_dict['schema'], name=table_dict['name'], description=table_dict['description'], columns=columns, is_view=table_dict['is_view'], tags=table_dict['tags']) results.append(table) self._iter = iter(results)
def _get_extract_iter(self) -> Iterator[TableMetadata]: """ Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata :return: """ for key, group in groupby(self._get_raw_extract_iter(), self._get_table_key): columns = [] for row in group: last_row = row columns.append( ColumnMetadata(row['col_name'], row['col_description'], row['col_type'], row['col_sort_order'])) yield TableMetadata(self._database, last_row['cluster'], last_row['schema'], last_row['name'], last_row['description'], columns)
def _get_extract_iter(self) -> Iterator[TableMetadata]: ''' Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata :return: ''' for _, group in groupby(self._get_raw_extract_iter(), self._get_table_key): columns = [] for row in group: last_row = row columns.append( ColumnMetadata(row['col_name'], row['col_description'], row['col_type'], row['col_sort_order'])) yield TableMetadata( last_row['database'], last_row['cluster'], last_row['schema'], last_row['name'], last_row['description'], columns, last_row['is_view'] == 'true', last_row['tags'].split( self._tags_separator) if last_row['tags'] else None)
def _get_extract_iter(self): # type: () -> Iterator[TableMetadata] """ It gets all tables and yields TableMetadata :return: """ for row in self._get_raw_extract_iter(): columns = [] for i in range(len(row['StorageDescriptor']['Columns'])): column = row['StorageDescriptor']['Columns'][i] columns.append( ColumnMetadata( column['Name'], column['Comment'] if 'Comment' in column else None, column['Type'], i)) yield TableMetadata( 'glue', self._cluster, row['DatabaseName'], row['Name'], row['Description'] if 'Description' in row else None, columns)
def _extract_topic_data(self) -> Iterator[TableMetadata]: for subject in self.subjects: ## not handling versions because cba s = requests.get( f"http://localhost:8081/subjects/{subject}/versions/1").json() schema = json.loads(s['schema']) fields, i = [], 0 ## make this recursive for key in schema['fields']: fields.append( ColumnMetadata(key['name'].replace("-", "_"), 'a comment', str(key['type']), i)) i += 1 ## need a kafka meta data model yield TableMetadata('kafka2', 'gold', 'test_schema', subject.replace("-", "_"), 'description', fields, True, ["a tag", "second tag"])
def _get_extract_iter(self): # type: () -> Iterator[TableMetadata] """ Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata :return: """ for key, group in groupby(self._get_raw_extract_iter(), self._get_table_key): columns = [] # no table description and column description for row in group: last_row = row columns.append(ColumnMetadata(name=row['col_name'], description='', col_type=row['col_type'], sort_order=row['col_sort_order'])) yield TableMetadata(database='druid', cluster=self._cluster, schema=last_row['schema'], name=last_row['name'], description='', columns=columns)
def test(self): # type: () -> None config = ConfigFactory.from_dict({ SqlToTblColUsageTransformer.DATABASE_NAME: 'database', SqlToTblColUsageTransformer.USER_EMAIL_ATTRIBUTE_NAME: 'email', SqlToTblColUsageTransformer.SQL_STATEMENT_ATTRIBUTE_NAME: 'statement' }) with patch.object(HiveTableMetadataExtractor, 'extract') as mock_extract,\ patch.object(HiveTableMetadataExtractor, 'init'): mock_extract.side_effect = [ TableMetadata( 'hive', 'gold', 'test_schema1', 'test_table1', 'test_table1', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0), ColumnMetadata('test_id2', 'description of test_id2', 'bigint', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('ds', None, 'varchar', 5) ]), None ] transformer = SqlToTblColUsageTransformer() transformer.init(config) foo = Foo(email='*****@*****.**', statement='SELECT foo, bar FROM test_table1') actual = transformer.transform(foo) expected = TableColumnUsage(col_readers=[ ColumnReader(database=u'database', cluster=u'gold', schema='test_schema1', table='test_table1', column='*', user_email='*****@*****.**') ]) self.assertEqual(expected.__repr__(), actual.__repr__())
def _get_extract_iter(self): # type: () -> Iterator[TableMetadata] """ Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata :return: """ for key, group in groupby(self._get_raw_extract_iter(), self._get_table_key): columns = [] for row in group: last_row = row columns.append( ColumnMetadata( row['col_name'], row['extras'] if row['extras'] is not None else row['col_description'], row['col_type'], row['col_sort_order'])) yield TableMetadata('athena', last_row['cluster'], last_row['schema_name'], last_row['name'], '', columns)
def _extract_table_metadata(self, object_name: str, data: Dict[str, Any]) -> TableMetadata: # sort the fields by name because Amundsen requires a sort order for the columns and I did # not see one in the response fields = sorted(data["fields"], key=lambda x: x["name"]) columns = [ ColumnMetadata( name=f["name"], description=f["inlineHelpText"], col_type=f["type"], sort_order=i, ) for i, f in enumerate(fields) ] return TableMetadata( database=self._database, cluster=self._cluster, schema=self._schema, name=object_name, # TODO: Can we extract table description / does it exist? description=None, columns=columns, )
def test_table_attributes(self): # type: () -> None self.table_metadata3 = TableMetadata('hive', 'gold', 'test_schema3', 'test_table3', 'test_table3', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0), ColumnMetadata('test_id2', 'description of test_id2', 'bigint', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('ds', None, 'varchar', 5)], is_view=False, attr1='uri', attr2='attr2') node_row = self.table_metadata3.next_node() actual = [] while node_row: actual.append(node_row) node_row = self.table_metadata3.next_node() self.assertEqual(actual[0].get('attr1'), 'uri') self.assertEqual(actual[0].get('attr2'), 'attr2')
def _get_column_metadata(self, view_original_text: str) -> List[ColumnMetadata]: """ Get Column Metadata from VIEW_ORIGINAL_TEXT from TBLS table for Presto Views. Columns are sorted the same way as they appear in Presto Create View SQL. :param view_original_text: :return: """ # remove encoded Presto View data prefix and suffix encoded_view_info = ( view_original_text. split(PrestoViewMetadataExtractor.PRESTO_VIEW_PREFIX, 1)[-1]. rsplit(PrestoViewMetadataExtractor.PRESTO_VIEW_SUFFIX, 1)[0] ) # view_original_text is b64 encoded: # https://github.com/prestodb/presto/blob/43bd519052ba4c56ff1f4fc807075637ab5f4f10/presto-hive/src/main/java/com/facebook/presto/hive/HiveUtil.java#L602-L605 decoded_view_info = base64.b64decode(encoded_view_info) columns = json.loads(decoded_view_info).get('columns') return [ColumnMetadata(name=column['name'], description=None, col_type=column['type'], sort_order=i) for i, column in enumerate(columns)]
def _get_extract_iter(self): # type: () -> Iterator[TableMetadata] """ Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata :return: """ for key, group in groupby(self._get_raw_extract_iter(), self._get_table_key): columns = [] for row in group: last_row = row columns.append( ColumnMetadata( row['col_name'], unidecode(row['col_description']) if row['col_description'] else None, row['col_type'], row['col_sort_order'])) yield TableMetadata( self._database, last_row['cluster'], last_row['schema_name'], last_row['name'], unidecode(last_row['description']) if last_row['description'] else None, columns, last_row['is_view'] == 'true')
def test_extraction_with_resource_link_result(self) -> None: with patch.object(GlueExtractor, '_search_tables') as mock_search: mock_search.return_value = [ test_table, { "Name": "test_resource_link", "DatabaseName": "test_schema", "TargetTable": { "CatalogId": "111111111111", "DatabaseName": "test_schema_external", "Name": "test_table" }, "CatalogId": "222222222222" } ] extractor = GlueExtractor() extractor.init(self.conf) actual = extractor.extract() expected = TableMetadata( 'glue', 'gold', 'test_schema', 'test_table', 'a table for testing', [ ColumnMetadata('col_id1', 'description of id1', 'bigint', 0), ColumnMetadata('col_id2', 'description of id2', 'bigint', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('ds', None, 'varchar', 5), ColumnMetadata('partition_key1', 'description of partition_key1', 'string', 6), ], False) self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def test_z_custom_sources(self) -> None: self.custom_source = TableMetadata('hive', 'gold', 'test_schema3', 'test_table4', 'test_table4', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0), ColumnMetadata('test_id2', 'description of test_id2', 'bigint', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('ds', None, 'varchar', 5)], is_view=False, description_source="custom") node_row = self.custom_source.next_node() actual = [] while node_row: node_row_serialized = neo4_serializer.serialize_node(node_row) actual.append(node_row_serialized) node_row = self.custom_source.next_node() expected = {'LABEL': 'Programmatic_Description', 'KEY': 'hive://gold.test_schema3/test_table4/_custom_description', 'description_source': 'custom', 'description': 'test_table4'} self.assertEqual(actual[1], expected)
def test_serialize(self): # type: () -> None self.table_metadata = TableMetadata( 'hive', 'gold', 'test_schema1', 'test_table1', 'test_table1', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0), ColumnMetadata('test_id2', 'description of test_id2', 'bigint', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('ds', None, 'varchar', 5) ]) self.table_metadata2 = TableMetadata( 'hive', 'gold', 'test_schema1', 'test_table1', 'test_table1', [ ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0), ColumnMetadata('test_id2', 'description of test_id2', 'bigint', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('ds', None, 'varchar', 5) ]) self.expected_nodes_deduped = [{ 'name': 'test_table1', 'KEY': 'hive://gold.test_schema1/test_table1', 'LABEL': 'Table', 'is_view:UNQUOTED': False }, { 'description': 'test_table1', 'KEY': 'hive://gold.test_schema1/test_table1/_description', 'LABEL': 'Description' }, { 'sort_order:UNQUOTED': 0, 'type': 'bigint', 'name': 'test_id1', 'KEY': 'hive://gold.test_schema1/test_table1/test_id1', 'LABEL': 'Column' }, { 'description': 'description of test_table1', 'KEY': 'hive://gold.test_schema1/test_table1/test_id1/_description', 'LABEL': 'Description' }, { 'sort_order:UNQUOTED': 1, 'type': 'bigint', 'name': 'test_id2', 'KEY': 'hive://gold.test_schema1/test_table1/test_id2', 'LABEL': 'Column' }, { 'description': 'description of test_id2', 'KEY': 'hive://gold.test_schema1/test_table1/test_id2/_description', 'LABEL': 'Description' }, { 'sort_order:UNQUOTED': 2, 'type': 'boolean', 'name': 'is_active', 'KEY': 'hive://gold.test_schema1/test_table1/is_active', 'LABEL': 'Column' }, { 'sort_order:UNQUOTED': 3, 'type': 'varchar', 'name': 'source', 'KEY': 'hive://gold.test_schema1/test_table1/source', 'LABEL': 'Column' }, { 'description': 'description of source', 'KEY': 'hive://gold.test_schema1/test_table1/source/_description', 'LABEL': 'Description' }, { 'sort_order:UNQUOTED': 4, 'type': 'timestamp', 'name': 'etl_created_at', 'KEY': 'hive://gold.test_schema1/test_table1/etl_created_at', 'LABEL': 'Column' }, { 'description': 'description of etl_created_at', 'KEY': 'hive://gold.test_schema1/test_table1/etl_created_at/_description', 'LABEL': 'Description' }, { 'sort_order:UNQUOTED': 5, 'type': 'varchar', 'name': 'ds', 'KEY': 'hive://gold.test_schema1/test_table1/ds', 'LABEL': 'Column' }] self.expected_nodes = copy.deepcopy(self.expected_nodes_deduped) self.expected_nodes.append({ 'name': 'hive', 'KEY': 'database://hive', 'LABEL': 'Database' }) self.expected_nodes.append({ 'name': 'gold', 'KEY': 'hive://gold', 'LABEL': 'Cluster' }) self.expected_nodes.append({ 'name': 'test_schema1', 'KEY': 'hive://gold.test_schema1', 'LABEL': 'Schema' }) self.expected_rels_deduped = [{ 'END_KEY': 'hive://gold.test_schema1/test_table1', 'START_LABEL': 'Schema', 'END_LABEL': 'Table', 'START_KEY': 'hive://gold.test_schema1', 'TYPE': 'TABLE', 'REVERSE_TYPE': 'TABLE_OF' }, { 'END_KEY': 'hive://gold.test_schema1/test_table1/_description', 'START_LABEL': 'Table', 'END_LABEL': 'Description', 'START_KEY': 'hive://gold.test_schema1/test_table1', 'TYPE': 'DESCRIPTION', 'REVERSE_TYPE': 'DESCRIPTION_OF' }, { 'END_KEY': 'hive://gold.test_schema1/test_table1/test_id1', 'START_LABEL': 'Table', 'END_LABEL': 'Column', 'START_KEY': 'hive://gold.test_schema1/test_table1', 'TYPE': 'COLUMN', 'REVERSE_TYPE': 'COLUMN_OF' }, { 'END_KEY': 'hive://gold.test_schema1/test_table1/test_id1/_description', 'START_LABEL': 'Column', 'END_LABEL': 'Description', 'START_KEY': 'hive://gold.test_schema1/test_table1/test_id1', 'TYPE': 'DESCRIPTION', 'REVERSE_TYPE': 'DESCRIPTION_OF' }, { 'END_KEY': 'hive://gold.test_schema1/test_table1/test_id2', 'START_LABEL': 'Table', 'END_LABEL': 'Column', 'START_KEY': 'hive://gold.test_schema1/test_table1', 'TYPE': 'COLUMN', 'REVERSE_TYPE': 'COLUMN_OF' }, { 'END_KEY': 'hive://gold.test_schema1/test_table1/test_id2/_description', 'START_LABEL': 'Column', 'END_LABEL': 'Description', 'START_KEY': 'hive://gold.test_schema1/test_table1/test_id2', 'TYPE': 'DESCRIPTION', 'REVERSE_TYPE': 'DESCRIPTION_OF' }, { 'END_KEY': 'hive://gold.test_schema1/test_table1/is_active', 'START_LABEL': 'Table', 'END_LABEL': 'Column', 'START_KEY': 'hive://gold.test_schema1/test_table1', 'TYPE': 'COLUMN', 'REVERSE_TYPE': 'COLUMN_OF' }, { 'END_KEY': 'hive://gold.test_schema1/test_table1/source', 'START_LABEL': 'Table', 'END_LABEL': 'Column', 'START_KEY': 'hive://gold.test_schema1/test_table1', 'TYPE': 'COLUMN', 'REVERSE_TYPE': 'COLUMN_OF' }, { 'END_KEY': 'hive://gold.test_schema1/test_table1/source/_description', 'START_LABEL': 'Column', 'END_LABEL': 'Description', 'START_KEY': 'hive://gold.test_schema1/test_table1/source', 'TYPE': 'DESCRIPTION', 'REVERSE_TYPE': 'DESCRIPTION_OF' }, { 'END_KEY': 'hive://gold.test_schema1/test_table1/etl_created_at', 'START_LABEL': 'Table', 'END_LABEL': 'Column', 'START_KEY': 'hive://gold.test_schema1/test_table1', 'TYPE': 'COLUMN', 'REVERSE_TYPE': 'COLUMN_OF' }, { 'END_KEY': 'hive://gold.test_schema1/test_table1/etl_created_at/_description', 'START_LABEL': 'Column', 'END_LABEL': 'Description', 'START_KEY': 'hive://gold.test_schema1/test_table1/etl_created_at', 'TYPE': 'DESCRIPTION', 'REVERSE_TYPE': 'DESCRIPTION_OF' }, { 'END_KEY': 'hive://gold.test_schema1/test_table1/ds', 'START_LABEL': 'Table', 'END_LABEL': 'Column', 'START_KEY': 'hive://gold.test_schema1/test_table1', 'TYPE': 'COLUMN', 'REVERSE_TYPE': 'COLUMN_OF' }] self.expected_rels = copy.deepcopy(self.expected_rels_deduped) self.expected_rels.append({ 'END_KEY': 'hive://gold', 'START_LABEL': 'Database', 'END_LABEL': 'Cluster', 'START_KEY': 'database://hive', 'TYPE': 'CLUSTER', 'REVERSE_TYPE': 'CLUSTER_OF' }) self.expected_rels.append({ 'END_KEY': 'hive://gold.test_schema1', 'START_LABEL': 'Cluster', 'END_LABEL': 'Schema', 'START_KEY': 'hive://gold', 'TYPE': 'SCHEMA', 'REVERSE_TYPE': 'SCHEMA_OF' }) node_row = self.table_metadata.next_node() actual = [] while node_row: actual.append(node_row) node_row = self.table_metadata.next_node() self.assertEqual(self.expected_nodes, actual) relation_row = self.table_metadata.next_relation() actual = [] while relation_row: actual.append(relation_row) relation_row = self.table_metadata.next_relation() self.assertEqual(self.expected_rels, actual) # 2nd record should not show already serialized database, cluster, and schema node_row = self.table_metadata2.next_node() actual = [] while node_row: actual.append(node_row) node_row = self.table_metadata2.next_node() self.assertEqual(self.expected_nodes_deduped, actual) relation_row = self.table_metadata2.next_relation() actual = [] while relation_row: actual.append(relation_row) relation_row = self.table_metadata2.next_relation() self.assertEqual(self.expected_rels_deduped, actual)
def test_extraction_with_multiple_result(self): # type: () -> None with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute table = { 'schema_name': 'test_schema1', 'name': 'test_table1', 'description': 'test table 1', 'cluster': self.conf['extractor.mssql_metadata.{}'.format( MSSQLMetadataExtractor.CLUSTER_KEY)] } table1 = { 'schema_name': 'test_schema1', 'name': 'test_table2', 'description': 'test table 2', 'cluster': self.conf['extractor.mssql_metadata.{}'.format( MSSQLMetadataExtractor.CLUSTER_KEY)] } table2 = { 'schema_name': 'test_schema2', 'name': 'test_table3', 'description': 'test table 3', 'cluster': self.conf['extractor.mssql_metadata.{}'.format( MSSQLMetadataExtractor.CLUSTER_KEY)] } sql_execute.return_value = [ self._union( { 'col_name': 'col_id1', 'col_type': 'bigint', 'col_description': 'description of col_id1', 'col_sort_order': 0 }, table), self._union( { 'col_name': 'col_id2', 'col_type': 'bigint', 'col_description': 'description of col_id2', 'col_sort_order': 1 }, table), self._union( { 'col_name': 'is_active', 'col_type': 'boolean', 'col_description': None, 'col_sort_order': 2 }, table), self._union( { 'col_name': 'source', 'col_type': 'varchar', 'col_description': 'description of source', 'col_sort_order': 3 }, table), self._union( { 'col_name': 'etl_created_at', 'col_type': 'timestamp', 'col_description': 'description of etl_created_at', 'col_sort_order': 4 }, table), self._union( { 'col_name': 'ds', 'col_type': 'varchar', 'col_description': None, 'col_sort_order': 5 }, table), self._union( { 'col_name': 'col_name', 'col_type': 'varchar', 'col_description': 'description of col_name', 'col_sort_order': 0 }, table1), self._union( { 'col_name': 'col_name2', 'col_type': 'varchar', 'col_description': 'description of col_name2', 'col_sort_order': 1 }, table1), self._union( { 'col_name': 'col_id3', 'col_type': 'varchar', 'col_description': 'description of col_id3', 'col_sort_order': 0 }, table2), self._union( { 'col_name': 'col_name3', 'col_type': 'varchar', 'col_description': 'description of col_name3', 'col_sort_order': 1 }, table2) ] extractor = MSSQLMetadataExtractor() extractor.init(self.conf) expected = TableMetadata( 'mssql', self.conf['extractor.mssql_metadata.{}'.format( MSSQLMetadataExtractor.CLUSTER_KEY)], 'test_schema1', 'test_table1', 'test table 1', [ ColumnMetadata('col_id1', 'description of col_id1', 'bigint', 0), ColumnMetadata('col_id2', 'description of col_id2', 'bigint', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('ds', None, 'varchar', 5), ], False, ['test_schema1']) actual = extractor.extract().__repr__() self.assertEqual(expected.__repr__(), actual) expected = TableMetadata( 'mssql', self.conf['extractor.mssql_metadata.{}'.format( MSSQLMetadataExtractor.CLUSTER_KEY)], 'test_schema1', 'test_table2', 'test table 2', [ ColumnMetadata('col_name', 'description of col_name', 'varchar', 0), ColumnMetadata('col_name2', 'description of col_name2', 'varchar', 1) ], False, ['test_schema1']) actual = extractor.extract().__repr__() self.assertEqual(expected.__repr__(), actual) expected = TableMetadata( 'mssql', self.conf['extractor.mssql_metadata.{}'.format( MSSQLMetadataExtractor.CLUSTER_KEY)], 'test_schema2', 'test_table3', 'test table 3', [ ColumnMetadata('col_id3', 'description of col_id3', 'varchar', 0), ColumnMetadata('col_name3', 'description of col_name3', 'varchar', 1) ], False, ['test_schema2']) actual = extractor.extract().__repr__() self.assertEqual(expected.__repr__(), actual) self.assertIsNone(extractor.extract()) self.assertIsNone(extractor.extract())
def test_extraction_with_multiple_views(self) -> None: with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute columns1 = { 'columns': [{ 'name': 'xyz', 'type': 'varchar' }, { 'name': 'xyy', 'type': 'double' }, { 'name': 'aaa', 'type': 'int' }, { 'name': 'ab', 'type': 'varchar' }] } columns2 = { 'columns': [{ 'name': 'xyy', 'type': 'varchar' }, { 'name': 'ab', 'type': 'double' }, { 'name': 'aaa', 'type': 'int' }, { 'name': 'xyz', 'type': 'varchar' }] } sql_execute.return_value = [ { 'tbl_id': 2, 'schema': 'test_schema2', 'name': 'test_view2', 'tbl_type': 'virtual_view', 'view_original_text': base64.b64encode( json.dumps(columns2).encode()).decode("utf-8") }, { 'tbl_id': 1, 'schema': 'test_schema1', 'name': 'test_view1', 'tbl_type': 'virtual_view', 'view_original_text': base64.b64encode( json.dumps(columns1).encode()).decode("utf-8") }, ] extractor = PrestoViewMetadataExtractor() extractor.init(self.conf) actual_first_view = extractor.extract() expected_first_view = TableMetadata( 'presto', 'gold', 'test_schema2', 'test_view2', None, [ ColumnMetadata(u'xyy', None, u'varchar', 0), ColumnMetadata(u'ab', None, u'double', 1), ColumnMetadata(u'aaa', None, u'int', 2), ColumnMetadata(u'xyz', None, u'varchar', 3) ], True) self.assertEqual(expected_first_view.__repr__(), actual_first_view.__repr__()) actual_second_view = extractor.extract() expected_second_view = TableMetadata( 'presto', 'gold', 'test_schema1', 'test_view1', None, [ ColumnMetadata(u'xyz', None, u'varchar', 0), ColumnMetadata(u'xyy', None, u'double', 1), ColumnMetadata(u'aaa', None, u'int', 2), ColumnMetadata(u'ab', None, u'varchar', 3) ], True) self.assertEqual(expected_second_view.__repr__(), actual_second_view.__repr__()) self.assertIsNone(extractor.extract())
def test_extraction_with_multiple_result(self) -> None: with patch.object(GlueExtractor, '_search_tables') as mock_search: mock_search.return_value = [ test_table, { 'Name': 'test_table2', 'DatabaseName': 'test_schema1', 'Description': 'test table 2', 'StorageDescriptor': { 'Columns': [{ 'Name': 'col_name', 'Type': 'varchar', 'Comment': 'description of col_name' }, { 'Name': 'col_name2', 'Type': 'varchar', 'Comment': 'description of col_name2' }] }, 'TableType': 'EXTERNAL_TABLE', }, { 'Name': 'test_table3', 'DatabaseName': 'test_schema2', 'StorageDescriptor': { 'Columns': [{ 'Name': 'col_id3', 'Type': 'varchar', 'Comment': 'description of col_id3' }, { 'Name': 'col_name3', 'Type': 'varchar', 'Comment': 'description of col_name3' }] }, 'Parameters': { 'comment': 'description of test table 3 from comment' }, 'TableType': 'EXTERNAL_TABLE', }, { 'Name': 'test_view1', 'DatabaseName': 'test_schema1', 'Description': 'test view 1', 'StorageDescriptor': { 'Columns': [{ 'Name': 'col_id3', 'Type': 'varchar', 'Comment': 'description of col_id3' }, { 'Name': 'col_name3', 'Type': 'varchar', 'Comment': 'description of col_name3' }] }, 'TableType': 'VIRTUAL_VIEW', }, ] extractor = GlueExtractor() extractor.init(self.conf) expected = TableMetadata( 'glue', 'gold', 'test_schema', 'test_table', 'a table for testing', [ ColumnMetadata('col_id1', 'description of id1', 'bigint', 0), ColumnMetadata('col_id2', 'description of id2', 'bigint', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('ds', None, 'varchar', 5), ColumnMetadata('partition_key1', 'description of partition_key1', 'string', 6), ], False) self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) expected = TableMetadata( 'glue', 'gold', 'test_schema1', 'test_table2', 'test table 2', [ ColumnMetadata('col_name', 'description of col_name', 'varchar', 0), ColumnMetadata('col_name2', 'description of col_name2', 'varchar', 1) ], False) self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) expected = TableMetadata( 'glue', 'gold', 'test_schema2', 'test_table3', 'description of test table 3 from comment', [ ColumnMetadata('col_id3', 'description of col_id3', 'varchar', 0), ColumnMetadata('col_name3', 'description of col_name3', 'varchar', 1) ], False) self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) expected = TableMetadata( 'glue', 'gold', 'test_schema1', 'test_view1', 'test view 1', [ ColumnMetadata('col_id3', 'description of col_id3', 'varchar', 0), ColumnMetadata('col_name3', 'description of col_name3', 'varchar', 1) ], True) self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) self.assertIsNone(extractor.extract()) self.assertIsNone(extractor.extract())
def test_extraction_with_single_result(self, mock_connect: MagicMock) -> None: """ Test Extraction with single table result from query """ mock_connection = MagicMock() mock_connect.return_value = mock_connection mock_cursor = MagicMock() mock_connection.cursor.return_value = mock_cursor mock_execute = MagicMock() mock_cursor.execute = mock_execute mock_cursor.description = [ ['col_name'], ['col_description'], ['col_type'], ['col_sort_order'], ['database'], ['cluster'], ['schema'], ['name'], ['description'], ['is_view'] ] # Pass flake8 Unsupported operand types for + error table: List[Any] = [ 'DREMIO', 'Production', 'test_schema', 'test_table', 'a table for testing', 'false' ] # Pass flake8 Unsupported operand types for + error expected_input: List[List[Any]] = [ ['col_id1', 'description of id1', 'number', 0] + table, ['col_id2', 'description of id2', 'number', 1] + table, ['is_active', None, 'boolean', 2] + table, ['source', 'description of source', 'varchar', 3] + table, ['etl_created_at', 'description of etl_created_at', 'timestamp_ltz', 4] + table, ['ds', None, 'varchar', 5] + table ] mock_cursor.execute.return_value = expected_input extractor = DremioMetadataExtractor() extractor.init(self.conf) actual = extractor.extract() expected = TableMetadata('DREMIO', 'Production', 'test_schema', 'test_table', 'a table for testing', [ColumnMetadata('col_id1', 'description of id1', 'number', 0), ColumnMetadata('col_id2', 'description of id2', 'number', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp_ltz', 4), ColumnMetadata('ds', None, 'varchar', 5)]) self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())