def test_extraction_with_single_result(self):
        # type: () -> None
        with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute
            table = {'schema': 'test_schema',
                     'name': 'test_table',
                     'description': 'a table for testing',
                     'cluster':
                     self.conf['extractor.postgres_metadata.{}'.format(PostgresMetadataExtractor.CLUSTER_KEY)]
                     }

            sql_execute.return_value = [
                self._union(
                    {'col_name': 'col_id1',
                     'col_type': 'bigint',
                     'col_description': 'description of id1',
                     'col_sort_order': 0}, table),
                self._union(
                    {'col_name': 'col_id2',
                     'col_type': 'bigint',
                     'col_description': 'description of id2',
                     'col_sort_order': 1}, table),
                self._union(
                    {'col_name': 'is_active',
                     'col_type': 'boolean',
                     'col_description': None,
                     'col_sort_order': 2}, table),
                self._union(
                    {'col_name': 'source',
                     'col_type': 'varchar',
                     'col_description': 'description of source',
                     'col_sort_order': 3}, table),
                self._union(
                    {'col_name': 'etl_created_at',
                     'col_type': 'timestamp',
                     'col_description': 'description of etl_created_at',
                     'col_sort_order': 4}, table),
                self._union(
                    {'col_name': 'ds',
                     'col_type': 'varchar',
                     'col_description': None,
                     'col_sort_order': 5}, table)
            ]

            extractor = PostgresMetadataExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableMetadata('postgres', 'MY_CLUSTER', 'test_schema', 'test_table', 'a table for testing',
                                     [ColumnMetadata('col_id1', 'description of id1', 'bigint', 0),
                                      ColumnMetadata('col_id2', 'description of id2', 'bigint', 1),
                                      ColumnMetadata('is_active', None, 'boolean', 2),
                                      ColumnMetadata('source', 'description of source', 'varchar', 3),
                                      ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4),
                                      ColumnMetadata('ds', None, 'varchar', 5)])

            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
Beispiel #2
0
    def test_col_badge_field(self) -> None:
        self.table_metadata4 = TableMetadata('hive', 'gold', 'test_schema4', 'test_table4', 'test_table4', [
            ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0, ['col-badge1', 'col-badge2'])],
            is_view=False, attr1='uri', attr2='attr2')

        node_row = self.table_metadata4.next_node()
        actual = []
        while node_row:
            serialized_node_row = neo4_serializer.serialize_node(node_row)
            actual.append(serialized_node_row)
            node_row = self.table_metadata4.next_node()

        self.assertEqual(actual[4].get('KEY'), 'col-badge1')
        self.assertEqual(actual[5].get('KEY'), 'col-badge2')

        relation_row = self.table_metadata4.next_relation()
        actual = []
        while relation_row:
            serialized_relation_row = neo4_serializer.serialize_relationship(relation_row)
            actual.append(serialized_relation_row)
            relation_row = self.table_metadata4.next_relation()

        expected_col_badge_rel1 = {'END_KEY': 'col-badge1', 'START_LABEL': 'Column',
                                   'END_LABEL': 'Badge',
                                   'START_KEY': 'hive://gold.test_schema4/test_table4/test_id1',
                                   'TYPE': 'HAS_BADGE', 'REVERSE_TYPE': 'BADGE_FOR'}
        expected_col_badge_rel2 = {'END_KEY': 'col-badge2', 'START_LABEL': 'Column',
                                   'END_LABEL': 'Badge',
                                   'START_KEY': 'hive://gold.test_schema4/test_table4/test_id1',
                                   'TYPE': 'HAS_BADGE', 'REVERSE_TYPE': 'BADGE_FOR'}

        self.assertEqual(actual[4], expected_col_badge_rel1)
        self.assertEqual(actual[5], expected_col_badge_rel2)
    def test_tags_populated_from_str(self):
        # type: () -> None
        self.table_metadata5 = TableMetadata('hive', 'gold', 'test_schema5', 'test_table5', 'test_table5', [
            ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0)], tags="tag3, tag4")

        # Test table tag field populated from str
        node_row = self.table_metadata5.next_node()
        actual = []
        while node_row:
            actual.append(node_row)
            node_row = self.table_metadata5.next_node()

        self.assertEqual(actual[2].get('LABEL'), 'Tag')
        self.assertEqual(actual[2].get('KEY'), 'tag3')
        self.assertEqual(actual[3].get('KEY'), 'tag4')

        relation_row = self.table_metadata5.next_relation()
        actual = []
        while relation_row:
            actual.append(relation_row)
            relation_row = self.table_metadata5.next_relation()

        # Table tag relationship
        expected_tab_tag_rel3 = {'END_KEY': 'tag3', 'START_LABEL': 'Table', 'END_LABEL':
                                 'Tag', 'START_KEY': 'hive://gold.test_schema5/test_table5',
                                 'TYPE': 'TAGGED_BY', 'REVERSE_TYPE': 'TAG'}
        expected_tab_tag_rel4 = {'END_KEY': 'tag4', 'START_LABEL': 'Table',
                                 'END_LABEL': 'Tag', 'START_KEY': 'hive://gold.test_schema5/test_table5',
                                 'TYPE': 'TAGGED_BY', 'REVERSE_TYPE': 'TAG'}
        self.assertEqual(actual[2], expected_tab_tag_rel3)
        self.assertEqual(actual[3], expected_tab_tag_rel4)
Beispiel #4
0
    def setUp(self) -> None:
        super(TestQueryJoin, self).setUp()
        # Display full diffs
        self.maxDiff = None
        self.tbl1_col = ColumnMetadata('field', '', '', 0)
        self.left_table_metadata = TableMetadata('hive', 'gold',
                                                 'test_schema1', 'test_table1',
                                                 'test_table1 desc',
                                                 [self.tbl1_col])
        self.tbl2_col = ColumnMetadata('field', '', '', 0)
        self.right_table_metadata = TableMetadata('hive', 'gold',
                                                  'test_schema1',
                                                  'test_table2',
                                                  'test_table2 desc',
                                                  [self.tbl2_col])
        self.query_metadata = QueryMetadata(
            sql="select * from table a where a.field > 3",
            tables=[self.left_table_metadata, self.right_table_metadata])

        self.query_join_metadata = QueryJoinMetadata(
            left_table=self.left_table_metadata,
            right_table=self.right_table_metadata,
            left_column=self.tbl1_col,
            right_column=self.tbl2_col,
            join_type='inner join',
            join_operator='=',
            join_sql=
            'test_table1 = join test_table2 on test_tabl1.field = test_table2.field',
            query_metadata=self.query_metadata)
        self._expected_key = ('inner-join-'
                              'hive://gold.test_schema1/test_table1/field-'
                              '=-'
                              'hive://gold.test_schema1/test_table2/field')
    def test_z_custom_sources(self):
        # type: () -> None
        self.custom_source = TableMetadata(
            'hive',
            'gold',
            'test_schema3',
            'test_table4',
            'test_table4', [
                ColumnMetadata('test_id1', 'description of test_table1',
                               'bigint', 0),
                ColumnMetadata('test_id2', 'description of test_id2', 'bigint',
                               1),
                ColumnMetadata('is_active', None, 'boolean', 2),
                ColumnMetadata('source', 'description of source', 'varchar',
                               3),
                ColumnMetadata('etl_created_at',
                               'description of etl_created_at', 'timestamp',
                               4),
                ColumnMetadata('ds', None, 'varchar', 5)
            ],
            is_view=False,
            description_source="custom")

        node_row = self.custom_source.next_node()
        actual = []
        while node_row:
            actual.append(node_row)
            node_row = self.custom_source.next_node()
        expected = {
            'LABEL': 'Programmatic_Description',
            'KEY': 'hive://gold.test_schema3/test_table4/_custom_description',
            'description_source': 'custom',
            'description': 'test_table4'
        }
        self.assertEqual(actual[1], expected)
    def test_tags_arent_populated_from_empty_list_and_str(self):
        # type: () -> None
        self.table_metadata6 = TableMetadata(
            'hive',
            'gold',
            'test_schema6',
            'test_table6',
            'test_table6', [
                ColumnMetadata('test_id1', 'description of test_table1',
                               'bigint', 0)
            ],
            tags=[])

        self.table_metadata7 = TableMetadata(
            'hive',
            'gold',
            'test_schema7',
            'test_table7',
            'test_table7', [
                ColumnMetadata('test_id1', 'description of test_table1',
                               'bigint', 0)
            ],
            tags="")

        # Test table tag fields are not populated from empty List
        node_row = self.table_metadata6.next_node()
        while node_row:
            self.assertNotEqual(node_row.get('LABEL'), 'Tag')
            node_row = self.table_metadata6.next_node()

        # Test table tag fields are not populated from empty str
        node_row = self.table_metadata7.next_node()
        while node_row:
            self.assertNotEqual(node_row.get('LABEL'), 'Tag')
            node_row = self.table_metadata7.next_node()
    def test_table_attributes(self):
        # type: () -> None
        self.table_metadata3 = TableMetadata(
            'hive',
            'gold',
            'test_schema3',
            'test_table3',
            'test_table3', [
                ColumnMetadata('test_id1', 'description of test_table1',
                               'bigint', 0),
                ColumnMetadata('test_id2', 'description of test_id2', 'bigint',
                               1),
                ColumnMetadata('is_active', None, 'boolean', 2),
                ColumnMetadata('source', 'description of source', 'varchar',
                               3),
                ColumnMetadata('etl_created_at',
                               'description of etl_created_at', 'timestamp',
                               4),
                ColumnMetadata('ds', None, 'varchar', 5)
            ],
            is_view=False,
            attr1='uri',
            attr2='attr2')

        node_row = self.table_metadata3.next_node()
        actual = []
        while node_row:
            actual.append(node_row)
            node_row = self.table_metadata3.next_node()

        self.assertEqual(actual[0].get('attr1'), 'uri')
        self.assertEqual(actual[0].get('attr2'), 'attr2')
    def test_extraction_with_database_specified(self):
        # type: () -> None
        with patch.object(SQLAlchemyExtractor,
                          '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute

            sql_execute.return_value = [{
                'schema': 'test_schema',
                'name': 'test_table',
                'description': 'a table for testing',
                'cluster': 'MY_CLUSTER',
                'is_view': 'false',
                'col_name': 'ds',
                'col_type': 'varchar',
                'col_description': None,
                'col_sort_order': 0
            }]

            extractor = SnowflakeMetadataExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableMetadata(
                self.database_key, 'MY_CLUSTER', 'test_schema', 'test_table',
                'a table for testing',
                [ColumnMetadata('ds', None, 'varchar', 0)])

            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
Beispiel #9
0
    def test_extraction_with_partition_badge(self) -> None:
        with patch.object(GlueExtractor, '_search_tables') as mock_search:
            mock_search.return_value = [test_table]

            extractor = GlueExtractor()
            extractor.init(conf=ConfigFactory.from_dict({
                GlueExtractor.PARTITION_BADGE_LABEL_KEY:
                "partition_key",
            }))
            actual = extractor.extract()
            expected = TableMetadata(
                'glue', 'gold', 'test_schema', 'test_table',
                'a table for testing', [
                    ColumnMetadata('col_id1', 'description of id1', 'bigint',
                                   0),
                    ColumnMetadata('col_id2', 'description of id2', 'bigint',
                                   1),
                    ColumnMetadata('is_active', None, 'boolean', 2),
                    ColumnMetadata('source', 'description of source',
                                   'varchar', 3),
                    ColumnMetadata('etl_created_at',
                                   'description of etl_created_at',
                                   'timestamp', 4),
                    ColumnMetadata('ds', None, 'varchar', 5),
                    ColumnMetadata(
                        'partition_key1',
                        'description of partition_key1',
                        'string',
                        6,
                        ["partition_key"],
                    ),
                ], False)
            self.assertEqual(expected.__repr__(), actual.__repr__())
Beispiel #10
0
    def setUp(self) -> None:
        super(TestTableMetadata, self).setUp()
        TableMetadata.serialized_nodes_keys = set()
        TableMetadata.serialized_rels_keys = set()
        self.table_metadata = TableMetadata(
            'hive', 'gold', 'test_schema1', 'test_table1', 'test_table1', [
                ColumnMetadata('test_id1', 'description of test_table1',
                               'bigint', 0),
                ColumnMetadata('test_id2', 'description of test_id2', 'bigint',
                               1),
                ColumnMetadata('is_active', None, 'boolean', 2),
                ColumnMetadata('source', 'description of source', 'varchar',
                               3),
                ColumnMetadata('etl_created_at',
                               'description of etl_created_at', 'timestamp',
                               4),
                ColumnMetadata('ds', None, 'varchar', 5)
            ])

        self.table_metadata2 = TableMetadata(
            'hive', 'gold', 'test_schema1', 'test_table1', 'test_table1', [
                ColumnMetadata('test_id1', 'description of test_table1',
                               'bigint', 0),
                ColumnMetadata('test_id2', 'description of test_id2', 'bigint',
                               1),
                ColumnMetadata('is_active', None, 'boolean', 2),
                ColumnMetadata('source', 'description of source', 'varchar',
                               3),
                ColumnMetadata('etl_created_at',
                               'description of etl_created_at', 'timestamp',
                               4),
                ColumnMetadata('ds', None, 'varchar', 5)
            ])
Beispiel #11
0
    def test_extraction_one_object(self, mock_salesforce: Any) -> None:
        mock_salesforce.return_value = MockSalesForce()
        config_dict: Dict = {
            f"extractor.salesforce_metadata.{SalesForceExtractor.OBJECT_NAMES_KEY}": [
                "Account"
            ],
            **self.config,
        }
        conf = ConfigFactory.from_dict(config_dict)

        mock_salesforce.return_value = MockSalesForce()
        extractor = SalesForceExtractor()
        extractor.init(Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope()))
        result = extractor.extract()
        self.assertIsInstance(result, TableMetadata)

        expected = TableMetadata(
            "salesforce",
            "gold",
            "default",
            "Account",
            None,
            [
                ColumnMetadata("Id", "The Account Id", "id", 0, []),
                ColumnMetadata("isDeleted", "Deleted?", "bool", 1, []),
            ],
            False,
            [],
        )

        self.assertEqual(expected.__repr__(), result.__repr__())

        self.assertIsNone(extractor.extract())
    def test_extraction_with_single_result(self,
                                           mock_connect: MagicMock) -> None:
        """
        Test Extraction with single table result from query
        """
        mock_connection = MagicMock()
        mock_connect.return_value = mock_connection

        mock_cursor = MagicMock()
        mock_connection.cursor.return_value = mock_cursor

        mock_execute = MagicMock()
        mock_cursor.execute = mock_execute

        mock_cursor.description = [['col_name'], ['col_description'],
                                   ['col_type'], ['col_sort_order'],
                                   ['database'], ['cluster'], ['schema'],
                                   ['name'], ['description'], ['is_view']]

        # Pass flake8 Unsupported operand types for + error
        table: List[Any] = [
            'DREMIO', 'Production', 'test_schema', 'test_table',
            'a table for testing', 'false'
        ]

        # Pass flake8 Unsupported operand types for + error
        expected_input: List[List[Any]] = [
            ['col_id1', 'description of id1', 'number', 0] + table,
            ['col_id2', 'description of id2', 'number', 1] + table,
            ['is_active', None, 'boolean', 2] + table,
            ['source', 'description of source', 'varchar', 3] + table,
            [
                'etl_created_at', 'description of etl_created_at',
                'timestamp_ltz', 4
            ] + table, ['ds', None, 'varchar', 5] + table
        ]

        mock_cursor.execute.return_value = expected_input

        extractor = DremioMetadataExtractor()
        extractor.init(self.conf)

        actual = extractor.extract()
        expected = TableMetadata(
            'DREMIO', 'Production', 'test_schema', 'test_table',
            'a table for testing', [
                ColumnMetadata('col_id1', 'description of id1', 'number', 0),
                ColumnMetadata('col_id2', 'description of id2', 'number', 1),
                ColumnMetadata('is_active', None, 'boolean', 2),
                ColumnMetadata('source', 'description of source', 'varchar',
                               3),
                ColumnMetadata('etl_created_at',
                               'description of etl_created_at',
                               'timestamp_ltz', 4),
                ColumnMetadata('ds', None, 'varchar', 5)
            ])

        self.assertEqual(expected.__repr__(), actual.__repr__())
        self.assertIsNone(extractor.extract())
    def test_tags_field(self) -> None:
        self.table_metadata4 = TableMetadata(
            'hive',
            'gold',
            'test_schema4',
            'test_table4',
            'test_table4', [
                ColumnMetadata('test_id1', 'description of test_table1',
                               'bigint', 0)
            ],
            is_view=False,
            tags=['tag1', 'tag2'],
            attr1='uri',
            attr2='attr2')

        node_row = self.table_metadata4.next_node()
        actual = []
        while node_row:
            node_row_serialized = neo4_serializer.serialize_node(node_row)
            actual.append(node_row_serialized)
            node_row = self.table_metadata4.next_node()

        self.assertEqual(actual[0].get('attr1'), 'uri')
        self.assertEqual(actual[0].get('attr2'), 'attr2')

        self.assertEqual(actual[2].get('LABEL'), 'Tag')
        self.assertEqual(actual[2].get('KEY'), 'tag1')
        self.assertEqual(actual[3].get('KEY'), 'tag2')

        relation_row = self.table_metadata4.next_relation()
        actual = []
        while relation_row:
            relation_row_serialized = neo4_serializer.serialize_relationship(
                relation_row)
            actual.append(relation_row_serialized)
            relation_row = self.table_metadata4.next_relation()

        # Table tag relationship
        expected_tab_tag_rel1 = {
            'END_KEY': 'tag1',
            'START_LABEL': 'Table',
            'END_LABEL': 'Tag',
            'START_KEY': 'hive://gold.test_schema4/test_table4',
            'TYPE': 'TAGGED_BY',
            'REVERSE_TYPE': 'TAG'
        }
        expected_tab_tag_rel2 = {
            'END_KEY': 'tag2',
            'START_LABEL': 'Table',
            'END_LABEL': 'Tag',
            'START_KEY': 'hive://gold.test_schema4/test_table4',
            'TYPE': 'TAGGED_BY',
            'REVERSE_TYPE': 'TAG'
        }

        self.assertEqual(actual[2], expected_tab_tag_rel1)
        self.assertEqual(actual[3], expected_tab_tag_rel2)
Beispiel #14
0
    def test_extraction_with_single_result(self):
        # type: () -> None
        with patch.object(GlueExtractor, '_search_tables') as mock_search:
            mock_search.return_value = [{
                'Name': 'test_table',
                'DatabaseName': 'test_schema',
                'Description': 'a table for testing',
                'StorageDescriptor': {
                    'Columns': [{
                        'Name': 'col_id1',
                        'Type': 'bigint',
                        'Comment': 'description of id1'
                    }, {
                        'Name': 'col_id2',
                        'Type': 'bigint',
                        'Comment': 'description of id2'
                    }, {
                        'Name': 'is_active',
                        'Type': 'boolean'
                    }, {
                        'Name': 'source',
                        'Type': 'varchar',
                        'Comment': 'description of source'
                    }, {
                        'Name': 'etl_created_at',
                        'Type': 'timestamp',
                        'Comment': 'description of etl_created_at'
                    }, {
                        'Name': 'ds',
                        'Type': 'varchar'
                    }]
                }
            }]

            extractor = GlueExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableMetadata(
                'glue', 'gold', 'test_schema', 'test_table',
                'a table for testing', [
                    ColumnMetadata('col_id1', 'description of id1', 'bigint',
                                   0),
                    ColumnMetadata('col_id2', 'description of id2', 'bigint',
                                   1),
                    ColumnMetadata('is_active', None, 'boolean', 2),
                    ColumnMetadata('source', 'description of source',
                                   'varchar', 3),
                    ColumnMetadata('etl_created_at',
                                   'description of etl_created_at',
                                   'timestamp', 4),
                    ColumnMetadata('ds', None, 'varchar', 5)
                ])
            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
Beispiel #15
0
    def test_extraction_multiple_objects(self, mock_salesforce: Any) -> None:
        mock_salesforce.return_value = MockSalesForce()
        config_dict: Dict = {
            f"extractor.salesforce_metadata.{SalesForceExtractor.OBJECT_NAMES_KEY}": [
                "Account",
                "Profile",
            ],
            **self.config,
        }
        conf = ConfigFactory.from_dict(config_dict)

        mock_salesforce.return_value = MockSalesForce()
        extractor = SalesForceExtractor()
        extractor.init(Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope()))

        results = [extractor.extract(), extractor.extract()]
        for result in results:
            self.assertIsInstance(result, TableMetadata)

        expecteds = [
            TableMetadata(
                "salesforce",
                "gold",
                "default",
                "Account",
                None,
                [
                    ColumnMetadata("Id", "The Account Id", "id", 0, []),
                    ColumnMetadata("isDeleted", "Deleted?", "bool", 1, []),
                ],
                False,
                [],
            ),
            TableMetadata(
                "salesforce",
                "gold",
                "default",
                "Profile",
                None,
                [
                    # These columns are sorted alphabetically
                    ColumnMetadata("Business", "Important Bizness", "string", 0, []),
                    ColumnMetadata("Id", "The Profile Id", "id", 1, []),
                ],
                False,
                [],
            ),
        ]

        for result, expected in zip(results, expecteds):
            self.assertEqual(expected.__repr__(), result.__repr__())

        self.assertIsNone(extractor.extract())
Beispiel #16
0
    def _retrieve_tables(self, dataset):
        # type: () -> Any
        for page in self._page_table_list_results(dataset):
            if 'tables' not in page:
                continue

            for table in page['tables']:
                tableRef = table['tableReference']
                table = self.bigquery_service.tables().get(
                    projectId=tableRef['projectId'],
                    datasetId=tableRef['datasetId'],
                    tableId=tableRef['tableId']).execute(num_retries=BigQueryMetadataExtractor.NUM_RETRIES)

                # BigQuery tables also have interesting metadata about partitioning
                # data location (EU/US), mod/create time, etc... Extract that some other time?
                schema = table['schema']
                cols = []
                if 'fields' in schema:
                    total_cols = 0
                    for column in schema['fields']:
                        total_cols = self._iterate_over_cols('', column, cols, total_cols + 1)

                table_meta = TableMetadata(
                    database='bigquery',
                    cluster=tableRef['projectId'],
                    schema_name=tableRef['datasetId'],
                    name=tableRef['tableId'],
                    description=table.get('description', ''),
                    columns=cols,
                    is_view=table['type'] == 'VIEW')

                yield(table_meta)
Beispiel #17
0
    def test_hive_parser_with_failures(self) -> None:
        transformer = ComplexTypeTransformer()
        config = ConfigFactory.from_dict({
            PARSING_FUNCTION:
            'databuilder.utils.hive_complex_type_parser.parse_hive_type',
        })
        transformer.init(conf=config)

        column = ColumnMetadata('col1', 'array type', 'array<array<int>>', 0)
        table_metadata = TableMetadata('hive', 'gold', 'test_schema',
                                       'test_table', 'test_table', [column])

        default_scalar_type = ScalarTypeMetadata(name='col1',
                                                 parent=column,
                                                 type_str='array<array<int>>')

        with patch.object(transformer, '_parsing_function') as mock:
            mock.side_effect = MagicMock(
                side_effect=Exception('Could not parse'))

            result = transformer.transform(table_metadata)

            self.assertEqual(transformer.success_count, 0)
            self.assertEqual(transformer.failure_count, 1)
            for actual in result.columns:
                self.assertEqual(actual.get_type_metadata(),
                                 default_scalar_type)
Beispiel #18
0
    def test_hive_parser_usage(self) -> None:
        transformer = ComplexTypeTransformer()
        config = ConfigFactory.from_dict({
            PARSING_FUNCTION:
            'databuilder.utils.hive_complex_type_parser.parse_hive_type',
        })
        transformer.init(conf=config)

        column = ColumnMetadata('col1', 'array type', 'array<array<int>>', 0)
        table_metadata = TableMetadata('hive', 'gold', 'test_schema',
                                       'test_table', 'test_table', [column])
        array_type = ArrayTypeMetadata(name='col1',
                                       parent=column,
                                       type_str='array<array<int>>')
        inner_array = ArrayTypeMetadata(name='_inner_',
                                        parent=array_type,
                                        type_str='array<int>')

        array_type.array_inner_type = inner_array

        result = transformer.transform(table_metadata)

        for actual in result.columns:
            self.assertTrue(
                isinstance(actual.get_type_metadata(), TypeMetadata))
            self.assertEqual(actual.get_type_metadata(), array_type)
            self.assertEqual(transformer.success_count, 1)
            self.assertEqual(transformer.failure_count, 0)
Beispiel #19
0
    def _get_extract_iter(self):
        # type: () -> Iterator[TableMetadata]
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """
        for _, group in groupby(self._get_raw_extract_iter(), self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                columns.append(
                    ColumnMetadata(
                        row["col_name"],
                        row["col_description"],
                        row["col_type"],
                        row["col_sort_order"],
                    )
                )

            yield TableMetadata(
                self._database,
                self._cluster,
                last_row["schema"],
                last_row["name"],
                last_row["description"],
                columns,
                is_view=bool(last_row["is_view"]),
            )
    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        """
        It gets all tables and yields TableMetadata
        :return:
        """
        keyspaces = self._get_keyspaces()
        for keyspace in keyspaces:
            # system keyspaces
            if keyspace.startswith('system'):
                continue
            for table in self._get_tables(keyspace):
                if self._filter and not self._filter(keyspace, table):
                    continue

                columns = []

                columns_dict = self._get_columns(keyspace, table)
                for idx, (column_name, column) in enumerate(columns_dict.items()):
                    columns.append(ColumnMetadata(
                        column_name,
                        None,
                        column.cql_type,
                        idx
                    ))

                yield TableMetadata(
                    'cassandra',
                    self._cluster,
                    keyspace,
                    table,
                    None,
                    columns
                )
    def _get_extract_iter(self):
        # type: () -> Iterator[TableMetadata]
        """
        It gets all tables and yields TableMetadata
        :return:
        """
        for row in self._get_raw_extract_iter():
            columns = []

            for i in range(len(row['StorageDescriptor']['Columns'])):
                column = row['StorageDescriptor']['Columns'][i]
                columns.append(ColumnMetadata(
                    column['Name'],
                    column['Comment'] if 'Comment' in column else None,
                    column['Type'],
                    i
                ))

            yield TableMetadata(
                'glue',
                self._cluster,
                row['DatabaseName'],
                row['Name'],
                row['Description'] if 'Description' in row else None,
                columns
            )
Beispiel #22
0
 def setUp(self) -> None:
     self.maxDiff = None
     super(TestQuery, self).setUp()
     self.user = User(first_name='test_first',
                      last_name='test_last',
                      full_name='test_first test_last',
                      email='*****@*****.**',
                      github_username='******',
                      team_name='test_team',
                      employee_type='FTE',
                      manager_email='*****@*****.**',
                      slack_id='slack',
                      is_active=True,
                      profile_url='https://profile',
                      updated_at=1,
                      role_name='swe')
     self.table_metadata = TableMetadata(
         'hive', 'gold', 'test_schema1', 'test_table1', 'test_table1', [
             ColumnMetadata('test_id1', 'description of test_table1',
                            'bigint', 0),
             ColumnMetadata('test_id2', 'description of test_id2', 'bigint',
                            1),
             ColumnMetadata('is_active', None, 'boolean', 2),
             ColumnMetadata('source', 'description of source', 'varchar',
                            3),
             ColumnMetadata('etl_created_at',
                            'description of etl_created_at', 'timestamp',
                            4),
             ColumnMetadata('ds', None, 'varchar', 5)
         ])
     self.sql = "select * from table"
     self.query_metadata = QueryMetadata(sql=self.sql,
                                         tables=[self.table_metadata],
                                         user=self.user)
     self._query_hash = 'da44ff72560e593a8eca9ffcee6a2696'
    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """
        for key, group in groupby(self._get_raw_extract_iter(), self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                column = None
                if row['is_partition_col'] == 1:
                    # create add a badge to indicate partition column
                    column = ColumnMetadata(row['col_name'], row['col_description'],
                                            row['col_type'], row['col_sort_order'], [PARTITION_BADGE])
                else:
                    column = ColumnMetadata(row['col_name'], row['col_description'],
                                            row['col_type'], row['col_sort_order'])
                columns.append(column)
            is_view = last_row['is_view'] == 1
            yield TableMetadata('hive', self._cluster,
                                last_row['schema'],
                                last_row['name'],
                                last_row['description'],
                                columns,
                                is_view=is_view)
Beispiel #24
0
    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        """
        It gets all tables and yields TableMetadata
        :return:
        """
        for row in self._get_raw_extract_iter():
            columns, i = [], 0

            for column in row['StorageDescriptor']['Columns'] \
                    + row.get('PartitionKeys', []):
                columns.append(ColumnMetadata(
                    column['Name'],
                    column['Comment'] if 'Comment' in column else None,
                    column['Type'],
                    i
                ))
                i += 1

            yield TableMetadata(
                'glue',
                self._cluster,
                row['DatabaseName'],
                row['Name'],
                row.get('Description') or row.get('Parameters', {}).get('comment'),
                columns,
                row.get('TableType') == 'VIRTUAL_VIEW',
            )
 def test_create_table_metadata(self) -> None:
     scraped = ScrapedTableMetadata(schema="test_schema1", table="test_table1")
     scraped.set_columns([ScrapedColumnMetadata(name="a", description=None, data_type="string", sort_order=0),
                          ScrapedColumnMetadata(name="b", description=None, data_type="int", sort_order=1)])
     created_metadata = self.dExtractor.create_table_metadata(scraped)
     expected = TableMetadata("test_database", "test_cluster", "test_schema1", "test_table1", description=None,
                              columns=[ColumnMetadata("a", None, "string", 0),
                                       ColumnMetadata("b", None, "int", 1)])
     self.assertEqual(str(expected), str(created_metadata))
Beispiel #26
0
    def setUp(self) -> None:
        super(TestTableMetadata, self).setUp()
        TableMetadata.serialized_nodes_keys = set()
        TableMetadata.serialized_rels_keys = set()

        column_with_type_metadata = ColumnMetadata(
            'has_nested_type', 'column with nested types',
            'array<array<array<string>>>', 6)
        column_with_type_metadata.set_column_key(
            'hive://gold.test_schema1/test_table1/has_nested_type')
        column_with_type_metadata.set_type_metadata(
            self._set_up_type_metadata(column_with_type_metadata))

        self.table_metadata = TableMetadata(
            'hive', 'gold', 'test_schema1', 'test_table1', 'test_table1', [
                ColumnMetadata('test_id1', 'description of test_table1',
                               'bigint', 0),
                ColumnMetadata('test_id2', 'description of test_id2', 'bigint',
                               1),
                ColumnMetadata('is_active', None, 'boolean', 2),
                ColumnMetadata('source', 'description of source', 'varchar',
                               3),
                ColumnMetadata('etl_created_at',
                               'description of etl_created_at', 'timestamp',
                               4),
                ColumnMetadata('ds', None, 'varchar', 5),
                column_with_type_metadata
            ])

        self.table_metadata2 = TableMetadata(
            'hive', 'gold', 'test_schema1', 'test_table1', 'test_table1', [
                ColumnMetadata('test_id1', 'description of test_table1',
                               'bigint', 0),
                ColumnMetadata('test_id2', 'description of test_id2', 'bigint',
                               1),
                ColumnMetadata('is_active', None, 'boolean', 2),
                ColumnMetadata('source', 'description of source', 'varchar',
                               3),
                ColumnMetadata('etl_created_at',
                               'description of etl_created_at', 'timestamp',
                               4),
                ColumnMetadata('ds', None, 'varchar', 5),
                column_with_type_metadata
            ])
Beispiel #27
0
    def _retrieve_tables(self, dataset: DatasetRef) -> Any:
        grouped_tables: Set[str] = set([])

        for page in self._page_table_list_results(dataset):
            if 'tables' not in page:
                continue

            for table in page['tables']:
                tableRef = table['tableReference']
                table_id = tableRef['tableId']

                # BigQuery tables that have numeric suffix starting with a date string will be
                # considered date range tables.
                # ( e.g. ga_sessions_20190101, ga_sessions_20190102, etc. )
                if self._is_sharded_table(table_id):
                    # Sharded tables have numeric suffix starting with a date string
                    # and then we only need one schema definition
                    table_prefix = table_id[:-len(
                        self._get_sharded_table_suffix(table_id))]
                    if table_prefix in grouped_tables:
                        # If one table in the date range is processed, then ignore other ones
                        # (it adds too much metadata)
                        continue

                    table_id = table_prefix
                    grouped_tables.add(table_prefix)

                table = self.bigquery_service.tables().get(
                    projectId=tableRef['projectId'],
                    datasetId=tableRef['datasetId'],
                    tableId=tableRef['tableId']).execute(
                        num_retries=BigQueryMetadataExtractor.NUM_RETRIES)

                # BigQuery tables also have interesting metadata about partitioning
                # data location (EU/US), mod/create time, etc... Extract that some other time?
                cols: List[ColumnMetadata] = []
                # Not all tables have schemas
                if 'schema' in table:
                    schema = table['schema']
                    if 'fields' in schema:
                        total_cols = 0
                        for column in schema['fields']:
                            # TRICKY: this mutates :cols:
                            total_cols = self._iterate_over_cols(
                                '', column, cols, total_cols + 1)

                table_meta = TableMetadata(database='bigquery',
                                           cluster=tableRef['projectId'],
                                           schema=tableRef['datasetId'],
                                           name=table_id,
                                           description=table.get(
                                               'description', ''),
                                           columns=cols,
                                           is_view=table['type'] == 'VIEW')

                yield (table_meta)
Beispiel #28
0
    def _retrieve_tables(self, dataset):
        # type: () -> Any
        for page in self._page_table_list_results(dataset):
            if 'tables' not in page:
                continue

            for table in page['tables']:
                tableRef = table['tableReference']

                table_id = tableRef['tableId']

                # BigQuery tables that have 8 digits as last characters are
                # considered date range tables and are grouped together in the UI.
                # ( e.g. ga_sessions_20190101, ga_sessions_20190102, etc. )
                last_eight_chars = table_id[-BigQueryMetadataExtractor.
                                            DATE_LENGTH:]
                if last_eight_chars.isdigit():
                    # If the last eight characters are digits, we assume the table is of a table date range type
                    # and then we only need one schema definition
                    table_prefix = table_id[:-BigQueryMetadataExtractor.
                                            DATE_LENGTH]
                    if table_prefix in self.grouped_tables:
                        # If one table in the date range is processed, then ignore other ones
                        # (it adds too much metadata)
                        continue

                    table_id = table_prefix
                    self.grouped_tables.add(table_prefix)

                table = self.bigquery_service.tables().get(
                    projectId=tableRef['projectId'],
                    datasetId=tableRef['datasetId'],
                    tableId=tableRef['tableId']).execute(
                        num_retries=BigQueryMetadataExtractor.NUM_RETRIES)

                # BigQuery tables also have interesting metadata about partitioning
                # data location (EU/US), mod/create time, etc... Extract that some other time?
                schema = table['schema']
                cols = []
                if 'fields' in schema:
                    total_cols = 0
                    for column in schema['fields']:
                        total_cols = self._iterate_over_cols(
                            '', column, cols, total_cols + 1)

                table_meta = TableMetadata(database='bigquery',
                                           cluster=tableRef['projectId'],
                                           schema_name=tableRef['datasetId'],
                                           name=table_id,
                                           description=table.get(
                                               'description', ''),
                                           columns=cols,
                                           is_view=table['type'] == 'VIEW')

                yield (table_meta)
Beispiel #29
0
    def _retrieve_tables(self, dataset: DatasetRef) -> Any:
        for page in self._page_table_list_results(dataset):
            if 'tables' not in page:
                continue

            for table in page['tables']:
                tableRef = table['tableReference']
                table_id = tableRef['tableId']

                # BigQuery tables that have 8 digits as last characters are
                # considered date range tables and are grouped together in the UI.
                # ( e.g. ga_sessions_20190101, ga_sessions_20190102, etc. )
                if self._is_sharded_table(table_id):
                    # If the last eight characters are digits, we assume the table is of a table date range type
                    # and then we only need one schema definition
                    table_prefix = table_id[:-BigQueryMetadataExtractor.DATE_LENGTH]
                    table_id = table_prefix
                    sharded_table_key = BigQueryMetadataExtractor.SHARDED_TABLE_KEY_FORMAT.format(
                        dataset_id=tableRef['datasetId'],
                        table_id=table_id)
                    if sharded_table_key in self.grouped_tables:
                        # If one table in the date range is processed, then ignore other ones
                        # (it adds too much metadata)
                        continue

                    self.grouped_tables.add(sharded_table_key)

                table = self.bigquery_service.tables().get(
                    projectId=tableRef['projectId'],
                    datasetId=tableRef['datasetId'],
                    tableId=tableRef['tableId']).execute(num_retries=BigQueryMetadataExtractor.NUM_RETRIES)

                # BigQuery tables also have interesting metadata about partitioning
                # data location (EU/US), mod/create time, etc... Extract that some other time?
                cols: List[ColumnMetadata] = []
                # Not all tables have schemas
                if 'schema' in table:
                    schema = table['schema']
                    if 'fields' in schema:
                        total_cols = 0
                        for column in schema['fields']:
                            # TRICKY: this mutates :cols:
                            total_cols = self._iterate_over_cols('', column, cols, total_cols + 1)

                table_meta = TableMetadata(
                    database='bigquery',
                    cluster=tableRef['projectId'],
                    schema=tableRef['datasetId'],
                    name=table_id,
                    description=table.get('description', ''),
                    columns=cols,
                    is_view=table['type'] == 'VIEW')

                yield(table_meta)
    def test_extraction_with_default_conf(self, mock_columns, mock_tables,
                                          mock_keyspaces):
        # type: () -> None
        mock_keyspaces.return_value = {'test_schema': None}
        mock_tables.return_value = {'test_table': None}
        columns_dict = OrderedDict()
        columns_dict['id'] = CassandraColumnMetadata(None, 'id', 'int')
        columns_dict['txt'] = CassandraColumnMetadata(None, 'txt', 'text')
        mock_columns.return_value = columns_dict

        extractor = CassandraExtractor()
        extractor.init(self.default_conf)
        actual = extractor.extract()
        expected = TableMetadata('cassandra', 'gold', 'test_schema',
                                 'test_table', None, [
                                     ColumnMetadata('id', None, 'int', 0),
                                     ColumnMetadata('txt', None, 'text', 1)
                                 ])
        self.assertEqual(expected.__repr__(), actual.__repr__())
        self.assertIsNone(extractor.extract())