def test_extraction_with_multiple_views(self) -> None:
        with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute

            columns1 = {'columns': [{'name': 'xyz', 'type': 'varchar'},
                                    {'name': 'xyy', 'type': 'double'},
                                    {'name': 'aaa', 'type': 'int'},
                                    {'name': 'ab', 'type': 'varchar'}]}

            columns2 = {'columns': [{'name': 'xyy', 'type': 'varchar'},
                                    {'name': 'ab', 'type': 'double'},
                                    {'name': 'aaa', 'type': 'int'},
                                    {'name': 'xyz', 'type': 'varchar'}]}

            sql_execute.return_value = [
                {'tbl_id': 2,
                 'schema': 'test_schema2',
                 'name': 'test_view2',
                 'tbl_type': 'virtual_view',
                 'view_original_text': base64.b64encode(json.dumps(columns2).encode()).decode("utf-8")},
                {'tbl_id': 1,
                 'schema': 'test_schema1',
                 'name': 'test_view1',
                 'tbl_type': 'virtual_view',
                 'view_original_text': base64.b64encode(json.dumps(columns1).encode()).decode("utf-8")},
            ]

            extractor = PrestoViewMetadataExtractor()
            extractor.init(self.conf)
            actual_first_view = extractor.extract()
            expected_first_view = TableMetadata('presto', 'gold', 'test_schema2', 'test_view2', None,
                                                [ColumnMetadata(u'xyy', None, u'varchar', 0),
                                                 ColumnMetadata(u'ab', None, u'double', 1),
                                                 ColumnMetadata(u'aaa', None, u'int', 2),
                                                 ColumnMetadata(u'xyz', None, u'varchar', 3)],
                                                True)
            self.assertEqual(expected_first_view.__repr__(), actual_first_view.__repr__())

            actual_second_view = extractor.extract()
            expected_second_view = TableMetadata('presto', 'gold', 'test_schema1', 'test_view1', None,
                                                 [ColumnMetadata(u'xyz', None, u'varchar', 0),
                                                  ColumnMetadata(u'xyy', None, u'double', 1),
                                                  ColumnMetadata(u'aaa', None, u'int', 2),
                                                  ColumnMetadata(u'ab', None, u'varchar', 3)],
                                                 True)
            self.assertEqual(expected_second_view.__repr__(), actual_second_view.__repr__())

            self.assertIsNone(extractor.extract())
Beispiel #2
0
    def test_extraction_one_object(self, mock_salesforce: Any) -> None:
        mock_salesforce.return_value = MockSalesForce()
        config_dict: Dict = {
            f"extractor.salesforce_metadata.{SalesForceExtractor.OBJECT_NAMES_KEY}": [
                "Account"
            ],
            **self.config,
        }
        conf = ConfigFactory.from_dict(config_dict)

        mock_salesforce.return_value = MockSalesForce()
        extractor = SalesForceExtractor()
        extractor.init(Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope()))
        result = extractor.extract()
        self.assertIsInstance(result, TableMetadata)

        expected = TableMetadata(
            "salesforce",
            "gold",
            "default",
            "Account",
            None,
            [
                ColumnMetadata("Id", "The Account Id", "id", 0, []),
                ColumnMetadata("isDeleted", "Deleted?", "bool", 1, []),
            ],
            False,
            [],
        )

        self.assertEqual(expected.__repr__(), result.__repr__())

        self.assertIsNone(extractor.extract())
    def test_extraction_with_database_specified(self):
        # type: () -> None
        with patch.object(SQLAlchemyExtractor,
                          '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute

            sql_execute.return_value = [{
                'schema': 'test_schema',
                'name': 'test_table',
                'description': 'a table for testing',
                'cluster': 'MY_CLUSTER',
                'is_view': 'false',
                'col_name': 'ds',
                'col_type': 'varchar',
                'col_description': None,
                'col_sort_order': 0
            }]

            extractor = SnowflakeMetadataExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableMetadata(
                self.database_key, 'MY_CLUSTER', 'test_schema', 'test_table',
                'a table for testing',
                [ColumnMetadata('ds', None, 'varchar', 0)])

            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
Beispiel #4
0
    def test_extraction_with_partition_badge(self) -> None:
        with patch.object(GlueExtractor, '_search_tables') as mock_search:
            mock_search.return_value = [test_table]

            extractor = GlueExtractor()
            extractor.init(conf=ConfigFactory.from_dict({
                GlueExtractor.PARTITION_BADGE_LABEL_KEY:
                "partition_key",
            }))
            actual = extractor.extract()
            expected = TableMetadata(
                'glue', 'gold', 'test_schema', 'test_table',
                'a table for testing', [
                    ColumnMetadata('col_id1', 'description of id1', 'bigint',
                                   0),
                    ColumnMetadata('col_id2', 'description of id2', 'bigint',
                                   1),
                    ColumnMetadata('is_active', None, 'boolean', 2),
                    ColumnMetadata('source', 'description of source',
                                   'varchar', 3),
                    ColumnMetadata('etl_created_at',
                                   'description of etl_created_at',
                                   'timestamp', 4),
                    ColumnMetadata('ds', None, 'varchar', 5),
                    ColumnMetadata(
                        'partition_key1',
                        'description of partition_key1',
                        'string',
                        6,
                        ["partition_key"],
                    ),
                ], False)
            self.assertEqual(expected.__repr__(), actual.__repr__())
    def test_extraction_with_single_result(self):
        # type: () -> None
        with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute
            table = {'schema': 'test_schema',
                     'name': 'test_table',
                     'description': 'a table for testing',
                     'cluster':
                     self.conf['extractor.postgres_metadata.{}'.format(PostgresMetadataExtractor.CLUSTER_KEY)]
                     }

            sql_execute.return_value = [
                self._union(
                    {'col_name': 'col_id1',
                     'col_type': 'bigint',
                     'col_description': 'description of id1',
                     'col_sort_order': 0}, table),
                self._union(
                    {'col_name': 'col_id2',
                     'col_type': 'bigint',
                     'col_description': 'description of id2',
                     'col_sort_order': 1}, table),
                self._union(
                    {'col_name': 'is_active',
                     'col_type': 'boolean',
                     'col_description': None,
                     'col_sort_order': 2}, table),
                self._union(
                    {'col_name': 'source',
                     'col_type': 'varchar',
                     'col_description': 'description of source',
                     'col_sort_order': 3}, table),
                self._union(
                    {'col_name': 'etl_created_at',
                     'col_type': 'timestamp',
                     'col_description': 'description of etl_created_at',
                     'col_sort_order': 4}, table),
                self._union(
                    {'col_name': 'ds',
                     'col_type': 'varchar',
                     'col_description': None,
                     'col_sort_order': 5}, table)
            ]

            extractor = PostgresMetadataExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableMetadata('postgres', 'MY_CLUSTER', 'test_schema', 'test_table', 'a table for testing',
                                     [ColumnMetadata('col_id1', 'description of id1', 'bigint', 0),
                                      ColumnMetadata('col_id2', 'description of id2', 'bigint', 1),
                                      ColumnMetadata('is_active', None, 'boolean', 2),
                                      ColumnMetadata('source', 'description of source', 'varchar', 3),
                                      ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4),
                                      ColumnMetadata('ds', None, 'varchar', 5)])

            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
    def test_extraction_with_single_result(self,
                                           mock_connect: MagicMock) -> None:
        """
        Test Extraction with single table result from query
        """
        mock_connection = MagicMock()
        mock_connect.return_value = mock_connection

        mock_cursor = MagicMock()
        mock_connection.cursor.return_value = mock_cursor

        mock_execute = MagicMock()
        mock_cursor.execute = mock_execute

        mock_cursor.description = [['col_name'], ['col_description'],
                                   ['col_type'], ['col_sort_order'],
                                   ['database'], ['cluster'], ['schema'],
                                   ['name'], ['description'], ['is_view']]

        # Pass flake8 Unsupported operand types for + error
        table: List[Any] = [
            'DREMIO', 'Production', 'test_schema', 'test_table',
            'a table for testing', 'false'
        ]

        # Pass flake8 Unsupported operand types for + error
        expected_input: List[List[Any]] = [
            ['col_id1', 'description of id1', 'number', 0] + table,
            ['col_id2', 'description of id2', 'number', 1] + table,
            ['is_active', None, 'boolean', 2] + table,
            ['source', 'description of source', 'varchar', 3] + table,
            [
                'etl_created_at', 'description of etl_created_at',
                'timestamp_ltz', 4
            ] + table, ['ds', None, 'varchar', 5] + table
        ]

        mock_cursor.execute.return_value = expected_input

        extractor = DremioMetadataExtractor()
        extractor.init(self.conf)

        actual = extractor.extract()
        expected = TableMetadata(
            'DREMIO', 'Production', 'test_schema', 'test_table',
            'a table for testing', [
                ColumnMetadata('col_id1', 'description of id1', 'number', 0),
                ColumnMetadata('col_id2', 'description of id2', 'number', 1),
                ColumnMetadata('is_active', None, 'boolean', 2),
                ColumnMetadata('source', 'description of source', 'varchar',
                               3),
                ColumnMetadata('etl_created_at',
                               'description of etl_created_at',
                               'timestamp_ltz', 4),
                ColumnMetadata('ds', None, 'varchar', 5)
            ])

        self.assertEqual(expected.__repr__(), actual.__repr__())
        self.assertIsNone(extractor.extract())
Beispiel #7
0
    def test_extraction_with_single_result(self):
        # type: () -> None
        with patch.object(GlueExtractor, '_search_tables') as mock_search:
            mock_search.return_value = [{
                'Name': 'test_table',
                'DatabaseName': 'test_schema',
                'Description': 'a table for testing',
                'StorageDescriptor': {
                    'Columns': [{
                        'Name': 'col_id1',
                        'Type': 'bigint',
                        'Comment': 'description of id1'
                    }, {
                        'Name': 'col_id2',
                        'Type': 'bigint',
                        'Comment': 'description of id2'
                    }, {
                        'Name': 'is_active',
                        'Type': 'boolean'
                    }, {
                        'Name': 'source',
                        'Type': 'varchar',
                        'Comment': 'description of source'
                    }, {
                        'Name': 'etl_created_at',
                        'Type': 'timestamp',
                        'Comment': 'description of etl_created_at'
                    }, {
                        'Name': 'ds',
                        'Type': 'varchar'
                    }]
                }
            }]

            extractor = GlueExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableMetadata(
                'glue', 'gold', 'test_schema', 'test_table',
                'a table for testing', [
                    ColumnMetadata('col_id1', 'description of id1', 'bigint',
                                   0),
                    ColumnMetadata('col_id2', 'description of id2', 'bigint',
                                   1),
                    ColumnMetadata('is_active', None, 'boolean', 2),
                    ColumnMetadata('source', 'description of source',
                                   'varchar', 3),
                    ColumnMetadata('etl_created_at',
                                   'description of etl_created_at',
                                   'timestamp', 4),
                    ColumnMetadata('ds', None, 'varchar', 5)
                ])
            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
    def test_extraction_with_default_conf(self, mock_columns, mock_tables,
                                          mock_keyspaces):
        # type: () -> None
        mock_keyspaces.return_value = {'test_schema': None}
        mock_tables.return_value = {'test_table': None}
        columns_dict = OrderedDict()
        columns_dict['id'] = CassandraColumnMetadata(None, 'id', 'int')
        columns_dict['txt'] = CassandraColumnMetadata(None, 'txt', 'text')
        mock_columns.return_value = columns_dict

        extractor = CassandraExtractor()
        extractor.init(self.default_conf)
        actual = extractor.extract()
        expected = TableMetadata('cassandra', 'gold', 'test_schema',
                                 'test_table', None, [
                                     ColumnMetadata('id', None, 'int', 0),
                                     ColumnMetadata('txt', None, 'text', 1)
                                 ])
        self.assertEqual(expected.__repr__(), actual.__repr__())
        self.assertIsNone(extractor.extract())
Beispiel #9
0
    def test_extractor_sorted_es_v7(self) -> None:
        extractor = self._get_extractor(self.config_sorted)

        extractor._get_es_version = lambda: self.es_version_v7
        extractor.es.indices.get = MagicMock(return_value=self.indices_v7)

        expected = TableMetadata('elasticsearch', 'cluster_name', 'schema_name', 'proper_index',
                                 None, [ColumnMetadata('keyword_property', '', 'keyword', 0, []),
                                        ColumnMetadata('long_property', '', 'long', 1, [])], False, [])

        result = []

        while True:
            entry = extractor.extract()

            if entry:
                result.append(entry)
            else:
                break

        self.assertEqual(1, len(result))
        self.assertEqual(expected.__repr__(), result[0].__repr__())
Beispiel #10
0
    def test_feature_view_extraction(self) -> None:
        self._init_extractor(programmatic_description_enabled=False)

        table = self.extractor.extract()

        expected = TableMetadata(
            database="feast",
            cluster="local",
            schema="fs",
            name="driver_hourly_stats",
            description=None,
            columns=[
                ColumnMetadata("driver_id",
                               "Internal identifier of the driver", "INT64",
                               0),
                ColumnMetadata("conv_rate", None, "FLOAT", 1),
                ColumnMetadata("acc_rate", None, "FLOAT", 2),
                ColumnMetadata("avg_daily_trips", None, "INT64", 3),
            ],
        )

        self.assertEqual(expected.__repr__(), table.__repr__())
Beispiel #11
0
    def test_extraction_with_resource_link_result(self) -> None:
        with patch.object(GlueExtractor, '_search_tables') as mock_search:
            mock_search.return_value = [
                test_table, {
                    "Name": "test_resource_link",
                    "DatabaseName": "test_schema",
                    "TargetTable": {
                        "CatalogId": "111111111111",
                        "DatabaseName": "test_schema_external",
                        "Name": "test_table"
                    },
                    "CatalogId": "222222222222"
                }
            ]

            extractor = GlueExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableMetadata(
                'glue', 'gold', 'test_schema', 'test_table',
                'a table for testing', [
                    ColumnMetadata('col_id1', 'description of id1', 'bigint',
                                   0),
                    ColumnMetadata('col_id2', 'description of id2', 'bigint',
                                   1),
                    ColumnMetadata('is_active', None, 'boolean', 2),
                    ColumnMetadata('source', 'description of source',
                                   'varchar', 3),
                    ColumnMetadata('etl_created_at',
                                   'description of etl_created_at',
                                   'timestamp', 4),
                    ColumnMetadata('ds', None, 'varchar', 5),
                    ColumnMetadata('partition_key1',
                                   'description of partition_key1', 'string',
                                   6),
                ], False)
            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
    def test_feature_table_extraction(self) -> None:
        self._init_extractor(programmatic_description_enabled=False)
        self.extractor._client.list_projects.return_value = ["default"]
        self._mock_feature_table()

        table = self.extractor.extract()
        self.extractor._client.get_entity.assert_called_with("driver_id",
                                                             project="default")
        expected = TableMetadata(
            database="feast",
            cluster="unittest-feast-instance",
            schema="default",
            name="driver_trips",
            description=None,
            columns=[
                ColumnMetadata("driver_id",
                               "Internal identifier of the driver", "INT64",
                               0),
                ColumnMetadata("trips_today", None, "INT32", 1),
            ],
        )

        self.assertEqual(expected.__repr__(), table.__repr__())
        self.assertIsNone(self.extractor.extract())
    def test_extraction_with_multiple_result(self):
        # type: () -> None
        with patch.object(SQLAlchemyExtractor,
                          '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute
            table = {
                'schema_name':
                'test_schema1',
                'name':
                'test_table1',
                'description':
                'test table 1',
                'cluster':
                self.conf['extractor.mssql_metadata.{}'.format(
                    MSSQLMetadataExtractor.CLUSTER_KEY)]
            }

            table1 = {
                'schema_name':
                'test_schema1',
                'name':
                'test_table2',
                'description':
                'test table 2',
                'cluster':
                self.conf['extractor.mssql_metadata.{}'.format(
                    MSSQLMetadataExtractor.CLUSTER_KEY)]
            }

            table2 = {
                'schema_name':
                'test_schema2',
                'name':
                'test_table3',
                'description':
                'test table 3',
                'cluster':
                self.conf['extractor.mssql_metadata.{}'.format(
                    MSSQLMetadataExtractor.CLUSTER_KEY)]
            }

            sql_execute.return_value = [
                self._union(
                    {
                        'col_name': 'col_id1',
                        'col_type': 'bigint',
                        'col_description': 'description of col_id1',
                        'col_sort_order': 0
                    }, table),
                self._union(
                    {
                        'col_name': 'col_id2',
                        'col_type': 'bigint',
                        'col_description': 'description of col_id2',
                        'col_sort_order': 1
                    }, table),
                self._union(
                    {
                        'col_name': 'is_active',
                        'col_type': 'boolean',
                        'col_description': None,
                        'col_sort_order': 2
                    }, table),
                self._union(
                    {
                        'col_name': 'source',
                        'col_type': 'varchar',
                        'col_description': 'description of source',
                        'col_sort_order': 3
                    }, table),
                self._union(
                    {
                        'col_name': 'etl_created_at',
                        'col_type': 'timestamp',
                        'col_description': 'description of etl_created_at',
                        'col_sort_order': 4
                    }, table),
                self._union(
                    {
                        'col_name': 'ds',
                        'col_type': 'varchar',
                        'col_description': None,
                        'col_sort_order': 5
                    }, table),
                self._union(
                    {
                        'col_name': 'col_name',
                        'col_type': 'varchar',
                        'col_description': 'description of col_name',
                        'col_sort_order': 0
                    }, table1),
                self._union(
                    {
                        'col_name': 'col_name2',
                        'col_type': 'varchar',
                        'col_description': 'description of col_name2',
                        'col_sort_order': 1
                    }, table1),
                self._union(
                    {
                        'col_name': 'col_id3',
                        'col_type': 'varchar',
                        'col_description': 'description of col_id3',
                        'col_sort_order': 0
                    }, table2),
                self._union(
                    {
                        'col_name': 'col_name3',
                        'col_type': 'varchar',
                        'col_description': 'description of col_name3',
                        'col_sort_order': 1
                    }, table2)
            ]

            extractor = MSSQLMetadataExtractor()
            extractor.init(self.conf)

            expected = TableMetadata(
                'mssql', self.conf['extractor.mssql_metadata.{}'.format(
                    MSSQLMetadataExtractor.CLUSTER_KEY)], 'test_schema1',
                'test_table1', 'test table 1', [
                    ColumnMetadata('col_id1', 'description of col_id1',
                                   'bigint', 0),
                    ColumnMetadata('col_id2', 'description of col_id2',
                                   'bigint', 1),
                    ColumnMetadata('is_active', None, 'boolean', 2),
                    ColumnMetadata('source', 'description of source',
                                   'varchar', 3),
                    ColumnMetadata('etl_created_at',
                                   'description of etl_created_at',
                                   'timestamp', 4),
                    ColumnMetadata('ds', None, 'varchar', 5),
                ], False, ['test_schema1'])

            actual = extractor.extract().__repr__()
            self.assertEqual(expected.__repr__(), actual)

            expected = TableMetadata(
                'mssql', self.conf['extractor.mssql_metadata.{}'.format(
                    MSSQLMetadataExtractor.CLUSTER_KEY)], 'test_schema1',
                'test_table2', 'test table 2', [
                    ColumnMetadata('col_name', 'description of col_name',
                                   'varchar', 0),
                    ColumnMetadata('col_name2', 'description of col_name2',
                                   'varchar', 1)
                ], False, ['test_schema1'])
            actual = extractor.extract().__repr__()

            self.assertEqual(expected.__repr__(), actual)

            expected = TableMetadata(
                'mssql', self.conf['extractor.mssql_metadata.{}'.format(
                    MSSQLMetadataExtractor.CLUSTER_KEY)], 'test_schema2',
                'test_table3', 'test table 3', [
                    ColumnMetadata('col_id3', 'description of col_id3',
                                   'varchar', 0),
                    ColumnMetadata('col_name3', 'description of col_name3',
                                   'varchar', 1)
                ], False, ['test_schema2'])
            actual = extractor.extract().__repr__()
            self.assertEqual(expected.__repr__(), actual)

            self.assertIsNone(extractor.extract())
            self.assertIsNone(extractor.extract())
    def test_extraction_with_single_result(self) -> None:
        with patch.object(SQLAlchemyExtractor,
                          '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute
            table = {
                'schema': 'test_schema',
                'name': 'test_table',
                'description': 'a table for testing',
                'is_view': 0
            }

            sql_execute.return_value = [
                self._union(
                    {
                        'col_name': 'col_id1',
                        'col_type': 'bigint',
                        'col_description': 'description of id1',
                        'col_sort_order': 0,
                        'is_partition_col': 0
                    }, table),
                self._union(
                    {
                        'col_name': 'col_id2',
                        'col_type': 'bigint',
                        'col_description': 'description of id2',
                        'col_sort_order': 1,
                        'is_partition_col': 0
                    }, table),
                self._union(
                    {
                        'col_name': 'is_active',
                        'col_type': 'boolean',
                        'col_description': None,
                        'col_sort_order': 2,
                        'is_partition_col': 1
                    }, table),
                self._union(
                    {
                        'col_name': 'source',
                        'col_type': 'varchar',
                        'col_description': 'description of source',
                        'col_sort_order': 3,
                        'is_partition_col': 0
                    }, table),
                self._union(
                    {
                        'col_name': 'etl_created_at',
                        'col_type': 'timestamp',
                        'col_description': 'description of etl_created_at',
                        'col_sort_order': 4,
                        'is_partition_col': 0
                    }, table),
                self._union(
                    {
                        'col_name': 'ds',
                        'col_type': 'varchar',
                        'col_description': None,
                        'col_sort_order': 5,
                        'is_partition_col': 0
                    }, table)
            ]

            extractor = HiveTableMetadataExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableMetadata(
                'hive',
                'gold',
                'test_schema',
                'test_table',
                'a table for testing', [
                    ColumnMetadata('col_id1', 'description of id1', 'bigint',
                                   0),
                    ColumnMetadata('col_id2', 'description of id2', 'bigint',
                                   1),
                    ColumnMetadata('is_active', None, 'boolean', 2,
                                   ['partition column']),
                    ColumnMetadata('source', 'description of source',
                                   'varchar', 3),
                    ColumnMetadata('etl_created_at',
                                   'description of etl_created_at',
                                   'timestamp', 4),
                    ColumnMetadata('ds', None, 'varchar', 5)
                ],
                is_view=False)

            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
Beispiel #15
0
    def test_extraction_with_multiple_result(self) -> None:
        with patch.object(GlueExtractor, '_search_tables') as mock_search:
            mock_search.return_value = [
                test_table,
                {
                    'Name': 'test_table2',
                    'DatabaseName': 'test_schema1',
                    'Description': 'test table 2',
                    'StorageDescriptor': {
                        'Columns': [{
                            'Name': 'col_name',
                            'Type': 'varchar',
                            'Comment': 'description of col_name'
                        }, {
                            'Name': 'col_name2',
                            'Type': 'varchar',
                            'Comment': 'description of col_name2'
                        }]
                    },
                    'TableType': 'EXTERNAL_TABLE',
                },
                {
                    'Name': 'test_table3',
                    'DatabaseName': 'test_schema2',
                    'StorageDescriptor': {
                        'Columns': [{
                            'Name': 'col_id3',
                            'Type': 'varchar',
                            'Comment': 'description of col_id3'
                        }, {
                            'Name': 'col_name3',
                            'Type': 'varchar',
                            'Comment': 'description of col_name3'
                        }]
                    },
                    'Parameters': {
                        'comment': 'description of test table 3 from comment'
                    },
                    'TableType': 'EXTERNAL_TABLE',
                },
                {
                    'Name': 'test_view1',
                    'DatabaseName': 'test_schema1',
                    'Description': 'test view 1',
                    'StorageDescriptor': {
                        'Columns': [{
                            'Name': 'col_id3',
                            'Type': 'varchar',
                            'Comment': 'description of col_id3'
                        }, {
                            'Name': 'col_name3',
                            'Type': 'varchar',
                            'Comment': 'description of col_name3'
                        }]
                    },
                    'TableType': 'VIRTUAL_VIEW',
                },
            ]

            extractor = GlueExtractor()
            extractor.init(self.conf)

            expected = TableMetadata(
                'glue', 'gold', 'test_schema', 'test_table',
                'a table for testing', [
                    ColumnMetadata('col_id1', 'description of id1', 'bigint',
                                   0),
                    ColumnMetadata('col_id2', 'description of id2', 'bigint',
                                   1),
                    ColumnMetadata('is_active', None, 'boolean', 2),
                    ColumnMetadata('source', 'description of source',
                                   'varchar', 3),
                    ColumnMetadata('etl_created_at',
                                   'description of etl_created_at',
                                   'timestamp', 4),
                    ColumnMetadata('ds', None, 'varchar', 5),
                    ColumnMetadata('partition_key1',
                                   'description of partition_key1', 'string',
                                   6),
                ], False)
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            expected = TableMetadata(
                'glue', 'gold', 'test_schema1', 'test_table2', 'test table 2',
                [
                    ColumnMetadata('col_name', 'description of col_name',
                                   'varchar', 0),
                    ColumnMetadata('col_name2', 'description of col_name2',
                                   'varchar', 1)
                ], False)
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            expected = TableMetadata(
                'glue', 'gold', 'test_schema2', 'test_table3',
                'description of test table 3 from comment', [
                    ColumnMetadata('col_id3', 'description of col_id3',
                                   'varchar', 0),
                    ColumnMetadata('col_name3', 'description of col_name3',
                                   'varchar', 1)
                ], False)
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            expected = TableMetadata(
                'glue', 'gold', 'test_schema1', 'test_view1', 'test view 1', [
                    ColumnMetadata('col_id3', 'description of col_id3',
                                   'varchar', 0),
                    ColumnMetadata('col_name3', 'description of col_name3',
                                   'varchar', 1)
                ], True)
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            self.assertIsNone(extractor.extract())
            self.assertIsNone(extractor.extract())
    def test_extraction_with_single_result(self):
        # type: () -> None
        with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute
            table = {'schema_name': 'test_schema',
                     'name': 'test_table',
                     'description': '',
                     'cluster': self.conf['extractor.athena_metadata.{}'.format(AthenaMetadataExtractor.CATALOG_KEY)],
                     }

            sql_execute.return_value = [
                self._union(
                    {'col_name': 'col_id1',
                     'col_type': 'bigint',
                     'col_description': 'description of id1',
                     'col_sort_order': 0,
                     'extras': None}, table),
                self._union(
                    {'col_name': 'col_id2',
                     'col_type': 'bigint',
                     'col_description': 'description of id2',
                     'col_sort_order': 1,
                     'extras': None}, table),
                self._union(
                    {'col_name': 'is_active',
                     'col_type': 'boolean',
                     'col_description': None,
                     'col_sort_order': 2,
                     'extras': None}, table),
                self._union(
                    {'col_name': 'source',
                     'col_type': 'varchar',
                     'col_description': 'description of source',
                     'col_sort_order': 3,
                     'extras': None}, table),
                self._union(
                    {'col_name': 'etl_created_at',
                     'col_type': 'timestamp',
                     'col_description': None,
                     'col_sort_order': 4,
                     'extras': 'partition key'}, table),
                self._union(
                    {'col_name': 'ds',
                     'col_type': 'varchar',
                     'col_description': None,
                     'col_sort_order': 5,
                     'extras': None}, table)
            ]

            extractor = AthenaMetadataExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableMetadata('athena', self.conf['extractor.athena_metadata.{}'.
                                     format(AthenaMetadataExtractor.CATALOG_KEY)], 'test_schema', 'test_table', '',
                                     [ColumnMetadata('col_id1', 'description of id1', 'bigint', 0),
                                      ColumnMetadata('col_id2', 'description of id2', 'bigint', 1),
                                      ColumnMetadata('is_active', None, 'boolean', 2),
                                      ColumnMetadata('source', 'description of source', 'varchar', 3),
                                      ColumnMetadata('etl_created_at', 'partition key', 'timestamp', 4),
                                      ColumnMetadata('ds', None, 'varchar', 5)])
            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
Beispiel #17
0
    def test_extraction_with_multiple_result(self):
        # type: () -> None
        with patch.object(SQLAlchemyExtractor,
                          '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute
            table = {
                'schema': 'test_schema1',
                'name': 'test_table1',
                'description': 'test table 1',
                'is_view': 0
            }

            table1 = {
                'schema': 'test_schema1',
                'name': 'test_table2',
                'description': 'test table 2',
                'is_view': 0
            }

            table2 = {
                'schema': 'test_schema2',
                'name': 'test_table3',
                'description': 'test table 3',
                'is_view': 0
            }

            sql_execute.return_value = [
                self._union(
                    {
                        'col_name': 'col_id1',
                        'col_type': 'bigint',
                        'col_description': 'description of col_id1',
                        'col_sort_order': 0
                    }, table),
                self._union(
                    {
                        'col_name': 'col_id2',
                        'col_type': 'bigint',
                        'col_description': 'description of col_id2',
                        'col_sort_order': 1
                    }, table),
                self._union(
                    {
                        'col_name': 'is_active',
                        'col_type': 'boolean',
                        'col_description': None,
                        'col_sort_order': 2
                    }, table),
                self._union(
                    {
                        'col_name': 'source',
                        'col_type': 'varchar',
                        'col_description': 'description of source',
                        'col_sort_order': 3
                    }, table),
                self._union(
                    {
                        'col_name': 'etl_created_at',
                        'col_type': 'timestamp',
                        'col_description': 'description of etl_created_at',
                        'col_sort_order': 4
                    }, table),
                self._union(
                    {
                        'col_name': 'ds',
                        'col_type': 'varchar',
                        'col_description': None,
                        'col_sort_order': 5
                    }, table),
                self._union(
                    {
                        'col_name': 'col_name',
                        'col_type': 'varchar',
                        'col_description': 'description of col_name',
                        'col_sort_order': 0
                    }, table1),
                self._union(
                    {
                        'col_name': 'col_name2',
                        'col_type': 'varchar',
                        'col_description': 'description of col_name2',
                        'col_sort_order': 1
                    }, table1),
                self._union(
                    {
                        'col_name': 'col_id3',
                        'col_type': 'varchar',
                        'col_description': 'description of col_id3',
                        'col_sort_order': 0
                    }, table2),
                self._union(
                    {
                        'col_name': 'col_name3',
                        'col_type': 'varchar',
                        'col_description': 'description of col_name3',
                        'col_sort_order': 1
                    }, table2)
            ]

            extractor = HiveTableMetadataExtractor()
            extractor.init(self.conf)

            expected = TableMetadata(
                'hive',
                'gold',
                'test_schema1',
                'test_table1',
                'test table 1', [
                    ColumnMetadata('col_id1', 'description of col_id1',
                                   'bigint', 0),
                    ColumnMetadata('col_id2', 'description of col_id2',
                                   'bigint', 1),
                    ColumnMetadata('is_active', None, 'boolean', 2),
                    ColumnMetadata('source', 'description of source',
                                   'varchar', 3),
                    ColumnMetadata('etl_created_at',
                                   'description of etl_created_at',
                                   'timestamp', 4),
                    ColumnMetadata('ds', None, 'varchar', 5)
                ],
                is_view=False)
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            expected = TableMetadata(
                'hive',
                'gold',
                'test_schema1',
                'test_table2',
                'test table 2', [
                    ColumnMetadata('col_name', 'description of col_name',
                                   'varchar', 0),
                    ColumnMetadata('col_name2', 'description of col_name2',
                                   'varchar', 1)
                ],
                is_view=False)
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            expected = TableMetadata(
                'hive',
                'gold',
                'test_schema2',
                'test_table3',
                'test table 3', [
                    ColumnMetadata('col_id3', 'description of col_id3',
                                   'varchar', 0),
                    ColumnMetadata('col_name3', 'description of col_name3',
                                   'varchar', 1)
                ],
                is_view=False)
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            self.assertIsNone(extractor.extract())
            self.assertIsNone(extractor.extract())
    def test_extraction_with_multiple_result(self) -> None:
        with patch.object(SQLAlchemyExtractor,
                          '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute
            table = {
                'schema':
                'test_schema1',
                'name':
                'test_table1',
                'description':
                'test table 1',
                'cluster':
                self.conf['extractor.snowflake_metadata.{}'.format(
                    SnowflakeMetadataExtractor.CLUSTER_KEY)],
                'is_view':
                'nottrue'
            }

            table1 = {
                'schema':
                'test_schema1',
                'name':
                'test_table2',
                'description':
                'test table 2',
                'cluster':
                self.conf['extractor.snowflake_metadata.{}'.format(
                    SnowflakeMetadataExtractor.CLUSTER_KEY)],
                'is_view':
                'false'
            }

            table2 = {
                'schema':
                'test_schema2',
                'name':
                'test_table3',
                'description':
                'test table 3',
                'cluster':
                self.conf['extractor.snowflake_metadata.{}'.format(
                    SnowflakeMetadataExtractor.CLUSTER_KEY)],
                'is_view':
                'true'
            }

            sql_execute.return_value = [
                self._union(
                    {
                        'col_name': 'col_id1',
                        'col_type': 'number',
                        'col_description': 'description of col_id1',
                        'col_sort_order': 0
                    }, table),
                self._union(
                    {
                        'col_name': 'col_id2',
                        'col_type': 'number',
                        'col_description': 'description of col_id2',
                        'col_sort_order': 1
                    }, table),
                self._union(
                    {
                        'col_name': 'is_active',
                        'col_type': 'boolean',
                        'col_description': None,
                        'col_sort_order': 2
                    }, table),
                self._union(
                    {
                        'col_name': 'source',
                        'col_type': 'varchar',
                        'col_description': 'description of source',
                        'col_sort_order': 3
                    }, table),
                self._union(
                    {
                        'col_name': 'etl_created_at',
                        'col_type': 'timestamp_ltz',
                        'col_description': 'description of etl_created_at',
                        'col_sort_order': 4
                    }, table),
                self._union(
                    {
                        'col_name': 'ds',
                        'col_type': 'varchar',
                        'col_description': None,
                        'col_sort_order': 5
                    }, table),
                self._union(
                    {
                        'col_name': 'col_name',
                        'col_type': 'varchar',
                        'col_description': 'description of col_name',
                        'col_sort_order': 0
                    }, table1),
                self._union(
                    {
                        'col_name': 'col_name2',
                        'col_type': 'varchar',
                        'col_description': 'description of col_name2',
                        'col_sort_order': 1
                    }, table1),
                self._union(
                    {
                        'col_name': 'col_id3',
                        'col_type': 'varchar',
                        'col_description': 'description of col_id3',
                        'col_sort_order': 0
                    }, table2),
                self._union(
                    {
                        'col_name': 'col_name3',
                        'col_type': 'varchar',
                        'col_description': 'description of col_name3',
                        'col_sort_order': 1
                    }, table2)
            ]

            extractor = SnowflakeMetadataExtractor()
            extractor.init(self.conf)

            expected = TableMetadata(
                'snowflake',
                self.conf['extractor.snowflake_metadata.{}'.format(
                    SnowflakeMetadataExtractor.CLUSTER_KEY)], 'test_schema1',
                'test_table1', 'test table 1', [
                    ColumnMetadata('col_id1', 'description of col_id1',
                                   'number', 0),
                    ColumnMetadata('col_id2', 'description of col_id2',
                                   'number', 1),
                    ColumnMetadata('is_active', None, 'boolean', 2),
                    ColumnMetadata('source', 'description of source',
                                   'varchar', 3),
                    ColumnMetadata('etl_created_at',
                                   'description of etl_created_at',
                                   'timestamp_ltz', 4),
                    ColumnMetadata('ds', None, 'varchar', 5)
                ])
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            expected = TableMetadata(
                'snowflake',
                self.conf['extractor.snowflake_metadata.{}'.format(
                    SnowflakeMetadataExtractor.CLUSTER_KEY)], 'test_schema1',
                'test_table2', 'test table 2', [
                    ColumnMetadata('col_name', 'description of col_name',
                                   'varchar', 0),
                    ColumnMetadata('col_name2', 'description of col_name2',
                                   'varchar', 1)
                ])
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            expected = TableMetadata(
                'snowflake',
                self.conf['extractor.snowflake_metadata.{}'.format(
                    SnowflakeMetadataExtractor.CLUSTER_KEY)], 'test_schema2',
                'test_table3', 'test table 3', [
                    ColumnMetadata('col_id3', 'description of col_id3',
                                   'varchar', 0),
                    ColumnMetadata('col_name3', 'description of col_name3',
                                   'varchar', 1)
                ], True)
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            self.assertIsNone(extractor.extract())
            self.assertIsNone(extractor.extract())