def test_multiple_results(self, mock_client):
        col1_name = "col1"
        col2_name = "col2"
        col1_type = "int"
        col2_type = "char"
        col1_sort_order = "1"
        col2_sort_order = "2"
        mock_client.return_value.instance.return_value.database.return_value.snapshot.return_value.__enter__.return_value.execute_sql.return_value = [
            [col1_name, col1_type, col1_sort_order, self.schema, self.table],
            [col2_name, col2_type, col2_sort_order, self.schema, self.table],
        ]

        extractor = SpannerMetadataExtractor()
        extractor.init(
            Scoped.get_scoped_conf(conf=self.conf,
                                   scope=extractor.get_scope()))
        result = extractor.extract()
        assert result.database == self.connection_name
        assert result.cluster == self.project_id
        assert result.schema == f"{self.instance_id}.{self.database_id}"
        assert result.name == self.table
        self.assertEqual(
            result.columns[0].__repr__(),
            ColumnMetadata(col1_name, None, col1_type, col1_sort_order,
                           None).__repr__(),
        )
        self.assertEqual(
            result.columns[1].__repr__(),
            ColumnMetadata(col2_name, None, col2_type, col2_sort_order,
                           None).__repr__(),
        )
    def test_format_for_markdown(self):
        table_metadata = TableMetadata(
            database='test_database',
            cluster='test_cluster',
            schema='test_schema',
            name='test_table',
            columns=[
                ColumnMetadata(
                    name='test_column_1',
                    description=None,
                    data_type='INTEGER',
                    sort_order=1,
                ),
                ColumnMetadata(
                    name='test_column_2',
                    description=None,
                    data_type='BOOLEAN',
                    sort_order=2,
                ),
            ],
        )

        expected = """# `test_schema.test_table`
`test_database` | `test_cluster`

## Column details
* [INTEGER]   `test_column_1`
* [BOOLEAN]   `test_column_2`
"""

        self.assertEqual(table_metadata.format_for_markdown(), expected)
    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        """
        Using itertools.groupby and raw level iterator, it groups to table and
        yields TableMetadata
        :return:
        """
        for _, group in groupby(self._get_raw_extract_iter(),
                                self._get_table_key):
            columns = []
            for row in group:
                column_description = (unidecode(row["col_description"])
                                      if row["col_description"] else None)
                last_row = row
                columns.append(
                    ColumnMetadata(
                        name=row["col_name"],
                        description=column_description,
                        data_type=row["data_type"],
                        sort_order=row["col_sort_order"],
                    ))

            description = (unidecode(last_row["description"])
                           if last_row["description"] else None)

            yield TableMetadata(
                database=self._database,
                cluster=last_row["cluster"],
                schema=last_row["schema"],
                name=last_row["name"],
                description=description,
                columns=columns,
                is_view=last_row["is_view"] == "true",
            )
Beispiel #4
0
    def _get_extract_iter(self):
        # type: () -> Iterator[TableMetadata]
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """
        for _, group in groupby(self._get_raw_extract_iter(),
                                self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                columns.append(
                    ColumnMetadata(
                        row["col_name"],
                        row["col_description"],
                        row["data_type"],
                        row["col_sort_order"],
                    ))

            yield TableMetadata(
                self._database,
                self._cluster,
                last_row["schema"],
                last_row["name"],
                last_row["description"],
                columns,
                is_view=bool(last_row["is_view"]),
            )
Beispiel #5
0
    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """
        for key, group in groupby(self._get_raw_extract_iter(),
                                  self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                columns.append(
                    ColumnMetadata(
                        row["col_name"],
                        row["col_description"],
                        row["data_type"],
                        row["col_sort_order"],
                    ))

            # Deviating from amundsen to add `is_view`
            yield TableMetadata(
                self._database,
                last_row["cluster"],
                last_row["schema"],
                last_row["name"],
                last_row["description"],
                columns,
                last_row["is_view"],
            )
Beispiel #6
0
    def _iterate_over_cols(
        self,
        tags_dict: dict,
        parent: str,
        column: str,
        cols: List[ColumnMetadata],
        total_cols: int,
    ) -> int:
        if len(parent) > 0:
            col_name = "{parent}.{field}".format(parent=parent,
                                                 field=column["name"])
        else:
            col_name = column["name"]

        tags = None
        if tags_dict and "tags" in tags_dict:
            for tag in tags_dict["tags"]:
                if "column" in tag:
                    if tag["column"] == col_name:
                        tags = tag

        if column["type"] == "RECORD":
            col = ColumnMetadata(
                name=col_name,
                description=column.get("description", ""),
                data_type=column["type"],
                sort_order=total_cols,
                tags=tags,
            )
            cols.append(col)
            total_cols += 1
            for field in column["fields"]:
                total_cols = self._iterate_over_cols(tags_dict, col_name,
                                                     field, cols, total_cols)
            return total_cols
        else:
            col = ColumnMetadata(
                name=col_name,
                description=column.get("description", ""),
                data_type=column["type"],
                sort_order=total_cols,
                tags=tags,
            )
            cols.append(col)
            return total_cols + 1
    def get_table_metadata(
        self,
        schema: str,
        table: str,
        cluster: Optional[str] = None,
        is_view_query_enabled: Optional[bool] = False,
    ):
        # Format table and schema addresses for queries.
        full_schema_address = self._get_full_schema_address(cluster, schema)
        full_table_address = "{}.{}".format(full_schema_address, table)

        # Execute query that gets column type + partition information.
        columns_query = "show columns in {}".format(full_table_address)
        column_query_results = self.execute(columns_query, has_header=True)
        column_query_field_names = next(column_query_results)
        columns = []
        for i, column_query_result in enumerate(column_query_results):
            column_dict = dict(zip(column_query_field_names, column_query_result))
            columns.append(
                ColumnMetadata(
                    name=column_dict["Column"],
                    description=column_dict["Comment"],
                    data_type=column_dict["Type"],
                    sort_order=i,
                    is_partition_column=column_dict["Extra"] == "partition key",
                )
            )

        if is_view_query_enabled:
            # Execute query that returns if table is a view.
            view_query = """
                select table_type
                from information_schema.tables
                where table_schema='{table_schema}'
                  and table_name='{table_name}'
                """.format(
                table_schema=schema, table_name=table
            )
            view_query_results = self.execute(view_query, has_header=False)
            is_view = next(view_query_results)[0] == "VIEW"
        else:
            is_view = False

        return TableMetadata(
            database=self._database,
            cluster=cluster,
            schema=schema,
            name=table,
            description=None,
            columns=columns,
            is_view=is_view,
        )
Beispiel #8
0
    def _get_extract_iter(self):
        with self.driver.session() as session:
            if not hasattr(self, "results"):
                self.results = session.read_transaction(self._execute_query)

            for result in self.results:
                # Parse watermark information.
                partition_columns = []
                for watermark in result["watermarks"]:
                    partition_columns.append(watermark["partition_key"])

                # Parse column information.
                column_names = result["column_names"]
                column_descriptions = result["column_descriptions"]
                column_types = result["column_types"]
                column_sort_orders = result["column_sort_orders"]
                zipped_columns = zip_longest(column_names, column_descriptions,
                                             column_types, column_sort_orders)

                column_metadatas = []
                for (
                        column_name,
                        column_description,
                        column_type,
                        column_sort_order,
                ) in zipped_columns:
                    if column_name in partition_columns:
                        is_partition_column = True
                    else:
                        is_partition_column = False
                    column_metadatas.append(
                        ColumnMetadata(
                            name=column_name,
                            description=column_description,
                            data_type=column_type,
                            sort_order=column_sort_order,
                            is_partition_column=is_partition_column,
                        ))

                yield TableMetadata(
                    database=result["database"],
                    cluster=result["cluster"],
                    schema=result["schema"],
                    name=result["name"],
                    description=result["description"],
                    columns=column_metadatas,
                    is_view=result["is_view"],
                    tags=result["tags"],
                )
Beispiel #9
0
    def test_extraction_with_single_result(self):
        with patch.object(
            splice_machine_metadata_extractor, "splice_connect"
        ) as mock_connect:
            column = ColumnMetadata("column1", None, "int", 0)
            table = TableMetadata(
                self.DATABASE,
                self.CLUSTER,
                "test_schema",
                "test_table",
                None,
                [column],
            )

            # Connection returns a cursor
            mock_cursor = MagicMock()
            mock_execute = MagicMock()
            mock_fetchall = MagicMock()

            # self.connection = splice_connect(...)
            mock_connection = MagicMock()
            mock_connect.return_value = mock_connection
            # self.cursor = self.connection.cursor()
            mock_connection.cursor.return_value = mock_cursor

            # self.cursor.execute(...)
            mock_cursor.execute = mock_execute

            # for row in self.cursor.fetchall()
            mock_cursor.fetchall = mock_fetchall

            mock_fetchall.return_value = [
                [
                    table.schema,
                    table.name,
                    "not-a-view",
                    column.name,
                    column.sort_order,
                    column.type,
                ]
            ]

            extractor = self.Extractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = table

            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
Beispiel #10
0
 def test_get_all_table_metadata_from_information_schema(
         self, mock_settings) -> None:
     self.engine.init(self.conf)
     self.engine.execute = MagicMock(
         side_effect=presto_engine_execute_side_effect)
     mock_columns = [
         ColumnMetadata(
             name=MOCK_INFORMATION_SCHEMA_RESULT_1["col_name"],
             description=MOCK_INFORMATION_SCHEMA_RESULT_1[
                 "col_description"],  # noqa: 501
             data_type=MOCK_INFORMATION_SCHEMA_RESULT_1["data_type"],
             sort_order=MOCK_INFORMATION_SCHEMA_RESULT_1["col_sort_order"],
             is_partition_column=None,
         ),
         ColumnMetadata(
             name=MOCK_INFORMATION_SCHEMA_RESULT_2["col_name"],
             description=MOCK_INFORMATION_SCHEMA_RESULT_2[
                 "col_description"],  # noqa: 501
             data_type=MOCK_INFORMATION_SCHEMA_RESULT_2["data_type"],
             sort_order=MOCK_INFORMATION_SCHEMA_RESULT_2["col_sort_order"],
             is_partition_column=None,
         ),
     ]
     expected = TableMetadata(
         database=MOCK_DATABASE_NAME,
         cluster=MOCK_CLUSTER_NAME,
         schema=MOCK_SCHEMA_NAME,
         name=MOCK_TABLE_NAME,
         columns=mock_columns,
         is_view=bool(MOCK_INFORMATION_SCHEMA_RESULT_1["is_view"]),
     )
     results = self.engine.get_all_table_metadata_from_information_schema(
         cluster=MOCK_CLUSTER_NAME)
     result = next(results)
     self.maxDiff = None
     self.assertEqual(result.__repr__(), expected.__repr__())
Beispiel #11
0
    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        for row in self._get_raw_extract_iter():
            columns, i = [], 0

            for column in row["StorageDescriptor"]["Columns"] + row.get(
                    "PartitionKeys", []):
                columns.append(
                    ColumnMetadata(
                        column["Name"],
                        column["Comment"] if "Comment" in column else None,
                        column["Type"],
                        i,
                    ))
                i += 1

            if self._is_location_parsing_enabled:
                catalog, schema, table = self._parse_location(
                    location=row["StorageDescriptor"]["Location"],
                    name=row["Name"])
            else:
                catalog = None
                schema = None
                table = row["Name"]

            if self._connection_name:
                database = self._connection_name + "/" + row["DatabaseName"]
            else:
                database = row["DatabaseName"]

            yield TableMetadata(
                database,
                catalog,
                schema,
                table,
                row.get("Description")
                or row.get("Parameters", {}).get("comment"),
                columns,
                row.get("TableType") == "VIRTUAL_VIEW",
            )
Beispiel #12
0
    def _get_extract_iter(self):
        # type: () -> Iterator[TableMetadata]
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """

        with self.database.snapshot() as snapshot:
            results = snapshot.execute_sql(self.sql_stmt)
            header = SpannerMetadataExtractor.HEADER
            headered_results = [
                dict(zip(header, result)) for result in results
            ]
            schema = "{}.{}".format(self._instance_id, self._database_id)

            for _, group in groupby(headered_results, self._get_table_key):
                columns = []

                for row in group:
                    last_row = row
                    columns.append(
                        ColumnMetadata(
                            row["col_name"],
                            None,
                            row["data_type"],
                            row["col_sort_order"],
                        ))

                yield TableMetadata(
                    database=self._connection_name or "spanner",
                    cluster=self._project_id,
                    schema=schema,
                    name=last_row["name"],
                    description=None,
                    columns=columns,
                )
Beispiel #13
0
    def test_extraction_with_single_result(self):
        # type: () -> None
        with patch.object(SQLAlchemyExtractor,
                          "_get_connection") as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute
            table = {
                "schema": "test_schema",
                "name": "test_table",
                "description": "a table for testing",
                "cluster": self.conf[SnowflakeMetadataExtractor.CLUSTER_KEY],
                "is_view": "false",
            }

            sql_execute.return_value = [
                self._union(
                    {
                        "col_name": "col_id1",
                        "data_type": "number",
                        "col_description": "description of id1",
                        "col_sort_order": 0,
                    },
                    table,
                ),
                self._union(
                    {
                        "col_name": "col_id2",
                        "data_type": "number",
                        "col_description": "description of id2",
                        "col_sort_order": 1,
                    },
                    table,
                ),
                self._union(
                    {
                        "col_name": "is_active",
                        "data_type": "boolean",
                        "col_description": None,
                        "col_sort_order": 2,
                    },
                    table,
                ),
                self._union(
                    {
                        "col_name": "source",
                        "data_type": "varchar",
                        "col_description": "description of source",
                        "col_sort_order": 3,
                    },
                    table,
                ),
                self._union(
                    {
                        "col_name": "etl_created_at",
                        "data_type": "timestamp_ltz",
                        "col_description": "description of etl_created_at",
                        "col_sort_order": 4,
                    },
                    table,
                ),
                self._union(
                    {
                        "col_name": "ds",
                        "data_type": "varchar",
                        "col_description": None,
                        "col_sort_order": 5,
                    },
                    table,
                ),
            ]

            extractor = SnowflakeMetadataExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableMetadata(
                "prod",
                "MY_CLUSTER",
                "test_schema",
                "test_table",
                "a table for testing",
                [
                    ColumnMetadata("col_id1", "description of id1", "number",
                                   0),
                    ColumnMetadata("col_id2", "description of id2", "number",
                                   1),
                    ColumnMetadata("is_active", None, "boolean", 2),
                    ColumnMetadata("source", "description of source",
                                   "varchar", 3),
                    ColumnMetadata(
                        "etl_created_at",
                        "description of etl_created_at",
                        "timestamp_ltz",
                        4,
                    ),
                    ColumnMetadata("ds", None, "varchar", 5),
                ],
            )

            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
    def test_extraction_with_multiple_result(self) -> None:
        with patch.object(SQLAlchemyExtractor,
                          "_get_connection") as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute
            table = {
                "schema": "test_schema1",
                "name": "test_table1",
                "description": "test table 1",
                "is_view": 0,
                "cluster": self.conf[PostgresMetadataExtractor.CLUSTER_KEY],
            }

            table1 = {
                "schema": "test_schema1",
                "name": "test_table2",
                "description": "test table 2",
                "is_view": 0,
                "cluster": self.conf[PostgresMetadataExtractor.CLUSTER_KEY],
            }

            table2 = {
                "schema": "test_schema2",
                "name": "test_table3",
                "description": "test table 3",
                "is_view": 0,
                "cluster": self.conf[PostgresMetadataExtractor.CLUSTER_KEY],
            }

            sql_execute.return_value = [
                self._union(
                    {
                        "col_name": "col_id1",
                        "data_type": "bigint",
                        "col_description": "description of col_id1",
                        "col_sort_order": 0,
                    },
                    table,
                ),
                self._union(
                    {
                        "col_name": "col_id2",
                        "data_type": "bigint",
                        "col_description": "description of col_id2",
                        "col_sort_order": 1,
                    },
                    table,
                ),
                self._union(
                    {
                        "col_name": "is_active",
                        "data_type": "boolean",
                        "col_description": None,
                        "col_sort_order": 2,
                    },
                    table,
                ),
                self._union(
                    {
                        "col_name": "source",
                        "data_type": "varchar",
                        "col_description": "description of source",
                        "col_sort_order": 3,
                    },
                    table,
                ),
                self._union(
                    {
                        "col_name": "etl_created_at",
                        "data_type": "timestamp",
                        "col_description": "description of etl_created_at",
                        "col_sort_order": 4,
                    },
                    table,
                ),
                self._union(
                    {
                        "col_name": "ds",
                        "data_type": "varchar",
                        "col_description": None,
                        "col_sort_order": 5,
                    },
                    table,
                ),
                self._union(
                    {
                        "col_name": "col_name",
                        "data_type": "varchar",
                        "col_description": "description of col_name",
                        "col_sort_order": 0,
                    },
                    table1,
                ),
                self._union(
                    {
                        "col_name": "col_name2",
                        "data_type": "varchar",
                        "col_description": "description of col_name2",
                        "col_sort_order": 1,
                    },
                    table1,
                ),
                self._union(
                    {
                        "col_name": "col_id3",
                        "data_type": "varchar",
                        "col_description": "description of col_id3",
                        "col_sort_order": 0,
                    },
                    table2,
                ),
                self._union(
                    {
                        "col_name": "col_name3",
                        "data_type": "varchar",
                        "col_description": "description of col_name3",
                        "col_sort_order": 1,
                    },
                    table2,
                ),
            ]

            extractor = PostgresMetadataExtractor()
            extractor.init(self.conf)

            expected = TableMetadata(
                "postgres",
                self.conf[PostgresMetadataExtractor.CLUSTER_KEY],
                "test_schema1",
                "test_table1",
                "test table 1",
                [
                    ColumnMetadata("col_id1", "description of col_id1",
                                   "bigint", 0),
                    ColumnMetadata("col_id2", "description of col_id2",
                                   "bigint", 1),
                    ColumnMetadata("is_active", None, "boolean", 2),
                    ColumnMetadata("source", "description of source",
                                   "varchar", 3),
                    ColumnMetadata(
                        "etl_created_at",
                        "description of etl_created_at",
                        "timestamp",
                        4,
                    ),
                    ColumnMetadata("ds", None, "varchar", 5),
                ],
                0,
            )
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            expected = TableMetadata(
                "postgres",
                self.conf[PostgresMetadataExtractor.CLUSTER_KEY],
                "test_schema1",
                "test_table2",
                "test table 2",
                [
                    ColumnMetadata("col_name", "description of col_name",
                                   "varchar", 0),
                    ColumnMetadata("col_name2", "description of col_name2",
                                   "varchar", 1),
                ],
                0,
            )
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            expected = TableMetadata(
                "postgres",
                self.conf[PostgresMetadataExtractor.CLUSTER_KEY],
                "test_schema2",
                "test_table3",
                "test table 3",
                [
                    ColumnMetadata("col_id3", "description of col_id3",
                                   "varchar", 0),
                    ColumnMetadata("col_name3", "description of col_name3",
                                   "varchar", 1),
                ],
                0,
            )
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            self.assertIsNone(extractor.extract())
            self.assertIsNone(extractor.extract())
Beispiel #15
0
    def test_extraction_with_single_result(self) -> None:
        with patch.object(GlueExtractor, "_search_tables") as mock_search:
            mock_search.return_value = [{
                "Name":
                "test_catalog_test_schema_test_table",
                "DatabaseName":
                "test_database",
                "Description":
                "a table for testing",
                "StorageDescriptor": {
                    "Columns": [
                        {
                            "Name": "col_id1",
                            "Type": "bigint",
                            "Comment": "description of id1",
                        },
                        {
                            "Name": "col_id2",
                            "Type": "bigint",
                            "Comment": "description of id2",
                        },
                        {
                            "Name": "is_active",
                            "Type": "boolean"
                        },
                        {
                            "Name": "source",
                            "Type": "varchar",
                            "Comment": "description of source",
                        },
                        {
                            "Name": "etl_created_at",
                            "Type": "timestamp",
                            "Comment": "description of etl_created_at",
                        },
                        {
                            "Name": "ds",
                            "Type": "varchar"
                        },
                    ],
                    "Location":
                    "test_catalog.test_schema.test_table",
                },
                "PartitionKeys": [
                    {
                        "Name": "partition_key1",
                        "Type": "string",
                        "Comment": "description of partition_key1",
                    },
                ],
                "TableType":
                "EXTERNAL_TABLE",
            }]

            extractor = GlueExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableMetadata(
                "test_database",
                None,
                None,
                "test_catalog_test_schema_test_table",
                "a table for testing",
                [
                    ColumnMetadata("col_id1", "description of id1", "bigint",
                                   0),
                    ColumnMetadata("col_id2", "description of id2", "bigint",
                                   1),
                    ColumnMetadata("is_active", None, "boolean", 2),
                    ColumnMetadata("source", "description of source",
                                   "varchar", 3),
                    ColumnMetadata(
                        "etl_created_at",
                        "description of etl_created_at",
                        "timestamp",
                        4,
                    ),
                    ColumnMetadata("ds", None, "varchar", 5),
                    ColumnMetadata("partition_key1",
                                   "description of partition_key1", "string",
                                   6),
                ],
                False,
            )
            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
    def get_all_table_metadata_from_information_schema(
        self,
        cluster: Optional[str] = None,
        where_clause_suffix: str = "",
    ):

        unformatted_query = """
        SELECT
          a.table_catalog AS cluster
          , a.table_schema AS schema
          , a.table_name AS name
          , NULL AS description
          , a.column_name AS col_name
          , a.ordinal_position as col_sort_order
          , IF(a.extra_info = 'partition key', 1, 0) AS is_partition_col
          , a.comment AS col_description
          , a.data_type
          , IF(b.table_name is not null, 1, 0) AS is_view
        FROM {cluster_prefix}information_schema.columns a
        LEFT JOIN {cluster_prefix}information_schema.views b
            ON a.table_catalog = b.table_catalog
            and a.table_schema = b.table_schema
            and a.table_name = b.table_name
        {where_clause_suffix}
        """

        LOGGER.info(
            "Pulling all table metadata in bulk from"
            + "information_schema in cluster name: {}".format(cluster)
        )

        if cluster is not None:
            cluster_prefix = cluster + "."
        else:
            cluster_prefix = ""

        formatted_query = unformatted_query.format(
            cluster_prefix=cluster_prefix, where_clause_suffix=where_clause_suffix
        )

        LOGGER.info("SQL for presto: {}".format(formatted_query))

        query_results = self.execute(formatted_query, is_dict_return_enabled=True)

        for _, group in groupby(query_results, self._get_table_key):
            columns = []
            for row in group:
                last_row = row
                columns.append(
                    ColumnMetadata(
                        row["col_name"],
                        row["col_description"],
                        row["data_type"],
                        row["col_sort_order"],
                    )
                )

            yield TableMetadata(
                self._database,
                cluster or self._default_cluster_name,
                last_row["schema"],
                last_row["name"],
                last_row["description"],
                columns,
                is_view=bool(last_row["is_view"]),
            )