Beispiel #1
0
    def test_parse_relation_with_integer_owner(self):
        self.maxDiff = None
        rel_type = SparkRelation.get_relation_type.Table

        relation = SparkRelation.create(
            schema='default_schema',
            identifier='mytable',
            type=rel_type
        )
        assert relation.database is None

        # Mimics the output of Spark with a DESCRIBE TABLE EXTENDED
        plain_rows = [
            ('col1', 'decimal(22,0)'),
            ('# Detailed Table Information', None),
            ('Owner', 1234)
        ]

        input_cols = [Row(keys=['col_name', 'data_type'], values=r)
                      for r in plain_rows]

        config = self._get_target_http(self.project_cfg)
        rows = SparkAdapter(config).parse_describe_extended(
            relation, input_cols)

        self.assertEqual(rows[0].to_column_dict().get('table_owner'), '1234')
Beispiel #2
0
    def test_parse_relation_with_statistics(self):
        self.maxDiff = None
        rel_type = SparkRelation.get_relation_type.Table

        relation = SparkRelation.create(schema='default_schema',
                                        identifier='mytable',
                                        type=rel_type)
        assert relation.database is None

        # Mimics the output of Spark with a DESCRIBE TABLE EXTENDED
        plain_rows = [
            ('col1',
             'decimal(22,0)'), ('# Partition Information', 'data_type'),
            (None, None), ('# Detailed Table Information', None),
            ('Database', None), ('Owner', 'root'),
            ('Created Time', 'Wed Feb 04 18:15:00 UTC 1815'),
            ('Last Access', 'Wed May 20 19:25:00 UTC 1925'),
            ('Statistics', '1109049927 bytes, 14093476 rows'),
            ('Type', 'MANAGED'), ('Provider', 'delta'),
            ('Location', '/mnt/vo'),
            ('Serde Library',
             'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'),
            ('InputFormat',
             'org.apache.hadoop.mapred.SequenceFileInputFormat'),
            ('OutputFormat',
             'org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat'),
            ('Partition Provider', 'Catalog')
        ]

        input_cols = [
            Row(keys=['col_name', 'data_type'], values=r) for r in plain_rows
        ]

        config = self._get_target_http(self.project_cfg)
        rows = SparkAdapter(config).parse_describe_extended(
            relation, input_cols)
        self.assertEqual(len(rows), 1)
        self.assertEqual(
            rows[0].to_dict(omit_none=False), {
                'table_database': None,
                'table_schema': relation.schema,
                'table_name': relation.name,
                'table_type': rel_type,
                'table_owner': 'root',
                'column': 'col1',
                'column_index': 0,
                'dtype': 'decimal(22,0)',
                'numeric_scale': None,
                'numeric_precision': None,
                'char_size': None,
                'stats:bytes:description': '',
                'stats:bytes:include': True,
                'stats:bytes:label': 'bytes',
                'stats:bytes:value': 1109049927,
                'stats:rows:description': '',
                'stats:rows:include': True,
                'stats:rows:label': 'rows',
                'stats:rows:value': 14093476,
            })
Beispiel #3
0
    def test_parse_relation_with_properties(self):
        rel_type = SparkRelation.RelationType.Table

        relation = SparkRelation.create(database='default_database',
                                        schema='default_schema',
                                        identifier='mytable',
                                        type=rel_type)

        # Mimics the output of Spark with a DESCRIBE TABLE EXTENDED
        plain_rows = [
            ('col1', 'decimal(19,25)'), ('', ''),
            ('# Detailed Table Information', ''),
            ('Database', relation.database), ('Owner', 'root'),
            ('Created Time', 'Wed Feb 04 18:15:00 UTC 1815'),
            ('Last Access', 'Wed May 20 19:25:00 UTC 1925'), ('Type',
                                                              'MANAGED'),
            ('Provider', 'delta'), ('Location', '/mnt/vo'),
            ('Serde Library',
             'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'),
            ('InputFormat',
             'org.apache.hadoop.mapred.SequenceFileInputFormat'),
            ('OutputFormat',
             'org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat'),
            ('Partition Provider', 'Catalog')
        ]

        input_cols = [
            Column(index=None,
                   name=r[0],
                   data_type=r[1],
                   rows=MappedSequence(keys=['col_name', 'data_type'],
                                       values=r)) for r in plain_rows
        ]

        rows = SparkAdapter._parse_relation(relation, input_cols, rel_type,
                                            {'Owner': 'Fokko'})
        self.assertEqual(
            rows[0], {
                'table_database': relation.database,
                'table_schema': relation.schema,
                'table_name': relation.name,
                'table_type': rel_type,
                'stats:bytes:description': 'The size of the table in bytes',
                'stats:bytes:include': False,
                'stats:bytes:label': 'Table size',
                'stats:bytes:value': None,
                'stats:rows:description': 'The number of rows in the table',
                'stats:rows:include': False,
                'stats:rows:label': 'Number of rows',
                'stats:rows:value': None,
                'table_comment': None,
                'table_owner': 'Fokko',
                'column_name': 'col1',
                'column_index': 0,
                'column_type': 'decimal(19,25)',
                'column_comment': None
            })
Beispiel #4
0
    def test_parse_columns_from_information_with_table_type_and_parquet_provider(
            self):
        self.maxDiff = None
        rel_type = SparkRelation.get_relation_type.Table

        information = (
            "Database: default_schema\n"
            "Table: mytable\n"
            "Owner: root\n"
            "Created Time: Wed Feb 04 18:15:00 UTC 1815\n"
            "Last Access: Wed May 20 19:25:00 UTC 1925\n"
            "Created By: Spark 3.0.1\n"
            "Type: MANAGED\n"
            "Provider: parquet\n"
            "Statistics: 1234567890 bytes, 12345678 rows\n"
            "Location: /mnt/vo\n"
            "Serde Library: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe\n"
            "InputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat\n"
            "OutputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat\n"
            "Schema: root\n"
            " |-- col1: decimal(22,0) (nullable = true)\n"
            " |-- col2: string (nullable = true)\n"
            " |-- dt: date (nullable = true)\n"
            " |-- struct_col: struct (nullable = true)\n"
            " |    |-- struct_inner_col: string (nullable = true)\n")
        relation = SparkRelation.create(schema='default_schema',
                                        identifier='mytable',
                                        type=rel_type,
                                        information=information)

        config = self._get_target_http(self.project_cfg)
        columns = SparkAdapter(config).parse_columns_from_information(relation)
        self.assertEqual(len(columns), 4)
        self.assertEqual(
            columns[2].to_column_dict(omit_none=False), {
                'table_database': None,
                'table_schema': relation.schema,
                'table_name': relation.name,
                'table_type': rel_type,
                'table_owner': 'root',
                'column': 'dt',
                'column_index': 2,
                'dtype': 'date',
                'numeric_scale': None,
                'numeric_precision': None,
                'char_size': None,
                'stats:bytes:description': '',
                'stats:bytes:include': True,
                'stats:bytes:label': 'bytes',
                'stats:bytes:value': 1234567890,
                'stats:rows:description': '',
                'stats:rows:include': True,
                'stats:rows:label': 'rows',
                'stats:rows:value': 12345678
            })

        self.assertEqual(
            columns[3].to_column_dict(omit_none=False), {
                'table_database': None,
                'table_schema': relation.schema,
                'table_name': relation.name,
                'table_type': rel_type,
                'table_owner': 'root',
                'column': 'struct_col',
                'column_index': 3,
                'dtype': 'struct',
                'numeric_scale': None,
                'numeric_precision': None,
                'char_size': None,
                'stats:bytes:description': '',
                'stats:bytes:include': True,
                'stats:bytes:label': 'bytes',
                'stats:bytes:value': 1234567890,
                'stats:rows:description': '',
                'stats:rows:include': True,
                'stats:rows:label': 'rows',
                'stats:rows:value': 12345678
            })
Beispiel #5
0
    def test_parse_columns_from_information_with_view_type(self):
        self.maxDiff = None
        rel_type = SparkRelation.get_relation_type.View
        information = (
            "Database: default_schema\n"
            "Table: myview\n"
            "Owner: root\n"
            "Created Time: Wed Feb 04 18:15:00 UTC 1815\n"
            "Last Access: UNKNOWN\n"
            "Created By: Spark 3.0.1\n"
            "Type: VIEW\n"
            "View Text: WITH base (\n"
            "    SELECT * FROM source_table\n"
            ")\n"
            "SELECT col1, col2, dt FROM base\n"
            "View Original Text: WITH base (\n"
            "    SELECT * FROM source_table\n"
            ")\n"
            "SELECT col1, col2, dt FROM base\n"
            "View Catalog and Namespace: spark_catalog.default\n"
            "View Query Output Columns: [col1, col2, dt]\n"
            "Table Properties: [view.query.out.col.1=col1, view.query.out.col.2=col2, "
            "transient_lastDdlTime=1618324324, view.query.out.col.3=dt, "
            "view.catalogAndNamespace.part.0=spark_catalog, "
            "view.catalogAndNamespace.part.1=default]\n"
            "Serde Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe\n"
            "InputFormat: org.apache.hadoop.mapred.SequenceFileInputFormat\n"
            "OutputFormat: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat\n"
            "Storage Properties: [serialization.format=1]\n"
            "Schema: root\n"
            " |-- col1: decimal(22,0) (nullable = true)\n"
            " |-- col2: string (nullable = true)\n"
            " |-- dt: date (nullable = true)\n"
            " |-- struct_col: struct (nullable = true)\n"
            " |    |-- struct_inner_col: string (nullable = true)\n")
        relation = SparkRelation.create(schema='default_schema',
                                        identifier='myview',
                                        type=rel_type,
                                        information=information)

        config = self._get_target_http(self.project_cfg)
        columns = SparkAdapter(config).parse_columns_from_information(relation)
        self.assertEqual(len(columns), 4)
        self.assertEqual(
            columns[1].to_column_dict(omit_none=False), {
                'table_database': None,
                'table_schema': relation.schema,
                'table_name': relation.name,
                'table_type': rel_type,
                'table_owner': 'root',
                'column': 'col2',
                'column_index': 1,
                'dtype': 'string',
                'numeric_scale': None,
                'numeric_precision': None,
                'char_size': None
            })

        self.assertEqual(
            columns[3].to_column_dict(omit_none=False), {
                'table_database': None,
                'table_schema': relation.schema,
                'table_name': relation.name,
                'table_type': rel_type,
                'table_owner': 'root',
                'column': 'struct_col',
                'column_index': 3,
                'dtype': 'struct',
                'numeric_scale': None,
                'numeric_precision': None,
                'char_size': None
            })