def test_parse_relation_with_integer_owner(self): self.maxDiff = None rel_type = SparkRelation.get_relation_type.Table relation = SparkRelation.create( schema='default_schema', identifier='mytable', type=rel_type ) assert relation.database is None # Mimics the output of Spark with a DESCRIBE TABLE EXTENDED plain_rows = [ ('col1', 'decimal(22,0)'), ('# Detailed Table Information', None), ('Owner', 1234) ] input_cols = [Row(keys=['col_name', 'data_type'], values=r) for r in plain_rows] config = self._get_target_http(self.project_cfg) rows = SparkAdapter(config).parse_describe_extended( relation, input_cols) self.assertEqual(rows[0].to_column_dict().get('table_owner'), '1234')
def test_parse_relation_with_statistics(self): self.maxDiff = None rel_type = SparkRelation.get_relation_type.Table relation = SparkRelation.create(schema='default_schema', identifier='mytable', type=rel_type) assert relation.database is None # Mimics the output of Spark with a DESCRIBE TABLE EXTENDED plain_rows = [ ('col1', 'decimal(22,0)'), ('# Partition Information', 'data_type'), (None, None), ('# Detailed Table Information', None), ('Database', None), ('Owner', 'root'), ('Created Time', 'Wed Feb 04 18:15:00 UTC 1815'), ('Last Access', 'Wed May 20 19:25:00 UTC 1925'), ('Statistics', '1109049927 bytes, 14093476 rows'), ('Type', 'MANAGED'), ('Provider', 'delta'), ('Location', '/mnt/vo'), ('Serde Library', 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'), ('InputFormat', 'org.apache.hadoop.mapred.SequenceFileInputFormat'), ('OutputFormat', 'org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat'), ('Partition Provider', 'Catalog') ] input_cols = [ Row(keys=['col_name', 'data_type'], values=r) for r in plain_rows ] config = self._get_target_http(self.project_cfg) rows = SparkAdapter(config).parse_describe_extended( relation, input_cols) self.assertEqual(len(rows), 1) self.assertEqual( rows[0].to_dict(omit_none=False), { 'table_database': None, 'table_schema': relation.schema, 'table_name': relation.name, 'table_type': rel_type, 'table_owner': 'root', 'column': 'col1', 'column_index': 0, 'dtype': 'decimal(22,0)', 'numeric_scale': None, 'numeric_precision': None, 'char_size': None, 'stats:bytes:description': '', 'stats:bytes:include': True, 'stats:bytes:label': 'bytes', 'stats:bytes:value': 1109049927, 'stats:rows:description': '', 'stats:rows:include': True, 'stats:rows:label': 'rows', 'stats:rows:value': 14093476, })
def test_parse_relation_with_properties(self): rel_type = SparkRelation.RelationType.Table relation = SparkRelation.create(database='default_database', schema='default_schema', identifier='mytable', type=rel_type) # Mimics the output of Spark with a DESCRIBE TABLE EXTENDED plain_rows = [ ('col1', 'decimal(19,25)'), ('', ''), ('# Detailed Table Information', ''), ('Database', relation.database), ('Owner', 'root'), ('Created Time', 'Wed Feb 04 18:15:00 UTC 1815'), ('Last Access', 'Wed May 20 19:25:00 UTC 1925'), ('Type', 'MANAGED'), ('Provider', 'delta'), ('Location', '/mnt/vo'), ('Serde Library', 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'), ('InputFormat', 'org.apache.hadoop.mapred.SequenceFileInputFormat'), ('OutputFormat', 'org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat'), ('Partition Provider', 'Catalog') ] input_cols = [ Column(index=None, name=r[0], data_type=r[1], rows=MappedSequence(keys=['col_name', 'data_type'], values=r)) for r in plain_rows ] rows = SparkAdapter._parse_relation(relation, input_cols, rel_type, {'Owner': 'Fokko'}) self.assertEqual( rows[0], { 'table_database': relation.database, 'table_schema': relation.schema, 'table_name': relation.name, 'table_type': rel_type, 'stats:bytes:description': 'The size of the table in bytes', 'stats:bytes:include': False, 'stats:bytes:label': 'Table size', 'stats:bytes:value': None, 'stats:rows:description': 'The number of rows in the table', 'stats:rows:include': False, 'stats:rows:label': 'Number of rows', 'stats:rows:value': None, 'table_comment': None, 'table_owner': 'Fokko', 'column_name': 'col1', 'column_index': 0, 'column_type': 'decimal(19,25)', 'column_comment': None })
def test_parse_columns_from_information_with_table_type_and_parquet_provider( self): self.maxDiff = None rel_type = SparkRelation.get_relation_type.Table information = ( "Database: default_schema\n" "Table: mytable\n" "Owner: root\n" "Created Time: Wed Feb 04 18:15:00 UTC 1815\n" "Last Access: Wed May 20 19:25:00 UTC 1925\n" "Created By: Spark 3.0.1\n" "Type: MANAGED\n" "Provider: parquet\n" "Statistics: 1234567890 bytes, 12345678 rows\n" "Location: /mnt/vo\n" "Serde Library: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe\n" "InputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat\n" "OutputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat\n" "Schema: root\n" " |-- col1: decimal(22,0) (nullable = true)\n" " |-- col2: string (nullable = true)\n" " |-- dt: date (nullable = true)\n" " |-- struct_col: struct (nullable = true)\n" " | |-- struct_inner_col: string (nullable = true)\n") relation = SparkRelation.create(schema='default_schema', identifier='mytable', type=rel_type, information=information) config = self._get_target_http(self.project_cfg) columns = SparkAdapter(config).parse_columns_from_information(relation) self.assertEqual(len(columns), 4) self.assertEqual( columns[2].to_column_dict(omit_none=False), { 'table_database': None, 'table_schema': relation.schema, 'table_name': relation.name, 'table_type': rel_type, 'table_owner': 'root', 'column': 'dt', 'column_index': 2, 'dtype': 'date', 'numeric_scale': None, 'numeric_precision': None, 'char_size': None, 'stats:bytes:description': '', 'stats:bytes:include': True, 'stats:bytes:label': 'bytes', 'stats:bytes:value': 1234567890, 'stats:rows:description': '', 'stats:rows:include': True, 'stats:rows:label': 'rows', 'stats:rows:value': 12345678 }) self.assertEqual( columns[3].to_column_dict(omit_none=False), { 'table_database': None, 'table_schema': relation.schema, 'table_name': relation.name, 'table_type': rel_type, 'table_owner': 'root', 'column': 'struct_col', 'column_index': 3, 'dtype': 'struct', 'numeric_scale': None, 'numeric_precision': None, 'char_size': None, 'stats:bytes:description': '', 'stats:bytes:include': True, 'stats:bytes:label': 'bytes', 'stats:bytes:value': 1234567890, 'stats:rows:description': '', 'stats:rows:include': True, 'stats:rows:label': 'rows', 'stats:rows:value': 12345678 })
def test_parse_columns_from_information_with_view_type(self): self.maxDiff = None rel_type = SparkRelation.get_relation_type.View information = ( "Database: default_schema\n" "Table: myview\n" "Owner: root\n" "Created Time: Wed Feb 04 18:15:00 UTC 1815\n" "Last Access: UNKNOWN\n" "Created By: Spark 3.0.1\n" "Type: VIEW\n" "View Text: WITH base (\n" " SELECT * FROM source_table\n" ")\n" "SELECT col1, col2, dt FROM base\n" "View Original Text: WITH base (\n" " SELECT * FROM source_table\n" ")\n" "SELECT col1, col2, dt FROM base\n" "View Catalog and Namespace: spark_catalog.default\n" "View Query Output Columns: [col1, col2, dt]\n" "Table Properties: [view.query.out.col.1=col1, view.query.out.col.2=col2, " "transient_lastDdlTime=1618324324, view.query.out.col.3=dt, " "view.catalogAndNamespace.part.0=spark_catalog, " "view.catalogAndNamespace.part.1=default]\n" "Serde Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe\n" "InputFormat: org.apache.hadoop.mapred.SequenceFileInputFormat\n" "OutputFormat: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat\n" "Storage Properties: [serialization.format=1]\n" "Schema: root\n" " |-- col1: decimal(22,0) (nullable = true)\n" " |-- col2: string (nullable = true)\n" " |-- dt: date (nullable = true)\n" " |-- struct_col: struct (nullable = true)\n" " | |-- struct_inner_col: string (nullable = true)\n") relation = SparkRelation.create(schema='default_schema', identifier='myview', type=rel_type, information=information) config = self._get_target_http(self.project_cfg) columns = SparkAdapter(config).parse_columns_from_information(relation) self.assertEqual(len(columns), 4) self.assertEqual( columns[1].to_column_dict(omit_none=False), { 'table_database': None, 'table_schema': relation.schema, 'table_name': relation.name, 'table_type': rel_type, 'table_owner': 'root', 'column': 'col2', 'column_index': 1, 'dtype': 'string', 'numeric_scale': None, 'numeric_precision': None, 'char_size': None }) self.assertEqual( columns[3].to_column_dict(omit_none=False), { 'table_database': None, 'table_schema': relation.schema, 'table_name': relation.name, 'table_type': rel_type, 'table_owner': 'root', 'column': 'struct_col', 'column_index': 3, 'dtype': 'struct', 'numeric_scale': None, 'numeric_precision': None, 'char_size': None })