def parse_describe_extended( self, relation: Relation, raw_rows: List[agate.Row]) -> List[SparkColumn]: # Convert the Row to a dict dict_rows = [dict(zip(row._keys, row._values)) for row in raw_rows] # Find the separator between the rows and the metadata provided # by the DESCRIBE TABLE EXTENDED statement pos = self.find_table_information_separator(dict_rows) # Remove rows that start with a hash, they are comments rows = [ row for row in raw_rows[0:pos] if not row['col_name'].startswith('#') ] metadata = { col['col_name']: col['data_type'] for col in raw_rows[pos + 1:] } raw_table_stats = metadata.get(KEY_TABLE_STATISTICS) table_stats = SparkColumn.convert_table_stats(raw_table_stats) return [ SparkColumn( table_database=relation.database, table_schema=relation.schema, table_name=relation.name, table_type=relation.type, table_owner=metadata.get(KEY_TABLE_OWNER), table_stats=table_stats, column=column['col_name'], column_index=idx, dtype=column['data_type'], ) for idx, column in enumerate(rows) ]
def parse_columns_from_information( self, relation: SparkRelation) -> List[SparkColumn]: owner_match = re.findall(self.INFORMATION_OWNER_REGEX, relation.information) owner = owner_match[0] if owner_match else None matches = re.finditer(self.INFORMATION_COLUMNS_REGEX, relation.information) columns = [] stats_match = re.findall(self.INFORMATION_STATISTICS_REGEX, relation.information) raw_table_stats = stats_match[0] if stats_match else None table_stats = SparkColumn.convert_table_stats(raw_table_stats) for match_num, match in enumerate(matches): column_name, column_type, nullable = match.groups() column = SparkColumn(table_database=None, table_schema=relation.schema, table_name=relation.table, table_type=relation.type, column_index=match_num, table_owner=owner, column=column_name, dtype=column_type, table_stats=table_stats) columns.append(column) return columns
def test_convert_table_stats_with_bytes(self): self.assertDictEqual( SparkColumn.convert_table_stats("123456789 bytes"), { 'stats:bytes:description': '', 'stats:bytes:include': True, 'stats:bytes:label': 'bytes', 'stats:bytes:value': 123456789 })
def _massage_column_for_catalog(self, column: SparkColumn) -> Dict[str, Any]: dct = column.to_dict() # different expectations here - Column.column is the name dct['column_name'] = dct.pop('column') dct['column_type'] = dct.pop('dtype') # table_database can't be None in core. if dct['table_database'] is None: dct['table_database'] = dct['table_schema'] return dct
def test_convert_table_stats_with_no_statistics(self): self.assertDictEqual(SparkColumn.convert_table_stats(None), {})