Example #1
0
    def parse_describe_extended(
            self, relation: Relation,
            raw_rows: List[agate.Row]) -> List[SparkColumn]:
        # Convert the Row to a dict
        dict_rows = [dict(zip(row._keys, row._values)) for row in raw_rows]
        # Find the separator between the rows and the metadata provided
        # by the DESCRIBE TABLE EXTENDED statement
        pos = self.find_table_information_separator(dict_rows)

        # Remove rows that start with a hash, they are comments
        rows = [
            row for row in raw_rows[0:pos]
            if not row['col_name'].startswith('#')
        ]
        metadata = {
            col['col_name']: col['data_type']
            for col in raw_rows[pos + 1:]
        }

        raw_table_stats = metadata.get(KEY_TABLE_STATISTICS)
        table_stats = SparkColumn.convert_table_stats(raw_table_stats)
        return [
            SparkColumn(
                table_database=relation.database,
                table_schema=relation.schema,
                table_name=relation.name,
                table_type=relation.type,
                table_owner=metadata.get(KEY_TABLE_OWNER),
                table_stats=table_stats,
                column=column['col_name'],
                column_index=idx,
                dtype=column['data_type'],
            ) for idx, column in enumerate(rows)
        ]
Example #2
0
 def parse_columns_from_information(
         self, relation: SparkRelation) -> List[SparkColumn]:
     owner_match = re.findall(self.INFORMATION_OWNER_REGEX,
                              relation.information)
     owner = owner_match[0] if owner_match else None
     matches = re.finditer(self.INFORMATION_COLUMNS_REGEX,
                           relation.information)
     columns = []
     stats_match = re.findall(self.INFORMATION_STATISTICS_REGEX,
                              relation.information)
     raw_table_stats = stats_match[0] if stats_match else None
     table_stats = SparkColumn.convert_table_stats(raw_table_stats)
     for match_num, match in enumerate(matches):
         column_name, column_type, nullable = match.groups()
         column = SparkColumn(table_database=None,
                              table_schema=relation.schema,
                              table_name=relation.table,
                              table_type=relation.type,
                              column_index=match_num,
                              table_owner=owner,
                              column=column_name,
                              dtype=column_type,
                              table_stats=table_stats)
         columns.append(column)
     return columns
Example #3
0
 def test_convert_table_stats_with_bytes(self):
     self.assertDictEqual(
         SparkColumn.convert_table_stats("123456789 bytes"), {
             'stats:bytes:description': '',
             'stats:bytes:include': True,
             'stats:bytes:label': 'bytes',
             'stats:bytes:value': 123456789
         })
Example #4
0
 def _massage_column_for_catalog(self,
                                 column: SparkColumn) -> Dict[str, Any]:
     dct = column.to_dict()
     # different expectations here - Column.column is the name
     dct['column_name'] = dct.pop('column')
     dct['column_type'] = dct.pop('dtype')
     # table_database can't be None in core.
     if dct['table_database'] is None:
         dct['table_database'] = dct['table_schema']
     return dct
Example #5
0
 def test_convert_table_stats_with_no_statistics(self):
     self.assertDictEqual(SparkColumn.convert_table_stats(None), {})