def fix_dtype_to_row_type( dc: DataContainer, row_type: "org.apache.calcite.rel.type.RelDataType" ): """ Fix the dtype of the given data container (or: the df within it) to the data type given as argument. To prevent unneeded conversions, do only convert if really needed, e.g. if the two types are "similar" enough, do not convert. Similarity involves the same general type (int, float, string etc) but not necessary the size (int64 and int32 are compatible) or the nullability. TODO: we should check the nullability of the SQL type """ df = dc.df cc = dc.column_container field_types = { int(field.getIndex()): str(field.getType()) for field in row_type.getFieldList() } for index, field_type in field_types.items(): expected_type = sql_to_python_type(field_type) field_name = cc.get_backend_by_frontend_index(index) df = cast_column_type(df, field_name, expected_type) return DataContainer(df, dc.column_container)
def try_to_infer_type_of_operation(operation, column_types, default_type=np.float64): """ Tries to infer the return type for an operation passed to aggregate or extended_projection methods. In order to work with dask-sql, the return type should be a pandas or numpy type. Parameters ---------- operation : Union[Callable, str] The operation to infer the type for column_types : pd.Series The dtypes series mapping the dtype for each column. Used if operation references a known column. default_type : Type, optional The return value if type cannot be infered, by default np.float64 Returns ------- Type An infered return type for the operation. """ try: # 1. First we try to guess the return type of the operation if isinstance(operation, (types.FunctionType, types.MethodType)): # operation is a custom function rtype = typing_callable_from_annotated_function(operation) rtype = get_args(rtype)[1] elif isinstance(operation, types.BuiltinFunctionType): # operation is something like 'sum' rtype = infer_type_builtins(operation) rtype = get_args(rtype)[1] else: if isinstance(operation, str): default_type = str # check if it's one of SQLAlchemy's known functions, like count if hasattr(functions, operation): rtype = getattr(functions, operation).type if inspect.isclass(rtype): rtype = rtype() rtype = sql_to_python_type(rtype.compile()) else: # otherwise operation is probably a str or # RelationalAlgebraStringExpression representing a column # literal, like 'col_a + 1', or a constant like '0'. # We try to parse the expression to get type of variable or # constant. rtype = type_of_expression( ast.parse(operation, mode="eval").body, column_types) else: rtype = type(operation) except (ValueError, TypeError, NotImplementedError, SyntaxError): LOG.warning(f"Unable to infer type of operation {operation}" f", assuming default {default_type} type instead.") rtype = default_type return rtype
def div(self, lhs, rhs, rex=None): result = lhs / rhs output_type = str(rex.getType()) output_type = sql_to_python_type(output_type.upper()) is_float = pd.api.types.is_float_dtype(output_type) if not is_float: result = da.trunc(result) return result
def cast(self, operand, rex=None) -> SeriesOrScalar: if not is_frame(operand): return operand output_type = str(rex.getType()) output_type = sql_to_python_type(output_type.upper()) return_column = cast_column_to_type(operand, output_type) if return_column is None: return operand else: return return_column
def cast(self, operand, rex=None) -> SeriesOrScalar: output_type = str(rex.getType()) python_type = sql_to_python_type(output_type.upper()) return_column = cast_column_to_type(operand, python_type) if return_column is None: return_column = operand # TODO: ideally we don't want to directly access the datetimes, # but Pandas can't truncate timezone datetimes and cuDF can't # truncate datetimes if output_type == "DATE": return return_column.dt.floor("D").astype(python_type) return return_column
def fix_dtype_to_row_type( dc: DataContainer, row_type: "org.apache.calcite.rel.type.RelDataType"): """ Fix the dtype of the given data container (or: the df within it) to the data type given as argument. To prevent unneeded conversions, do only convert if really needed, e.g. if the two types are "similar" enough, do not convert. Similarity involves the same general type (int, float, string etc) but not necessary the size (int64 and int32 are compatible) or the nullability. TODO: we should check the nullability of the SQL type """ df = dc.df cc = dc.column_container field_types = { int(field.getIndex()): str(field.getType()) for field in row_type.getFieldList() } for index, field_type in field_types.items(): expected_type = sql_to_python_type(field_type) field_name = cc.get_backend_by_frontend_index(index) current_type = df[field_name].dtype logger.debug( f"Column {field_name} has type {current_type}, expecting {expected_type}..." ) if similar_type(current_type, expected_type): logger.debug("...not converting.") continue current_float = pd.api.types.is_float_dtype(current_type) expected_integer = pd.api.types.is_integer_dtype(expected_type) if current_float and expected_integer: logger.debug("...truncating...") df[field_name] = da.trunc(df[field_name]) logger.debug( f"Need to cast {field_name} from {current_type} to {expected_type}" ) df[field_name] = df[field_name].astype(expected_type) return DataContainer(df, dc.column_container)
def to_dc(self, input_item: Any, table_name: str, format: str = None, **kwargs): # pragma: no cover table_name = kwargs.pop("hive_table_name", table_name) schema = kwargs.pop("hive_schema_name", "default") parsed = self._parse_hive_table_description(input_item, schema, table_name) ( column_information, table_information, storage_information, partition_information, ) = parsed logger.debug("Extracted hive information: ") logger.debug(f"column information: {column_information}") logger.debug(f"table information: {table_information}") logger.debug(f"storage information: {storage_information}") logger.debug(f"partition information: {partition_information}") # Convert column information column_information = { col: sql_to_python_type(col_type.upper()) for col, col_type in column_information.items() } # Extract format information if "InputFormat" in storage_information: format = storage_information["InputFormat"].split(".")[-1] # databricks format is different, see https://github.com/nils-braun/dask-sql/issues/83 elif "InputFormat" in table_information: format = table_information["InputFormat"].split(".")[-1] else: raise RuntimeError( "Do not understand the output of 'DESCRIBE FORMATTED <table>'") if format == "TextInputFormat" or format == "SequenceFileInputFormat": storage_description = storage_information.get( "Storage Desc Params", {}) read_function = partial( dd.read_csv, sep=storage_description.get("field.delim", ","), header=None, ) elif format == "ParquetInputFormat" or format == "MapredParquetInputFormat": read_function = dd.read_parquet elif format == "OrcInputFormat": read_function = dd.read_orc elif format == "JsonInputFormat": read_function = dd.read_json else: raise AttributeError( f"Do not understand hive's table format {format}") def _normalize(loc): if loc.startswith("dbfs:/") and not loc.startswith("dbfs://"): # dask (or better: fsspec) needs to have the URL in a specific form # starting with two // after the protocol loc = f"dbfs://{loc.lstrip('dbfs:')}" # file:// is not a known protocol loc = loc.lstrip("file:") # Only allow files which do not start with . or _ # Especially, not allow the _SUCCESS files return os.path.join(loc, "[A-Za-z0-9-]*") def wrapped_read_function(location, column_information, **kwargs): location = _normalize(location) logger.debug(f"Reading in hive data from {location}") df = read_function(location, **kwargs) logger.debug(f"Applying column information: {column_information}") df = df.rename( columns=dict(zip(df.columns, column_information.keys()))) for col, expected_type in column_information.items(): df = cast_column_type(df, col, expected_type) return df if partition_information: partition_list = self._parse_hive_partition_description( input_item, schema, table_name) logger.debug(f"Reading in partitions from {partition_list}") tables = [] for partition in partition_list: parsed = self._parse_hive_table_description( input_item, schema, table_name, partition=partition) ( partition_column_information, partition_table_information, _, _, ) = parsed location = partition_table_information["Location"] table = wrapped_read_function(location, partition_column_information, **kwargs) # Now add the additional partition columns partition_values = ast.literal_eval( partition_table_information["Partition Value"]) logger.debug( f"Applying additional partition information as columns: {partition_information}" ) partition_id = 0 for partition_key, partition_type in partition_information.items( ): table[partition_key] = partition_values[partition_id] table = cast_column_type(table, partition_key, partition_type) partition_id += 1 tables.append(table) return dd.concat(tables) location = table_information["Location"] df = wrapped_read_function(location, column_information, **kwargs) return df