def spark2redshift(dtype: str, varchar_length: int = 256) -> str: """Pyspark to Redshift conversion.""" dtype = dtype.lower() if dtype == "smallint": return "SMALLINT" elif dtype == "int": return "INT" elif dtype == "bigint": return "BIGINT" elif dtype == "float": return "FLOAT4" elif dtype == "double": return "FLOAT8" elif dtype in ("bool", "boolean"): return "BOOLEAN" elif dtype == "timestamp": return "TIMESTAMP" elif dtype == "date": return "DATE" elif dtype == "string": return f"VARCHAR({varchar_length})" elif dtype.startswith("decimal"): return dtype.replace(" ", "").upper() else: raise UnsupportedType("Unsupported Spark type: " + dtype)
def athena2pyarrow(dtype: str) -> str: """Athena to PyArrow conversion.""" dtype = dtype.lower() if dtype == "tinyint": return "int8" if dtype == "smallint": return "int16" elif dtype in ("int", "integer"): return "int32" elif dtype == "bigint": return "int64" elif dtype == "float": return "float32" elif dtype == "double": return "float64" elif dtype in ("boolean", "bool"): return "bool" elif dtype in ("string", "char", "varchar", "array", "row", "map"): return "string" elif dtype == "timestamp": return "timestamp[ns]" elif dtype == "date": return "date32" else: raise UnsupportedType(f"Unsupported Athena type: {dtype}")
def pyarrow2mysql(dtype: pa.types, varchar_length: int = 256) -> str: """Pyarrow to MySQL conversion.""" dtype_str = str(dtype).lower() if dtype_str == "int16": return "SMALLINT" elif dtype_str == "int32": return "INT" elif dtype_str == "int64": return "BIGINT" elif dtype_str == "float": return "FLOAT" elif dtype_str == "double": return "DOUBLE" elif dtype_str == "bool": return "BOOLEAN" elif dtype_str == "string": return f"VARCHAR({varchar_length})" elif dtype_str.startswith("timestamp"): return "TIMESTAMP" elif dtype_str.startswith("date"): return "DATE" elif dtype_str.startswith("decimal"): return dtype_str.replace(" ", "").upper() else: raise UnsupportedType(f"Unsupported Pyarrow type: {dtype}")
def pyarrow2athena(dtype: pa.types) -> str: dtype_str = str(dtype).lower() if dtype_str == "int8": return "tinyint" elif dtype_str == "int16": return "smallint" elif dtype_str == "int32": return "int" elif dtype_str == "int64": return "bigint" elif dtype_str == "float": return "float" elif dtype_str == "double": return "double" elif dtype_str == "bool": return "boolean" elif dtype_str == "string": return "string" elif dtype_str.startswith("timestamp"): return "timestamp" elif dtype_str.startswith("date"): return "date" elif dtype_str.startswith("decimal"): return dtype_str.replace(" ", "") elif dtype_str.startswith("list"): return f"array<{pyarrow2athena(dtype.value_type)}>" elif dtype_str == "null": raise UndetectedType("We can't infer the data type from an entire null object column") else: raise UnsupportedType(f"Unsupported Pyarrow type: {dtype}")
def pyarrow2athena(dtype): dtype_str = str(dtype).lower() if dtype_str == "int8": return "tinyint" elif dtype_str == "int16": return "smallint" elif dtype_str == "int32": return "int" elif dtype_str == "int64": return "bigint" elif dtype_str == "float": return "float" elif dtype_str == "double": return "double" elif dtype_str == "bool": return "boolean" elif dtype_str == "string": return "string" elif dtype_str.startswith("timestamp"): return "timestamp" elif dtype_str.startswith("date"): return "date" elif dtype_str.startswith("list"): return f"array<{pyarrow2athena(dtype.value_type)}>" else: raise UnsupportedType(f"Unsupported Pyarrow type: {dtype}")
def _type_athena2pandas(dtype): dtype = dtype.lower() if dtype in ["int", "integer", "bigint", "smallint", "tinyint"]: return "Int64" elif dtype in ["float", "double", "real"]: return "float64" elif dtype == "boolean": return "bool" elif dtype in ["string", "char", "varchar", "array", "row", "map"]: return "object" elif dtype in ["timestamp", "date"]: return "datetime64" else: raise UnsupportedType(f"Unsupported Athena type: {dtype}")
def _build_schema( dataframe, partition_cols: Optional[List[str]], preserve_index: bool, indexes_position: str, cast_columns: Optional[Dict[str, str]] = None ) -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]: if cast_columns is None: cast_columns = {} logger.debug(f"dataframe.dtypes:\n{dataframe.dtypes}") if partition_cols is None: partition_cols = [] pyarrow_schema: List[Tuple[ str, Any]] = data_types.extract_pyarrow_schema_from_pandas( dataframe=dataframe, preserve_index=preserve_index, indexes_position=indexes_position) schema_built: List[Tuple[str, str]] = [] partition_cols_types: Dict[str, str] = {} for name, dtype in pyarrow_schema: if (cast_columns is not None) and (name in cast_columns.keys()): if name in partition_cols: partition_cols_types[name] = cast_columns[name] else: schema_built.append((name, cast_columns[name])) else: try: athena_type = data_types.pyarrow2athena(dtype) except UndetectedType: raise UndetectedType( f"We can't infer the data type from an entire null object column ({name}). " f"Please consider pass the type of this column explicitly using the cast " f"columns argument") except UnsupportedType: raise UnsupportedType( f"Unsupported Pyarrow type for column {name}: {dtype}") if name in partition_cols: partition_cols_types[name] = athena_type else: schema_built.append((name, athena_type)) partition_cols_schema_built: List = [(name, partition_cols_types[name]) for name in partition_cols] logger.debug(f"schema_built:\n{schema_built}") logger.debug( f"partition_cols_schema_built:\n{partition_cols_schema_built}") return schema_built, partition_cols_schema_built
def _type_spark2redshift(dtype): dtype = dtype.lower() if dtype in ["smallint", "int", "bigint"]: return "BIGINT" elif dtype == "float": return "FLOAT4" elif dtype == "double": return "FLOAT8" elif dtype == "bool": return "BOOLEAN" elif dtype == "timestamp": return "TIMESTAMP" elif dtype == "string": return "VARCHAR(256)" else: raise UnsupportedType("Unsupported Spark type: " + dtype)
def _type_spark2redshift(dtype): dtype = dtype.lower() if dtype == "int": return "INTEGER" elif dtype == "long": return "BIGINT" elif dtype == "float": return "FLOAT8" elif dtype == "bool": return "BOOLEAN" elif dtype == "string": return "VARCHAR(256)" elif dtype[:10] == "datetime.datetime": return "TIMESTAMP" else: raise UnsupportedType("Unsupported Spark type: " + dtype)
def type_python2athena(python_type): python_type = str(python_type) if python_type == "<class 'int'>": return "bigint" elif python_type == "<class 'float'>": return "double" elif python_type == "<class 'boll'>": return "boolean" elif python_type == "<class 'str'>": return "string" elif python_type == "<class 'datetime.datetime'>": return "timestamp" elif python_type == "<class 'datetime.date'>": return "date" else: raise UnsupportedType(f"Unsupported Python type: {python_type}")
def type_athena2python(dtype): dtype = dtype.lower() if dtype in ["int", "integer", "bigint", "smallint", "tinyint"]: return int elif dtype in ["float", "double", "real"]: return float elif dtype == "boolean": return bool elif dtype in ["string", "char", "varchar", "array", "row", "map"]: return str elif dtype == "timestamp": return datetime elif dtype == "date": return date else: raise UnsupportedType(f"Unsupported Athena type: {dtype}")
def python2athena(python_type: type) -> str: """Python to Athena conversion.""" python_type_str: str = str(python_type) if python_type_str == "<class 'int'>": return "bigint" elif python_type_str == "<class 'float'>": return "double" elif python_type_str == "<class 'boll'>": return "boolean" elif python_type_str == "<class 'str'>": return "string" elif python_type_str == "<class 'datetime.datetime'>": return "timestamp" elif python_type_str == "<class 'datetime.date'>": return "date" else: raise UnsupportedType(f"Unsupported Python type: {python_type_str}")
def athena2pandas(dtype): dtype = dtype.lower() if dtype in ["int", "integer", "bigint", "smallint", "tinyint"]: return "Int64" elif dtype in ["float", "double", "real"]: return "float64" elif dtype == "boolean": return "bool" elif dtype in ["string", "char", "varchar"]: return "str" elif dtype in ["timestamp", "timestamp with time zone"]: return "datetime64" elif dtype == "date": return "date" elif dtype == "array": return "literal_eval" else: raise UnsupportedType(f"Unsupported Athena type: {dtype}")
def _type_pandas2athena(dtype): dtype = dtype.lower() if dtype == "int32": return "int" elif dtype == "int64": return "bigint" elif dtype == "float32": return "float" elif dtype == "float64": return "double" elif dtype == "bool": return "boolean" elif dtype == "object" and isinstance(dtype, string_types): return "string" elif dtype[:10] == "datetime64": return "string" else: raise UnsupportedType("Unsupported Pandas type: " + dtype)
def pandas2athena(dtype): dtype = dtype.lower() if dtype == "int32": return "int" elif dtype in ["int64", "Int64"]: return "bigint" elif dtype == "float32": return "float" elif dtype == "float64": return "double" elif dtype == "bool": return "boolean" elif dtype == "object": return "string" elif dtype.startswith("datetime64"): return "timestamp" else: raise UnsupportedType(f"Unsupported Pandas type: {dtype}")
def pandas2redshift(dtype): dtype = dtype.lower() if dtype == "int32": return "INTEGER" elif dtype == "int64": return "BIGINT" elif dtype == "float32": return "FLOAT4" elif dtype == "float64": return "FLOAT8" elif dtype == "bool": return "BOOLEAN" elif dtype == "object" and isinstance(dtype, str): return "VARCHAR(256)" elif dtype[:10] == "datetime64": return "TIMESTAMP" else: raise UnsupportedType("Unsupported Pandas type: " + dtype)
def type_pandas2athena(dtype): dtype = dtype.lower() if dtype == "int32": return "int" elif dtype in ["int64", "Int64"]: return "bigint" elif dtype == "float32": return "float" elif dtype == "float64": return "double" elif dtype == "bool": return "boolean" elif dtype == "object" and isinstance(dtype, str): return "string" elif dtype[:10] == "datetime64": return "timestamp" else: raise UnsupportedType(f"Unsupported Pandas type: {dtype}")
def athena2python(dtype: str) -> Optional[type]: dtype = dtype.lower() if dtype in ("int", "integer", "bigint", "smallint", "tinyint"): return int elif dtype in ("float", "double", "real"): return float elif dtype == "boolean": return bool elif dtype in ("string", "char", "varchar", "array", "row", "map"): return str elif dtype == "timestamp": return datetime elif dtype == "date": return date elif dtype == "unknown": return None elif dtype == "decimal": return Decimal else: raise UnsupportedType(f"Unsupported Athena type: {dtype}")
def pandas2redshift(dtype: str, varchar_length: int = 256) -> str: dtype = dtype.lower() if dtype == "int32": return "INTEGER" elif dtype == "int64": return "BIGINT" elif dtype == "float32": return "FLOAT4" elif dtype == "float64": return "FLOAT8" elif dtype == "bool": return "BOOLEAN" elif dtype == "string": return f"VARCHAR({varchar_length})" elif dtype == "object": return f"VARCHAR({varchar_length})" elif dtype[:10] == "datetime64": return "TIMESTAMP" else: raise UnsupportedType("Unsupported Pandas type: " + dtype)
def athena2pandas(dtype: str) -> str: dtype = dtype.lower() if dtype in ("int", "integer", "bigint", "smallint", "tinyint"): return "Int64" elif dtype in ("float", "double", "real"): return "float64" elif dtype == "boolean": return "bool" elif dtype in ("string", "char", "varchar"): return "string" elif dtype in ("timestamp", "timestamp with time zone"): return "datetime64" elif dtype == "date": return "date" elif dtype == "array": return "list" elif dtype == "decimal": return "decimal" else: raise UnsupportedType(f"Unsupported Athena type: {dtype}")
def pandas2athena(dtype: str) -> str: """Pandas to Aurora conversion.""" dtype = dtype.lower() if dtype == "int32": return "int" elif dtype in ("int64", "Int64"): return "bigint" elif dtype == "float32": return "float" elif dtype == "float64": return "double" elif dtype == "bool": return "boolean" elif dtype == "string": return "string" elif dtype == "object": return "string" elif dtype.startswith("datetime64"): return "timestamp" else: raise UnsupportedType(f"Unsupported Pandas type: {dtype}")
def athena2redshift(dtype: str, varchar_length: int = 256) -> str: dtype = dtype.lower() if dtype == "smallint": return "SMALLINT" elif dtype in ("int", "integer"): return "INTEGER" elif dtype == "bigint": return "BIGINT" elif dtype == "float": return "FLOAT4" elif dtype == "double": return "FLOAT8" elif dtype in ("boolean", "bool"): return "BOOL" elif dtype in ("string", "char", "varchar", "array", "row", "map"): return f"VARCHAR({varchar_length})" elif dtype == "timestamp": return "TIMESTAMP" elif dtype == "date": return "DATE" else: raise UnsupportedType(f"Unsupported Athena type: {dtype}")
def redshift2pyarrow(dtype: str) -> str: dtype_str: str = str(dtype) if dtype_str in ("SMALLINT", "INT2"): return "int16" elif dtype_str in ("INTEGER", "INT", "INT4"): return "int32" elif dtype_str in ("BIGINT", "INT8"): return "int64" elif dtype_str in ("REAL", "FLOAT4"): return "float32" elif dtype_str in ("DOUBLE PRECISION", "FLOAT8", "FLOAT"): return "float64" elif dtype_str in ("BOOLEAN", "BOOL"): return "bool" elif dtype_str in ("VARCHAR", "CHARACTER VARYING", "NVARCHAR", "TEXT"): return "string" elif dtype_str == "DATE": return "date32" elif dtype_str == "TIMESTAMP": return "timestamp[ns]" else: raise UnsupportedType(f"Unsupported Redshift type: {dtype_str}")
def redshift2athena(dtype: str) -> str: dtype_str = str(dtype) if dtype_str in ("SMALLINT", "INT2"): return "smallint" elif dtype_str in ("INTEGER", "INT", "INT4"): return "int" elif dtype_str in ("BIGINT", "INT8"): return "bigint" elif dtype_str in ("REAL", "FLOAT4"): return "float" elif dtype_str in ("DOUBLE PRECISION", "FLOAT8", "FLOAT"): return "double" elif dtype_str in ("BOOLEAN", "BOOL"): return "boolean" elif dtype_str in ("VARCHAR", "CHARACTER VARYING", "NVARCHAR", "TEXT"): return "string" elif dtype_str == "DATE": return "date" elif dtype_str == "TIMESTAMP": return "timestamp" else: raise UnsupportedType(f"Unsupported Redshift type: {dtype_str}")
def spark2redshift(dtype: str) -> str: dtype = dtype.lower() if dtype == "smallint": return "SMALLINT" elif dtype == "int": return "INT" elif dtype == "bigint": return "BIGINT" elif dtype == "float": return "FLOAT4" elif dtype == "double": return "FLOAT8" elif dtype == "bool": return "BOOLEAN" elif dtype == "timestamp": return "TIMESTAMP" elif dtype == "date": return "DATE" elif dtype == "string": return "VARCHAR(256)" else: raise UnsupportedType("Unsupported Spark type: " + dtype)
def pyarrow2redshift(dtype): dtype_str = str(dtype).lower() if dtype_str == "int16": return "SMALLINT" elif dtype_str == "int32": return "INT" elif dtype_str == "int64": return "BIGINT" elif dtype_str == "float": return "FLOAT4" elif dtype_str == "double": return "FLOAT8" elif dtype_str == "bool": return "BOOLEAN" elif dtype_str == "string": return "VARCHAR(256)" elif dtype_str.startswith("timestamp"): return "TIMESTAMP" elif dtype_str.startswith("date"): return "DATE" else: raise UnsupportedType(f"Unsupported Pyarrow type: {dtype}")
def pyarrow2postgres(dtype: pa.types) -> str: dtype_str = str(dtype).lower() if dtype_str == "int16": return "SMALLINT" elif dtype_str == "int32": return "INT" elif dtype_str == "int64": return "BIGINT" elif dtype_str == "float": return "FLOAT4" elif dtype_str == "double": return "FLOAT8" elif dtype_str == "bool": return "BOOLEAN" elif dtype_str == "string": return "VARCHAR(256)" elif dtype_str.startswith("timestamp"): return "TIMESTAMP" elif dtype_str.startswith("date"): return "DATE" elif dtype_str.startswith("decimal"): return dtype_str.replace(" ", "").upper() else: raise UnsupportedType(f"Unsupported Pyarrow type: {dtype}")
def athena2pyarrow(dtype): dtype = dtype.lower() if dtype == "tinyint": return "int8" if dtype == "smallint": return "int16" elif dtype in ["int", "integer"]: return "int32" elif dtype == "bigint": return "int64" elif dtype == "float": return "float32" elif dtype == "double": return "float64" elif dtype in ["boolean", "bool"]: return "bool" elif dtype in ["string", "char", "varchar", "array", "row", "map"]: return "string" elif dtype == "timestamp": return "timestamp[ns]" elif dtype == "date": return "date32" else: raise UnsupportedType(f"Unsupported Athena type: {dtype}")