def pyarrow2athena(dtype: pa.DataType) -> str: # pylint: disable=too-many-branches,too-many-return-statements """Pyarrow to Athena data types conversion.""" if pa.types.is_int8(dtype): return "tinyint" if pa.types.is_int16(dtype): return "smallint" if pa.types.is_int32(dtype): return "int" if pa.types.is_int64(dtype): return "bigint" if pa.types.is_float32(dtype): return "float" if pa.types.is_float64(dtype): return "double" if pa.types.is_boolean(dtype): return "boolean" if pa.types.is_string(dtype): return "string" if pa.types.is_timestamp(dtype): return "timestamp" if pa.types.is_date(dtype): return "date" if pa.types.is_binary(dtype): return "binary" if pa.types.is_dictionary(dtype): return pyarrow2athena(dtype=dtype.value_type) if pa.types.is_decimal(dtype): return f"decimal({dtype.precision},{dtype.scale})" if pa.types.is_list(dtype): return f"array<{pyarrow2athena(dtype=dtype.value_type)}>" if pa.types.is_struct(dtype): # pragma: no cover return f"struct<{', '.join([f'{f.name}: {pyarrow2athena(dtype=f.type)}' for f in dtype])}>" if dtype == pa.null(): raise exceptions.UndetectedType("We can not infer the data type from an entire null object column") raise exceptions.UnsupportedType(f"Unsupported Pyarrow type: {dtype}") # pragma: no cover
def athena_types_from_pandas( df: pd.DataFrame, index: bool, dtype: Optional[Dict[str, str]] = None, index_left: bool = False ) -> Dict[str, str]: """Extract the related Athena data types from any Pandas DataFrame.""" casts: Dict[str, str] = dtype if dtype else {} pa_columns_types: Dict[str, Optional[pa.DataType]] = pyarrow_types_from_pandas( df=df, index=index, ignore_cols=list(casts.keys()), index_left=index_left ) athena_columns_types: Dict[str, str] = {} for k, v in pa_columns_types.items(): if v is None: athena_columns_types[k] = casts[k].replace(" ", "") else: try: athena_columns_types[k] = pyarrow2athena(dtype=v) except exceptions.UndetectedType as ex: raise exceptions.UndetectedType( "Impossible to infer the equivalent Athena data type " f"for the {k} column. " "It is completely empty (only null values) " f"and has a too generic data type ({df[k].dtype}). " "Please, cast this columns with a more deterministic data type " f"(e.g. df['{k}'] = df['{k}'].astype('string')) or " "pass the column schema as argument for AWS Data Wrangler " f"(e.g. dtype={{'{k}': 'string'}}" ) from ex _logger.debug("athena_columns_types: %s", athena_columns_types) return athena_columns_types
def pyarrow2sqlalchemy( # pylint: disable=too-many-branches,too-many-return-statements dtype: pa.DataType, db_type: str) -> VisitableType: """Pyarrow to Athena data types conversion.""" if pa.types.is_int8(dtype): return sqlalchemy.types.SmallInteger if pa.types.is_int16(dtype): return sqlalchemy.types.SmallInteger if pa.types.is_int32(dtype): return sqlalchemy.types.Integer if pa.types.is_int64(dtype): return sqlalchemy.types.BigInteger if pa.types.is_float32(dtype): return sqlalchemy.types.Float if pa.types.is_float64(dtype): if db_type == "mysql": return sqlalchemy.dialects.mysql.DOUBLE if db_type == "postgresql": return sqlalchemy.dialects.postgresql.DOUBLE_PRECISION if db_type == "redshift": return sqlalchemy_redshift.dialect.DOUBLE_PRECISION raise exceptions.InvalidDatabaseType( f"{db_type} is a invalid database type, please choose between postgresql, mysql and redshift." ) # pragma: no cover if pa.types.is_boolean(dtype): return sqlalchemy.types.Boolean if pa.types.is_string(dtype): if db_type == "mysql": return sqlalchemy.types.Text if db_type == "postgresql": return sqlalchemy.types.Text if db_type == "redshift": return sqlalchemy.types.VARCHAR(length=256) raise exceptions.InvalidDatabaseType( f"{db_type} is a invalid database type. " f"Please choose between postgresql, mysql and redshift." ) # pragma: no cover if pa.types.is_timestamp(dtype): return sqlalchemy.types.DateTime if pa.types.is_date(dtype): return sqlalchemy.types.Date if pa.types.is_binary(dtype): if db_type == "redshift": raise exceptions.UnsupportedType( f"Binary columns are not supported for Redshift." ) # pragma: no cover return sqlalchemy.types.Binary if pa.types.is_decimal(dtype): return sqlalchemy.types.Numeric(precision=dtype.precision, scale=dtype.scale) if pa.types.is_dictionary(dtype): return pyarrow2sqlalchemy(dtype=dtype.value_type, db_type=db_type) if dtype == pa.null(): # pragma: no cover raise exceptions.UndetectedType( "We can not infer the data type from an entire null object column") raise exceptions.UnsupportedType( f"Unsupported Pyarrow type: {dtype}") # pragma: no cover
def pyarrow2athena( # pylint: disable=too-many-branches,too-many-return-statements dtype: pa.DataType, ignore_null: bool = False) -> str: """Pyarrow to Athena data types conversion.""" if pa.types.is_int8(dtype): return "tinyint" if pa.types.is_int16(dtype) or pa.types.is_uint8(dtype): return "smallint" if pa.types.is_int32(dtype) or pa.types.is_uint16(dtype): return "int" if pa.types.is_int64(dtype) or pa.types.is_uint32(dtype): return "bigint" if pa.types.is_uint64(dtype): raise exceptions.UnsupportedType( "There is no support for uint64, please consider int64 or uint32.") if pa.types.is_float32(dtype): return "float" if pa.types.is_float64(dtype): return "double" if pa.types.is_boolean(dtype): return "boolean" if pa.types.is_string(dtype): return "string" if pa.types.is_timestamp(dtype): return "timestamp" if pa.types.is_date(dtype): return "date" if pa.types.is_binary(dtype): return "binary" if pa.types.is_dictionary(dtype): return pyarrow2athena(dtype=dtype.value_type) if pa.types.is_decimal(dtype): return f"decimal({dtype.precision},{dtype.scale})" if pa.types.is_list(dtype): return f"array<{pyarrow2athena(dtype=dtype.value_type)}>" if pa.types.is_struct(dtype): return f"struct<{','.join([f'{f.name}:{pyarrow2athena(dtype=f.type)}' for f in dtype])}>" if pa.types.is_map(dtype): return f"map<{pyarrow2athena(dtype=dtype.key_type)}, {pyarrow2athena(dtype=dtype.item_type)}>" if dtype == pa.null(): if ignore_null: return "" raise exceptions.UndetectedType( "We can not infer the data type from an entire null object column") raise exceptions.UnsupportedType(f"Unsupported Pyarrow type: {dtype}")