Beispiel #1
0
def pyarrow2athena(dtype: pa.types) -> str:
    dtype_str = str(dtype).lower()
    if dtype_str == "int8":
        return "tinyint"
    elif dtype_str == "int16":
        return "smallint"
    elif dtype_str == "int32":
        return "int"
    elif dtype_str == "int64":
        return "bigint"
    elif dtype_str == "float":
        return "float"
    elif dtype_str == "double":
        return "double"
    elif dtype_str == "bool":
        return "boolean"
    elif dtype_str == "string":
        return "string"
    elif dtype_str.startswith("timestamp"):
        return "timestamp"
    elif dtype_str.startswith("date"):
        return "date"
    elif dtype_str.startswith("decimal"):
        return dtype_str.replace(" ", "")
    elif dtype_str.startswith("list"):
        return f"array<{pyarrow2athena(dtype.value_type)}>"
    elif dtype_str == "null":
        raise UndetectedType("We can't infer the data type from an entire null object column")
    else:
        raise UnsupportedType(f"Unsupported Pyarrow type: {dtype}")
Beispiel #2
0
    def _build_schema(
        dataframe,
        partition_cols: Optional[List[str]],
        preserve_index: bool,
        indexes_position: str,
        cast_columns: Optional[Dict[str, str]] = None
    ) -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]:
        if cast_columns is None:
            cast_columns = {}
        logger.debug(f"dataframe.dtypes:\n{dataframe.dtypes}")
        if partition_cols is None:
            partition_cols = []

        pyarrow_schema: List[Tuple[
            str, Any]] = data_types.extract_pyarrow_schema_from_pandas(
                dataframe=dataframe,
                preserve_index=preserve_index,
                indexes_position=indexes_position)

        schema_built: List[Tuple[str, str]] = []
        partition_cols_types: Dict[str, str] = {}
        for name, dtype in pyarrow_schema:
            if (cast_columns is not None) and (name in cast_columns.keys()):
                if name in partition_cols:
                    partition_cols_types[name] = cast_columns[name]
                else:
                    schema_built.append((name, cast_columns[name]))
            else:
                try:
                    athena_type = data_types.pyarrow2athena(dtype)
                except UndetectedType:
                    raise UndetectedType(
                        f"We can't infer the data type from an entire null object column ({name}). "
                        f"Please consider pass the type of this column explicitly using the cast "
                        f"columns argument")
                except UnsupportedType:
                    raise UnsupportedType(
                        f"Unsupported Pyarrow type for column {name}: {dtype}")
                if name in partition_cols:
                    partition_cols_types[name] = athena_type
                else:
                    schema_built.append((name, athena_type))

        partition_cols_schema_built: List = [(name, partition_cols_types[name])
                                             for name in partition_cols]

        logger.debug(f"schema_built:\n{schema_built}")
        logger.debug(
            f"partition_cols_schema_built:\n{partition_cols_schema_built}")
        return schema_built, partition_cols_schema_built