def parse_path(path: str) -> Tuple[str, str]: """Split a full S3 path in bucket and key strings. 's3://bucket/key' -> ('bucket', 'key') Parameters ---------- path : str S3 path (e.g. s3://bucket/key). Returns ------- Tuple[str, str] Tuple of bucket and key strings Examples -------- >>> from awswrangler._utils import parse_path >>> bucket, key = parse_path('s3://bucket/key') """ if path.startswith("s3://") is False: raise exceptions.InvalidArgumentValue(f"'{path}' is not a valid path. It MUST start with 's3://'") parts = path.replace("s3://", "").split("/", 1) bucket: str = parts[0] if "/" in bucket: raise exceptions.InvalidArgumentValue(f"'{bucket}' is not a valid bucket name.") key: str = "" if len(parts) == 2: key = key if parts[1] is None else parts[1] return bucket, key
def __init__( self, path: str, s3_block_size: int, mode: str, use_threads: Union[bool, int], s3_additional_kwargs: Optional[Dict[str, str]], boto3_session: Optional[boto3.Session], newline: Optional[str], encoding: Optional[str], ) -> None: super().__init__() self._use_threads = use_threads self._newline: str = "\n" if newline is None else newline self._encoding: str = "utf-8" if encoding is None else encoding self._bucket, self._key = _utils.parse_path(path=path) self._boto3_session: boto3.Session = _utils.ensure_session(session=boto3_session) if mode not in {"rb", "wb", "r", "w"}: raise NotImplementedError("File mode must be {'rb', 'wb', 'r', 'w'}, not %s" % mode) self._mode: str = "rb" if mode is None else mode self._one_shot_download: bool = False if 0 < s3_block_size < 3: raise exceptions.InvalidArgumentValue( "s3_block_size MUST > 2 to define a valid size or " "< 1 to avoid blocks and always execute one shot downloads." ) if s3_block_size <= 0: _logger.debug("s3_block_size of %d, enabling one_shot_download.", s3_block_size) self._one_shot_download = True self._s3_block_size: int = s3_block_size self._s3_half_block_size: int = s3_block_size // 2 self._s3_additional_kwargs: Dict[str, str] = {} if s3_additional_kwargs is None else s3_additional_kwargs self._client: boto3.client = _utils.client(service_name="s3", session=self._boto3_session) self._loc: int = 0 if self.readable() is True: self._cache: bytes = b"" self._start: int = 0 self._end: int = 0 size: Optional[int] = size_objects( path=[path], use_threads=False, boto3_session=self._boto3_session, s3_additional_kwargs=self._s3_additional_kwargs, )[path] if size is None: raise exceptions.InvalidArgumentValue(f"S3 object w/o defined size: {path}") self._size: int = size _logger.debug("self._size: %s", self._size) _logger.debug("self._s3_block_size: %s", self._s3_block_size) elif self.writable() is True: self._mpu: Dict[str, Any] = {} self._buffer: io.BytesIO = io.BytesIO() self._parts_count: int = 0 self._size = 0 self._upload_proxy: _UploadProxy = _UploadProxy(use_threads=self._use_threads) else: raise RuntimeError(f"Invalid mode: {self._mode}")
def _validate_args( df: pd.DataFrame, table: Optional[str], database: Optional[str], dataset: bool, path: Optional[str], partition_cols: Optional[List[str]], bucketing_info: Optional[Tuple[List[str], int]], mode: Optional[str], description: Optional[str], parameters: Optional[Dict[str, str]], columns_comments: Optional[Dict[str, str]], ) -> None: if df.empty is True: raise exceptions.EmptyDataFrame("DataFrame cannot be empty.") if dataset is False: if path is None: raise exceptions.InvalidArgumentValue( "If dataset is False, the `path` argument must be passed.") if path.endswith("/"): raise exceptions.InvalidArgumentValue( "If <dataset=False>, the argument <path> should be a key, not a prefix." ) if partition_cols: raise exceptions.InvalidArgumentCombination( "Please, pass dataset=True to be able to use partition_cols.") if bucketing_info: raise exceptions.InvalidArgumentCombination( "Please, pass dataset=True to be able to use bucketing_info.") if mode is not None: raise exceptions.InvalidArgumentCombination( "Please pass dataset=True to be able to use mode.") if any(arg is not None for arg in (table, description, parameters, columns_comments)): raise exceptions.InvalidArgumentCombination( "Please pass dataset=True to be able to use any one of these " "arguments: database, table, description, parameters, " "columns_comments.") elif (database is None) != (table is None): raise exceptions.InvalidArgumentCombination( "Arguments database and table must be passed together. If you want to store your dataset metadata in " "the Glue Catalog, please ensure you are passing both.") elif all(x is None for x in [path, database, table]): raise exceptions.InvalidArgumentCombination( "You must specify a `path` if dataset is True and database/table are not enabled." ) elif bucketing_info and bucketing_info[1] <= 0: raise exceptions.InvalidArgumentValue( "Please pass a value greater than 1 for the number of buckets for bucketing." )
def _check_schema_changes(columns_types: Dict[str, str], table_input: Optional[Dict[str, Any]], mode: str) -> None: if (table_input is not None) and (mode in ("append", "overwrite_partitions")): catalog_cols: Dict[str, str] = {x["Name"]: x["Type"] for x in table_input["StorageDescriptor"]["Columns"]} for c, t in columns_types.items(): if c not in catalog_cols: raise exceptions.InvalidArgumentValue( f"Schema change detected: New column {c} with type {t}. " "Please pass schema_evolution=True to allow new columns " "behaviour." ) if t != catalog_cols[c]: # Data type change detected! raise exceptions.InvalidArgumentValue( f"Schema change detected: Data type change on column {c} " f"(Old type: {catalog_cols[c]} / New type {t})." )
def _validate_args( df: pd.DataFrame, table: Optional[str], database: Optional[str], dataset: bool, path: str, partition_cols: Optional[List[str]], mode: Optional[str], description: Optional[str], parameters: Optional[Dict[str, str]], columns_comments: Optional[Dict[str, str]], ) -> None: if df.empty is True: raise exceptions.EmptyDataFrame() if dataset is False: if path.endswith("/"): raise exceptions.InvalidArgumentValue( "If <dataset=False>, the argument <path> should be a file path, not a directory." ) if partition_cols: raise exceptions.InvalidArgumentCombination( "Please, pass dataset=True to be able to use partition_cols.") if mode is not None: raise exceptions.InvalidArgumentCombination( "Please pass dataset=True to be able to use mode.") if any(arg is not None for arg in (table, description, parameters, columns_comments)): raise exceptions.InvalidArgumentCombination( "Please pass dataset=True to be able to use any one of these " "arguments: database, table, description, parameters, " "columns_comments.") elif (database is None) != (table is None): raise exceptions.InvalidArgumentCombination( "Arguments database and table must be passed together. If you want to store your dataset metadata in " "the Glue Catalog, please ensure you are passing both.")
def _paginate_stream(args: Dict[str, Any], path: str, use_threads: Union[bool, int], boto3_session: Optional[boto3.Session]) -> pd.DataFrame: obj_size: int = size_objects( # type: ignore path=[path], use_threads=False, boto3_session=boto3_session, ).get(path) if obj_size is None: raise exceptions.InvalidArgumentValue( f"S3 object w/o defined size: {path}") dfs: List[pd.Dataframe] = [] client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) if use_threads is False: dfs = list( _select_object_content( args=args, client_s3=client_s3, scan_range=scan_range, ) for scan_range in _gen_scan_range(obj_size=obj_size)) else: cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) with concurrent.futures.ThreadPoolExecutor( max_workers=cpus) as executor: dfs = list( executor.map( _select_object_content, itertools.repeat(args), itertools.repeat(client_s3), _gen_scan_range(obj_size=obj_size), )) return pd.concat(dfs, ignore_index=True)
def _paginate_stream( args: Dict[str, Any], path: str, use_threads: Union[bool, int], boto3_session: Optional[boto3.Session] ) -> pd.DataFrame: obj_size: int = size_objects( # type: ignore path=[path], use_threads=False, boto3_session=boto3_session, ).get(path) if obj_size is None: raise exceptions.InvalidArgumentValue(f"S3 object w/o defined size: {path}") scan_ranges = _gen_scan_range(obj_size=obj_size) if use_threads is False: stream_records = list( _select_object_content( args=args, boto3_session=boto3_session, scan_range=scan_range, ) for scan_range in scan_ranges ) else: cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: stream_records = list( executor.map( _select_object_content, itertools.repeat(args), itertools.repeat(boto3_session), scan_ranges, ) ) return pd.DataFrame([item for sublist in stream_records for item in sublist]) # Flatten list of lists
def extract_partitions_metadata_from_paths( path: str, paths: List[str] ) -> Tuple[Optional[Dict[str, str]], Optional[Dict[str, List[str]]]]: """Extract partitions metadata from Amazon S3 paths.""" path = path if path.endswith("/") else f"{path}/" partitions_types: Dict[str, str] = {} partitions_values: Dict[str, List[str]] = {} for p in paths: if path not in p: raise exceptions.InvalidArgumentValue( f"Object {p} is not under the root path ({path})." ) # pragma: no cover path_wo_filename: str = p.rpartition("/")[0] + "/" if path_wo_filename not in partitions_values: path_wo_prefix: str = path_wo_filename.replace(f"{path}/", "") dirs: List[str] = [x for x in path_wo_prefix.split("/") if (x != "") and ("=" in x)] if dirs: values_tups: List[Tuple[str, str]] = [tuple(x.split("=")[:2]) for x in dirs] # type: ignore values_dics: Dict[str, str] = dict(values_tups) p_values: List[str] = list(values_dics.values()) p_types: Dict[str, str] = {x: "string" for x in values_dics.keys()} if not partitions_types: partitions_types = p_types if p_values: partitions_types = p_types partitions_values[path_wo_filename] = p_values elif p_types != partitions_types: # pragma: no cover raise exceptions.InvalidSchemaConvergence( f"At least two different partitions schema detected: {partitions_types} and {p_types}" ) if not partitions_types: return None, None return partitions_types, partitions_values
def _resolve_query_with_cache( # pylint: disable=too-many-return-statements cache_info, categories: Optional[List[str]], chunksize: Optional[Union[int, bool]], use_threads: bool, session: Optional[boto3.Session], ): """Fetch cached data and return it as a pandas Dataframe (or list of Dataframes).""" if cache_info["data_type"] == "parquet": manifest_path = cache_info["query_execution_info"]["Statistics"]["DataManifestLocation"] # this is needed just so we can access boto's modeled exceptions client_s3: boto3.client = _utils.client(service_name="s3", session=session) try: paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=session) except (client_s3.exceptions.NoSuchBucket, client_s3.exceptions.NoSuchKey): # pragma: no cover return None if all([s3.does_object_exist(path) for path in paths]): chunked: Union[bool, int] = False if chunksize is None else chunksize _logger.debug("chunked: %s", chunked) if not paths: # pragma: no cover if chunked is False: return pd.DataFrame() return _utils.empty_generator() ret = s3.read_parquet( path=paths, use_threads=use_threads, boto3_session=session, chunked=chunked, categories=categories ) _logger.debug(type(ret)) return ret elif cache_info["data_type"] == "csv": dtype, parse_timestamps, parse_dates, converters, binaries = _get_query_metadata( query_execution_id=cache_info["query_execution_info"]["QueryExecutionId"], categories=categories, boto3_session=session, ) path = cache_info["query_execution_info"]["ResultConfiguration"]["OutputLocation"] if s3.does_object_exist(path=path, boto3_session=session): _logger.debug("Start CSV reading from %s", path) _chunksize: Optional[int] = chunksize if isinstance(chunksize, int) else None _logger.debug("_chunksize: %s", _chunksize) ret = s3.read_csv( path=[path], dtype=dtype, parse_dates=parse_timestamps, converters=converters, quoting=csv.QUOTE_ALL, keep_default_na=False, na_values=[""], chunksize=_chunksize, skip_blank_lines=False, use_threads=False, boto3_session=session, ) _logger.debug("Start type casting...") _logger.debug(type(ret)) if chunksize is None: df = _fix_csv_types(df=ret, parse_dates=parse_dates, binaries=binaries) return df dfs = _fix_csv_types_generator(dfs=ret, parse_dates=parse_dates, binaries=binaries) return dfs raise exceptions.InvalidArgumentValue(f"Invalid data type: {cache_info['data_type']}.") # pragma: no cover
def _validate_args( df: pd.DataFrame, table: Optional[str], dataset: bool, path: str, partition_cols: Optional[List[str]], mode: Optional[str], description: Optional[str], parameters: Optional[Dict[str, str]], columns_comments: Optional[Dict[str, str]], ) -> None: if df.empty is True: raise exceptions.EmptyDataFrame() if dataset is False: if path.endswith("/"): raise exceptions.InvalidArgumentValue( "If <dataset=False>, the argument <path> should be a object path, not a directory." ) if partition_cols: raise exceptions.InvalidArgumentCombination( "Please, pass dataset=True to be able to use partition_cols.") if mode is not None: raise exceptions.InvalidArgumentCombination( "Please pass dataset=True to be able to use mode.") if any(arg is not None for arg in (table, description, parameters, columns_comments)): raise exceptions.InvalidArgumentCombination( "Please pass dataset=True to be able to use any one of these " "arguments: database, table, description, parameters, " "columns_comments.")
def upsert_table_parameters( parameters: Dict[str, str], database: str, table: str, transaction_id: Optional[str] = None, catalog_versioning: bool = False, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> Dict[str, str]: """Insert or Update the received parameters. Parameters ---------- parameters : Dict[str, str] e.g. {"source": "mysql", "destination": "datalake"} database : str Database name. table : str Table name. transaction_id: str, optional The ID of the transaction (i.e. used with GOVERNED tables). catalog_versioning : bool If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Dict[str, str] All parameters after the upsert. Examples -------- >>> import awswrangler as wr >>> pars = wr.catalog.upsert_table_parameters( ... parameters={"source": "mysql", "destination": "datalake"}, ... database="...", ... table="...") """ session: boto3.Session = _utils.ensure_session(session=boto3_session) table_input: Optional[Dict[str, str]] = _get_table_input( database=database, table=table, boto3_session=session, transaction_id=transaction_id, catalog_id=catalog_id ) if table_input is None: raise exceptions.InvalidArgumentValue(f"Table {database}.{table} does not exist.") return _upsert_table_parameters( parameters=parameters, database=database, boto3_session=session, transaction_id=transaction_id, catalog_id=catalog_id, table_input=table_input, catalog_versioning=catalog_versioning, )
def _apply_type(name: str, value: Any, dtype: Type[Union[str, bool, int]], nullable: bool) -> _ConfigValueType: if _Config._is_null(value=value): if nullable is True: return None exceptions.InvalidArgumentValue(f"{name} configuration does not accept a null value. Please pass {dtype}.") try: return dtype(value) if isinstance(value, dtype) is False else value except ValueError as ex: raise exceptions.InvalidConfiguration(f"Config {name} must receive a {dtype} value.") from ex
def _set_config_value(self, key: str, value: Any) -> None: if key not in _CONFIG_ARGS: raise exceptions.InvalidArgumentValue( f"{key} is not a valid configuration. Please use: {list(_CONFIG_ARGS.keys())}" ) value_casted: _ConfigValueType = self._apply_type( name=key, value=value, dtype=_CONFIG_ARGS[key].dtype, nullable=_CONFIG_ARGS[key].nullable ) self._loaded_values[key] = value_casted
def _to_parquet_dataset( df: pd.DataFrame, path: str, index: bool, compression: Optional[str], compression_ext: str, cpus: int, fs: s3fs.S3FileSystem, use_threads: bool, mode: str, dtype: Dict[str, str], partition_cols: Optional[List[str]] = None, boto3_session: Optional[boto3.Session] = None, ) -> Tuple[List[str], Dict[str, List[str]]]: paths: List[str] = [] partitions_values: Dict[str, List[str]] = {} path = path if path[-1] == "/" else f"{path}/" if mode not in ["append", "overwrite", "overwrite_partitions"]: raise exceptions.InvalidArgumentValue( f"{mode} is a invalid mode, please use append, overwrite or overwrite_partitions." ) if (mode == "overwrite") or ((mode == "overwrite_partitions") and (not partition_cols)): delete_objects(path=path, use_threads=use_threads, boto3_session=boto3_session) df = _data_types.cast_pandas_with_athena_types(df=df, dtype=dtype) schema: pa.Schema = _data_types.pyarrow_schema_from_pandas( df=df, index=index, ignore_cols=partition_cols, dtype=dtype ) _logger.debug("schema: \n%s", schema) if not partition_cols: file_path: str = f"{path}{uuid.uuid4().hex}{compression_ext}.parquet" _to_parquet_file( df=df, schema=schema, path=file_path, index=index, compression=compression, cpus=cpus, fs=fs, dtype=dtype ) paths.append(file_path) else: for keys, subgroup in df.groupby(by=partition_cols, observed=True): subgroup = subgroup.drop(partition_cols, axis="columns") keys = (keys,) if not isinstance(keys, tuple) else keys subdir = "/".join([f"{name}={val}" for name, val in zip(partition_cols, keys)]) prefix: str = f"{path}{subdir}/" if mode == "overwrite_partitions": delete_objects(path=prefix, use_threads=use_threads, boto3_session=boto3_session) file_path = f"{prefix}{uuid.uuid4().hex}{compression_ext}.parquet" _to_parquet_file( df=subgroup, schema=schema, path=file_path, index=index, compression=compression, cpus=cpus, fs=fs, dtype=dtype, ) paths.append(file_path) partitions_values[prefix] = [str(k) for k in keys] return paths, partitions_values
def extract_athena_types( df: pd.DataFrame, index: bool = False, partition_cols: Optional[List[str]] = None, dtype: Optional[Dict[str, str]] = None, file_format: str = "parquet", ) -> Tuple[Dict[str, str], Dict[str, str]]: """Extract columns and partitions types (Amazon Athena) from Pandas DataFrame. https://docs.aws.amazon.com/athena/latest/ug/data-types.html Parameters ---------- df : pandas.DataFrame Pandas DataFrame. index : bool Should consider the DataFrame index as a column?. partition_cols : List[str], optional List of partitions names. dtype: Dict[str, str], optional Dictionary of columns names and Athena/Glue types to be casted. Useful when you have columns with undetermined or mixed data types. (e.g. {'col name': 'bigint', 'col2 name': 'int'}) file_format : str, optional File format to be consided to place the index column: "parquet" | "csv". Returns ------- Tuple[Dict[str, str], Optional[Dict[str, str]]] columns_types: Dictionary with keys as column names and vales as data types (e.g. {'col0': 'bigint', 'col1': 'double'}). / partitions_types: Dictionary with keys as partition names and values as data types (e.g. {'col2': 'date'}). Examples -------- >>> import awswrangler as wr >>> columns_types, partitions_types = wr.catalog.extract_athena_types( ... df=df, index=False, partition_cols=["par0", "par1"], file_format="csv" ... ) """ if file_format == "parquet": index_left: bool = False elif file_format == "csv": index_left = True else: raise exceptions.InvalidArgumentValue( "file_format argument must be parquet or csv") return _data_types.athena_types_from_pandas_partitioned( df=df, index=index, partition_cols=partition_cols, dtype=dtype, index_left=index_left)
def _extract_partitions_from_path(path_root: str, path: str) -> Dict[str, str]: """Extract partitions values and names from Amazon S3 path.""" path_root = path_root if path_root.endswith("/") else f"{path_root}/" if path_root not in path: raise exceptions.InvalidArgumentValue(f"Object {path} is not under the root path ({path_root}).") path_wo_filename: str = path.rpartition("/")[0] + "/" path_wo_prefix: str = path_wo_filename.replace(f"{path_root}/", "") dirs: Tuple[str, ...] = tuple(x for x in path_wo_prefix.split("/") if (x != "") and (x.count("=") == 1)) if not dirs: return {} values_tups = cast(Tuple[Tuple[str, str]], tuple(tuple(x.split("=")[:2]) for x in dirs)) values_dics: Dict[str, str] = dict(values_tups) return values_dics
def parse_path( path: str, multipart: bool = False ) -> Union[Tuple[str, str], Tuple[str, str, List[str]]]: """Split a full S3 path in bucket and key strings. If multipart is True, also returns the key split by /. 's3://bucket/key' -> ('bucket', 'key') 's3://bucket/keypart1/keypart2' -> ('bucket', 'keypart1/keypart2', ['keypart1', 'keypart2']) Parameters ---------- path : str S3 path (e.g. s3://bucket/key). Returns ------- Union[Tuple[str, str], Tuple[str, str, List[str]]] Tuple of bucket and key strings or Tuple of bucket, key string and List of key parts Examples -------- >>> from awswrangler._utils import parse_path >>> bucket, key = parse_path('s3://bucket/key') ('bucket', 'key') >>> bucket, key, keyparts = parse_path('s3://bucket/keypart1/keypart2/file.csv', multipart=True) ('bucket', 'keypart1/keypart2', ['keypart1', 'keypart2', 'file']) """ if path.startswith("s3://") is False: raise exceptions.InvalidArgumentValue( f"'{path}' is not a valid path. It MUST start with 's3://'") parts = path.replace("s3://", "").split("/", 1) bucket: str = parts[0] key: str = "" if multipart: keyparts: List[str] = [] if len(parts) >= 2: key_string = key if parts[1] is None else parts[1] levels = key_string.count("/") keyparts = key_string.split("/", levels) keyparts[-1] = keyparts[-1].partition('.')[0] key = '/'.join(keyparts[:-1]) return bucket, key, keyparts if len(parts) == 2: key = key if parts[1] is None else parts[1] return bucket, key
def _to_dataset( func: Callable, concurrent_partitioning: bool, df: pd.DataFrame, path_root: str, index: bool, use_threads: bool, mode: str, partition_cols: Optional[List[str]], boto3_session: boto3.Session, **func_kwargs, ) -> Tuple[List[str], Dict[str, List[str]]]: path_root = path_root if path_root[-1] == "/" else f"{path_root}/" # Evaluate mode if mode not in ["append", "overwrite", "overwrite_partitions"]: raise exceptions.InvalidArgumentValue( f"{mode} is a invalid mode, please use append, overwrite or overwrite_partitions." ) if (mode == "overwrite") or ((mode == "overwrite_partitions") and (not partition_cols)): delete_objects(path=path_root, use_threads=use_threads, boto3_session=boto3_session) # Writing partitions_values: Dict[str, List[str]] = {} if not partition_cols: paths: List[str] = [ func(df=df, path_root=path_root, boto3_session=boto3_session, index=index, **func_kwargs) ] else: paths, partitions_values = _to_partitions( func=func, concurrent_partitioning=concurrent_partitioning, df=df, path_root=path_root, use_threads=use_threads, mode=mode, partition_cols=partition_cols, boto3_session=boto3_session, index=index, **func_kwargs, ) _logger.debug("paths: %s", paths) _logger.debug("partitions_values: %s", partitions_values) return paths, partitions_values
def list_sampling(lst: List[Any], sampling: float) -> List[Any]: """Random List sampling.""" if sampling > 1.0 or sampling <= 0.0: raise exceptions.InvalidArgumentValue(f"Argument <sampling> must be [0.0 < value <= 1.0]. {sampling} received.") _len: int = len(lst) if _len == 0: return [] num_samples: int = int(round(_len * sampling)) num_samples = _len if num_samples > _len else num_samples num_samples = 1 if num_samples < 1 else num_samples _logger.debug("_len: %s", _len) _logger.debug("sampling: %s", sampling) _logger.debug("num_samples: %s", num_samples) return random.sample(population=lst, k=num_samples)
def extract_partitions_from_path(path_root: str, path: str) -> Dict[str, Any]: """Extract partitions values and names from Amazon S3 path.""" path_root = path_root if path_root.endswith("/") else f"{path_root}/" if path_root not in path: raise exceptions.InvalidArgumentValue( f"Object {path} is not under the root path ({path_root})." ) # pragma: no cover path_wo_filename: str = path.rpartition("/")[0] + "/" path_wo_prefix: str = path_wo_filename.replace(f"{path_root}/", "") dirs: List[str] = [x for x in path_wo_prefix.split("/") if (x != "") and ("=" in x)] if not dirs: return {} # pragma: no cover values_tups: List[Tuple[str, str]] = [tuple(x.split("=")[:2]) for x in dirs] # type: ignore values_dics: Dict[str, str] = dict(values_tups) return values_dics
def _validate_items(items: Union[List[Dict[str, Any]], List[Mapping[str, Any]]], dynamodb_table: boto3.resource) -> None: """Validate if all items have the required keys for the Amazon DynamoDB table. Parameters ---------- items : Union[List[Dict[str, Any]], List[Mapping[str, Any]]] List which contains the items that will be validated. dynamodb_table : boto3.resources.dynamodb.Table Amazon DynamoDB Table object. Returns ------- None None. """ table_keys = [ schema["AttributeName"] for schema in dynamodb_table.key_schema ] if not all(key in item for item in items for key in table_keys): raise exceptions.InvalidArgumentValue( "All items need to contain the required keys for the table.")
def _create_csv_table( database: str, table: str, path: str, columns_types: Dict[str, str], partitions_types: Optional[Dict[str, str]], bucketing_info: Optional[Tuple[List[str], int]], description: Optional[str], compression: Optional[str], parameters: Optional[Dict[str, str]], columns_comments: Optional[Dict[str, str]], mode: str, catalog_versioning: bool, sep: str, skip_header_line_count: Optional[int], boto3_session: Optional[boto3.Session], projection_enabled: bool, projection_types: Optional[Dict[str, str]], projection_ranges: Optional[Dict[str, str]], projection_values: Optional[Dict[str, str]], projection_intervals: Optional[Dict[str, str]], projection_digits: Optional[Dict[str, str]], catalog_table_input: Optional[Dict[str, Any]], catalog_id: Optional[str], ) -> None: table = sanitize_table_name(table=table) partitions_types = {} if partitions_types is None else partitions_types _logger.debug("catalog_table_input: %s", catalog_table_input) table_input: Dict[str, Any] if (catalog_table_input is not None) and (mode in ("append", "overwrite_partitions")): table_input = catalog_table_input catalog_cols: Dict[str, str] = { x["Name"]: x["Type"] for x in table_input["StorageDescriptor"]["Columns"] } for c, t in columns_types.items(): if c not in catalog_cols: _logger.debug("New column %s with type %s.", c, t) raise exceptions.InvalidArgumentValue( f"Schema change detected - New column {c}. Schema evolution is not supported for CSV tables." ) else: table_input = _csv_table_definition( table=table, path=path, columns_types=columns_types, partitions_types=partitions_types, bucketing_info=bucketing_info, compression=compression, sep=sep, skip_header_line_count=skip_header_line_count, ) table_exist: bool = catalog_table_input is not None _logger.debug("table_exist: %s", table_exist) _create_table( database=database, table=table, description=description, parameters=parameters, columns_comments=columns_comments, mode=mode, catalog_versioning=catalog_versioning, boto3_session=boto3_session, table_input=table_input, table_exist=table_exist, partitions_types=partitions_types, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, catalog_id=catalog_id, )
def to_parquet( # pylint: disable=too-many-arguments,too-many-locals,too-many-branches,too-many-statements df: pd.DataFrame, path: Optional[str] = None, index: bool = False, compression: Optional[str] = "snappy", pyarrow_additional_kwargs: Optional[Dict[str, Any]] = None, max_rows_by_file: Optional[int] = None, use_threads: Union[bool, int] = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, sanitize_columns: bool = False, dataset: bool = False, filename_prefix: Optional[str] = None, partition_cols: Optional[List[str]] = None, bucketing_info: Optional[Tuple[List[str], int]] = None, concurrent_partitioning: bool = False, mode: Optional[str] = None, catalog_versioning: bool = False, schema_evolution: bool = True, database: Optional[str] = None, table: Optional[str] = None, table_type: Optional[str] = None, transaction_id: Optional[str] = None, dtype: Optional[Dict[str, str]] = None, description: Optional[str] = None, parameters: Optional[Dict[str, str]] = None, columns_comments: Optional[Dict[str, str]] = None, regular_partitions: bool = True, projection_enabled: bool = False, projection_types: Optional[Dict[str, str]] = None, projection_ranges: Optional[Dict[str, str]] = None, projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, catalog_id: Optional[str] = None, ) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: """Write Parquet file or dataset on Amazon S3. The concept of Dataset goes beyond the simple idea of ordinary files and enable more complex features like partitioning and catalog integration (Amazon Athena/AWS Glue Catalog). Note ---- This operation may mutate the original pandas dataframe in-place. To avoid this behaviour please pass in a deep copy instead (i.e. `df.copy()`) Note ---- If `database` and `table` arguments are passed, the table name and all column names will be automatically sanitized using `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`. Please, pass `sanitize_columns=True` to enforce this behaviour always. Note ---- On `append` mode, the `parameters` will be upsert on an existing table. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- df: pandas.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html path : str, optional S3 path (for file e.g. ``s3://bucket/prefix/filename.parquet``) (for dataset e.g. ``s3://bucket/prefix``). Required if dataset=False or when dataset=True and creating a new dataset index : bool True to store the DataFrame index in file, otherwise False to ignore it. compression: str, optional Compression style (``None``, ``snappy``, ``gzip``). pyarrow_additional_kwargs : Optional[Dict[str, Any]] Additional parameters forwarded to pyarrow. e.g. pyarrow_additional_kwargs={'coerce_timestamps': 'ns', 'use_deprecated_int96_timestamps': False, 'allow_truncated_timestamps'=False} max_rows_by_file : int Max number of rows in each file. Default is None i.e. dont split the files. (e.g. 33554432, 268435456) use_threads : bool, int True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. If integer is provided, specified number is used. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs : Optional[Dict[str, Any]] Forwarded to botocore requests. e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'} sanitize_columns : bool True to sanitize columns names (using `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`) or False to keep it as is. True value behaviour is enforced if `database` and `table` arguments are passed. dataset : bool If True store a parquet dataset instead of a ordinary file(s) If True, enable all follow arguments: partition_cols, mode, database, table, description, parameters, columns_comments, concurrent_partitioning, catalog_versioning, projection_enabled, projection_types, projection_ranges, projection_values, projection_intervals, projection_digits, catalog_id, schema_evolution. filename_prefix: str, optional If dataset=True, add a filename prefix to the output files. partition_cols: List[str], optional List of column names that will be used to create partitions. Only takes effect if dataset=True. bucketing_info: Tuple[List[str], int], optional Tuple consisting of the column names used for bucketing as the first element and the number of buckets as the second element. Only `str`, `int` and `bool` are supported as column data types for bucketing. concurrent_partitioning: bool If True will increase the parallelism level during the partitions writing. It will decrease the writing time and increase the memory usage. https://aws-data-wrangler.readthedocs.io/en/2.13.0/tutorials/022%20-%20Writing%20Partitions%20Concurrently.html mode: str, optional ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True. For details check the related tutorial: https://aws-data-wrangler.readthedocs.io/en/2.13.0/stubs/awswrangler.s3.to_parquet.html#awswrangler.s3.to_parquet catalog_versioning : bool If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. schema_evolution : bool If True allows schema evolution (new or missing columns), otherwise a exception will be raised. True by default. (Only considered if dataset=True and mode in ("append", "overwrite_partitions")) Related tutorial: https://aws-data-wrangler.readthedocs.io/en/2.13.0/tutorials/014%20-%20Schema%20Evolution.html database : str, optional Glue/Athena catalog: Database name. table : str, optional Glue/Athena catalog: Table name. table_type: str, optional The type of the Glue Table. Set to EXTERNAL_TABLE if None. transaction_id: str, optional The ID of the transaction when writing to a Governed Table. dtype : Dict[str, str], optional Dictionary of columns names and Athena/Glue types to be casted. Useful when you have columns with undetermined or mixed data types. (e.g. {'col name': 'bigint', 'col2 name': 'int'}) description : str, optional Glue/Athena catalog: Table description parameters : Dict[str, str], optional Glue/Athena catalog: Key/value pairs to tag the table. columns_comments : Dict[str, str], optional Glue/Athena catalog: Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). regular_partitions : bool Create regular partitions (Non projected partitions) on Glue Catalog. Disable when you will work only with Partition Projection. Keep enabled even when working with projections is useful to keep Redshift Spectrum working with the regular partitions. projection_enabled : bool Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) projection_types : Optional[Dict[str, str]] Dictionary of partitions names and Athena projections types. Valid types: "enum", "integer", "date", "injected" https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) projection_ranges: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections ranges. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) projection_values: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections values. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) projection_intervals: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections intervals. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '5'}) projection_digits: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. Returns ------- Dict[str, Union[List[str], Dict[str, List[str]]]] Dictionary with: 'paths': List of all stored files paths on S3. 'partitions_values': Dictionary of partitions added with keys as S3 path locations and values as a list of partitions values as str. Examples -------- Writing single file >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/prefix/my_file.parquet', ... ) { 'paths': ['s3://bucket/prefix/my_file.parquet'], 'partitions_values': {} } Writing single file encrypted with a KMS key >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/prefix/my_file.parquet', ... s3_additional_kwargs={ ... 'ServerSideEncryption': 'aws:kms', ... 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN' ... } ... ) { 'paths': ['s3://bucket/prefix/my_file.parquet'], 'partitions_values': {} } Writing partitioned dataset >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... partition_cols=['col2'] ... ) { 'paths': ['s3://.../col2=A/x.parquet', 's3://.../col2=B/y.parquet'], 'partitions_values: { 's3://.../col2=A/': ['A'], 's3://.../col2=B/': ['B'] } } Writing bucketed dataset >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... bucketing_info=(["col2"], 2) ... ) { 'paths': ['s3://.../x_bucket-00000.csv', 's3://.../col2=B/x_bucket-00001.csv'], 'partitions_values: {} } Writing dataset to S3 with metadata on Athena/Glue Catalog. >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... partition_cols=['col2'], ... database='default', # Athena/Glue database ... table='my_table' # Athena/Glue table ... ) { 'paths': ['s3://.../col2=A/x.parquet', 's3://.../col2=B/y.parquet'], 'partitions_values: { 's3://.../col2=A/': ['A'], 's3://.../col2=B/': ['B'] } } Writing dataset to Glue governed table >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'], ... 'col3': [None, None, None] ... }), ... dataset=True, ... mode='append', ... database='default', # Athena/Glue database ... table='my_table', # Athena/Glue table ... table_type='GOVERNED', ... transaction_id="xxx", ... ) { 'paths': ['s3://.../x.parquet'], 'partitions_values: {} } Writing dataset casting empty column data type >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'], ... 'col3': [None, None, None] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... database='default', # Athena/Glue database ... table='my_table' # Athena/Glue table ... dtype={'col3': 'date'} ... ) { 'paths': ['s3://.../x.parquet'], 'partitions_values: {} } """ _validate_args( df=df, table=table, database=database, dataset=dataset, path=path, partition_cols=partition_cols, bucketing_info=bucketing_info, mode=mode, description=description, parameters=parameters, columns_comments=columns_comments, ) # Evaluating compression if _COMPRESSION_2_EXT.get(compression, None) is None: raise exceptions.InvalidCompression(f"{compression} is invalid, please use None, 'snappy' or 'gzip'.") compression_ext: str = _COMPRESSION_2_EXT[compression] # Initializing defaults partition_cols = partition_cols if partition_cols else [] dtype = dtype if dtype else {} partitions_values: Dict[str, List[str]] = {} mode = "append" if mode is None else mode commit_trans: bool = False if transaction_id: table_type = "GOVERNED" filename_prefix = filename_prefix + uuid.uuid4().hex if filename_prefix else uuid.uuid4().hex cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) session: boto3.Session = _utils.ensure_session(session=boto3_session) # Sanitize table to respect Athena's standards if (sanitize_columns is True) or (database is not None and table is not None): df, dtype, partition_cols = _sanitize(df=df, dtype=dtype, partition_cols=partition_cols) # Evaluating dtype catalog_table_input: Optional[Dict[str, Any]] = None if database is not None and table is not None: catalog_table_input = catalog._get_table_input( # pylint: disable=protected-access database=database, table=table, boto3_session=session, transaction_id=transaction_id, catalog_id=catalog_id ) catalog_path: Optional[str] = None if catalog_table_input: table_type = catalog_table_input["TableType"] catalog_path = catalog_table_input["StorageDescriptor"]["Location"] if path is None: if catalog_path: path = catalog_path else: raise exceptions.InvalidArgumentValue( "Glue table does not exist in the catalog. Please pass the `path` argument to create it." ) elif path and catalog_path: if path.rstrip("/") != catalog_path.rstrip("/"): raise exceptions.InvalidArgumentValue( f"The specified path: {path}, does not match the existing Glue catalog table path: {catalog_path}" ) if (table_type == "GOVERNED") and (not transaction_id): _logger.debug("`transaction_id` not specified for GOVERNED table, starting transaction") transaction_id = lakeformation.start_transaction(read_only=False, boto3_session=boto3_session) commit_trans = True df = _apply_dtype(df=df, dtype=dtype, catalog_table_input=catalog_table_input, mode=mode) schema: pa.Schema = _data_types.pyarrow_schema_from_pandas( df=df, index=index, ignore_cols=partition_cols, dtype=dtype ) _logger.debug("schema: \n%s", schema) if dataset is False: paths = _to_parquet( df=df, path=path, schema=schema, index=index, cpus=cpus, compression=compression, compression_ext=compression_ext, pyarrow_additional_kwargs=pyarrow_additional_kwargs, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, dtype=dtype, max_rows_by_file=max_rows_by_file, use_threads=use_threads, ) else: columns_types: Dict[str, str] = {} partitions_types: Dict[str, str] = {} if (database is not None) and (table is not None): columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned( df=df, index=index, partition_cols=partition_cols, dtype=dtype ) if schema_evolution is False: _utils.check_schema_changes(columns_types=columns_types, table_input=catalog_table_input, mode=mode) if (catalog_table_input is None) and (table_type == "GOVERNED"): catalog._create_parquet_table( # pylint: disable=protected-access database=database, table=table, path=path, # type: ignore columns_types=columns_types, table_type=table_type, partitions_types=partitions_types, bucketing_info=bucketing_info, compression=compression, description=description, parameters=parameters, columns_comments=columns_comments, boto3_session=session, mode=mode, transaction_id=transaction_id, catalog_versioning=catalog_versioning, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, projection_storage_location_template=None, catalog_id=catalog_id, catalog_table_input=catalog_table_input, ) catalog_table_input = catalog._get_table_input( # pylint: disable=protected-access database=database, table=table, boto3_session=session, transaction_id=transaction_id, catalog_id=catalog_id, ) paths, partitions_values = _to_dataset( func=_to_parquet, concurrent_partitioning=concurrent_partitioning, df=df, path_root=path, # type: ignore filename_prefix=filename_prefix, index=index, compression=compression, compression_ext=compression_ext, catalog_id=catalog_id, database=database, table=table, table_type=table_type, transaction_id=transaction_id, pyarrow_additional_kwargs=pyarrow_additional_kwargs, cpus=cpus, use_threads=use_threads, partition_cols=partition_cols, partitions_types=partitions_types, bucketing_info=bucketing_info, dtype=dtype, mode=mode, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, schema=schema, max_rows_by_file=max_rows_by_file, ) if (database is not None) and (table is not None): try: catalog._create_parquet_table( # pylint: disable=protected-access database=database, table=table, path=path, # type: ignore columns_types=columns_types, table_type=table_type, partitions_types=partitions_types, bucketing_info=bucketing_info, compression=compression, description=description, parameters=parameters, columns_comments=columns_comments, boto3_session=session, mode=mode, transaction_id=transaction_id, catalog_versioning=catalog_versioning, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, projection_storage_location_template=None, catalog_id=catalog_id, catalog_table_input=catalog_table_input, ) if partitions_values and (regular_partitions is True) and (table_type != "GOVERNED"): _logger.debug("partitions_values:\n%s", partitions_values) catalog.add_parquet_partitions( database=database, table=table, partitions_values=partitions_values, bucketing_info=bucketing_info, compression=compression, boto3_session=session, catalog_id=catalog_id, columns_types=columns_types, ) if commit_trans: lakeformation.commit_transaction( transaction_id=transaction_id, boto3_session=boto3_session # type: ignore ) except Exception: _logger.debug("Catalog write failed, cleaning up S3 (paths: %s).", paths) delete_objects( path=paths, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, ) raise return {"paths": paths, "partitions_values": partitions_values}
def to_parquet( # pylint: disable=too-many-arguments,too-many-locals df: pd.DataFrame, path: str, index: bool = False, compression: Optional[str] = "snappy", use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, sanitize_columns: bool = False, dataset: bool = False, partition_cols: Optional[List[str]] = None, mode: Optional[str] = None, catalog_versioning: bool = False, database: Optional[str] = None, table: Optional[str] = None, dtype: Optional[Dict[str, str]] = None, description: Optional[str] = None, parameters: Optional[Dict[str, str]] = None, columns_comments: Optional[Dict[str, str]] = None, regular_partitions: bool = True, projection_enabled: bool = False, projection_types: Optional[Dict[str, str]] = None, projection_ranges: Optional[Dict[str, str]] = None, projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, ) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: """Write Parquet file or dataset on Amazon S3. The concept of Dataset goes beyond the simple idea of files and enable more complex features like partitioning, casting and catalog integration (Amazon Athena/AWS Glue Catalog). Note ---- If `dataset=True` The table name and all column names will be automatically sanitized using `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`. Please, pass `sanitize_columns=True` to force the same behaviour for `dataset=False`. Note ---- On `append` mode, the `parameters` will be upsert on an existing table. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). Parameters ---------- df: pandas.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html path : str S3 path (for file e.g. ``s3://bucket/prefix/filename.parquet``) (for dataset e.g. ``s3://bucket/prefix``). index : bool True to store the DataFrame index in file, otherwise False to ignore it. compression: str, optional Compression style (``None``, ``snappy``, ``gzip``). use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: Forward to s3fs, useful for server side encryption https://s3fs.readthedocs.io/en/latest/#serverside-encryption sanitize_columns : bool True to sanitize columns names or False to keep it as is. True value is forced if `dataset=True`. dataset : bool If True store a parquet dataset instead of a single file. If True, enable all follow arguments: partition_cols, mode, database, table, description, parameters, columns_comments, . partition_cols: List[str], optional List of column names that will be used to create partitions. Only takes effect if dataset=True. mode: str, optional ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True. catalog_versioning : bool If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. database : str, optional Glue/Athena catalog: Database name. table : str, optional Glue/Athena catalog: Table name. dtype : Dict[str, str], optional Dictionary of columns names and Athena/Glue types to be casted. Useful when you have columns with undetermined or mixed data types. (e.g. {'col name': 'bigint', 'col2 name': 'int'}) description : str, optional Glue/Athena catalog: Table description parameters : Dict[str, str], optional Glue/Athena catalog: Key/value pairs to tag the table. columns_comments : Dict[str, str], optional Glue/Athena catalog: Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). regular_partitions : bool Create regular partitions (Non projected partitions) on Glue Catalog. Disable when you will work only with Partition Projection. Keep enabled even when working with projections is useful to keep Redshift Spectrum working with the regular partitions. projection_enabled : bool Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) projection_types : Optional[Dict[str, str]] Dictionary of partitions names and Athena projections types. Valid types: "enum", "integer", "date", "injected" https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) projection_ranges: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections ranges. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) projection_values: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections values. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) projection_intervals: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections intervals. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '5'}) projection_digits: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) Returns ------- Dict[str, Union[List[str], Dict[str, List[str]]]] Dictionary with: 'paths': List of all stored files paths on S3. 'partitions_values': Dictionary of partitions added with keys as S3 path locations and values as a list of partitions values as str. Examples -------- Writing single file >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/prefix/my_file.parquet', ... ) { 'paths': ['s3://bucket/prefix/my_file.parquet'], 'partitions_values': {} } Writing single file encrypted with a KMS key >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/prefix/my_file.parquet', ... s3_additional_kwargs={ ... 'ServerSideEncryption': 'aws:kms', ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' ... } ... ) { 'paths': ['s3://bucket/prefix/my_file.parquet'], 'partitions_values': {} } Writing partitioned dataset >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... partition_cols=['col2'] ... ) { 'paths': ['s3://.../col2=A/x.parquet', 's3://.../col2=B/y.parquet'], 'partitions_values: { 's3://.../col2=A/': ['A'], 's3://.../col2=B/': ['B'] } } Writing dataset to S3 with metadata on Athena/Glue Catalog. >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... partition_cols=['col2'], ... database='default', # Athena/Glue database ... table='my_table' # Athena/Glue table ... ) { 'paths': ['s3://.../col2=A/x.parquet', 's3://.../col2=B/y.parquet'], 'partitions_values: { 's3://.../col2=A/': ['A'], 's3://.../col2=B/': ['B'] } } Writing dataset casting empty column data type >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'], ... 'col3': [None, None, None] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... database='default', # Athena/Glue database ... table='my_table' # Athena/Glue table ... dtype={'col3': 'date'} ... ) { 'paths': ['s3://.../x.parquet'], 'partitions_values: {} } """ if (database is None) ^ (table is None): raise exceptions.InvalidArgumentCombination( "Please pass database and table arguments to be able to store the metadata into the Athena/Glue Catalog." ) if df.empty is True: raise exceptions.EmptyDataFrame() partition_cols = partition_cols if partition_cols else [] dtype = dtype if dtype else {} partitions_values: Dict[str, List[str]] = {} # Sanitize table to respect Athena's standards if (sanitize_columns is True) or (dataset is True): df = catalog.sanitize_dataframe_columns_names(df=df) partition_cols = [catalog.sanitize_column_name(p) for p in partition_cols] dtype = {catalog.sanitize_column_name(k): v.lower() for k, v in dtype.items()} catalog.drop_duplicated_columns(df=df) session: boto3.Session = _utils.ensure_session(session=boto3_session) cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) fs: s3fs.S3FileSystem = _utils.get_fs(session=session, s3_additional_kwargs=s3_additional_kwargs) compression_ext: Optional[str] = _COMPRESSION_2_EXT.get(compression, None) if compression_ext is None: raise exceptions.InvalidCompression(f"{compression} is invalid, please use None, snappy or gzip.") if dataset is False: if path.endswith("/"): # pragma: no cover raise exceptions.InvalidArgumentValue( "If <dataset=False>, the argument <path> should be a object path, not a directory." ) if partition_cols: raise exceptions.InvalidArgumentCombination("Please, pass dataset=True to be able to use partition_cols.") if mode is not None: raise exceptions.InvalidArgumentCombination("Please pass dataset=True to be able to use mode.") if any(arg is not None for arg in (database, table, description, parameters)): raise exceptions.InvalidArgumentCombination( "Please pass dataset=True to be able to use any one of these " "arguments: database, table, description, parameters, " "columns_comments." ) df = _data_types.cast_pandas_with_athena_types(df=df, dtype=dtype) schema: pa.Schema = _data_types.pyarrow_schema_from_pandas( df=df, index=index, ignore_cols=partition_cols, dtype=dtype ) _logger.debug("schema: \n%s", schema) paths = [ _to_parquet_file( df=df, path=path, schema=schema, index=index, compression=compression, cpus=cpus, fs=fs, dtype=dtype ) ] else: mode = "append" if mode is None else mode if ( (mode in ("append", "overwrite_partitions")) and (database is not None) and (table is not None) ): # Fetching Catalog Types catalog_types: Optional[Dict[str, str]] = catalog.get_table_types( database=database, table=table, boto3_session=session ) if catalog_types is not None: for k, v in catalog_types.items(): dtype[k] = v paths, partitions_values = _to_parquet_dataset( df=df, path=path, index=index, compression=compression, compression_ext=compression_ext, cpus=cpus, fs=fs, use_threads=use_threads, partition_cols=partition_cols, dtype=dtype, mode=mode, boto3_session=session, ) if (database is not None) and (table is not None): columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned( df=df, index=index, partition_cols=partition_cols, dtype=dtype ) catalog.create_parquet_table( database=database, table=table, path=path, columns_types=columns_types, partitions_types=partitions_types, compression=compression, description=description, parameters=parameters, columns_comments=columns_comments, boto3_session=session, mode=mode, catalog_versioning=catalog_versioning, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, ) if partitions_values and (regular_partitions is True): _logger.debug("partitions_values:\n%s", partitions_values) catalog.add_parquet_partitions( database=database, table=table, partitions_values=partitions_values, compression=compression, boto3_session=session, ) return {"paths": paths, "partitions_values": partitions_values}
def _to_csv_dataset( df: pd.DataFrame, path: str, index: bool, sep: str, fs: s3fs.S3FileSystem, use_threads: bool, mode: str, dtype: Dict[str, str], partition_cols: Optional[List[str]] = None, boto3_session: Optional[boto3.Session] = None, ) -> Tuple[List[str], Dict[str, List[str]]]: paths: List[str] = [] partitions_values: Dict[str, List[str]] = {} path = path if path[-1] == "/" else f"{path}/" if mode not in ["append", "overwrite", "overwrite_partitions"]: raise exceptions.InvalidArgumentValue( f"{mode} is a invalid mode, please use append, overwrite or overwrite_partitions." ) if (mode == "overwrite") or ((mode == "overwrite_partitions") and (not partition_cols)): delete_objects(path=path, use_threads=use_threads, boto3_session=boto3_session) df = _data_types.cast_pandas_with_athena_types(df=df, dtype=dtype) _logger.debug("dtypes: %s", df.dtypes) if not partition_cols: file_path: str = f"{path}{uuid.uuid4().hex}.csv" _to_text( file_format="csv", df=df, path=file_path, fs=fs, quoting=csv.QUOTE_NONE, escapechar="\\", header=False, date_format="%Y-%m-%d %H:%M:%S.%f", index=index, sep=sep, ) paths.append(file_path) else: for keys, subgroup in df.groupby(by=partition_cols, observed=True): subgroup = subgroup.drop(partition_cols, axis="columns") keys = (keys,) if not isinstance(keys, tuple) else keys subdir = "/".join([f"{name}={val}" for name, val in zip(partition_cols, keys)]) prefix: str = f"{path}{subdir}/" if mode == "overwrite_partitions": delete_objects(path=prefix, use_threads=use_threads, boto3_session=boto3_session) file_path = f"{prefix}{uuid.uuid4().hex}.csv" _to_text( file_format="csv", df=subgroup, path=file_path, fs=fs, quoting=csv.QUOTE_NONE, escapechar="\\", header=False, date_format="%Y-%m-%d %H:%M:%S.%f", index=index, sep=sep, ) paths.append(file_path) partitions_values[prefix] = [str(k) for k in keys] return paths, partitions_values
def merge_datasets( source_path: str, target_path: str, mode: str = "append", use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, ) -> List[str]: """Merge a source dataset into a target dataset. This function accepts Unix shell-style wildcards in the source_path argument. * (matches everything), ? (matches any single character), [seq] (matches any character in seq), [!seq] (matches any character not in seq). Note ---- If you are merging tables (S3 datasets + Glue Catalog metadata), remember that you will also need to update your partitions metadata in some cases. (e.g. wr.athena.repair_table(table='...', database='...')) Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- source_path : str, S3 Path for the source directory. target_path : str, S3 Path for the target directory. mode: str, optional ``append`` (Default), ``overwrite``, ``overwrite_partitions``. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs : Optional[Dict[str, Any]] Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass", "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging". e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'} Returns ------- List[str] List of new objects paths. Examples -------- Merging >>> import awswrangler as wr >>> wr.s3.merge_datasets( ... source_path="s3://bucket0/dir0/", ... target_path="s3://bucket1/dir1/", ... mode="append" ... ) ["s3://bucket1/dir1/key0", "s3://bucket1/dir1/key1"] Merging with a KMS key >>> import awswrangler as wr >>> wr.s3.merge_datasets( ... source_path="s3://bucket0/dir0/", ... target_path="s3://bucket1/dir1/", ... mode="append", ... s3_additional_kwargs={ ... 'ServerSideEncryption': 'aws:kms', ... 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN' ... } ... ) ["s3://bucket1/dir1/key0", "s3://bucket1/dir1/key1"] """ source_path = source_path[:-1] if source_path[-1] == "/" else source_path target_path = target_path[:-1] if target_path[-1] == "/" else target_path session: boto3.Session = _utils.ensure_session(session=boto3_session) paths: List[str] = list_objects(path=f"{source_path}/", boto3_session=session) _logger.debug("len(paths): %s", len(paths)) if len(paths) < 1: return [] if mode == "overwrite": _logger.debug("Deleting to overwrite: %s/", target_path) delete_objects(path=f"{target_path}/", use_threads=use_threads, boto3_session=session) elif mode == "overwrite_partitions": paths_wo_prefix: List[str] = [ x.replace(f"{source_path}/", "") for x in paths ] paths_wo_filename: List[str] = [ f"{x.rpartition('/')[0]}/" for x in paths_wo_prefix ] partitions_paths: List[str] = list(set(paths_wo_filename)) target_partitions_paths = [ f"{target_path}/{x}" for x in partitions_paths ] for path in target_partitions_paths: _logger.debug("Deleting to overwrite_partitions: %s", path) delete_objects(path=path, use_threads=use_threads, boto3_session=session) elif mode != "append": raise exceptions.InvalidArgumentValue( f"{mode} is a invalid mode option.") new_objects: List[str] = copy_objects( paths=paths, source_path=source_path, target_path=target_path, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, ) _logger.debug("len(new_objects): %s", len(new_objects)) return new_objects
def _resolve_query_without_cache( # pylint: disable=too-many-branches,too-many-locals,too-many-return-statements,too-many-statements sql: str, database: str, ctas_approach: bool, categories: Optional[List[str]], chunksize: Optional[Union[int, bool]], s3_output: Optional[str], workgroup: Optional[str], encryption: Optional[str], kms_key: Optional[str], keep_files: bool, ctas_temp_table_name: Optional[str], use_threads: bool, session: Optional[boto3.Session], ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """ Execute any query in Athena and returns results as Dataframe, back to `read_sql_query`. Usually called by `read_sql_query` when using cache is not possible. """ wg_config: Dict[str, Union[bool, Optional[str]]] = _get_workgroup_config(session=session, workgroup=workgroup) _s3_output: str = _get_s3_output(s3_output=s3_output, wg_config=wg_config, boto3_session=session) _s3_output = _s3_output[:-1] if _s3_output[-1] == "/" else _s3_output name: str = "" if ctas_approach is True: if ctas_temp_table_name is not None: name = catalog.sanitize_table_name(ctas_temp_table_name) else: name = f"temp_table_{pa.compat.guid()}" path: str = f"{_s3_output}/{name}" ext_location: str = "\n" if wg_config["enforced"] is True else f",\n external_location = '{path}'\n" sql = ( f'CREATE TABLE "{name}"\n' f"WITH(\n" f" format = 'Parquet',\n" f" parquet_compression = 'SNAPPY'" f"{ext_location}" f") AS\n" f"{sql}" ) _logger.debug("sql: %s", sql) query_id: str = _start_query_execution( sql=sql, wg_config=wg_config, database=database, s3_output=_s3_output, workgroup=workgroup, encryption=encryption, kms_key=kms_key, boto3_session=session, ) _logger.debug("query_id: %s", query_id) try: query_response: Dict[str, Any] = wait_query(query_execution_id=query_id, boto3_session=session) except exceptions.QueryFailed as ex: if ctas_approach is True: if "Column name not specified" in str(ex): raise exceptions.InvalidArgumentValue( "Please, define all columns names in your query. (E.g. 'SELECT MAX(col1) AS max_col1, ...')" ) if "Column type is unknown" in str(ex): raise exceptions.InvalidArgumentValue( "Please, define all columns types in your query. " "(E.g. 'SELECT CAST(NULL AS INTEGER) AS MY_COL, ...')" ) raise ex # pragma: no cover if query_response["QueryExecution"]["Status"]["State"] in ["FAILED", "CANCELLED"]: # pragma: no cover reason: str = query_response["QueryExecution"]["Status"]["StateChangeReason"] message_error: str = f"Query error: {reason}" raise exceptions.AthenaQueryError(message_error) ret: Union[pd.DataFrame, Iterator[pd.DataFrame]] if ctas_approach is True: catalog.delete_table_if_exists(database=database, table=name, boto3_session=session) manifest_path: str = f"{_s3_output}/tables/{query_id}-manifest.csv" metadata_path: str = f"{_s3_output}/tables/{query_id}.metadata" _logger.debug("manifest_path: %s", manifest_path) _logger.debug("metadata_path: %s", metadata_path) s3.wait_objects_exist(paths=[manifest_path, metadata_path], use_threads=False, boto3_session=session) paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=session) chunked: Union[bool, int] = False if chunksize is None else chunksize _logger.debug("chunked: %s", chunked) if not paths: if chunked is False: return pd.DataFrame() return _utils.empty_generator() s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session) ret = s3.read_parquet( path=paths, use_threads=use_threads, boto3_session=session, chunked=chunked, categories=categories ) paths_delete: List[str] = paths + [manifest_path, metadata_path] _logger.debug(type(ret)) if chunked is False: if keep_files is False: s3.delete_objects(path=paths_delete, use_threads=use_threads, boto3_session=session) return ret if keep_files is False: return _delete_after_iterate(dfs=ret, paths=paths_delete, use_threads=use_threads, boto3_session=session) return ret dtype, parse_timestamps, parse_dates, converters, binaries = _get_query_metadata( query_execution_id=query_id, categories=categories, boto3_session=session ) path = f"{_s3_output}/{query_id}.csv" s3.wait_objects_exist(paths=[path], use_threads=False, boto3_session=session) _logger.debug("Start CSV reading from %s", path) _chunksize: Optional[int] = chunksize if isinstance(chunksize, int) else None _logger.debug("_chunksize: %s", _chunksize) ret = s3.read_csv( path=[path], dtype=dtype, parse_dates=parse_timestamps, converters=converters, quoting=csv.QUOTE_ALL, keep_default_na=False, na_values=[""], chunksize=_chunksize, skip_blank_lines=False, use_threads=False, boto3_session=session, ) _logger.debug("Start type casting...") _logger.debug(type(ret)) if chunksize is None: df = _fix_csv_types(df=ret, parse_dates=parse_dates, binaries=binaries) if keep_files is False: s3.delete_objects(path=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=session) return df dfs = _fix_csv_types_generator(dfs=ret, parse_dates=parse_dates, binaries=binaries) if keep_files is False: return _delete_after_iterate( dfs=dfs, paths=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=session ) return dfs
def to_csv( # pylint: disable=too-many-arguments,too-many-locals,too-many-statements,too-many-branches df: pd.DataFrame, path: Optional[str] = None, sep: str = ",", index: bool = True, columns: Optional[List[str]] = None, use_threads: Union[bool, int] = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, sanitize_columns: bool = False, dataset: bool = False, filename_prefix: Optional[str] = None, partition_cols: Optional[List[str]] = None, bucketing_info: Optional[Tuple[List[str], int]] = None, concurrent_partitioning: bool = False, mode: Optional[str] = None, catalog_versioning: bool = False, schema_evolution: bool = False, database: Optional[str] = None, table: Optional[str] = None, table_type: Optional[str] = None, transaction_id: Optional[str] = None, dtype: Optional[Dict[str, str]] = None, description: Optional[str] = None, parameters: Optional[Dict[str, str]] = None, columns_comments: Optional[Dict[str, str]] = None, regular_partitions: bool = True, projection_enabled: bool = False, projection_types: Optional[Dict[str, str]] = None, projection_ranges: Optional[Dict[str, str]] = None, projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, catalog_id: Optional[str] = None, **pandas_kwargs: Any, ) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: """Write CSV file or dataset on Amazon S3. The concept of Dataset goes beyond the simple idea of ordinary files and enable more complex features like partitioning and catalog integration (Amazon Athena/AWS Glue Catalog). Note ---- If database` and `table` arguments are passed, the table name and all column names will be automatically sanitized using `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`. Please, pass `sanitize_columns=True` to enforce this behaviour always. Note ---- If `table` and `database` arguments are passed, `pandas_kwargs` will be ignored due restrictive quoting, date_format, escapechar and encoding required by Athena/Glue Catalog. Note ---- Compression: The minimum acceptable version to achive it is Pandas 1.2.0 that requires Python >= 3.7.1. Note ---- On `append` mode, the `parameters` will be upsert on an existing table. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- df: pandas.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html path : str, optional Amazon S3 path (e.g. s3://bucket/prefix/filename.csv) (for dataset e.g. ``s3://bucket/prefix``). Required if dataset=False or when creating a new dataset sep : str String of length 1. Field delimiter for the output file. index : bool Write row names (index). columns : Optional[List[str]] Columns to write. use_threads : bool, int True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. If integer is provided, specified number is used. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 Session will be used if boto3_session receive None. s3_additional_kwargs : Optional[Dict[str, Any]] Forwarded to botocore requests. e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'} sanitize_columns : bool True to sanitize columns names or False to keep it as is. True value is forced if `dataset=True`. dataset : bool If True store as a dataset instead of ordinary file(s) If True, enable all follow arguments: partition_cols, mode, database, table, description, parameters, columns_comments, concurrent_partitioning, catalog_versioning, projection_enabled, projection_types, projection_ranges, projection_values, projection_intervals, projection_digits, catalog_id, schema_evolution. filename_prefix: str, optional If dataset=True, add a filename prefix to the output files. partition_cols: List[str], optional List of column names that will be used to create partitions. Only takes effect if dataset=True. bucketing_info: Tuple[List[str], int], optional Tuple consisting of the column names used for bucketing as the first element and the number of buckets as the second element. Only `str`, `int` and `bool` are supported as column data types for bucketing. concurrent_partitioning: bool If True will increase the parallelism level during the partitions writing. It will decrease the writing time and increase the memory usage. https://aws-data-wrangler.readthedocs.io/en/2.13.0/tutorials/022%20-%20Writing%20Partitions%20Concurrently.html mode : str, optional ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True. For details check the related tutorial: https://aws-data-wrangler.readthedocs.io/en/2.13.0/stubs/awswrangler.s3.to_parquet.html#awswrangler.s3.to_parquet catalog_versioning : bool If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. schema_evolution : bool If True allows schema evolution (new or missing columns), otherwise a exception will be raised. (Only considered if dataset=True and mode in ("append", "overwrite_partitions")). False by default. Related tutorial: https://aws-data-wrangler.readthedocs.io/en/2.13.0/tutorials/014%20-%20Schema%20Evolution.html database : str, optional Glue/Athena catalog: Database name. table : str, optional Glue/Athena catalog: Table name. table_type: str, optional The type of the Glue Table. Set to EXTERNAL_TABLE if None transaction_id: str, optional The ID of the transaction when writing to a Governed Table. dtype : Dict[str, str], optional Dictionary of columns names and Athena/Glue types to be casted. Useful when you have columns with undetermined or mixed data types. (e.g. {'col name': 'bigint', 'col2 name': 'int'}) description : str, optional Glue/Athena catalog: Table description parameters : Dict[str, str], optional Glue/Athena catalog: Key/value pairs to tag the table. columns_comments : Dict[str, str], optional Glue/Athena catalog: Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). regular_partitions : bool Create regular partitions (Non projected partitions) on Glue Catalog. Disable when you will work only with Partition Projection. Keep enabled even when working with projections is useful to keep Redshift Spectrum working with the regular partitions. projection_enabled : bool Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) projection_types : Optional[Dict[str, str]] Dictionary of partitions names and Athena projections types. Valid types: "enum", "integer", "date", "injected" https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) projection_ranges: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections ranges. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) projection_values: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections values. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) projection_intervals: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections intervals. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '5'}) projection_digits: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. pandas_kwargs : KEYWORD arguments forwarded to pandas.DataFrame.to_csv(). You can NOT pass `pandas_kwargs` explicit, just add valid Pandas arguments in the function call and Wrangler will accept it. e.g. wr.s3.to_csv(df, path, sep='|', na_rep='NULL', decimal=',') https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html Returns ------- Dict[str, Union[List[str], Dict[str, List[str]]]] Dictionary with: 'paths': List of all stored files paths on S3. 'partitions_values': Dictionary of partitions added with keys as S3 path locations and values as a list of partitions values as str. Examples -------- Writing single file >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/prefix/my_file.csv', ... ) { 'paths': ['s3://bucket/prefix/my_file.csv'], 'partitions_values': {} } Writing single file with pandas_kwargs >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/prefix/my_file.csv', ... sep='|', ... na_rep='NULL', ... decimal=',' ... ) { 'paths': ['s3://bucket/prefix/my_file.csv'], 'partitions_values': {} } Writing single file encrypted with a KMS key >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/prefix/my_file.csv', ... s3_additional_kwargs={ ... 'ServerSideEncryption': 'aws:kms', ... 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN' ... } ... ) { 'paths': ['s3://bucket/prefix/my_file.csv'], 'partitions_values': {} } Writing partitioned dataset >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... partition_cols=['col2'] ... ) { 'paths': ['s3://.../col2=A/x.csv', 's3://.../col2=B/y.csv'], 'partitions_values: { 's3://.../col2=A/': ['A'], 's3://.../col2=B/': ['B'] } } Writing bucketed dataset >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... bucketing_info=(["col2"], 2) ... ) { 'paths': ['s3://.../x_bucket-00000.csv', 's3://.../col2=B/x_bucket-00001.csv'], 'partitions_values: {} } Writing dataset to S3 with metadata on Athena/Glue Catalog. >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... partition_cols=['col2'], ... database='default', # Athena/Glue database ... table='my_table' # Athena/Glue table ... ) { 'paths': ['s3://.../col2=A/x.csv', 's3://.../col2=B/y.csv'], 'partitions_values: { 's3://.../col2=A/': ['A'], 's3://.../col2=B/': ['B'] } } Writing dataset to Glue governed table >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'], ... 'col3': [None, None, None] ... }), ... dataset=True, ... mode='append', ... database='default', # Athena/Glue database ... table='my_table', # Athena/Glue table ... table_type='GOVERNED', ... transaction_id="xxx", ... ) { 'paths': ['s3://.../x.csv'], 'partitions_values: {} } Writing dataset casting empty column data type >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'], ... 'col3': [None, None, None] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... database='default', # Athena/Glue database ... table='my_table' # Athena/Glue table ... dtype={'col3': 'date'} ... ) { 'paths': ['s3://.../x.csv'], 'partitions_values: {} } """ if "pandas_kwargs" in pandas_kwargs: raise exceptions.InvalidArgument( "You can NOT pass `pandas_kwargs` explicit, just add valid " "Pandas arguments in the function call and Wrangler will accept it." "e.g. wr.s3.to_csv(df, path, sep='|', na_rep='NULL', decimal=',', compression='gzip')" ) if pandas_kwargs.get("compression") and str( pd.__version__) < LooseVersion("1.2.0"): raise exceptions.InvalidArgument( f"CSV compression on S3 is not supported for Pandas version {pd.__version__}. " "The minimum acceptable version to achive it is Pandas 1.2.0 that requires Python >=3.7.1." ) _validate_args( df=df, table=table, database=database, dataset=dataset, path=path, partition_cols=partition_cols, bucketing_info=bucketing_info, mode=mode, description=description, parameters=parameters, columns_comments=columns_comments, ) # Initializing defaults partition_cols = partition_cols if partition_cols else [] dtype = dtype if dtype else {} partitions_values: Dict[str, List[str]] = {} mode = "append" if mode is None else mode commit_trans: bool = False if transaction_id: table_type = "GOVERNED" filename_prefix = filename_prefix + uuid.uuid4( ).hex if filename_prefix else uuid.uuid4().hex session: boto3.Session = _utils.ensure_session(session=boto3_session) # Sanitize table to respect Athena's standards if (sanitize_columns is True) or (database is not None and table is not None): df, dtype, partition_cols = _sanitize(df=df, dtype=dtype, partition_cols=partition_cols) # Evaluating dtype catalog_table_input: Optional[Dict[str, Any]] = None if database and table: catalog_table_input = catalog._get_table_input( # pylint: disable=protected-access database=database, table=table, boto3_session=session, transaction_id=transaction_id, catalog_id=catalog_id) catalog_path: Optional[str] = None if catalog_table_input: table_type = catalog_table_input["TableType"] catalog_path = catalog_table_input.get("StorageDescriptor", {}).get("Location") if path is None: if catalog_path: path = catalog_path else: raise exceptions.InvalidArgumentValue( "Glue table does not exist in the catalog. Please pass the `path` argument to create it." ) elif path and catalog_path: if path.rstrip("/") != catalog_path.rstrip("/"): raise exceptions.InvalidArgumentValue( f"The specified path: {path}, does not match the existing Glue catalog table path: {catalog_path}" ) if pandas_kwargs.get("compression") not in ("gzip", "bz2", None): raise exceptions.InvalidArgumentCombination( "If database and table are given, you must use one of these compressions: gzip, bz2 or None." ) if (table_type == "GOVERNED") and (not transaction_id): _logger.debug( "`transaction_id` not specified for GOVERNED table, starting transaction" ) transaction_id = lakeformation.start_transaction( read_only=False, boto3_session=boto3_session) commit_trans = True df = _apply_dtype(df=df, dtype=dtype, catalog_table_input=catalog_table_input, mode=mode) paths: List[str] = [] if dataset is False: pandas_kwargs["sep"] = sep pandas_kwargs["index"] = index pandas_kwargs["columns"] = columns _to_text( file_format="csv", df=df, use_threads=use_threads, path=path, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, **pandas_kwargs, ) paths = [path] # type: ignore else: compression: Optional[str] = pandas_kwargs.get("compression", None) if database and table: quoting: Optional[int] = csv.QUOTE_NONE escapechar: Optional[str] = "\\" header: Union[bool, List[str]] = pandas_kwargs.get("header", False) date_format: Optional[str] = "%Y-%m-%d %H:%M:%S.%f" pd_kwargs: Dict[str, Any] = {} else: quoting = pandas_kwargs.get("quoting", None) escapechar = pandas_kwargs.get("escapechar", None) header = pandas_kwargs.get("header", True) date_format = pandas_kwargs.get("date_format", None) pd_kwargs = pandas_kwargs.copy() pd_kwargs.pop("quoting", None) pd_kwargs.pop("escapechar", None) pd_kwargs.pop("header", None) pd_kwargs.pop("date_format", None) pd_kwargs.pop("compression", None) df = df[columns] if columns else df columns_types: Dict[str, str] = {} partitions_types: Dict[str, str] = {} if database and table: columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned( df=df, index=index, partition_cols=partition_cols, dtype=dtype, index_left=True) if schema_evolution is False: _utils.check_schema_changes(columns_types=columns_types, table_input=catalog_table_input, mode=mode) if (catalog_table_input is None) and (table_type == "GOVERNED"): catalog._create_csv_table( # pylint: disable=protected-access database=database, table=table, path=path, columns_types=columns_types, table_type=table_type, partitions_types=partitions_types, bucketing_info=bucketing_info, description=description, parameters=parameters, columns_comments=columns_comments, boto3_session=session, mode=mode, transaction_id=transaction_id, schema_evolution=schema_evolution, catalog_versioning=catalog_versioning, sep=sep, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, projection_storage_location_template=None, catalog_table_input=catalog_table_input, catalog_id=catalog_id, compression=pandas_kwargs.get("compression"), skip_header_line_count=None, serde_library=None, serde_parameters=None, ) catalog_table_input = catalog._get_table_input( # pylint: disable=protected-access database=database, table=table, boto3_session=session, transaction_id=transaction_id, catalog_id=catalog_id, ) paths, partitions_values = _to_dataset( func=_to_text, concurrent_partitioning=concurrent_partitioning, df=df, path_root=path, # type: ignore index=index, sep=sep, compression=compression, catalog_id=catalog_id, database=database, table=table, table_type=table_type, transaction_id=transaction_id, filename_prefix=filename_prefix, use_threads=use_threads, partition_cols=partition_cols, partitions_types=partitions_types, bucketing_info=bucketing_info, mode=mode, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, file_format="csv", quoting=quoting, escapechar=escapechar, header=header, date_format=date_format, **pd_kwargs, ) if database and table: try: serde_info: Dict[str, Any] = {} if catalog_table_input: serde_info = catalog_table_input["StorageDescriptor"][ "SerdeInfo"] serde_library: Optional[str] = serde_info.get( "SerializationLibrary", None) serde_parameters: Optional[Dict[str, str]] = serde_info.get( "Parameters", None) catalog._create_csv_table( # pylint: disable=protected-access database=database, table=table, path=path, columns_types=columns_types, table_type=table_type, partitions_types=partitions_types, bucketing_info=bucketing_info, description=description, parameters=parameters, columns_comments=columns_comments, boto3_session=session, mode=mode, transaction_id=transaction_id, catalog_versioning=catalog_versioning, schema_evolution=schema_evolution, sep=sep, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, projection_storage_location_template=None, catalog_table_input=catalog_table_input, catalog_id=catalog_id, compression=pandas_kwargs.get("compression"), skip_header_line_count=True if header else None, serde_library=serde_library, serde_parameters=serde_parameters, ) if partitions_values and (regular_partitions is True) and (table_type != "GOVERNED"): _logger.debug("partitions_values:\n%s", partitions_values) catalog.add_csv_partitions( database=database, table=table, partitions_values=partitions_values, bucketing_info=bucketing_info, boto3_session=session, sep=sep, serde_library=serde_library, serde_parameters=serde_parameters, catalog_id=catalog_id, columns_types=columns_types, compression=pandas_kwargs.get("compression"), ) if commit_trans: lakeformation.commit_transaction( transaction_id=transaction_id, boto3_session=boto3_session # type: ignore ) except Exception: _logger.debug( "Catalog write failed, cleaning up S3 (paths: %s).", paths) delete_objects( path=paths, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, ) raise return {"paths": paths, "partitions_values": partitions_values}
def to_json( # pylint: disable=too-many-arguments,too-many-locals,too-many-statements,too-many-branches df: pd.DataFrame, path: Optional[str] = None, index: bool = True, columns: Optional[List[str]] = None, use_threads: Union[bool, int] = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, sanitize_columns: bool = False, dataset: bool = False, filename_prefix: Optional[str] = None, partition_cols: Optional[List[str]] = None, bucketing_info: Optional[Tuple[List[str], int]] = None, concurrent_partitioning: bool = False, mode: Optional[str] = None, catalog_versioning: bool = False, schema_evolution: bool = True, database: Optional[str] = None, table: Optional[str] = None, table_type: Optional[str] = None, transaction_id: Optional[str] = None, dtype: Optional[Dict[str, str]] = None, description: Optional[str] = None, parameters: Optional[Dict[str, str]] = None, columns_comments: Optional[Dict[str, str]] = None, regular_partitions: bool = True, projection_enabled: bool = False, projection_types: Optional[Dict[str, str]] = None, projection_ranges: Optional[Dict[str, str]] = None, projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, catalog_id: Optional[str] = None, **pandas_kwargs: Any, ) -> Union[List[str], Dict[str, Union[List[str], Dict[str, List[str]]]]]: """Write JSON file on Amazon S3. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Note ---- Compression: The minimum acceptable version to achive it is Pandas 1.2.0 that requires Python >= 3.7.1. Parameters ---------- df: pandas.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html path : str Amazon S3 path (e.g. s3://bucket/filename.json). index : bool Write row names (index). columns : Optional[List[str]] Columns to write. use_threads : bool, int True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. If integer is provided, specified number is used. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 Session will be used if boto3_session receive None. s3_additional_kwargs : Optional[Dict[str, Any]] Forwarded to botocore requests. e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'} sanitize_columns : bool True to sanitize columns names or False to keep it as is. True value is forced if `dataset=True`. dataset : bool If True store as a dataset instead of ordinary file(s) If True, enable all follow arguments: partition_cols, mode, database, table, description, parameters, columns_comments, concurrent_partitioning, catalog_versioning, projection_enabled, projection_types, projection_ranges, projection_values, projection_intervals, projection_digits, catalog_id, schema_evolution. filename_prefix: str, optional If dataset=True, add a filename prefix to the output files. partition_cols: List[str], optional List of column names that will be used to create partitions. Only takes effect if dataset=True. bucketing_info: Tuple[List[str], int], optional Tuple consisting of the column names used for bucketing as the first element and the number of buckets as the second element. Only `str`, `int` and `bool` are supported as column data types for bucketing. concurrent_partitioning: bool If True will increase the parallelism level during the partitions writing. It will decrease the writing time and increase the memory usage. https://aws-data-wrangler.readthedocs.io/en/2.13.0/tutorials/022%20-%20Writing%20Partitions%20Concurrently.html mode : str, optional ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True. For details check the related tutorial: https://aws-data-wrangler.readthedocs.io/en/2.13.0/stubs/awswrangler.s3.to_parquet.html#awswrangler.s3.to_parquet catalog_versioning : bool If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. schema_evolution : bool If True allows schema evolution (new or missing columns), otherwise a exception will be raised. (Only considered if dataset=True and mode in ("append", "overwrite_partitions")) Related tutorial: https://aws-data-wrangler.readthedocs.io/en/2.13.0/tutorials/014%20-%20Schema%20Evolution.html database : str, optional Glue/Athena catalog: Database name. table : str, optional Glue/Athena catalog: Table name. table_type: str, optional The type of the Glue Table. Set to EXTERNAL_TABLE if None transaction_id: str, optional The ID of the transaction when writing to a Governed Table. dtype : Dict[str, str], optional Dictionary of columns names and Athena/Glue types to be casted. Useful when you have columns with undetermined or mixed data types. (e.g. {'col name': 'bigint', 'col2 name': 'int'}) description : str, optional Glue/Athena catalog: Table description parameters : Dict[str, str], optional Glue/Athena catalog: Key/value pairs to tag the table. columns_comments : Dict[str, str], optional Glue/Athena catalog: Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). regular_partitions : bool Create regular partitions (Non projected partitions) on Glue Catalog. Disable when you will work only with Partition Projection. Keep enabled even when working with projections is useful to keep Redshift Spectrum working with the regular partitions. projection_enabled : bool Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) projection_types : Optional[Dict[str, str]] Dictionary of partitions names and Athena projections types. Valid types: "enum", "integer", "date", "injected" https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) projection_ranges: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections ranges. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) projection_values: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections values. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) projection_intervals: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections intervals. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '5'}) projection_digits: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. pandas_kwargs: KEYWORD arguments forwarded to pandas.DataFrame.to_json(). You can NOT pass `pandas_kwargs` explicit, just add valid Pandas arguments in the function call and Wrangler will accept it. e.g. wr.s3.to_json(df, path, lines=True, date_format='iso') https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html Returns ------- List[str] List of written files. Examples -------- Writing JSON file >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_json( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/filename.json', ... ) Writing JSON file using pandas_kwargs >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_json( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/filename.json', ... lines=True, ... date_format='iso' ... ) Writing CSV file encrypted with a KMS key >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_json( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/filename.json', ... s3_additional_kwargs={ ... 'ServerSideEncryption': 'aws:kms', ... 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN' ... } ... ) """ if "pandas_kwargs" in pandas_kwargs: raise exceptions.InvalidArgument( "You can NOT pass `pandas_kwargs` explicit, just add valid " "Pandas arguments in the function call and Wrangler will accept it." "e.g. wr.s3.to_json(df, path, lines=True, date_format='iso')") if pandas_kwargs.get("compression") and str( pd.__version__) < LooseVersion("1.2.0"): raise exceptions.InvalidArgument( f"JSON compression on S3 is not supported for Pandas version {pd.__version__}. " "The minimum acceptable version to achive it is Pandas 1.2.0 that requires Python >=3.7.1." ) _validate_args( df=df, table=table, database=database, dataset=dataset, path=path, partition_cols=partition_cols, bucketing_info=bucketing_info, mode=mode, description=description, parameters=parameters, columns_comments=columns_comments, ) # Initializing defaults partition_cols = partition_cols if partition_cols else [] dtype = dtype if dtype else {} partitions_values: Dict[str, List[str]] = {} mode = "append" if mode is None else mode commit_trans: bool = False if transaction_id: table_type = "GOVERNED" filename_prefix = filename_prefix + uuid.uuid4( ).hex if filename_prefix else uuid.uuid4().hex session: boto3.Session = _utils.ensure_session(session=boto3_session) # Sanitize table to respect Athena's standards if (sanitize_columns is True) or (database is not None and table is not None): df, dtype, partition_cols = _sanitize(df=df, dtype=dtype, partition_cols=partition_cols) # Evaluating dtype catalog_table_input: Optional[Dict[str, Any]] = None if database and table: catalog_table_input = catalog._get_table_input( # pylint: disable=protected-access database=database, table=table, boto3_session=session, transaction_id=transaction_id, catalog_id=catalog_id) catalog_path: Optional[str] = None if catalog_table_input: table_type = catalog_table_input["TableType"] catalog_path = catalog_table_input.get("StorageDescriptor", {}).get("Location") if path is None: if catalog_path: path = catalog_path else: raise exceptions.InvalidArgumentValue( "Glue table does not exist in the catalog. Please pass the `path` argument to create it." ) elif path and catalog_path: if path.rstrip("/") != catalog_path.rstrip("/"): raise exceptions.InvalidArgumentValue( f"The specified path: {path}, does not match the existing Glue catalog table path: {catalog_path}" ) if pandas_kwargs.get("compression") not in ("gzip", "bz2", None): raise exceptions.InvalidArgumentCombination( "If database and table are given, you must use one of these compressions: gzip, bz2 or None." ) if (table_type == "GOVERNED") and (not transaction_id): _logger.debug( "`transaction_id` not specified for GOVERNED table, starting transaction" ) transaction_id = lakeformation.start_transaction( read_only=False, boto3_session=boto3_session) commit_trans = True df = _apply_dtype(df=df, dtype=dtype, catalog_table_input=catalog_table_input, mode=mode) if dataset is False: return _to_text( file_format="json", df=df, path=path, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, **pandas_kwargs, ) compression: Optional[str] = pandas_kwargs.get("compression", None) df = df[columns] if columns else df columns_types: Dict[str, str] = {} partitions_types: Dict[str, str] = {} if database and table: columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned( df=df, index=index, partition_cols=partition_cols, dtype=dtype) if schema_evolution is False: _utils.check_schema_changes(columns_types=columns_types, table_input=catalog_table_input, mode=mode) if (catalog_table_input is None) and (table_type == "GOVERNED"): catalog._create_json_table( # pylint: disable=protected-access database=database, table=table, path=path, # type: ignore columns_types=columns_types, table_type=table_type, partitions_types=partitions_types, bucketing_info=bucketing_info, description=description, parameters=parameters, columns_comments=columns_comments, boto3_session=session, mode=mode, transaction_id=transaction_id, catalog_versioning=catalog_versioning, schema_evolution=schema_evolution, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, projection_storage_location_template=None, catalog_table_input=catalog_table_input, catalog_id=catalog_id, compression=pandas_kwargs.get("compression"), serde_library=None, serde_parameters=None, ) catalog_table_input = catalog._get_table_input( # pylint: disable=protected-access database=database, table=table, boto3_session=session, transaction_id=transaction_id, catalog_id=catalog_id, ) paths, partitions_values = _to_dataset( func=_to_text, concurrent_partitioning=concurrent_partitioning, df=df, path_root=path, # type: ignore filename_prefix=filename_prefix, index=index, compression=compression, catalog_id=catalog_id, database=database, table=table, table_type=table_type, transaction_id=transaction_id, use_threads=use_threads, partition_cols=partition_cols, partitions_types=partitions_types, bucketing_info=bucketing_info, mode=mode, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, file_format="json", ) if database and table: try: serde_info: Dict[str, Any] = {} if catalog_table_input: serde_info = catalog_table_input["StorageDescriptor"][ "SerdeInfo"] serde_library: Optional[str] = serde_info.get( "SerializationLibrary", None) serde_parameters: Optional[Dict[str, str]] = serde_info.get( "Parameters", None) catalog._create_json_table( # pylint: disable=protected-access database=database, table=table, path=path, # type: ignore columns_types=columns_types, table_type=table_type, partitions_types=partitions_types, bucketing_info=bucketing_info, description=description, parameters=parameters, columns_comments=columns_comments, boto3_session=session, mode=mode, transaction_id=transaction_id, catalog_versioning=catalog_versioning, schema_evolution=schema_evolution, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, projection_storage_location_template=None, catalog_table_input=catalog_table_input, catalog_id=catalog_id, compression=pandas_kwargs.get("compression"), serde_library=serde_library, serde_parameters=serde_parameters, ) if partitions_values and (regular_partitions is True) and (table_type != "GOVERNED"): _logger.debug("partitions_values:\n%s", partitions_values) catalog.add_json_partitions( database=database, table=table, partitions_values=partitions_values, bucketing_info=bucketing_info, boto3_session=session, serde_library=serde_library, serde_parameters=serde_parameters, catalog_id=catalog_id, columns_types=columns_types, compression=pandas_kwargs.get("compression"), ) if commit_trans: lakeformation.commit_transaction( transaction_id=transaction_id, boto3_session=boto3_session # type: ignore ) except Exception: _logger.debug("Catalog write failed, cleaning up S3 (paths: %s).", paths) delete_objects( path=paths, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, ) raise return {"paths": paths, "partitions_values": partitions_values}
def _create_parquet_table( database: str, table: str, path: str, columns_types: Dict[str, str], partitions_types: Optional[Dict[str, str]], bucketing_info: Optional[Tuple[List[str], int]], catalog_id: Optional[str], compression: Optional[str], description: Optional[str], parameters: Optional[Dict[str, str]], columns_comments: Optional[Dict[str, str]], mode: str, catalog_versioning: bool, projection_enabled: bool, projection_types: Optional[Dict[str, str]], projection_ranges: Optional[Dict[str, str]], projection_values: Optional[Dict[str, str]], projection_intervals: Optional[Dict[str, str]], projection_digits: Optional[Dict[str, str]], boto3_session: Optional[boto3.Session], catalog_table_input: Optional[Dict[str, Any]], ) -> None: table = sanitize_table_name(table=table) partitions_types = {} if partitions_types is None else partitions_types _logger.debug("catalog_table_input: %s", catalog_table_input) table_input: Dict[str, Any] if (catalog_table_input is not None) and (mode in ("append", "overwrite_partitions")): table_input = catalog_table_input catalog_cols: Dict[str, str] = { x["Name"]: x["Type"] for x in table_input["StorageDescriptor"]["Columns"] } for c, t in columns_types.items(): if c not in catalog_cols: _logger.debug("New column %s with type %s.", c, t) table_input["StorageDescriptor"]["Columns"].append({ "Name": c, "Type": t }) mode = "update" elif t != catalog_cols[c]: # Data type change detected! raise exceptions.InvalidArgumentValue( f"Data type change detected on column {c} (Old type: {catalog_cols[c]} / New type {t})." ) else: table_input = _parquet_table_definition( table=table, path=path, columns_types=columns_types, partitions_types=partitions_types, bucketing_info=bucketing_info, compression=compression, ) table_exist: bool = catalog_table_input is not None _logger.debug("table_exist: %s", table_exist) _create_table( database=database, table=table, description=description, parameters=parameters, columns_comments=columns_comments, mode=mode, catalog_versioning=catalog_versioning, boto3_session=boto3_session, table_input=table_input, table_exist=table_exist, partitions_types=partitions_types, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, catalog_id=catalog_id, )