def _get_path_root(path: Union[str, List[str]], dataset: bool) -> Optional[str]: if (dataset is True) and (not isinstance(path, str)): raise exceptions.InvalidArgument( "The path argument must be a string if dataset=True (Amazon S3 prefix)." ) return _prefix_cleanup(str(path)) if dataset is True else None
def _read_text( parser_func: Callable[..., pd.DataFrame], path: Union[str, List[str]], path_suffix: Union[str, List[str], None], path_ignore_suffix: Union[str, List[str], None], ignore_empty: bool, use_threads: bool, last_modified_begin: Optional[datetime.datetime], last_modified_end: Optional[datetime.datetime], boto3_session: Optional[boto3.Session], s3_additional_kwargs: Optional[Dict[str, str]], chunksize: Optional[int], dataset: bool, partition_filter: Optional[Callable[[Dict[str, str]], bool]], ignore_index: bool, **pandas_kwargs: Any, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: if "iterator" in pandas_kwargs: raise exceptions.InvalidArgument( "Please, use the chunksize argument instead of iterator.") session: boto3.Session = _utils.ensure_session(session=boto3_session) paths: List[str] = _path2list( path=path, boto3_session=session, suffix=path_suffix, ignore_suffix=_get_path_ignore_suffix( path_ignore_suffix=path_ignore_suffix), ignore_empty=ignore_empty, last_modified_begin=last_modified_begin, last_modified_end=last_modified_end, s3_additional_kwargs=s3_additional_kwargs, ) path_root: Optional[str] = _get_path_root(path=path, dataset=dataset) if path_root is not None: paths = _apply_partition_filter(path_root=path_root, paths=paths, filter_func=partition_filter) if len(paths) < 1: raise exceptions.NoFilesFound(f"No files Found on: {path}.") _logger.debug("paths:\n%s", paths) args: Dict[str, Any] = { "parser_func": parser_func, "boto3_session": session, "dataset": dataset, "path_root": path_root, "pandas_kwargs": pandas_kwargs, "s3_additional_kwargs": s3_additional_kwargs, "use_threads": use_threads, } _logger.debug("args:\n%s", pprint.pformat(args)) ret: Union[pd.DataFrame, Iterator[pd.DataFrame]] if chunksize is not None: ret = _read_text_chunked(paths=paths, chunksize=chunksize, **args) elif len(paths) == 1: ret = _read_text_file(path=paths[0], **args) else: ret = _union(dfs=[_read_text_file(path=p, **args) for p in paths], ignore_index=ignore_index) return ret
def _create_table( database: str, table: str, description: Optional[str], parameters: Optional[Dict[str, str]], columns_comments: Optional[Dict[str, str]], mode: str, catalog_versioning: bool, boto3_session: Optional[boto3.Session], table_input: Dict[str, Any], ): if description is not None: table_input["Description"] = description if parameters is not None: for k, v in parameters.items(): table_input["Parameters"][k] = v if columns_comments is not None: for col in table_input["StorageDescriptor"]["Columns"]: name: str = col["Name"] if name in columns_comments: col["Comment"] = columns_comments[name] for par in table_input["PartitionKeys"]: name = par["Name"] if name in columns_comments: par["Comment"] = columns_comments[name] session: boto3.Session = _utils.ensure_session(session=boto3_session) client_glue: boto3.client = _utils.client(service_name="glue", session=session) exist: bool = does_table_exist(database=database, table=table, boto3_session=session) if mode not in ("overwrite", "append"): # pragma: no cover raise exceptions.InvalidArgument(f"{mode} is not a valid mode. It must be 'overwrite' or 'append'.") if (exist is True) and (mode == "overwrite"): skip_archive: bool = not catalog_versioning client_glue.update_table(DatabaseName=database, TableInput=table_input, SkipArchive=skip_archive) elif exist is False: client_glue.create_table(DatabaseName=database, TableInput=table_input)
def __get_region_from_session(self) -> str: """Extract region from session.""" region: Optional[str] = self.boto3_session.region_name if region is not None: return region raise exceptions.InvalidArgument( "There is no region_name defined on boto3, please configure it.")
def get_region_from_session(boto3_session: Optional[boto3.Session] = None, default_region: Optional[str] = None) -> str: """Extract region from session.""" session: boto3.Session = ensure_session(session=boto3_session) region: Optional[str] = session.region_name if region is not None: return region if default_region is not None: return default_region raise exceptions.InvalidArgument("There is no region_name defined on boto3, please configure it.")
def create_ingestion( dataset_name: Optional[str] = None, dataset_id: Optional[str] = None, ingestion_id: Optional[str] = None, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> str: """Create and starts a new SPICE ingestion on a dataset. Note ---- You must pass ``dataset_name`` OR ``dataset_id`` argument. Parameters ---------- dataset_name : str, optional Dataset name. dataset_id : str, optional Dataset ID. ingestion_id : str, optional Ingestion ID. account_id : str, optional If None, the account ID will be inferred from your boto3 session. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- str Ingestion ID Examples -------- >>> import awswrangler as wr >>> status = wr.quicksight.create_ingestion("my_dataset") """ session: boto3.Session = _utils.ensure_session(session=boto3_session) if account_id is None: account_id = sts.get_account_id(boto3_session=session) if (dataset_name is None) and (dataset_id is None): raise exceptions.InvalidArgument( "You must pass a not None dataset_name or dataset_id argument." ) # pragma: no cover if (dataset_id is None) and (dataset_name is not None): dataset_id = get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=session) if ingestion_id is None: ingestion_id = uuid.uuid4().hex client: boto3.client = _utils.client(service_name="quicksight", session=session) response: Dict[str, Any] = client.create_ingestion(DataSetId=dataset_id, IngestionId=ingestion_id, AwsAccountId=account_id) return response["IngestionId"]
def _validate_args( start_timestamp: int, end_timestamp: int, ) -> None: if start_timestamp < 0: raise exceptions.InvalidArgument( "`start_time` cannot be a negative value.") if start_timestamp >= end_timestamp: raise exceptions.InvalidArgumentCombination( "`start_time` must be inferior to `end_time`.")
def _get_id( name: str, func: Callable[..., List[Dict[str, Any]]], attr_name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> str: ids: List[str] = _get_ids( name=name, func=func, attr_name=attr_name, account_id=account_id, boto3_session=boto3_session ) if len(ids) == 0: raise exceptions.InvalidArgument(f"There is no {attr_name} related with name {name}") if len(ids) > 1: raise exceptions.InvalidArgument( f"There is {len(ids)} {attr_name} with name {name}. " f"Please pass the id argument to specify " f"which one you would like to describe." ) return ids[0]
def delete_template( name: Optional[str] = None, template_id: Optional[str] = None, version_number: Optional[int] = None, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> None: # pragma: no cover """Delete a tamplate. Note ---- You must pass a not None ``name`` or ``template_id`` argument. Parameters ---------- name : str, optional Dashboard name. template_id : str, optional The ID for the dashboard. version_number : int, optional Specifies the version of the template that you want to delete. If you don't provide a version number, it deletes all versions of the template. account_id : str, optional If None, the account ID will be inferred from your boto3 session. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.quicksight.delete_template(name="...") """ if (name is None) and (template_id is None): raise exceptions.InvalidArgument( "You must pass a not None name or template_id argument.") session: boto3.Session = _utils.ensure_session(session=boto3_session) if (template_id is None) and (name is not None): template_id = get_template_id(name=name, account_id=account_id, boto3_session=session) args: Dict[str, Any] = { "func_name": "delete_template", "account_id": account_id, "boto3_session": session, "TemplateId": template_id, } if version_number is not None: args["VersionNumber"] = version_number _delete(**args)
def delete_dashboard( name: Optional[str] = None, dashboard_id: Optional[str] = None, version_number: Optional[int] = None, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> None: # pragma: no cover """Delete a dashboard. Note ---- You must pass a not None ``name`` or ``dashboard_id`` argument. Parameters ---------- name : str, optional Dashboard name. dashboard_id : str, optional The ID for the dashboard. version_number : int, optional The version number of the dashboard. If the version number property is provided, only the specified version of the dashboard is deleted. account_id : str, optional If None, the account ID will be inferred from your boto3 session. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.quicksight.delete_dashboard(name="...") """ if (name is None) and (dashboard_id is None): raise exceptions.InvalidArgument( "You must pass a not None name or dashboard_id argument.") session: boto3.Session = _utils.ensure_session(session=boto3_session) if (dashboard_id is None) and (name is not None): dashboard_id = get_dashboard_id(name=name, account_id=account_id, boto3_session=session) args: Dict[str, Any] = { "func_name": "delete_dashboard", "account_id": account_id, "boto3_session": session, "DashboardId": dashboard_id, } if version_number is not None: args["VersionNumber"] = version_number _delete(**args)
def describe_ingestion( ingestion_id: str, dataset_name: Optional[str] = None, dataset_id: Optional[str] = None, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> Dict[str, Any]: """Describe a QuickSight ingestion by ID. Note ---- You must pass a not None value for ``dataset_name`` or ``dataset_id`` argument. Parameters ---------- ingestion_id : str Ingestion ID. dataset_name : str, optional Dataset name. dataset_id : str, optional Dataset ID. account_id : str, optional If None, the account ID will be inferred from your boto3 session. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Dict[str, Any] Ingestion Description. Examples -------- >>> import awswrangler as wr >>> description = wr.quicksight.describe_dataset(ingestion_id="...", dataset_name="...") """ if (dataset_name is None) and (dataset_id is None): raise exceptions.InvalidArgument( "You must pass a not None name or dataset_id argument.") session: boto3.Session = _utils.ensure_session(session=boto3_session) if account_id is None: account_id = sts.get_account_id(boto3_session=session) if (dataset_id is None) and (dataset_name is not None): dataset_id = get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=session) client: boto3.client = _utils.client(service_name="quicksight", session=session) return cast( Dict[str, Any], client.describe_ingestion(IngestionId=ingestion_id, AwsAccountId=account_id, DataSetId=dataset_id)["Ingestion"], )
def get_data_source_arn(name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> str: """Get QuickSight data source ARN given a name and fails if there is more than 1 ARN associated with this name. Note ---- This function returns a list of ARNs because Quicksight accepts duplicated data source names, so you may have more than 1 ARN for a given name. Parameters ---------- name : str Data source name. account_id : str, optional If None, the account ID will be inferred from your boto3 session. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- str Data source ARN. Examples -------- >>> import awswrangler as wr >>> arn = wr.quicksight.get_data_source_arn("...") """ arns: List[str] = get_data_source_arns(name=name, account_id=account_id, boto3_session=boto3_session) if len(arns) == 0: raise exceptions.InvalidArgument( f"There is not data source with name {name}") if len(arns) > 1: raise exceptions.InvalidArgument( f"There is more than 1 data source with name {name}. " f"Please pass the data_source_arn argument to specify " f"which one you would like to describe.") return arns[0]
def extract_athena_table_columns( database: str, table: str, boto3_session: boto3.Session) -> List[Dict[str, str]]: """Extract athena columns data types from table and raising an exception if not exist.""" dtypes: Optional[Dict[str, str]] = catalog.get_table_types( database=database, table=table, boto3_session=boto3_session) if dtypes is None: raise exceptions.InvalidArgument( f"{database}.{table} does not exist on Athena.") return [{ "Name": name, "Type": _data_types.athena2quicksight(dtype=dtype) } for name, dtype in dtypes.items()]
def _apply_partitions(df: pd.DataFrame, dataset: bool, path: str, path_root: Optional[str]) -> pd.DataFrame: if dataset is False: return df if dataset is True and path_root is None: raise exceptions.InvalidArgument("A path_root is required when dataset=True.") path_root = cast(str, path_root) partitions: Dict[str, str] = _extract_partitions_from_path(path_root=path_root, path=path) _logger.debug("partitions: %s", partitions) count: int = len(df.index) _logger.debug("count: %s", count) for name, value in partitions.items(): df[name] = pd.Categorical.from_codes(np.repeat([0], count), categories=[value]) return df
def describe_data_source_permissions( name: Optional[str] = None, data_source_id: Optional[str] = None, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> Dict[str, Any]: """Describe a QuickSight data source permissions by name or ID. Note ---- You must pass a not None ``name`` or ``data_source_id`` argument. Parameters ---------- name : str, optional Data source name. data_source_id : str, optional Data source ID. account_id : str, optional If None, the account ID will be inferred from your boto3 session. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Dict[str, Any] Data source Permissions Description. Examples -------- >>> import awswrangler as wr >>> description = wr.quicksight.describe_data_source_permissions("my-data-source") """ if (name is None) and (data_source_id is None): raise exceptions.InvalidArgument( "You must pass a not None name or data_source_id argument.") session: boto3.Session = _utils.ensure_session(session=boto3_session) if account_id is None: account_id = sts.get_account_id(boto3_session=session) if (data_source_id is None) and (name is not None): data_source_id = get_data_source_id(name=name, account_id=account_id, boto3_session=session) client: boto3.client = _utils.client(service_name="quicksight", session=session) return cast( Dict[str, Any], client.describe_data_source_permissions( AwsAccountId=account_id, DataSourceId=data_source_id)["Permissions"], )
def cancel_ingestion( ingestion_id: str = None, dataset_name: Optional[str] = None, dataset_id: Optional[str] = None, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> None: """Cancel an ongoing ingestion of data into SPICE. Note ---- You must pass a not None value for ``dataset_name`` or ``dataset_id`` argument. Parameters ---------- ingestion_id : str Ingestion ID. dataset_name : str, optional Dataset name. dataset_id : str, optional Dataset ID. account_id : str, optional If None, the account ID will be inferred from your boto3 session. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.quicksight.cancel_ingestion(ingestion_id="...", dataset_name="...") """ if (dataset_name is None) and (dataset_id is None): raise exceptions.InvalidArgument( "You must pass a not None name or dataset_id argument.") session: boto3.Session = _utils.ensure_session(session=boto3_session) if account_id is None: account_id = sts.get_account_id(boto3_session=session) if (dataset_id is None) and (dataset_name is not None): dataset_id = get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=session) client: boto3.client = _utils.client(service_name="quicksight", session=session) client.cancel_ingestion(IngestionId=ingestion_id, AwsAccountId=account_id, DataSetId=dataset_id)
def delete_dataset( name: Optional[str] = None, dataset_id: Optional[str] = None, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> None: """Delete a dataset. Note ---- You must pass a not None ``name`` or ``dataset_id`` argument. Parameters ---------- name : str, optional Dashboard name. dataset_id : str, optional The ID for the dataset. account_id : str, optional If None, the account ID will be inferred from your boto3 session. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.quicksight.delete_dataset(name="...") """ if (name is None) and (dataset_id is None): raise exceptions.InvalidArgument( "You must pass a not None name or dataset_id argument." ) # pragma: no cover session: boto3.Session = _utils.ensure_session(session=boto3_session) if (dataset_id is None) and (name is not None): dataset_id = get_dataset_id(name=name, account_id=account_id, boto3_session=session) # pragma: no cover args: Dict[str, Any] = { "func_name": "delete_data_set", "account_id": account_id, "boto3_session": session, "DataSetId": dataset_id, } _delete(**args)
def list_ingestions( dataset_name: Optional[str] = None, dataset_id: Optional[str] = None, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> List[Dict[str, Any]]: """List the history of SPICE ingestions for a dataset. Parameters ---------- dataset_name : str, optional Dataset name. dataset_id : str, optional The ID of the dataset used in the ingestion. account_id : str, optional If None, the account ID will be inferred from your boto3 session. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- List[Dict[str, Any]] IAM policy assignments. Examples -------- >>> import awswrangler as wr >>> ingestions = wr.quicksight.list_ingestions() """ if (dataset_name is None) and (dataset_id is None): raise exceptions.InvalidArgument( "You must pass a not None name or dataset_id argument." ) # pragma: no cover session: boto3.Session = _utils.ensure_session(session=boto3_session) if account_id is None: account_id = sts.get_account_id(boto3_session=session) if (dataset_id is None) and (dataset_name is not None): dataset_id = get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=session) return _list( func_name="list_ingestions", attr_name="Ingestions", account_id=account_id, boto3_session=boto3_session, DataSetId=dataset_id, )
def _pyarrow_chunk_generator( pq_file: pyarrow.parquet.ParquetFile, chunked: Union[bool, int], columns: Optional[List[str]], use_threads_flag: bool, ) -> Iterator[pa.RecordBatch]: if chunked is True: batch_size = 65_536 elif isinstance(chunked, int) and chunked > 0: batch_size = chunked else: raise exceptions.InvalidArgument(f"chunked: {chunked}") chunks = pq_file.iter_batches( batch_size=batch_size, columns=columns, use_threads=use_threads_flag, use_pandas_metadata=False ) for chunk in chunks: yield chunk
def _read_parquet_chunked( paths: List[str], chunked: Union[bool, int], columns: Optional[List[str]], categories: Optional[List[str]], validate_schema: bool, safe: bool, boto3_session: boto3.Session, dataset: bool, path_root: Optional[str], s3_additional_kwargs: Optional[Dict[str, str]], use_threads: bool, ) -> Iterator[pd.DataFrame]: next_slice: Optional[pd.DataFrame] = None last_schema: Optional[Dict[str, str]] = None last_path: str = "" for path in paths: with open_s3_object( path=path, mode="rb", use_threads=use_threads, s3_block_size=10_485_760, # 10 MB (10 * 2**20) s3_additional_kwargs=s3_additional_kwargs, boto3_session=boto3_session, ) as f: pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile( source=f, read_dictionary=categories) schema: Dict[str, str] = _data_types.athena_types_from_pyarrow_schema( schema=pq_file.schema.to_arrow_schema(), partitions=None)[0] if validate_schema is True and last_schema is not None: if schema != last_schema: raise exceptions.InvalidSchemaConvergence( f"Was detect at least 2 different schemas:\n" f" - {last_path} -> {last_schema}\n" f" - {path} -> {schema}") last_schema = schema last_path = path num_row_groups: int = pq_file.num_row_groups _logger.debug("num_row_groups: %s", num_row_groups) for i in range(num_row_groups): _logger.debug("Reading Row Group %s...", i) df: pd.DataFrame = _arrowtable2df( table=pq_file.read_row_group(i=i, columns=columns, use_threads=use_threads, use_pandas_metadata=False), categories=categories, safe=safe, use_threads=use_threads, dataset=dataset, path=path, path_root=path_root, ) if chunked is True: yield df elif isinstance(chunked, int) and chunked > 0: if next_slice is not None: df = _union(dfs=[next_slice, df], ignore_index=None) while len(df.index) >= chunked: yield df.iloc[:chunked] df = df.iloc[chunked:] if df.empty: next_slice = None else: next_slice = df else: raise exceptions.InvalidArgument(f"chunked: {chunked}")
def create_athena_dataset( name: str, database: Optional[str] = None, table: Optional[str] = None, sql: Optional[str] = None, sql_name: str = "CustomSQL", data_source_name: Optional[str] = None, data_source_arn: Optional[str] = None, import_mode: str = "DIRECT_QUERY", allowed_to_use: Optional[List[str]] = None, allowed_to_manage: Optional[List[str]] = None, logical_table_alias: str = "LogicalTable", rename_columns: Optional[Dict[str, str]] = None, cast_columns_types: Optional[Dict[str, str]] = None, tags: Optional[Dict[str, str]] = None, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> str: """Create a QuickSight dataset. Note ---- You will not be able to see the the dataset in the console if you not pass your user to one of the ``allowed_*`` arguments. Note ---- You must pass ``database``/``table`` OR ``sql`` argument. Note ---- You must pass ``data_source_name`` OR ``data_source_arn`` argument. Parameters ---------- name : str Dataset name. database : str Athena's database name. table : str Athena's table name. sql : str Use a SQL query to define your table. sql_name : str Query name. data_source_name : str, optional QuickSight data source name. data_source_arn : str, optional QuickSight data source ARN. import_mode : str Indicates whether you want to import the data into SPICE. 'SPICE'|'DIRECT_QUERY' tags : Dict[str, str], optional Key/Value collection to put on the Cluster. e.g. {"foo": "boo", "bar": "xoo"}) allowed_to_use : optional List of principals that will be allowed to see and use the data source. e.g. ["john", "Mary"] allowed_to_manage : optional List of principals that will be allowed to see, use, update and delete the data source. e.g. ["Mary"] logical_table_alias : str A display name for the logical table. rename_columns : Dict[str, str], optional Dictionary to map column renames. e.g. {"old_name": "new_name", "old_name2": "new_name2"} cast_columns_types : Dict[str, str], optional Dictionary to map column casts. e.g. {"col_name": "STRING", "col_name2": "DECIMAL"} Valid types: 'STRING'|'INTEGER'|'DECIMAL'|'DATETIME' account_id : str, optional If None, the account ID will be inferred from your boto3 session. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- str Dataset ID. Examples -------- >>> import awswrangler as wr >>> dataset_id = wr.quicksight.create_athena_dataset( ... name="...", ... database="..." ... table="..." ... data_source_name="..." ... allowed_to_manage=["Mary"] ... ) """ if (data_source_name is None) and (data_source_arn is None): raise exceptions.InvalidArgument("You must pass a not None data_source_name or data_source_arn argument.") if ((database is None) and (table is None)) and (sql is None): raise exceptions.InvalidArgument("You must pass database/table OR sql argument.") if (database is not None) and (sql is not None): raise exceptions.InvalidArgument( "If you provide sql argument, please include the database name inside the sql statement." "Do NOT pass in with database argument." ) session: boto3.Session = _utils.ensure_session(session=boto3_session) client: boto3.client = _utils.client(service_name="quicksight", session=session) if account_id is None: account_id = sts.get_account_id(boto3_session=session) if (data_source_arn is None) and (data_source_name is not None): data_source_arn = get_data_source_arn(name=data_source_name, account_id=account_id, boto3_session=session) if sql is not None: physical_table: Dict[str, Dict[str, Any]] = { "CustomSql": { "DataSourceArn": data_source_arn, "Name": sql_name, "SqlQuery": sql, "Columns": extract_athena_query_columns( sql=sql, data_source_arn=data_source_arn, # type: ignore account_id=account_id, boto3_session=session, ), } } else: physical_table = { "RelationalTable": { "DataSourceArn": data_source_arn, "Schema": database, "Name": table, "InputColumns": extract_athena_table_columns( database=database, # type: ignore table=table, # type: ignore boto3_session=session, ), } } table_uuid: str = uuid.uuid4().hex dataset_id: str = uuid.uuid4().hex args: Dict[str, Any] = { "AwsAccountId": account_id, "DataSetId": dataset_id, "Name": name, "ImportMode": import_mode, "PhysicalTableMap": {table_uuid: physical_table}, "LogicalTableMap": {table_uuid: {"Alias": logical_table_alias, "Source": {"PhysicalTableId": table_uuid}}}, } trans: List[Dict[str, Dict[str, Any]]] = _generate_transformations( rename_columns=rename_columns, cast_columns_types=cast_columns_types ) if trans: args["LogicalTableMap"][table_uuid]["DataTransforms"] = trans permissions: List[Dict[str, Union[str, List[str]]]] = _generate_permissions( resource="dataset", account_id=account_id, boto3_session=session, allowed_to_use=allowed_to_use, allowed_to_manage=allowed_to_manage, ) if permissions: args["Permissions"] = permissions if tags is not None: _tags: List[Dict[str, str]] = [{"Key": k, "Value": v} for k, v in tags.items()] args["Tags"] = _tags client.create_data_set(**args) return dataset_id
def to_sql(df: pd.DataFrame, con: sqlalchemy.engine.Engine, **pandas_kwargs: Any) -> None: """Write records stored in a DataFrame to a SQL database. Support for **Redshift**, **PostgreSQL** and **MySQL**. Support for all pandas to_sql() arguments: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_sql.html Note ---- Redshift: For large DataFrames (1MM+ rows) consider the function **wr.db.copy_to_redshift()**. Note ---- Redshift: `index=False` will be forced. Parameters ---------- df : pandas.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html con : sqlalchemy.engine.Engine SQLAlchemy Engine. Please use, wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine() pandas_kwargs KEYWORD arguments forwarded to pandas.DataFrame.to_sql(). You can NOT pass `pandas_kwargs` explicit, just add valid Pandas arguments in the function call and Wrangler will accept it. e.g. wr.db.to_sql(df, con=con, name="table_name", schema="schema_name", if_exists="replace", index=False) https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_sql.html Returns ------- None None. Examples -------- Writing to Redshift with temporary credentials >>> import awswrangler as wr >>> import pandas as pd >>> wr.db.to_sql( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... con=wr.db.get_redshift_temp_engine(cluster_identifier="...", user="******"), ... name="table_name", ... schema="schema_name" ... ) Writing to Redshift with temporary credentials and using pandas_kwargs >>> import awswrangler as wr >>> import pandas as pd >>> wr.db.to_sql( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... con=wr.db.get_redshift_temp_engine(cluster_identifier="...", user="******"), ... name="table_name", ... schema="schema_name", ... if_exists="replace", ... index=False, ... ) Writing to Redshift from Glue Catalog Connections >>> import awswrangler as wr >>> import pandas as pd >>> wr.db.to_sql( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... con=wr.catalog.get_engine(connection="..."), ... name="table_name", ... schema="schema_name" ... ) """ if "pandas_kwargs" in pandas_kwargs: raise exceptions.InvalidArgument( "You can NOT pass `pandas_kwargs` explicit, just add valid " "Pandas arguments in the function call and Wrangler will accept it." "e.g. wr.db.to_sql(df, con, name='...', schema='...', if_exists='replace')" ) if df.empty is True: raise exceptions.EmptyDataFrame() if not isinstance(con, sqlalchemy.engine.Engine): raise exceptions.InvalidConnection( "Invalid 'con' argument, please pass a " "SQLAlchemy Engine. Use wr.db.get_engine(), " "wr.db.get_redshift_temp_engine() or wr.catalog.get_engine()") if "dtype" in pandas_kwargs: cast_columns: Dict[str, VisitableType] = pandas_kwargs["dtype"] else: cast_columns = {} dtypes: Dict[str, VisitableType] = _data_types.sqlalchemy_types_from_pandas( df=df, db_type=con.name, dtype=cast_columns) pandas_kwargs["dtype"] = dtypes pandas_kwargs["con"] = con if pandas_kwargs["con"].name.lower( ) == "redshift": # Redshift does not accept index pandas_kwargs["index"] = False _utils.try_it(f=df.to_sql, ex=sqlalchemy.exc.InternalError, **pandas_kwargs)
def to_csv( # pylint: disable=too-many-arguments,too-many-locals df: pd.DataFrame, path: str, sep: str = ",", index: bool = True, columns: Optional[List[str]] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, sanitize_columns: bool = False, dataset: bool = False, partition_cols: Optional[List[str]] = None, concurrent_partitioning: bool = False, mode: Optional[str] = None, catalog_versioning: bool = False, database: Optional[str] = None, table: Optional[str] = None, dtype: Optional[Dict[str, str]] = None, description: Optional[str] = None, parameters: Optional[Dict[str, str]] = None, columns_comments: Optional[Dict[str, str]] = None, regular_partitions: bool = True, projection_enabled: bool = False, projection_types: Optional[Dict[str, str]] = None, projection_ranges: Optional[Dict[str, str]] = None, projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, catalog_id: Optional[str] = None, **pandas_kwargs: Any, ) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: """Write CSV file or dataset on Amazon S3. The concept of Dataset goes beyond the simple idea of ordinary files and enable more complex features like partitioning and catalog integration (Amazon Athena/AWS Glue Catalog). Note ---- If database` and `table` arguments are passed, the table name and all column names will be automatically sanitized using `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`. Please, pass `sanitize_columns=True` to enforce this behaviour always. Note ---- If `dataset=True`, `pandas_kwargs` will be ignored due restrictive quoting, date_format, escapechar, encoding, etc required by Athena/Glue Catalog. Note ---- By now Pandas does not support in-memory CSV compression. https://github.com/pandas-dev/pandas/issues/22555 So the `compression` will not be supported on Wrangler too. Note ---- On `append` mode, the `parameters` will be upsert on an existing table. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- df: pandas.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html path : str Amazon S3 path (e.g. s3://bucket/filename.csv). sep : str String of length 1. Field delimiter for the output file. index : bool Write row names (index). columns : List[str], optional Columns to write. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 Session will be used if boto3_session receive None. s3_additional_kwargs : Optional[Dict[str, Any]] Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass", "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging". e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'} sanitize_columns : bool True to sanitize columns names or False to keep it as is. True value is forced if `dataset=True`. dataset : bool If True store a parquet dataset instead of a ordinary file(s) If True, enable all follow arguments: partition_cols, mode, database, table, description, parameters, columns_comments, concurrent_partitioning, catalog_versioning, projection_enabled, projection_types, projection_ranges, projection_values, projection_intervals, projection_digits, catalog_id, schema_evolution. partition_cols: List[str], optional List of column names that will be used to create partitions. Only takes effect if dataset=True. concurrent_partitioning: bool If True will increase the parallelism level during the partitions writing. It will decrease the writing time and increase the memory usage. https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/022%20-%20Writing%20Partitions%20Concurrently.ipynb mode : str, optional ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True. For details check the related tutorial: https://aws-data-wrangler.readthedocs.io/en/stable/stubs/awswrangler.s3.to_parquet.html#awswrangler.s3.to_parquet catalog_versioning : bool If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. database : str, optional Glue/Athena catalog: Database name. table : str, optional Glue/Athena catalog: Table name. dtype : Dict[str, str], optional Dictionary of columns names and Athena/Glue types to be casted. Useful when you have columns with undetermined or mixed data types. (e.g. {'col name': 'bigint', 'col2 name': 'int'}) description : str, optional Glue/Athena catalog: Table description parameters : Dict[str, str], optional Glue/Athena catalog: Key/value pairs to tag the table. columns_comments : Dict[str, str], optional Glue/Athena catalog: Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). regular_partitions : bool Create regular partitions (Non projected partitions) on Glue Catalog. Disable when you will work only with Partition Projection. Keep enabled even when working with projections is useful to keep Redshift Spectrum working with the regular partitions. projection_enabled : bool Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) projection_types : Optional[Dict[str, str]] Dictionary of partitions names and Athena projections types. Valid types: "enum", "integer", "date", "injected" https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) projection_ranges: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections ranges. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) projection_values: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections values. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) projection_intervals: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections intervals. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '5'}) projection_digits: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. pandas_kwargs : KEYWORD arguments forwarded to pandas.DataFrame.to_csv(). You can NOT pass `pandas_kwargs` explicit, just add valid Pandas arguments in the function call and Wrangler will accept it. e.g. wr.s3.to_csv(df, path, sep='|', na_rep='NULL', decimal=',') https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html Returns ------- Dict[str, Union[List[str], Dict[str, List[str]]]] Dictionary with: 'paths': List of all stored files paths on S3. 'partitions_values': Dictionary of partitions added with keys as S3 path locations and values as a list of partitions values as str. Examples -------- Writing single file >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/prefix/my_file.csv', ... ) { 'paths': ['s3://bucket/prefix/my_file.csv'], 'partitions_values': {} } Writing single file with pandas_kwargs >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/prefix/my_file.csv', ... sep='|', ... na_rep='NULL', ... decimal=',' ... ) { 'paths': ['s3://bucket/prefix/my_file.csv'], 'partitions_values': {} } Writing single file encrypted with a KMS key >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/prefix/my_file.csv', ... s3_additional_kwargs={ ... 'ServerSideEncryption': 'aws:kms', ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' ... } ... ) { 'paths': ['s3://bucket/prefix/my_file.csv'], 'partitions_values': {} } Writing partitioned dataset >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... partition_cols=['col2'] ... ) { 'paths': ['s3://.../col2=A/x.csv', 's3://.../col2=B/y.csv'], 'partitions_values: { 's3://.../col2=A/': ['A'], 's3://.../col2=B/': ['B'] } } Writing dataset to S3 with metadata on Athena/Glue Catalog. >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... partition_cols=['col2'], ... database='default', # Athena/Glue database ... table='my_table' # Athena/Glue table ... ) { 'paths': ['s3://.../col2=A/x.csv', 's3://.../col2=B/y.csv'], 'partitions_values: { 's3://.../col2=A/': ['A'], 's3://.../col2=B/': ['B'] } } Writing dataset casting empty column data type >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'], ... 'col3': [None, None, None] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... database='default', # Athena/Glue database ... table='my_table' # Athena/Glue table ... dtype={'col3': 'date'} ... ) { 'paths': ['s3://.../x.csv'], 'partitions_values: {} } """ if "pandas_kwargs" in pandas_kwargs: raise exceptions.InvalidArgument( "You can NOT pass `pandas_kwargs` explicit, just add valid " "Pandas arguments in the function call and Wrangler will accept it." "e.g. wr.s3.to_csv(df, path, sep='|', na_rep='NULL', decimal=',')") _validate_args( df=df, table=table, database=database, dataset=dataset, path=path, partition_cols=partition_cols, mode=mode, description=description, parameters=parameters, columns_comments=columns_comments, ) # Initializing defaults partition_cols = partition_cols if partition_cols else [] dtype = dtype if dtype else {} partitions_values: Dict[str, List[str]] = {} mode = "append" if mode is None else mode session: boto3.Session = _utils.ensure_session(session=boto3_session) # Sanitize table to respect Athena's standards if (sanitize_columns is True) or (database is not None and table is not None): df, dtype, partition_cols = _sanitize(df=df, dtype=dtype, partition_cols=partition_cols) # Evaluating dtype catalog_table_input: Optional[Dict[str, Any]] = None if database is not None and table is not None: catalog_table_input = catalog._get_table_input( # pylint: disable=protected-access database=database, table=table, boto3_session=session, catalog_id=catalog_id) df = _apply_dtype(df=df, dtype=dtype, catalog_table_input=catalog_table_input, mode=mode) if dataset is False: pandas_kwargs["sep"] = sep pandas_kwargs["index"] = index pandas_kwargs["columns"] = columns _to_text( file_format="csv", df=df, use_threads=use_threads, path=path, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, **pandas_kwargs, ) paths = [path] else: df = df[columns] if columns else df paths, partitions_values = _to_dataset( func=_to_text, concurrent_partitioning=concurrent_partitioning, df=df, path_root=path, index=index, sep=sep, use_threads=use_threads, partition_cols=partition_cols, mode=mode, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, file_format="csv", quoting=csv.QUOTE_NONE, escapechar="\\", header=False, date_format="%Y-%m-%d %H:%M:%S.%f", ) if (database is not None) and (table is not None): try: columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned( df=df, index=index, partition_cols=partition_cols, dtype=dtype, index_left=True) catalog._create_csv_table( # pylint: disable=protected-access database=database, table=table, path=path, columns_types=columns_types, partitions_types=partitions_types, description=description, parameters=parameters, columns_comments=columns_comments, boto3_session=session, mode=mode, catalog_versioning=catalog_versioning, sep=sep, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, catalog_table_input=catalog_table_input, catalog_id=catalog_id, compression=None, skip_header_line_count=None, ) if partitions_values and (regular_partitions is True): _logger.debug("partitions_values:\n%s", partitions_values) catalog.add_csv_partitions( database=database, table=table, partitions_values=partitions_values, boto3_session=session, sep=sep, catalog_id=catalog_id, columns_types=columns_types, ) except Exception: _logger.debug( "Catalog write failed, cleaning up S3 (paths: %s).", paths) delete_objects(path=paths, use_threads=use_threads, boto3_session=session) raise return {"paths": paths, "partitions_values": partitions_values}
def to_json( df: pd.DataFrame, path: str, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, use_threads: bool = True, **pandas_kwargs: Any, ) -> None: """Write JSON file on Amazon S3. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- df: pandas.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html path : str Amazon S3 path (e.g. s3://bucket/filename.csv). boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 Session will be used if boto3_session receive None. s3_additional_kwargs : Optional[Dict[str, Any]] Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass", "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging". e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'} use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. pandas_kwargs: KEYWORD arguments forwarded to pandas.DataFrame.to_json(). You can NOT pass `pandas_kwargs` explicit, just add valid Pandas arguments in the function call and Wrangler will accept it. e.g. wr.s3.to_json(df, path, lines=True, date_format='iso') https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html Returns ------- None None. Examples -------- Writing JSON file >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_json( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/filename.json', ... ) Writing JSON file using pandas_kwargs >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_json( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/filename.json', ... lines=True, ... date_format='iso' ... ) Writing CSV file encrypted with a KMS key >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_json( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/filename.json', ... s3_additional_kwargs={ ... 'ServerSideEncryption': 'aws:kms', ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' ... } ... ) """ if "pandas_kwargs" in pandas_kwargs: raise exceptions.InvalidArgument( "You can NOT pass `pandas_kwargs` explicit, just add valid " "Pandas arguments in the function call and Wrangler will accept it." "e.g. wr.s3.to_json(df, path, lines=True, date_format='iso')") _to_text( file_format="json", df=df, path=path, use_threads=use_threads, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, **pandas_kwargs, )
def _read_parquet_chunked( # pylint: disable=too-many-branches paths: List[str], chunked: Union[bool, int], validate_schema: bool, ignore_index: Optional[bool], columns: Optional[List[str]], categories: Optional[List[str]], safe: bool, map_types: bool, boto3_session: boto3.Session, dataset: bool, path_root: Optional[str], s3_additional_kwargs: Optional[Dict[str, str]], use_threads: Union[bool, int], ) -> Iterator[pd.DataFrame]: next_slice: Optional[pd.DataFrame] = None last_schema: Optional[Dict[str, str]] = None last_path: str = "" for path in paths: with open_s3_object( path=path, mode="rb", use_threads=use_threads, s3_block_size=10_485_760, # 10 MB (10 * 2**20) s3_additional_kwargs=s3_additional_kwargs, boto3_session=boto3_session, ) as f: pq_file: Optional[ pyarrow.parquet.ParquetFile] = _pyarrow_parquet_file_wrapper( source=f, read_dictionary=categories) if pq_file is None: continue if validate_schema is True: schema: Dict[ str, str] = _data_types.athena_types_from_pyarrow_schema( schema=pq_file.schema.to_arrow_schema(), partitions=None)[0] if last_schema is not None: if schema != last_schema: raise exceptions.InvalidSchemaConvergence( f"Was detect at least 2 different schemas:\n" f" - {last_path} -> {last_schema}\n" f" - {path} -> {schema}") last_schema = schema last_path = path num_row_groups: int = pq_file.num_row_groups _logger.debug("num_row_groups: %s", num_row_groups) use_threads_flag: bool = use_threads if isinstance( use_threads, bool) else bool(use_threads > 1) # iter_batches is only available for pyarrow >= 3.0.0 if callable(getattr(pq_file, "iter_batches", None)): chunk_generator = _pyarrow_chunk_generator( pq_file=pq_file, chunked=chunked, columns=columns, use_threads_flag=use_threads_flag) else: chunk_generator = _row_group_chunk_generator( pq_file=pq_file, columns=columns, use_threads_flag=use_threads_flag, num_row_groups=num_row_groups) for chunk in chunk_generator: df: pd.DataFrame = _arrowtable2df( table=chunk, categories=categories, safe=safe, map_types=map_types, use_threads=use_threads, dataset=dataset, path=path, path_root=path_root, ) if chunked is True: yield df elif isinstance(chunked, int) and chunked > 0: if next_slice is not None: df = _union(dfs=[next_slice, df], ignore_index=ignore_index) while len(df.index) >= chunked: yield df.iloc[:chunked, :].copy() df = df.iloc[chunked:, :] if df.empty: next_slice = None else: next_slice = df else: raise exceptions.InvalidArgument(f"chunked: {chunked}")
def _create_table( # pylint: disable=too-many-branches,too-many-statements database: str, table: str, description: Optional[str], parameters: Optional[Dict[str, str]], mode: str, catalog_versioning: bool, boto3_session: Optional[boto3.Session], table_input: Dict[str, Any], table_exist: bool, projection_enabled: bool, partitions_types: Optional[Dict[str, str]], columns_comments: Optional[Dict[str, str]], projection_types: Optional[Dict[str, str]], projection_ranges: Optional[Dict[str, str]], projection_values: Optional[Dict[str, str]], projection_intervals: Optional[Dict[str, str]], projection_digits: Optional[Dict[str, str]], catalog_id: Optional[str], ) -> None: # Description mode = _update_if_necessary(dic=table_input, key="Description", value=description, mode=mode) # Parameters parameters = parameters if parameters else {} for k, v in parameters.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=k, value=v, mode=mode) # Projection if projection_enabled is True: table_input["Parameters"]["projection.enabled"] = "true" partitions_types = partitions_types if partitions_types else {} projection_types = projection_types if projection_types else {} projection_ranges = projection_ranges if projection_ranges else {} projection_values = projection_values if projection_values else {} projection_intervals = projection_intervals if projection_intervals else {} projection_digits = projection_digits if projection_digits else {} projection_types = { sanitize_column_name(k): v for k, v in projection_types.items() } projection_ranges = { sanitize_column_name(k): v for k, v in projection_ranges.items() } projection_values = { sanitize_column_name(k): v for k, v in projection_values.items() } projection_intervals = { sanitize_column_name(k): v for k, v in projection_intervals.items() } projection_digits = { sanitize_column_name(k): v for k, v in projection_digits.items() } for k, v in projection_types.items(): dtype: Optional[str] = partitions_types.get(k) if dtype is None: raise exceptions.InvalidArgumentCombination( f"Column {k} appears as projected column but not as partitioned column." ) if dtype == "date": table_input["Parameters"][ f"projection.{k}.format"] = "yyyy-MM-dd" elif dtype == "timestamp": table_input["Parameters"][ f"projection.{k}.format"] = "yyyy-MM-dd HH:mm:ss" table_input["Parameters"][ f"projection.{k}.interval.unit"] = "SECONDS" table_input["Parameters"][f"projection.{k}.interval"] = "1" for k, v in projection_types.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=f"projection.{k}.type", value=v, mode=mode) for k, v in projection_ranges.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=f"projection.{k}.range", value=v, mode=mode) for k, v in projection_values.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=f"projection.{k}.values", value=v, mode=mode) for k, v in projection_intervals.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=f"projection.{k}.interval", value=str(v), mode=mode) for k, v in projection_digits.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=f"projection.{k}.digits", value=str(v), mode=mode) else: table_input["Parameters"]["projection.enabled"] = "false" # Column comments columns_comments = columns_comments if columns_comments else {} columns_comments = { sanitize_column_name(k): v for k, v in columns_comments.items() } if columns_comments: for col in table_input["StorageDescriptor"]["Columns"]: name: str = col["Name"] if name in columns_comments: mode = _update_if_necessary(dic=col, key="Comment", value=columns_comments[name], mode=mode) for par in table_input["PartitionKeys"]: name = par["Name"] if name in columns_comments: mode = _update_if_necessary(dic=par, key="Comment", value=columns_comments[name], mode=mode) _logger.debug("table_input: %s", table_input) session: boto3.Session = _utils.ensure_session(session=boto3_session) client_glue: boto3.client = _utils.client(service_name="glue", session=session) skip_archive: bool = not catalog_versioning if mode not in ("overwrite", "append", "overwrite_partitions", "update"): raise exceptions.InvalidArgument( f"{mode} is not a valid mode. It must be 'overwrite', 'append' or 'overwrite_partitions'." ) if table_exist is True and mode == "overwrite": delete_all_partitions(table=table, database=database, catalog_id=catalog_id, boto3_session=session) _logger.debug("Updating table (%s)...", mode) client_glue.update_table(**_catalog_id(catalog_id=catalog_id, DatabaseName=database, TableInput=table_input, SkipArchive=skip_archive)) elif (table_exist is True) and (mode in ("append", "overwrite_partitions", "update")): if mode == "update": _logger.debug("Updating table (%s)...", mode) client_glue.update_table(**_catalog_id(catalog_id=catalog_id, DatabaseName=database, TableInput=table_input, SkipArchive=skip_archive)) elif table_exist is False: try: _logger.debug("Creating table (%s)...", mode) client_glue.create_table(**_catalog_id(catalog_id=catalog_id, DatabaseName=database, TableInput=table_input)) except client_glue.exceptions.AlreadyExistsException: if mode == "overwrite": _utils.try_it( f=_overwrite_table, ex=client_glue.exceptions.AlreadyExistsException, client_glue=client_glue, catalog_id=catalog_id, database=database, table=table, table_input=table_input, boto3_session=boto3_session, ) _logger.debug("Leaving table as is (%s)...", mode)
def to_excel( df: pd.DataFrame, path: str, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, use_threads: Union[bool, int] = True, **pandas_kwargs: Any, ) -> str: """Write EXCEL file on Amazon S3. Note ---- This function accepts any Pandas's read_excel() argument. https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html Note ---- Depending on the file extension ('xlsx', 'xls', 'odf'...), an additional library might have to be installed first (e.g. xlrd). Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- df: pandas.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html path : str Amazon S3 path (e.g. s3://bucket/filename.xlsx). boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 Session will be used if boto3_session receive None. s3_additional_kwargs : Optional[Dict[str, Any]] Forwarded to botocore requests. e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'} use_threads : bool, int True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. If integer is provided, specified number is used. pandas_kwargs: KEYWORD arguments forwarded to pandas.DataFrame.to_excel(). You can NOT pass `pandas_kwargs` explicit, just add valid Pandas arguments in the function call and Wrangler will accept it. e.g. wr.s3.to_excel(df, path, na_rep="", index=False) https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_excel.html Returns ------- str Written S3 path. Examples -------- Writing EXCEL file >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_excel(df, 's3://bucket/filename.xlsx') """ if "pandas_kwargs" in pandas_kwargs: raise exceptions.InvalidArgument( "You can NOT pass `pandas_kwargs` explicit, just add valid " "Pandas arguments in the function call and Wrangler will accept it." "e.g. wr.s3.to_excel(df, path, na_rep=" ", index=False)") session: boto3.Session = _utils.ensure_session(session=boto3_session) with open_s3_object( path=path, mode="wb", use_threads=use_threads, s3_additional_kwargs=s3_additional_kwargs, boto3_session=session, ) as f: _logger.debug("pandas_kwargs: %s", pandas_kwargs) df.to_excel(f, **pandas_kwargs) return path
def to_json( # pylint: disable=too-many-arguments,too-many-locals,too-many-statements,too-many-branches df: pd.DataFrame, path: Optional[str] = None, index: bool = True, columns: Optional[List[str]] = None, use_threads: Union[bool, int] = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, sanitize_columns: bool = False, dataset: bool = False, filename_prefix: Optional[str] = None, partition_cols: Optional[List[str]] = None, bucketing_info: Optional[Tuple[List[str], int]] = None, concurrent_partitioning: bool = False, mode: Optional[str] = None, catalog_versioning: bool = False, schema_evolution: bool = True, database: Optional[str] = None, table: Optional[str] = None, table_type: Optional[str] = None, transaction_id: Optional[str] = None, dtype: Optional[Dict[str, str]] = None, description: Optional[str] = None, parameters: Optional[Dict[str, str]] = None, columns_comments: Optional[Dict[str, str]] = None, regular_partitions: bool = True, projection_enabled: bool = False, projection_types: Optional[Dict[str, str]] = None, projection_ranges: Optional[Dict[str, str]] = None, projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, catalog_id: Optional[str] = None, **pandas_kwargs: Any, ) -> Union[List[str], Dict[str, Union[List[str], Dict[str, List[str]]]]]: """Write JSON file on Amazon S3. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Note ---- Compression: The minimum acceptable version to achive it is Pandas 1.2.0 that requires Python >= 3.7.1. Parameters ---------- df: pandas.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html path : str Amazon S3 path (e.g. s3://bucket/filename.json). index : bool Write row names (index). columns : Optional[List[str]] Columns to write. use_threads : bool, int True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. If integer is provided, specified number is used. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 Session will be used if boto3_session receive None. s3_additional_kwargs : Optional[Dict[str, Any]] Forwarded to botocore requests. e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'} sanitize_columns : bool True to sanitize columns names or False to keep it as is. True value is forced if `dataset=True`. dataset : bool If True store as a dataset instead of ordinary file(s) If True, enable all follow arguments: partition_cols, mode, database, table, description, parameters, columns_comments, concurrent_partitioning, catalog_versioning, projection_enabled, projection_types, projection_ranges, projection_values, projection_intervals, projection_digits, catalog_id, schema_evolution. filename_prefix: str, optional If dataset=True, add a filename prefix to the output files. partition_cols: List[str], optional List of column names that will be used to create partitions. Only takes effect if dataset=True. bucketing_info: Tuple[List[str], int], optional Tuple consisting of the column names used for bucketing as the first element and the number of buckets as the second element. Only `str`, `int` and `bool` are supported as column data types for bucketing. concurrent_partitioning: bool If True will increase the parallelism level during the partitions writing. It will decrease the writing time and increase the memory usage. https://aws-data-wrangler.readthedocs.io/en/2.13.0/tutorials/022%20-%20Writing%20Partitions%20Concurrently.html mode : str, optional ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True. For details check the related tutorial: https://aws-data-wrangler.readthedocs.io/en/2.13.0/stubs/awswrangler.s3.to_parquet.html#awswrangler.s3.to_parquet catalog_versioning : bool If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. schema_evolution : bool If True allows schema evolution (new or missing columns), otherwise a exception will be raised. (Only considered if dataset=True and mode in ("append", "overwrite_partitions")) Related tutorial: https://aws-data-wrangler.readthedocs.io/en/2.13.0/tutorials/014%20-%20Schema%20Evolution.html database : str, optional Glue/Athena catalog: Database name. table : str, optional Glue/Athena catalog: Table name. table_type: str, optional The type of the Glue Table. Set to EXTERNAL_TABLE if None transaction_id: str, optional The ID of the transaction when writing to a Governed Table. dtype : Dict[str, str], optional Dictionary of columns names and Athena/Glue types to be casted. Useful when you have columns with undetermined or mixed data types. (e.g. {'col name': 'bigint', 'col2 name': 'int'}) description : str, optional Glue/Athena catalog: Table description parameters : Dict[str, str], optional Glue/Athena catalog: Key/value pairs to tag the table. columns_comments : Dict[str, str], optional Glue/Athena catalog: Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). regular_partitions : bool Create regular partitions (Non projected partitions) on Glue Catalog. Disable when you will work only with Partition Projection. Keep enabled even when working with projections is useful to keep Redshift Spectrum working with the regular partitions. projection_enabled : bool Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) projection_types : Optional[Dict[str, str]] Dictionary of partitions names and Athena projections types. Valid types: "enum", "integer", "date", "injected" https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) projection_ranges: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections ranges. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) projection_values: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections values. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) projection_intervals: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections intervals. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '5'}) projection_digits: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. pandas_kwargs: KEYWORD arguments forwarded to pandas.DataFrame.to_json(). You can NOT pass `pandas_kwargs` explicit, just add valid Pandas arguments in the function call and Wrangler will accept it. e.g. wr.s3.to_json(df, path, lines=True, date_format='iso') https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html Returns ------- List[str] List of written files. Examples -------- Writing JSON file >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_json( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/filename.json', ... ) Writing JSON file using pandas_kwargs >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_json( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/filename.json', ... lines=True, ... date_format='iso' ... ) Writing CSV file encrypted with a KMS key >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_json( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/filename.json', ... s3_additional_kwargs={ ... 'ServerSideEncryption': 'aws:kms', ... 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN' ... } ... ) """ if "pandas_kwargs" in pandas_kwargs: raise exceptions.InvalidArgument( "You can NOT pass `pandas_kwargs` explicit, just add valid " "Pandas arguments in the function call and Wrangler will accept it." "e.g. wr.s3.to_json(df, path, lines=True, date_format='iso')") if pandas_kwargs.get("compression") and str( pd.__version__) < LooseVersion("1.2.0"): raise exceptions.InvalidArgument( f"JSON compression on S3 is not supported for Pandas version {pd.__version__}. " "The minimum acceptable version to achive it is Pandas 1.2.0 that requires Python >=3.7.1." ) _validate_args( df=df, table=table, database=database, dataset=dataset, path=path, partition_cols=partition_cols, bucketing_info=bucketing_info, mode=mode, description=description, parameters=parameters, columns_comments=columns_comments, ) # Initializing defaults partition_cols = partition_cols if partition_cols else [] dtype = dtype if dtype else {} partitions_values: Dict[str, List[str]] = {} mode = "append" if mode is None else mode commit_trans: bool = False if transaction_id: table_type = "GOVERNED" filename_prefix = filename_prefix + uuid.uuid4( ).hex if filename_prefix else uuid.uuid4().hex session: boto3.Session = _utils.ensure_session(session=boto3_session) # Sanitize table to respect Athena's standards if (sanitize_columns is True) or (database is not None and table is not None): df, dtype, partition_cols = _sanitize(df=df, dtype=dtype, partition_cols=partition_cols) # Evaluating dtype catalog_table_input: Optional[Dict[str, Any]] = None if database and table: catalog_table_input = catalog._get_table_input( # pylint: disable=protected-access database=database, table=table, boto3_session=session, transaction_id=transaction_id, catalog_id=catalog_id) catalog_path: Optional[str] = None if catalog_table_input: table_type = catalog_table_input["TableType"] catalog_path = catalog_table_input.get("StorageDescriptor", {}).get("Location") if path is None: if catalog_path: path = catalog_path else: raise exceptions.InvalidArgumentValue( "Glue table does not exist in the catalog. Please pass the `path` argument to create it." ) elif path and catalog_path: if path.rstrip("/") != catalog_path.rstrip("/"): raise exceptions.InvalidArgumentValue( f"The specified path: {path}, does not match the existing Glue catalog table path: {catalog_path}" ) if pandas_kwargs.get("compression") not in ("gzip", "bz2", None): raise exceptions.InvalidArgumentCombination( "If database and table are given, you must use one of these compressions: gzip, bz2 or None." ) if (table_type == "GOVERNED") and (not transaction_id): _logger.debug( "`transaction_id` not specified for GOVERNED table, starting transaction" ) transaction_id = lakeformation.start_transaction( read_only=False, boto3_session=boto3_session) commit_trans = True df = _apply_dtype(df=df, dtype=dtype, catalog_table_input=catalog_table_input, mode=mode) if dataset is False: return _to_text( file_format="json", df=df, path=path, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, **pandas_kwargs, ) compression: Optional[str] = pandas_kwargs.get("compression", None) df = df[columns] if columns else df columns_types: Dict[str, str] = {} partitions_types: Dict[str, str] = {} if database and table: columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned( df=df, index=index, partition_cols=partition_cols, dtype=dtype) if schema_evolution is False: _utils.check_schema_changes(columns_types=columns_types, table_input=catalog_table_input, mode=mode) if (catalog_table_input is None) and (table_type == "GOVERNED"): catalog._create_json_table( # pylint: disable=protected-access database=database, table=table, path=path, # type: ignore columns_types=columns_types, table_type=table_type, partitions_types=partitions_types, bucketing_info=bucketing_info, description=description, parameters=parameters, columns_comments=columns_comments, boto3_session=session, mode=mode, transaction_id=transaction_id, catalog_versioning=catalog_versioning, schema_evolution=schema_evolution, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, projection_storage_location_template=None, catalog_table_input=catalog_table_input, catalog_id=catalog_id, compression=pandas_kwargs.get("compression"), serde_library=None, serde_parameters=None, ) catalog_table_input = catalog._get_table_input( # pylint: disable=protected-access database=database, table=table, boto3_session=session, transaction_id=transaction_id, catalog_id=catalog_id, ) paths, partitions_values = _to_dataset( func=_to_text, concurrent_partitioning=concurrent_partitioning, df=df, path_root=path, # type: ignore filename_prefix=filename_prefix, index=index, compression=compression, catalog_id=catalog_id, database=database, table=table, table_type=table_type, transaction_id=transaction_id, use_threads=use_threads, partition_cols=partition_cols, partitions_types=partitions_types, bucketing_info=bucketing_info, mode=mode, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, file_format="json", ) if database and table: try: serde_info: Dict[str, Any] = {} if catalog_table_input: serde_info = catalog_table_input["StorageDescriptor"][ "SerdeInfo"] serde_library: Optional[str] = serde_info.get( "SerializationLibrary", None) serde_parameters: Optional[Dict[str, str]] = serde_info.get( "Parameters", None) catalog._create_json_table( # pylint: disable=protected-access database=database, table=table, path=path, # type: ignore columns_types=columns_types, table_type=table_type, partitions_types=partitions_types, bucketing_info=bucketing_info, description=description, parameters=parameters, columns_comments=columns_comments, boto3_session=session, mode=mode, transaction_id=transaction_id, catalog_versioning=catalog_versioning, schema_evolution=schema_evolution, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, projection_storage_location_template=None, catalog_table_input=catalog_table_input, catalog_id=catalog_id, compression=pandas_kwargs.get("compression"), serde_library=serde_library, serde_parameters=serde_parameters, ) if partitions_values and (regular_partitions is True) and (table_type != "GOVERNED"): _logger.debug("partitions_values:\n%s", partitions_values) catalog.add_json_partitions( database=database, table=table, partitions_values=partitions_values, bucketing_info=bucketing_info, boto3_session=session, serde_library=serde_library, serde_parameters=serde_parameters, catalog_id=catalog_id, columns_types=columns_types, compression=pandas_kwargs.get("compression"), ) if commit_trans: lakeformation.commit_transaction( transaction_id=transaction_id, boto3_session=boto3_session # type: ignore ) except Exception: _logger.debug("Catalog write failed, cleaning up S3 (paths: %s).", paths) delete_objects( path=paths, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, ) raise return {"paths": paths, "partitions_values": partitions_values}
def to_csv( # pylint: disable=too-many-arguments,too-many-locals,too-many-statements,too-many-branches df: pd.DataFrame, path: Optional[str] = None, sep: str = ",", index: bool = True, columns: Optional[List[str]] = None, use_threads: Union[bool, int] = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, sanitize_columns: bool = False, dataset: bool = False, filename_prefix: Optional[str] = None, partition_cols: Optional[List[str]] = None, bucketing_info: Optional[Tuple[List[str], int]] = None, concurrent_partitioning: bool = False, mode: Optional[str] = None, catalog_versioning: bool = False, schema_evolution: bool = False, database: Optional[str] = None, table: Optional[str] = None, table_type: Optional[str] = None, transaction_id: Optional[str] = None, dtype: Optional[Dict[str, str]] = None, description: Optional[str] = None, parameters: Optional[Dict[str, str]] = None, columns_comments: Optional[Dict[str, str]] = None, regular_partitions: bool = True, projection_enabled: bool = False, projection_types: Optional[Dict[str, str]] = None, projection_ranges: Optional[Dict[str, str]] = None, projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, catalog_id: Optional[str] = None, **pandas_kwargs: Any, ) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: """Write CSV file or dataset on Amazon S3. The concept of Dataset goes beyond the simple idea of ordinary files and enable more complex features like partitioning and catalog integration (Amazon Athena/AWS Glue Catalog). Note ---- If database` and `table` arguments are passed, the table name and all column names will be automatically sanitized using `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`. Please, pass `sanitize_columns=True` to enforce this behaviour always. Note ---- If `table` and `database` arguments are passed, `pandas_kwargs` will be ignored due restrictive quoting, date_format, escapechar and encoding required by Athena/Glue Catalog. Note ---- Compression: The minimum acceptable version to achive it is Pandas 1.2.0 that requires Python >= 3.7.1. Note ---- On `append` mode, the `parameters` will be upsert on an existing table. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- df: pandas.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html path : str, optional Amazon S3 path (e.g. s3://bucket/prefix/filename.csv) (for dataset e.g. ``s3://bucket/prefix``). Required if dataset=False or when creating a new dataset sep : str String of length 1. Field delimiter for the output file. index : bool Write row names (index). columns : Optional[List[str]] Columns to write. use_threads : bool, int True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. If integer is provided, specified number is used. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 Session will be used if boto3_session receive None. s3_additional_kwargs : Optional[Dict[str, Any]] Forwarded to botocore requests. e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'} sanitize_columns : bool True to sanitize columns names or False to keep it as is. True value is forced if `dataset=True`. dataset : bool If True store as a dataset instead of ordinary file(s) If True, enable all follow arguments: partition_cols, mode, database, table, description, parameters, columns_comments, concurrent_partitioning, catalog_versioning, projection_enabled, projection_types, projection_ranges, projection_values, projection_intervals, projection_digits, catalog_id, schema_evolution. filename_prefix: str, optional If dataset=True, add a filename prefix to the output files. partition_cols: List[str], optional List of column names that will be used to create partitions. Only takes effect if dataset=True. bucketing_info: Tuple[List[str], int], optional Tuple consisting of the column names used for bucketing as the first element and the number of buckets as the second element. Only `str`, `int` and `bool` are supported as column data types for bucketing. concurrent_partitioning: bool If True will increase the parallelism level during the partitions writing. It will decrease the writing time and increase the memory usage. https://aws-data-wrangler.readthedocs.io/en/2.13.0/tutorials/022%20-%20Writing%20Partitions%20Concurrently.html mode : str, optional ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True. For details check the related tutorial: https://aws-data-wrangler.readthedocs.io/en/2.13.0/stubs/awswrangler.s3.to_parquet.html#awswrangler.s3.to_parquet catalog_versioning : bool If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. schema_evolution : bool If True allows schema evolution (new or missing columns), otherwise a exception will be raised. (Only considered if dataset=True and mode in ("append", "overwrite_partitions")). False by default. Related tutorial: https://aws-data-wrangler.readthedocs.io/en/2.13.0/tutorials/014%20-%20Schema%20Evolution.html database : str, optional Glue/Athena catalog: Database name. table : str, optional Glue/Athena catalog: Table name. table_type: str, optional The type of the Glue Table. Set to EXTERNAL_TABLE if None transaction_id: str, optional The ID of the transaction when writing to a Governed Table. dtype : Dict[str, str], optional Dictionary of columns names and Athena/Glue types to be casted. Useful when you have columns with undetermined or mixed data types. (e.g. {'col name': 'bigint', 'col2 name': 'int'}) description : str, optional Glue/Athena catalog: Table description parameters : Dict[str, str], optional Glue/Athena catalog: Key/value pairs to tag the table. columns_comments : Dict[str, str], optional Glue/Athena catalog: Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). regular_partitions : bool Create regular partitions (Non projected partitions) on Glue Catalog. Disable when you will work only with Partition Projection. Keep enabled even when working with projections is useful to keep Redshift Spectrum working with the regular partitions. projection_enabled : bool Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) projection_types : Optional[Dict[str, str]] Dictionary of partitions names and Athena projections types. Valid types: "enum", "integer", "date", "injected" https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) projection_ranges: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections ranges. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) projection_values: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections values. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) projection_intervals: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections intervals. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '5'}) projection_digits: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. pandas_kwargs : KEYWORD arguments forwarded to pandas.DataFrame.to_csv(). You can NOT pass `pandas_kwargs` explicit, just add valid Pandas arguments in the function call and Wrangler will accept it. e.g. wr.s3.to_csv(df, path, sep='|', na_rep='NULL', decimal=',') https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html Returns ------- Dict[str, Union[List[str], Dict[str, List[str]]]] Dictionary with: 'paths': List of all stored files paths on S3. 'partitions_values': Dictionary of partitions added with keys as S3 path locations and values as a list of partitions values as str. Examples -------- Writing single file >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/prefix/my_file.csv', ... ) { 'paths': ['s3://bucket/prefix/my_file.csv'], 'partitions_values': {} } Writing single file with pandas_kwargs >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/prefix/my_file.csv', ... sep='|', ... na_rep='NULL', ... decimal=',' ... ) { 'paths': ['s3://bucket/prefix/my_file.csv'], 'partitions_values': {} } Writing single file encrypted with a KMS key >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/prefix/my_file.csv', ... s3_additional_kwargs={ ... 'ServerSideEncryption': 'aws:kms', ... 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN' ... } ... ) { 'paths': ['s3://bucket/prefix/my_file.csv'], 'partitions_values': {} } Writing partitioned dataset >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... partition_cols=['col2'] ... ) { 'paths': ['s3://.../col2=A/x.csv', 's3://.../col2=B/y.csv'], 'partitions_values: { 's3://.../col2=A/': ['A'], 's3://.../col2=B/': ['B'] } } Writing bucketed dataset >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... bucketing_info=(["col2"], 2) ... ) { 'paths': ['s3://.../x_bucket-00000.csv', 's3://.../col2=B/x_bucket-00001.csv'], 'partitions_values: {} } Writing dataset to S3 with metadata on Athena/Glue Catalog. >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... partition_cols=['col2'], ... database='default', # Athena/Glue database ... table='my_table' # Athena/Glue table ... ) { 'paths': ['s3://.../col2=A/x.csv', 's3://.../col2=B/y.csv'], 'partitions_values: { 's3://.../col2=A/': ['A'], 's3://.../col2=B/': ['B'] } } Writing dataset to Glue governed table >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'], ... 'col3': [None, None, None] ... }), ... dataset=True, ... mode='append', ... database='default', # Athena/Glue database ... table='my_table', # Athena/Glue table ... table_type='GOVERNED', ... transaction_id="xxx", ... ) { 'paths': ['s3://.../x.csv'], 'partitions_values: {} } Writing dataset casting empty column data type >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'], ... 'col3': [None, None, None] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... database='default', # Athena/Glue database ... table='my_table' # Athena/Glue table ... dtype={'col3': 'date'} ... ) { 'paths': ['s3://.../x.csv'], 'partitions_values: {} } """ if "pandas_kwargs" in pandas_kwargs: raise exceptions.InvalidArgument( "You can NOT pass `pandas_kwargs` explicit, just add valid " "Pandas arguments in the function call and Wrangler will accept it." "e.g. wr.s3.to_csv(df, path, sep='|', na_rep='NULL', decimal=',', compression='gzip')" ) if pandas_kwargs.get("compression") and str( pd.__version__) < LooseVersion("1.2.0"): raise exceptions.InvalidArgument( f"CSV compression on S3 is not supported for Pandas version {pd.__version__}. " "The minimum acceptable version to achive it is Pandas 1.2.0 that requires Python >=3.7.1." ) _validate_args( df=df, table=table, database=database, dataset=dataset, path=path, partition_cols=partition_cols, bucketing_info=bucketing_info, mode=mode, description=description, parameters=parameters, columns_comments=columns_comments, ) # Initializing defaults partition_cols = partition_cols if partition_cols else [] dtype = dtype if dtype else {} partitions_values: Dict[str, List[str]] = {} mode = "append" if mode is None else mode commit_trans: bool = False if transaction_id: table_type = "GOVERNED" filename_prefix = filename_prefix + uuid.uuid4( ).hex if filename_prefix else uuid.uuid4().hex session: boto3.Session = _utils.ensure_session(session=boto3_session) # Sanitize table to respect Athena's standards if (sanitize_columns is True) or (database is not None and table is not None): df, dtype, partition_cols = _sanitize(df=df, dtype=dtype, partition_cols=partition_cols) # Evaluating dtype catalog_table_input: Optional[Dict[str, Any]] = None if database and table: catalog_table_input = catalog._get_table_input( # pylint: disable=protected-access database=database, table=table, boto3_session=session, transaction_id=transaction_id, catalog_id=catalog_id) catalog_path: Optional[str] = None if catalog_table_input: table_type = catalog_table_input["TableType"] catalog_path = catalog_table_input.get("StorageDescriptor", {}).get("Location") if path is None: if catalog_path: path = catalog_path else: raise exceptions.InvalidArgumentValue( "Glue table does not exist in the catalog. Please pass the `path` argument to create it." ) elif path and catalog_path: if path.rstrip("/") != catalog_path.rstrip("/"): raise exceptions.InvalidArgumentValue( f"The specified path: {path}, does not match the existing Glue catalog table path: {catalog_path}" ) if pandas_kwargs.get("compression") not in ("gzip", "bz2", None): raise exceptions.InvalidArgumentCombination( "If database and table are given, you must use one of these compressions: gzip, bz2 or None." ) if (table_type == "GOVERNED") and (not transaction_id): _logger.debug( "`transaction_id` not specified for GOVERNED table, starting transaction" ) transaction_id = lakeformation.start_transaction( read_only=False, boto3_session=boto3_session) commit_trans = True df = _apply_dtype(df=df, dtype=dtype, catalog_table_input=catalog_table_input, mode=mode) paths: List[str] = [] if dataset is False: pandas_kwargs["sep"] = sep pandas_kwargs["index"] = index pandas_kwargs["columns"] = columns _to_text( file_format="csv", df=df, use_threads=use_threads, path=path, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, **pandas_kwargs, ) paths = [path] # type: ignore else: compression: Optional[str] = pandas_kwargs.get("compression", None) if database and table: quoting: Optional[int] = csv.QUOTE_NONE escapechar: Optional[str] = "\\" header: Union[bool, List[str]] = pandas_kwargs.get("header", False) date_format: Optional[str] = "%Y-%m-%d %H:%M:%S.%f" pd_kwargs: Dict[str, Any] = {} else: quoting = pandas_kwargs.get("quoting", None) escapechar = pandas_kwargs.get("escapechar", None) header = pandas_kwargs.get("header", True) date_format = pandas_kwargs.get("date_format", None) pd_kwargs = pandas_kwargs.copy() pd_kwargs.pop("quoting", None) pd_kwargs.pop("escapechar", None) pd_kwargs.pop("header", None) pd_kwargs.pop("date_format", None) pd_kwargs.pop("compression", None) df = df[columns] if columns else df columns_types: Dict[str, str] = {} partitions_types: Dict[str, str] = {} if database and table: columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned( df=df, index=index, partition_cols=partition_cols, dtype=dtype, index_left=True) if schema_evolution is False: _utils.check_schema_changes(columns_types=columns_types, table_input=catalog_table_input, mode=mode) if (catalog_table_input is None) and (table_type == "GOVERNED"): catalog._create_csv_table( # pylint: disable=protected-access database=database, table=table, path=path, columns_types=columns_types, table_type=table_type, partitions_types=partitions_types, bucketing_info=bucketing_info, description=description, parameters=parameters, columns_comments=columns_comments, boto3_session=session, mode=mode, transaction_id=transaction_id, schema_evolution=schema_evolution, catalog_versioning=catalog_versioning, sep=sep, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, projection_storage_location_template=None, catalog_table_input=catalog_table_input, catalog_id=catalog_id, compression=pandas_kwargs.get("compression"), skip_header_line_count=None, serde_library=None, serde_parameters=None, ) catalog_table_input = catalog._get_table_input( # pylint: disable=protected-access database=database, table=table, boto3_session=session, transaction_id=transaction_id, catalog_id=catalog_id, ) paths, partitions_values = _to_dataset( func=_to_text, concurrent_partitioning=concurrent_partitioning, df=df, path_root=path, # type: ignore index=index, sep=sep, compression=compression, catalog_id=catalog_id, database=database, table=table, table_type=table_type, transaction_id=transaction_id, filename_prefix=filename_prefix, use_threads=use_threads, partition_cols=partition_cols, partitions_types=partitions_types, bucketing_info=bucketing_info, mode=mode, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, file_format="csv", quoting=quoting, escapechar=escapechar, header=header, date_format=date_format, **pd_kwargs, ) if database and table: try: serde_info: Dict[str, Any] = {} if catalog_table_input: serde_info = catalog_table_input["StorageDescriptor"][ "SerdeInfo"] serde_library: Optional[str] = serde_info.get( "SerializationLibrary", None) serde_parameters: Optional[Dict[str, str]] = serde_info.get( "Parameters", None) catalog._create_csv_table( # pylint: disable=protected-access database=database, table=table, path=path, columns_types=columns_types, table_type=table_type, partitions_types=partitions_types, bucketing_info=bucketing_info, description=description, parameters=parameters, columns_comments=columns_comments, boto3_session=session, mode=mode, transaction_id=transaction_id, catalog_versioning=catalog_versioning, schema_evolution=schema_evolution, sep=sep, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, projection_storage_location_template=None, catalog_table_input=catalog_table_input, catalog_id=catalog_id, compression=pandas_kwargs.get("compression"), skip_header_line_count=True if header else None, serde_library=serde_library, serde_parameters=serde_parameters, ) if partitions_values and (regular_partitions is True) and (table_type != "GOVERNED"): _logger.debug("partitions_values:\n%s", partitions_values) catalog.add_csv_partitions( database=database, table=table, partitions_values=partitions_values, bucketing_info=bucketing_info, boto3_session=session, sep=sep, serde_library=serde_library, serde_parameters=serde_parameters, catalog_id=catalog_id, columns_types=columns_types, compression=pandas_kwargs.get("compression"), ) if commit_trans: lakeformation.commit_transaction( transaction_id=transaction_id, boto3_session=boto3_session # type: ignore ) except Exception: _logger.debug( "Catalog write failed, cleaning up S3 (paths: %s).", paths) delete_objects( path=paths, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, ) raise return {"paths": paths, "partitions_values": partitions_values}
def read_json( path: Union[str, List[str]], path_suffix: Union[str, List[str], None] = None, path_ignore_suffix: Union[str, List[str], None] = None, orient: str = "columns", use_threads: bool = True, last_modified_begin: Optional[datetime.datetime] = None, last_modified_end: Optional[datetime.datetime] = None, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, chunksize: Optional[int] = None, dataset: bool = False, partition_filter: Optional[Callable[[Dict[str, str]], bool]] = None, **pandas_kwargs: Any, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """Read JSON file(s) from from a received S3 prefix or list of S3 objects paths. This function accepts Unix shell-style wildcards in the path argument. * (matches everything), ? (matches any single character), [seq] (matches any character in seq), [!seq] (matches any character not in seq). Note ---- For partial and gradual reading use the argument ``chunksize`` instead of ``iterator``. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Note ---- The filter by last_modified begin last_modified end is applied after list all S3 files Parameters ---------- path : Union[str, List[str]] S3 prefix (accepts Unix shell-style wildcards) (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. ``[s3://bucket/key0, s3://bucket/key1]``). path_suffix: Union[str, List[str], None] Suffix or List of suffixes for filtering S3 keys. path_ignore_suffix: Union[str, List[str], None] Suffix or List of suffixes for S3 keys to be ignored. orient : str Same as Pandas: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_json.html use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. last_modified_begin Filter the s3 files by the Last modified date of the object. The filter is applied only after list all s3 files. last_modified_end: datetime, optional Filter the s3 files by the Last modified date of the object. The filter is applied only after list all s3 files. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: Forward to botocore requests, only "SSECustomerAlgorithm" and "SSECustomerKey" arguments will be considered. chunksize: int, optional If specified, return an generator where chunksize is the number of rows to include in each chunk. dataset: bool If `True` read a JSON dataset instead of simple file(s) loading all the related partitions as columns. If `True`, the `lines=True` will be assumed by default. partition_filter: Optional[Callable[[Dict[str, str]], bool]] Callback Function filters to apply on PARTITION columns (PUSH-DOWN filter). This function MUST receive a single argument (Dict[str, str]) where keys are partitions names and values are partitions values. Partitions values will be always strings extracted from S3. This function MUST return a bool, True to read the partition or False to ignore it. Ignored if `dataset=False`. E.g ``lambda x: True if x["year"] == "2020" and x["month"] == "1" else False`` https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/023%20-%20Flexible%20Partitions%20Filter.ipynb pandas_kwargs: KEYWORD arguments forwarded to pandas.read_json(). You can NOT pass `pandas_kwargs` explicit, just add valid Pandas arguments in the function call and Wrangler will accept it. e.g. wr.s3.read_json('s3://bucket/prefix/', lines=True, keep_default_dates=True) https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_json.html Returns ------- Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] Pandas DataFrame or a Generator in case of `chunksize != None`. Examples -------- Reading all JSON files under a prefix >>> import awswrangler as wr >>> df = wr.s3.read_json(path='s3://bucket/prefix/') Reading all CSV files under a prefix and using pandas_kwargs >>> import awswrangler as wr >>> df = wr.s3.read_json('s3://bucket/prefix/', lines=True, keep_default_dates=True) Reading all JSON files from a list >>> import awswrangler as wr >>> df = wr.s3.read_json(path=['s3://bucket/filename0.json', 's3://bucket/filename1.json']) Reading in chunks of 100 lines >>> import awswrangler as wr >>> dfs = wr.s3.read_json(path=['s3://bucket/0.json', 's3://bucket/1.json'], chunksize=100, lines=True) >>> for df in dfs: >>> print(df) # 100 lines Pandas DataFrame Reading JSON Dataset with PUSH-DOWN filter over partitions >>> import awswrangler as wr >>> my_filter = lambda x: True if x["city"].startswith("new") else False >>> df = wr.s3.read_json(path, dataset=True, partition_filter=my_filter) """ if "pandas_kwargs" in pandas_kwargs: raise exceptions.InvalidArgument( "You can NOT pass `pandas_kwargs` explicit, just add valid " "Pandas arguments in the function call and Wrangler will accept it." "e.g. wr.s3.read_json(path, lines=True, keep_default_dates=True)") if (dataset is True) and ("lines" not in pandas_kwargs): pandas_kwargs["lines"] = True pandas_kwargs["orient"] = orient ignore_index: bool = orient not in ("split", "index", "columns") return _read_text( parser_func=pd.read_json, path=path, path_suffix=path_suffix, path_ignore_suffix=path_ignore_suffix, use_threads=use_threads, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, chunksize=chunksize, dataset=dataset, partition_filter=partition_filter, last_modified_begin=last_modified_begin, last_modified_end=last_modified_end, ignore_index=ignore_index, **pandas_kwargs, )