def read_parquet_metadata_internal( path: Union[str, List[str]], dtype: Optional[Dict[str, str]], sampling: float, dataset: bool, use_threads: bool, boto3_session: Optional[boto3.Session], ) -> Tuple[Dict[str, str], Optional[Dict[str, str]], Optional[Dict[ str, List[str]]]]: """Handle wr.s3.read_parquet_metadata internally.""" session: boto3.Session = _utils.ensure_session(session=boto3_session) if dataset is True: if isinstance(path, str): _path: Optional[str] = path if path.endswith("/") else f"{path}/" paths: List[str] = path2list(path=_path, boto3_session=session) else: # pragma: no cover raise exceptions.InvalidArgumentType( "Argument <path> must be str if dataset=True.") else: if isinstance(path, str): _path = None paths = path2list(path=path, boto3_session=session) elif isinstance(path, list): _path = None paths = path else: # pragma: no cover raise exceptions.InvalidArgumentType( f"Argument path must be str or List[str] instead of {type(path)}." ) schemas: List[Dict[str, str]] = [ _read_parquet_metadata_file(path=x, use_threads=use_threads, boto3_session=session) for x in _utils.list_sampling(lst=paths, sampling=sampling) ] _logger.debug("schemas: %s", schemas) columns_types: Dict[str, str] = {} for schema in schemas: for column, _dtype in schema.items(): if (column in columns_types) and (columns_types[column] != _dtype): # pragma: no cover raise exceptions.InvalidSchemaConvergence( f"Was detect at least 2 different types in column {column} ({columns_types[column]} and {dtype})." ) columns_types[column] = _dtype partitions_types: Optional[Dict[str, str]] = None partitions_values: Optional[Dict[str, List[str]]] = None if (dataset is True) and (_path is not None): partitions_types, partitions_values = _utils.extract_partitions_metadata_from_paths( path=_path, paths=paths) if dtype: for k, v in dtype.items(): if columns_types and k in columns_types: columns_types[k] = v if partitions_types and k in partitions_types: partitions_types[k] = v _logger.debug("columns_types: %s", columns_types) return columns_types, partitions_types, partitions_values
def _read_parquet_init( path: Union[str, List[str]], filters: Optional[Union[List[Tuple], List[List[Tuple]]]] = None, categories: List[str] = None, validate_schema: bool = True, dataset: bool = False, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, ) -> pyarrow.parquet.ParquetDataset: """Encapsulate all initialization before the use of the pyarrow.parquet.ParquetDataset.""" session: boto3.Session = _utils.ensure_session(session=boto3_session) if dataset is False: path_or_paths: Union[str, List[str]] = path2list(path=path, boto3_session=session) elif isinstance(path, str): path_or_paths = path[:-1] if path.endswith("/") else path else: path_or_paths = path _logger.debug("path_or_paths: %s", path_or_paths) fs: s3fs.S3FileSystem = _utils.get_fs( session=session, s3_additional_kwargs=s3_additional_kwargs) cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) data: pyarrow.parquet.ParquetDataset = pyarrow.parquet.ParquetDataset( path_or_paths=path_or_paths, filesystem=fs, metadata_nthreads=cpus, filters=filters, read_dictionary=categories, validate_schema=validate_schema, split_row_groups=False, use_legacy_dataset=True, ) return data
def describe_objects( path: Union[str, List[str]], wait_time: Optional[Union[int, float]] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, ) -> Dict[str, Dict[str, Any]]: """Describe Amazon S3 objects from a received S3 prefix or list of S3 objects paths. Fetch attributes like ContentLength, DeleteMarker, LastModified, ContentType, etc The full list of attributes can be explored under the boto3 head_object documentation: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.head_object Note ---- In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). Parameters ---------- path : Union[str, List[str]] S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). wait_time : Union[int,float], optional How much time (seconds) should Wrangler try to reach this objects. Very useful to overcome eventual consistence issues. `None` means only a single try will be done. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Dict[str, Dict[str, Any]] Return a dictionary of objects returned from head_objects where the key is the object path. The response object can be explored here: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.head_object Examples -------- >>> import awswrangler as wr >>> descs0 = wr.s3.describe_objects(['s3://bucket/key0', 's3://bucket/key1']) # Describe both objects >>> descs1 = wr.s3.describe_objects('s3://bucket/prefix') # Describe all objects under the prefix >>> descs2 = wr.s3.describe_objects('s3://bucket/prefix', wait_time=30) # Overcoming eventual consistence issues """ paths: List[str] = path2list(path=path, boto3_session=boto3_session) if len(paths) < 1: return {} client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) resp_list: List[Tuple[str, Dict[str, Any]]] if use_threads is False: resp_list = [_describe_object(path=p, wait_time=wait_time, client_s3=client_s3) for p in paths] else: cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: resp_list = list( executor.map(_describe_object, paths, itertools.repeat(wait_time), itertools.repeat(client_s3)) ) desc_dict: Dict[str, Dict[str, Any]] = dict(resp_list) return desc_dict
def delete_objects(path: Union[str, List[str]], use_threads: bool = True, boto3_session: Optional[boto3.Session] = None) -> None: """Delete Amazon S3 objects from a received S3 prefix or list of S3 objects paths. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). Parameters ---------- path : Union[str, List[str]] S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.s3.delete_objects(['s3://bucket/key0', 's3://bucket/key1']) # Delete both objects >>> wr.s3.delete_objects('s3://bucket/prefix') # Delete all objects under the received prefix """ paths: List[str] = path2list(path=path, boto3_session=boto3_session) if len(paths) < 1: return client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) buckets: Dict[str, List[str]] = _split_paths_by_bucket(paths=paths) for bucket, keys in buckets.items(): chunks: List[List[str]] = _utils.chunkify(lst=keys, max_length=1_000) if use_threads is False: for chunk in chunks: _delete_objects(bucket=bucket, keys=chunk, client_s3=client_s3) else: cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) with concurrent.futures.ThreadPoolExecutor( max_workers=cpus) as executor: list( executor.map(_delete_objects, itertools.repeat(bucket), chunks, itertools.repeat(client_s3)))
def _read_text( parser_func: Callable, path: Union[str, List[str]], use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, chunksize: Optional[int] = None, dataset: bool = False, **pandas_kwargs, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: if "iterator" in pandas_kwargs: raise exceptions.InvalidArgument( "Please, use chunksize instead of iterator.") session: boto3.Session = _utils.ensure_session(session=boto3_session) if (dataset is True) and (not isinstance(path, str)): # pragma: no cover raise exceptions.InvalidArgument( "The path argument must be a string Amazon S3 prefix if dataset=True." ) if dataset is True: path_root: str = str(path) else: path_root = "" paths: List[str] = path2list(path=path, boto3_session=session) _logger.debug("paths:\n%s", paths) if chunksize is not None: dfs: Iterator[pd.DataFrame] = _read_text_chunksize( parser_func=parser_func, paths=paths, boto3_session=session, chunksize=chunksize, pandas_args=pandas_kwargs, s3_additional_kwargs=s3_additional_kwargs, dataset=dataset, path_root=path_root, ) return dfs if use_threads is False: df: pd.DataFrame = pd.concat( objs=[ _read_text_full( parser_func=parser_func, path=p, boto3_session=session, pandas_args=pandas_kwargs, s3_additional_kwargs=s3_additional_kwargs, dataset=dataset, path_root=path_root, ) for p in paths ], ignore_index=True, sort=False, ) else: cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) with concurrent.futures.ThreadPoolExecutor( max_workers=cpus) as executor: df = pd.concat( objs=executor.map( _read_text_full, itertools.repeat(parser_func), itertools.repeat(path_root), paths, itertools.repeat( _utils.boto3_to_primitives( boto3_session=session)), # Boto3.Session itertools.repeat(pandas_kwargs), itertools.repeat(s3_additional_kwargs), itertools.repeat(dataset), ), ignore_index=True, sort=False, ) return df
def copy_files_to_redshift( # pylint: disable=too-many-locals,too-many-arguments path: Union[str, List[str]], manifest_directory: str, con: sqlalchemy.engine.Engine, table: str, schema: str, iam_role: str, mode: str = "append", diststyle: str = "AUTO", distkey: Optional[str] = None, sortstyle: str = "COMPOUND", sortkey: Optional[List[str]] = None, primary_keys: Optional[List[str]] = None, varchar_lengths_default: int = 256, varchar_lengths: Optional[Dict[str, int]] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, ) -> None: """Load Parquet files from S3 to a Table on Amazon Redshift (Through COPY command). https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html Note ---- If the table does not exist yet, it will be automatically created for you using the Parquet metadata to infer the columns data types. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). Parameters ---------- path : Union[str, List[str]] S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). manifest_directory : str S3 prefix (e.g. s3://bucket/prefix) con : sqlalchemy.engine.Engine SQLAlchemy Engine. Please use, wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine() table : str Table name schema : str Schema name iam_role : str AWS IAM role with the related permissions. mode : str Append, overwrite or upsert. diststyle : str Redshift distribution styles. Must be in ["AUTO", "EVEN", "ALL", "KEY"]. https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html distkey : str, optional Specifies a column name or positional number for the distribution key. sortstyle : str Sorting can be "COMPOUND" or "INTERLEAVED". https://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data.html sortkey : List[str], optional List of columns to be sorted. primary_keys : List[str], optional Primary keys. varchar_lengths_default : int The size that will be set for all VARCHAR columns not specified with varchar_lengths. varchar_lengths : Dict[str, int], optional Dict of VARCHAR length by columns. (e.g. {"col1": 10, "col5": 200}). use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.db.copy_files_to_redshift( ... path="s3://bucket/my_parquet_files/", ... con=wr.catalog.get_engine(connection="my_glue_conn_name"), ... table="my_table", ... schema="public" ... iam_role="arn:aws:iam::XXX:role/XXX" ... ) """ _varchar_lengths: Dict[ str, int] = {} if varchar_lengths is None else varchar_lengths session: boto3.Session = _utils.ensure_session(session=boto3_session) paths: List[str] = path2list(path=path, boto3_session=session) # pylint: disable=protected-access manifest_directory = manifest_directory if manifest_directory.endswith( "/") else f"{manifest_directory}/" manifest_path: str = f"{manifest_directory}manifest.json" write_redshift_copy_manifest(manifest_path=manifest_path, paths=paths, use_threads=use_threads, boto3_session=session) s3.wait_objects_exist(paths=paths + [manifest_path], use_threads=False, boto3_session=session) athena_types, _ = s3.read_parquet_metadata(path=paths, dataset=False, use_threads=use_threads, boto3_session=session) _logger.debug("athena_types: %s", athena_types) redshift_types: Dict[str, str] = {} for col_name, col_type in athena_types.items(): length: int = _varchar_lengths[ col_name] if col_name in _varchar_lengths else varchar_lengths_default redshift_types[col_name] = _data_types.athena2redshift( dtype=col_type, varchar_length=length) with con.begin() as _con: created_table, created_schema = _rs_create_table( con=_con, table=table, schema=schema, redshift_types=redshift_types, mode=mode, diststyle=diststyle, sortstyle=sortstyle, distkey=distkey, sortkey=sortkey, primary_keys=primary_keys, ) _rs_copy( con=_con, table=created_table, schema=created_schema, manifest_path=manifest_path, iam_role=iam_role, num_files=len(paths), ) if table != created_table: # upsert _rs_upsert(con=_con, schema=schema, table=table, temp_table=created_table, primary_keys=primary_keys) s3.delete_objects(path=[manifest_path], use_threads=use_threads, boto3_session=session)