def _read_dfs_from_multiple_paths( read_func: Callable[..., pd.DataFrame], paths: List[str], version_ids: Optional[Dict[str, str]], use_threads: Union[bool, int], kwargs: Dict[str, Any], ) -> List[pd.DataFrame]: cpus = ensure_cpu_count(use_threads) if cpus < 2: return [ read_func( path, version_id=version_ids.get(path) if version_ids else None, **kwargs) for path in paths ] with concurrent.futures.ThreadPoolExecutor( max_workers=ensure_cpu_count(use_threads)) as executor: kwargs["boto3_session"] = boto3_to_primitives(kwargs["boto3_session"]) partial_read_func = partial(read_func, **kwargs) versions = [ version_ids.get(p) if isinstance(version_ids, dict) else None for p in paths ] return list(df for df in executor.map(partial_read_func, paths, versions))
def _read_schemas_from_files( paths: List[str], sampling: float, use_threads: bool, boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, str]], ) -> Tuple[Dict[str, str], ...]: paths = _utils.list_sampling(lst=paths, sampling=sampling) schemas: Tuple[Dict[str, str], ...] = tuple() n_paths: int = len(paths) if use_threads is False or n_paths == 1: schemas = tuple( _read_parquet_metadata_file( path=p, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, use_threads=use_threads) for p in paths) elif n_paths > 1: cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) with concurrent.futures.ThreadPoolExecutor( max_workers=cpus) as executor: schemas = tuple( executor.map( _read_parquet_metadata_file, paths, itertools.repeat( _utils.boto3_to_primitives( boto3_session=boto3_session)), # Boto3.Session itertools.repeat(s3_additional_kwargs), itertools.repeat(use_threads), )) _logger.debug("schemas: %s", schemas) return schemas
def _read_parquet( path: str, columns: Optional[List[str]], categories: Optional[List[str]], safe: bool, boto3_session: boto3.Session, dataset: bool, path_root: Optional[str], s3_additional_kwargs: Optional[Dict[str, str]], use_threads: bool, ) -> pd.DataFrame: if use_threads is False: table: pa.Table = _read_parquet_file( path=path, columns=columns, categories=categories, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, use_threads=use_threads, ) else: cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) num_row_groups: int = _count_row_groups( path=path, categories=categories, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, use_threads=use_threads, ) with concurrent.futures.ThreadPoolExecutor( max_workers=cpus) as executor: tables: Tuple[pa.Table, ...] = tuple( executor.map( _read_parquet_row_group, range(num_row_groups), itertools.repeat(path), itertools.repeat(columns), itertools.repeat(categories), itertools.repeat( _utils.boto3_to_primitives( boto3_session=boto3_session)), itertools.repeat(s3_additional_kwargs), itertools.repeat(use_threads), )) table = pa.lib.concat_tables(tables, promote=False) _logger.debug("Converting PyArrow Table to Pandas DataFrame...") return _arrowtable2df( table=table, categories=categories, safe=safe, use_threads=use_threads, dataset=dataset, path=path, path_root=path_root, )
def _read_dfs_from_multiple_paths( read_func: Callable[..., pd.DataFrame], paths: List[str], use_threads: Union[bool, int], kwargs: Dict[str, Any]) -> List[pd.DataFrame]: cpus = ensure_cpu_count(use_threads) if cpus < 2: return [read_func(path, **kwargs) for path in paths] with concurrent.futures.ThreadPoolExecutor( max_workers=ensure_cpu_count(use_threads)) as executor: kwargs["boto3_session"] = boto3_to_primitives(kwargs["boto3_session"]) partial_read_func = partial(read_func, **kwargs) return list(df for df in executor.map(partial_read_func, paths))
def upload( self, bucket: str, key: str, part: int, upload_id: str, data: bytes, boto3_session: boto3.Session, boto3_kwargs: Dict[str, Any], ) -> None: """Upload Part.""" if self._exec is not None: _utils.block_waiting_available_thread(seq=self._futures, max_workers=self._cpus) future = self._exec.submit( _UploadProxy._caller, bucket=bucket, key=key, part=part, upload_id=upload_id, data=data, boto3_primitives=_utils.boto3_to_primitives( boto3_session=boto3_session), boto3_kwargs=boto3_kwargs, ) self._futures.append(future) else: self._results.append( self._caller( bucket=bucket, key=key, part=part, upload_id=upload_id, data=data, boto3_primitives=_utils.boto3_to_primitives( boto3_session=boto3_session), boto3_kwargs=boto3_kwargs, ))
def write(self, func: Callable[..., List[str]], boto3_session: boto3.Session, **func_kwargs: Any) -> None: """Write File.""" if self._exec is not None: _utils.block_waiting_available_thread(seq=self._futures, max_workers=self._cpus) _logger.debug("Submitting: %s", func) future = self._exec.submit( _WriteProxy._caller, func=func, boto3_primitives=_utils.boto3_to_primitives(boto3_session=boto3_session), func_kwargs=func_kwargs, ) self._futures.append(future) else: self._results += func(boto3_session=boto3_session, **func_kwargs)
def write(self, func: Callable, boto3_session: boto3.Session, **func_kwargs) -> None: """Write File.""" if self._exec is not None: _logger.debug("Submitting: %s", func) future = self._exec.submit( fn=_WriteProxy._caller, func=func, boto3_primitives=_utils.boto3_to_primitives( boto3_session=boto3_session), func_kwargs=func_kwargs, ) self._futures.append(future) else: self._results.append( func(boto3_session=boto3_session, **func_kwargs))
def _read_schemas_from_files( paths: List[str], sampling: float, use_threads: Union[bool, int], boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, str]], version_ids: Optional[Dict[str, str]] = None, ignore_null: bool = False, pyarrow_additional_kwargs: Optional[Dict[str, Any]] = None, ) -> Tuple[Dict[str, str], ...]: paths = _utils.list_sampling(lst=paths, sampling=sampling) schemas: Tuple[Optional[Dict[str, str]], ...] = tuple() n_paths: int = len(paths) cpus: int = _utils.ensure_cpu_count(use_threads) if cpus == 1 or n_paths == 1: schemas = tuple( _read_parquet_metadata_file( path=p, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, use_threads=use_threads, version_id=version_ids.get(p) if isinstance(version_ids, dict) else None, ignore_null=ignore_null, pyarrow_additional_kwargs=pyarrow_additional_kwargs, ) for p in paths ) elif n_paths > 1: versions = [version_ids.get(p) if isinstance(version_ids, dict) else None for p in paths] with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: schemas = tuple( executor.map( _read_parquet_metadata_file, paths, itertools.repeat(_utils.boto3_to_primitives(boto3_session=boto3_session)), # Boto3.Session itertools.repeat(s3_additional_kwargs), itertools.repeat(use_threads), versions, itertools.repeat(ignore_null), itertools.repeat(pyarrow_additional_kwargs), ) ) schemas = cast(Tuple[Dict[str, str], ...], tuple(x for x in schemas if x is not None)) _logger.debug("schemas: %s", schemas) return schemas
def _wait_objects( waiter_name: str, paths: List[str], delay: Optional[Union[int, float]] = None, max_attempts: Optional[int] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, ) -> None: delay = 5 if delay is None else delay max_attempts = 20 if max_attempts is None else max_attempts _delay: int = int(delay) if isinstance(delay, float) else delay if len(paths) < 1: return None _paths: List[Tuple[str, str]] = [_utils.parse_path(path=p) for p in paths] if len(_paths) == 1: _wait_object( path=_paths[0], waiter_name=waiter_name, delay=_delay, max_attempts=max_attempts, boto3_session=boto3_session, ) elif use_threads is False: for path in _paths: _wait_object(path=path, waiter_name=waiter_name, delay=_delay, max_attempts=max_attempts, boto3_session=boto3_session) else: cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) with concurrent.futures.ThreadPoolExecutor( max_workers=cpus) as executor: list( executor.map( _wait_object_concurrent, _paths, itertools.repeat(waiter_name), itertools.repeat(_delay), itertools.repeat(max_attempts), itertools.repeat( _utils.boto3_to_primitives( boto3_session=boto3_session)), )) return None
def _read_concurrent( func: Callable[..., pd.DataFrame], paths: List[str], ignore_index: Optional[bool], boto3_session: boto3.Session, **func_kwargs: Any, ) -> pd.DataFrame: cpus: int = _utils.ensure_cpu_count(use_threads=True) with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: return _union( dfs=list( executor.map( _caller, paths, itertools.repeat(func), itertools.repeat(_utils.boto3_to_primitives(boto3_session=boto3_session)), itertools.repeat(func_kwargs), ) ), ignore_index=ignore_index, )
def _fetch_range_proxy(self, start: int, end: int) -> bytes: _logger.debug("Fetching: s3://%s/%s - Range: %s-%s", self._bucket, self._key, start, end) boto3_primitives: _utils.Boto3PrimitivesType = _utils.boto3_to_primitives( boto3_session=self._boto3_session) boto3_kwargs: Dict[str, Any] = get_botocore_valid_kwargs( function_name="get_object", s3_additional_kwargs=self._s3_additional_kwargs) cpus: int = _utils.ensure_cpu_count(use_threads=self._use_threads) range_size: int = end - start if cpus < 2 or range_size < (2 * _MIN_PARALLEL_READ_BLOCK): return _fetch_range( range_values=(start, end), bucket=self._bucket, key=self._key, boto3_primitives=boto3_primitives, boto3_kwargs=boto3_kwargs, )[1] sizes: Tuple[int, ...] = _utils.get_even_chunks_sizes( total_size=range_size, chunk_size=_MIN_PARALLEL_READ_BLOCK, upper_bound=False) ranges: List[Tuple[int, int]] = [] chunk_start: int = start for size in sizes: ranges.append((chunk_start, chunk_start + size)) chunk_start += size with concurrent.futures.ThreadPoolExecutor( max_workers=cpus) as executor: return self._merge_range(ranges=list( executor.map( _fetch_range, ranges, itertools.repeat(self._bucket), itertools.repeat(self._key), itertools.repeat(boto3_primitives), itertools.repeat(boto3_kwargs), )), )
def write( df: pd.DataFrame, database: str, table: str, time_col: str, measure_col: str, dimensions_cols: List[str], num_threads: int = 32, boto3_session: Optional[boto3.Session] = None, ) -> List[Dict[str, str]]: """Store a Pandas DataFrame into a Amazon Timestream table. Parameters ---------- df: pandas.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html database : str Amazon Timestream database name. table : str Amazon Timestream table name. time_col : str DataFrame column name to be used as time. MUST be a timestamp column. measure_col : str DataFrame column name to be used as measure. dimensions_cols : List[str] List of DataFrame column names to be used as dimensions. num_threads : str Number of thread to be used for concurrent writing. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 Session will be used if boto3_session receive None. Returns ------- List[Dict[str, str]] Rejected records. Examples -------- Store a Pandas DataFrame into a Amazon Timestream table. >>> import awswrangler as wr >>> import pandas as pd >>> df = pd.DataFrame( >>> { >>> "time": [datetime.now(), datetime.now(), datetime.now()], >>> "dim0": ["foo", "boo", "bar"], >>> "dim1": [1, 2, 3], >>> "measure": [1.0, 1.1, 1.2], >>> } >>> ) >>> rejected_records = wr.timestream.write( >>> df=df, >>> database="sampleDB", >>> table="sampleTable", >>> time_col="time", >>> measure_col="measure", >>> dimensions_cols=["dim0", "dim1"], >>> ) >>> assert len(rejected_records) == 0 """ measure_type: str = _data_types.timestream_type_from_pandas( df[[measure_col]]) _logger.debug("measure_type: %s", measure_type) cols_names: List[str] = [time_col, measure_col] + dimensions_cols _logger.debug("cols_names: %s", cols_names) batches: List[List[Any]] = _utils.chunkify(lst=_df2list(df=df[cols_names]), max_length=100) _logger.debug("len(batches): %s", len(batches)) with concurrent.futures.ThreadPoolExecutor( max_workers=num_threads) as executor: res: List[List[Any]] = list( executor.map( _write_batch, itertools.repeat(database), itertools.repeat(table), itertools.repeat(cols_names), itertools.repeat(measure_type), batches, itertools.repeat( _utils.boto3_to_primitives(boto3_session=boto3_session)), )) return [item for sublist in res for item in sublist]
def describe_objects( path: Union[str, List[str]], use_threads: bool = True, last_modified_begin: Optional[datetime.datetime] = None, last_modified_end: Optional[datetime.datetime] = None, boto3_session: Optional[boto3.Session] = None, ) -> Dict[str, Dict[str, Any]]: """Describe Amazon S3 objects from a received S3 prefix or list of S3 objects paths. Fetch attributes like ContentLength, DeleteMarker, last_modified, ContentType, etc The full list of attributes can be explored under the boto3 head_object documentation: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.head_object This function accepts Unix shell-style wildcards in the path argument. * (matches everything), ? (matches any single character), [seq] (matches any character in seq), [!seq] (matches any character not in seq). Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Note ---- The filter by last_modified begin last_modified end is applied after list all S3 files Parameters ---------- path : Union[str, List[str]] S3 prefix (accepts Unix shell-style wildcards) (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. last_modified_begin Filter the s3 files by the Last modified date of the object. The filter is applied only after list all s3 files. last_modified_end: datetime, optional Filter the s3 files by the Last modified date of the object. The filter is applied only after list all s3 files. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Dict[str, Dict[str, Any]] Return a dictionary of objects returned from head_objects where the key is the object path. The response object can be explored here: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.head_object Examples -------- >>> import awswrangler as wr >>> descs0 = wr.s3.describe_objects(['s3://bucket/key0', 's3://bucket/key1']) # Describe both objects >>> descs1 = wr.s3.describe_objects('s3://bucket/prefix') # Describe all objects under the prefix """ paths: List[str] = _path2list( path=path, boto3_session=boto3_session, last_modified_begin=last_modified_begin, last_modified_end=last_modified_end, ) if len(paths) < 1: return {} resp_list: List[Tuple[str, Dict[str, Any]]] if len(paths) == 1: resp_list = [ _describe_object(path=paths[0], boto3_session=boto3_session) ] elif use_threads is False: resp_list = [ _describe_object(path=p, boto3_session=boto3_session) for p in paths ] else: cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) with concurrent.futures.ThreadPoolExecutor( max_workers=cpus) as executor: resp_list = list( executor.map( _describe_object_concurrent, paths, itertools.repeat( _utils.boto3_to_primitives( boto3_session=boto3_session)), )) desc_dict: Dict[str, Dict[str, Any]] = dict(resp_list) return desc_dict
def delete_objects( path: Union[str, List[str]], use_threads: bool = True, last_modified_begin: Optional[datetime.datetime] = None, last_modified_end: Optional[datetime.datetime] = None, boto3_session: Optional[boto3.Session] = None, ) -> None: """Delete Amazon S3 objects from a received S3 prefix or list of S3 objects paths. This function accepts Unix shell-style wildcards in the path argument. * (matches everything), ? (matches any single character), [seq] (matches any character in seq), [!seq] (matches any character not in seq). Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Note ---- The filter by last_modified begin last_modified end is applied after list all S3 files Parameters ---------- path : Union[str, List[str]] S3 prefix (accepts Unix shell-style wildcards) (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. last_modified_begin Filter the s3 files by the Last modified date of the object. The filter is applied only after list all s3 files. last_modified_end: datetime, optional Filter the s3 files by the Last modified date of the object. The filter is applied only after list all s3 files. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.s3.delete_objects(['s3://bucket/key0', 's3://bucket/key1']) # Delete both objects >>> wr.s3.delete_objects('s3://bucket/prefix') # Delete all objects under the received prefix """ paths: List[str] = _path2list( path=path, boto3_session=boto3_session, last_modified_begin=last_modified_begin, last_modified_end=last_modified_end, ) if len(paths) < 1: return buckets: Dict[str, List[str]] = _split_paths_by_bucket(paths=paths) for bucket, keys in buckets.items(): chunks: List[List[str]] = _utils.chunkify(lst=keys, max_length=1_000) if len(chunks) == 1: _delete_objects(bucket=bucket, keys=chunks[0], boto3_session=boto3_session) elif use_threads is False: for chunk in chunks: _delete_objects(bucket=bucket, keys=chunk, boto3_session=boto3_session) else: cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) with concurrent.futures.ThreadPoolExecutor( max_workers=cpus) as executor: list( executor.map( _delete_objects_concurrent, itertools.repeat(bucket), chunks, itertools.repeat( _utils.boto3_to_primitives( boto3_session=boto3_session)), ))
def _read_text( parser_func: Callable, path: Union[str, List[str]], use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, chunksize: Optional[int] = None, dataset: bool = False, **pandas_kwargs, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: if "iterator" in pandas_kwargs: raise exceptions.InvalidArgument( "Please, use chunksize instead of iterator.") session: boto3.Session = _utils.ensure_session(session=boto3_session) if (dataset is True) and (not isinstance(path, str)): # pragma: no cover raise exceptions.InvalidArgument( "The path argument must be a string Amazon S3 prefix if dataset=True." ) if dataset is True: path_root: str = str(path) else: path_root = "" paths: List[str] = path2list(path=path, boto3_session=session) _logger.debug("paths:\n%s", paths) if chunksize is not None: dfs: Iterator[pd.DataFrame] = _read_text_chunksize( parser_func=parser_func, paths=paths, boto3_session=session, chunksize=chunksize, pandas_args=pandas_kwargs, s3_additional_kwargs=s3_additional_kwargs, dataset=dataset, path_root=path_root, ) return dfs if use_threads is False: df: pd.DataFrame = pd.concat( objs=[ _read_text_full( parser_func=parser_func, path=p, boto3_session=session, pandas_args=pandas_kwargs, s3_additional_kwargs=s3_additional_kwargs, dataset=dataset, path_root=path_root, ) for p in paths ], ignore_index=True, sort=False, ) else: cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) with concurrent.futures.ThreadPoolExecutor( max_workers=cpus) as executor: df = pd.concat( objs=executor.map( _read_text_full, itertools.repeat(parser_func), itertools.repeat(path_root), paths, itertools.repeat( _utils.boto3_to_primitives( boto3_session=session)), # Boto3.Session itertools.repeat(pandas_kwargs), itertools.repeat(s3_additional_kwargs), itertools.repeat(dataset), ), ignore_index=True, sort=False, ) return df