def _fetch_range( range_values: Tuple[int, int], bucket: str, key: str, boto3_primitives: _utils.Boto3PrimitivesType, boto3_kwargs: Dict[str, Any], ) -> Tuple[int, bytes]: start, end = range_values _logger.debug("Fetching: s3://%s/%s - Range: %s-%s", bucket, key, start, end) boto3_session: boto3.Session = _utils.boto3_from_primitives( primitives=boto3_primitives) client: boto3.client = _utils.client(service_name="s3", session=boto3_session) resp: Dict[str, Any] = _utils.try_it( f=client.get_object, ex=_S3_RETRYABLE_ERRORS, base=0.5, max_num_tries=6, Bucket=bucket, Key=key, Range=f"bytes={start}-{end - 1}", **boto3_kwargs, ) return start, cast(bytes, resp["Body"].read())
def _caller( bucket: str, key: str, part: int, upload_id: str, data: bytes, boto3_primitives: _utils.Boto3PrimitivesType, boto3_kwargs: Dict[str, Any], ) -> Dict[str, Union[str, int]]: _logger.debug("Upload part %s started.", part) boto3_session: boto3.Session = _utils.boto3_from_primitives(primitives=boto3_primitives) client: boto3.client = _utils.client(service_name="s3", session=boto3_session) resp: Dict[str, Any] = _utils.try_it( f=client.upload_part, ex=_S3_RETRYABLE_ERRORS, base=0.5, max_num_tries=6, Bucket=bucket, Key=key, Body=data, PartNumber=part, UploadId=upload_id, **boto3_kwargs, ) _logger.debug("Upload part %s done.", part) return {"PartNumber": part, "ETag": resp["ETag"]}
def _read_parquet_row_group( row_group: int, path: str, columns: Optional[List[str]], categories: Optional[List[str]], boto3_primitives: _utils.Boto3PrimitivesType, s3_additional_kwargs: Optional[Dict[str, str]], ) -> pa.Table: boto3_session: boto3.Session = _utils.boto3_from_primitives( primitives=boto3_primitives) fs: s3fs.S3FileSystem = _utils.get_fs( s3fs_block_size=134_217_728, session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, # 128 MB (128 * 2**20) ) with _utils.open_file(fs=fs, path=path, mode="rb") as f: pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile( source=f, read_dictionary=categories) num_row_groups: int = pq_file.num_row_groups _logger.debug("Reading Row Group %s/%s [multi-threaded]", row_group + 1, num_row_groups) return pq_file.read_row_group(i=row_group, columns=columns, use_threads=False, use_pandas_metadata=False)
def _delete_objects_concurrent( bucket: str, keys: List[str], boto3_primitives: _utils.Boto3PrimitivesType) -> None: boto3_session = _utils.boto3_from_primitives(primitives=boto3_primitives) return _delete_objects(bucket=bucket, keys=keys, boto3_session=boto3_session)
def _caller(func: Callable, boto3_primitives: _utils.Boto3PrimitivesType, func_kwargs: Dict[str, Any]) -> pd.DataFrame: boto3_session: boto3.Session = _utils.boto3_from_primitives( primitives=boto3_primitives) func_kwargs["boto3_session"] = boto3_session _logger.debug("Calling: %s", func) return func(**func_kwargs)
def _wait_object_concurrent( path: Tuple[str, str], waiter_name: str, delay: int, max_attempts: int, boto3_primitives: _utils.Boto3PrimitivesType ) -> None: boto3_session = _utils.boto3_from_primitives(primitives=boto3_primitives) _wait_object( path=path, waiter_name=waiter_name, delay=delay, max_attempts=max_attempts, boto3_session=boto3_session )
def _read_parquet_row_group( row_group: int, path: str, columns: Optional[List[str]], categories: Optional[List[str]], boto3_primitives: _utils.Boto3PrimitivesType, s3_additional_kwargs: Optional[Dict[str, str]], use_threads: bool, ) -> pa.Table: boto3_session: boto3.Session = _utils.boto3_from_primitives( primitives=boto3_primitives) with open_s3_object( path=path, mode="rb", use_threads=use_threads, s3_block_size=10_485_760, # 10 MB (10 * 2**20) s3_additional_kwargs=s3_additional_kwargs, boto3_session=boto3_session, ) as f: pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile( source=f, read_dictionary=categories) num_row_groups: int = pq_file.num_row_groups _logger.debug("Reading Row Group %s/%s [multi-threaded]", row_group + 1, num_row_groups) return pq_file.read_row_group(i=row_group, columns=columns, use_threads=False, use_pandas_metadata=False)
def _describe_object_concurrent( path: str, boto3_primitives: _utils.Boto3PrimitivesType, s3_additional_kwargs: Optional[Dict[str, Any]]) -> Tuple[str, Dict[str, Any]]: boto3_session = _utils.boto3_from_primitives(primitives=boto3_primitives) return _describe_object(path=path, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs)
def _caller(path: str, func: Callable, boto3_primitives: _utils.Boto3PrimitivesType, func_kwargs: Dict[str, Any]) -> pd.DataFrame: boto3_session: boto3.Session = _utils.boto3_from_primitives( primitives=boto3_primitives) func_kwargs["path"] = path func_kwargs["boto3_session"] = boto3_session return func(**func_kwargs)
def _delete_objects_concurrent( bucket: str, keys: List[str], s3_additional_kwargs: Optional[Dict[str, Any]], boto3_primitives: _utils.Boto3PrimitivesType, ) -> None: boto3_session = _utils.boto3_from_primitives(primitives=boto3_primitives) return _delete_objects(bucket=bucket, keys=keys, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs)
def _write_batch( database: str, table: str, cols_names: List[str], measure_cols_names: List[str], measure_types: List[str], version: int, batch: List[Any], boto3_primitives: _utils.Boto3PrimitivesType, ) -> List[Dict[str, str]]: boto3_session: boto3.Session = _utils.boto3_from_primitives( primitives=boto3_primitives) client: boto3.client = _utils.client( service_name="timestream-write", session=boto3_session, botocore_config=Config(read_timeout=20, max_pool_connections=5000, retries={"max_attempts": 10}), ) try: time_loc = 0 measure_cols_loc = 1 dimensions_cols_loc = 1 + len(measure_cols_names) records: List[Dict[str, Any]] = [] for rec in batch: record: Dict[str, Any] = { "Dimensions": [{ "Name": name, "DimensionValueType": "VARCHAR", "Value": str(value) } for name, value in zip(cols_names[dimensions_cols_loc:], rec[dimensions_cols_loc:])], "Time": str(round(rec[time_loc].timestamp() * 1_000)), "TimeUnit": "MILLISECONDS", "Version": version, } if len(measure_cols_names) == 1: record["MeasureName"] = measure_cols_names[0] record["MeasureValueType"] = measure_types[0] record["MeasureValue"] = str(rec[measure_cols_loc]) else: record["MeasureName"] = measure_cols_names[0] record["MeasureValueType"] = "MULTI" record["MeasureValues"] = [{ "Name": measure_name, "Value": str(measure_value), "Type": measure_value_type } for measure_name, measure_value, measure_value_type in zip( measure_cols_names, rec[measure_cols_loc:dimensions_cols_loc], measure_types)] records.append(record)
def _write_batch( database: str, table: str, cols_names: List[str], measure_type: str, batch: List[Any], boto3_primitives: _utils.Boto3PrimitivesType, ) -> List[Dict[str, str]]: boto3_session: boto3.Session = _utils.boto3_from_primitives( primitives=boto3_primitives) client: boto3.client = _utils.client( service_name="timestream-write", session=boto3_session, botocore_config=Config(read_timeout=20, max_pool_connections=5000, retries={"max_attempts": 10}), ) try: _utils.try_it( f=client.write_records, ex=(client.exceptions.ThrottlingException, client.exceptions.InternalServerException), max_num_tries=5, DatabaseName=database, TableName=table, Records=[{ "Dimensions": [{ "Name": name, "DimensionValueType": "VARCHAR", "Value": str(value) } for name, value in zip(cols_names[2:], rec[2:])], "MeasureName": cols_names[1], "MeasureValueType": measure_type, "MeasureValue": str(rec[1]), "Time": str(round(rec[0].timestamp() * 1_000)), "TimeUnit": "MILLISECONDS", } for rec in batch], ) except client.exceptions.RejectedRecordsException as ex: return cast(List[Dict[str, str]], ex.response["RejectedRecords"]) return []
def _describe_object_concurrent( path: str, boto3_primitives: _utils.Boto3PrimitivesType ) -> Tuple[str, Dict[str, Any]]: boto3_session = _utils.boto3_from_primitives(primitives=boto3_primitives) return _describe_object(path=path, boto3_session=boto3_session)