Ejemplo n.º 1
0
def _fetch_range(
    range_values: Tuple[int, int],
    bucket: str,
    key: str,
    boto3_primitives: _utils.Boto3PrimitivesType,
    boto3_kwargs: Dict[str, Any],
) -> Tuple[int, bytes]:
    start, end = range_values
    _logger.debug("Fetching: s3://%s/%s - Range: %s-%s", bucket, key, start,
                  end)
    boto3_session: boto3.Session = _utils.boto3_from_primitives(
        primitives=boto3_primitives)
    client: boto3.client = _utils.client(service_name="s3",
                                         session=boto3_session)
    resp: Dict[str, Any] = _utils.try_it(
        f=client.get_object,
        ex=_S3_RETRYABLE_ERRORS,
        base=0.5,
        max_num_tries=6,
        Bucket=bucket,
        Key=key,
        Range=f"bytes={start}-{end - 1}",
        **boto3_kwargs,
    )
    return start, cast(bytes, resp["Body"].read())
Ejemplo n.º 2
0
 def _caller(
     bucket: str,
     key: str,
     part: int,
     upload_id: str,
     data: bytes,
     boto3_primitives: _utils.Boto3PrimitivesType,
     boto3_kwargs: Dict[str, Any],
 ) -> Dict[str, Union[str, int]]:
     _logger.debug("Upload part %s started.", part)
     boto3_session: boto3.Session = _utils.boto3_from_primitives(primitives=boto3_primitives)
     client: boto3.client = _utils.client(service_name="s3", session=boto3_session)
     resp: Dict[str, Any] = _utils.try_it(
         f=client.upload_part,
         ex=_S3_RETRYABLE_ERRORS,
         base=0.5,
         max_num_tries=6,
         Bucket=bucket,
         Key=key,
         Body=data,
         PartNumber=part,
         UploadId=upload_id,
         **boto3_kwargs,
     )
     _logger.debug("Upload part %s done.", part)
     return {"PartNumber": part, "ETag": resp["ETag"]}
Ejemplo n.º 3
0
def _read_parquet_row_group(
    row_group: int,
    path: str,
    columns: Optional[List[str]],
    categories: Optional[List[str]],
    boto3_primitives: _utils.Boto3PrimitivesType,
    s3_additional_kwargs: Optional[Dict[str, str]],
) -> pa.Table:
    boto3_session: boto3.Session = _utils.boto3_from_primitives(
        primitives=boto3_primitives)
    fs: s3fs.S3FileSystem = _utils.get_fs(
        s3fs_block_size=134_217_728,
        session=boto3_session,
        s3_additional_kwargs=s3_additional_kwargs,  # 128 MB (128 * 2**20)
    )
    with _utils.open_file(fs=fs, path=path, mode="rb") as f:
        pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(
            source=f, read_dictionary=categories)
        num_row_groups: int = pq_file.num_row_groups
        _logger.debug("Reading Row Group %s/%s [multi-threaded]",
                      row_group + 1, num_row_groups)
        return pq_file.read_row_group(i=row_group,
                                      columns=columns,
                                      use_threads=False,
                                      use_pandas_metadata=False)
Ejemplo n.º 4
0
def _delete_objects_concurrent(
        bucket: str, keys: List[str],
        boto3_primitives: _utils.Boto3PrimitivesType) -> None:
    boto3_session = _utils.boto3_from_primitives(primitives=boto3_primitives)
    return _delete_objects(bucket=bucket,
                           keys=keys,
                           boto3_session=boto3_session)
 def _caller(func: Callable, boto3_primitives: _utils.Boto3PrimitivesType,
             func_kwargs: Dict[str, Any]) -> pd.DataFrame:
     boto3_session: boto3.Session = _utils.boto3_from_primitives(
         primitives=boto3_primitives)
     func_kwargs["boto3_session"] = boto3_session
     _logger.debug("Calling: %s", func)
     return func(**func_kwargs)
Ejemplo n.º 6
0
def _wait_object_concurrent(
    path: Tuple[str, str], waiter_name: str, delay: int, max_attempts: int, boto3_primitives: _utils.Boto3PrimitivesType
) -> None:
    boto3_session = _utils.boto3_from_primitives(primitives=boto3_primitives)
    _wait_object(
        path=path, waiter_name=waiter_name, delay=delay, max_attempts=max_attempts, boto3_session=boto3_session
    )
def _read_parquet_row_group(
    row_group: int,
    path: str,
    columns: Optional[List[str]],
    categories: Optional[List[str]],
    boto3_primitives: _utils.Boto3PrimitivesType,
    s3_additional_kwargs: Optional[Dict[str, str]],
    use_threads: bool,
) -> pa.Table:
    boto3_session: boto3.Session = _utils.boto3_from_primitives(
        primitives=boto3_primitives)
    with open_s3_object(
            path=path,
            mode="rb",
            use_threads=use_threads,
            s3_block_size=10_485_760,  # 10 MB (10 * 2**20)
            s3_additional_kwargs=s3_additional_kwargs,
            boto3_session=boto3_session,
    ) as f:
        pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(
            source=f, read_dictionary=categories)
        num_row_groups: int = pq_file.num_row_groups
        _logger.debug("Reading Row Group %s/%s [multi-threaded]",
                      row_group + 1, num_row_groups)
        return pq_file.read_row_group(i=row_group,
                                      columns=columns,
                                      use_threads=False,
                                      use_pandas_metadata=False)
Ejemplo n.º 8
0
def _describe_object_concurrent(
    path: str, boto3_primitives: _utils.Boto3PrimitivesType,
    s3_additional_kwargs: Optional[Dict[str,
                                        Any]]) -> Tuple[str, Dict[str, Any]]:
    boto3_session = _utils.boto3_from_primitives(primitives=boto3_primitives)
    return _describe_object(path=path,
                            boto3_session=boto3_session,
                            s3_additional_kwargs=s3_additional_kwargs)
Ejemplo n.º 9
0
def _caller(path: str, func: Callable,
            boto3_primitives: _utils.Boto3PrimitivesType,
            func_kwargs: Dict[str, Any]) -> pd.DataFrame:
    boto3_session: boto3.Session = _utils.boto3_from_primitives(
        primitives=boto3_primitives)
    func_kwargs["path"] = path
    func_kwargs["boto3_session"] = boto3_session
    return func(**func_kwargs)
Ejemplo n.º 10
0
def _delete_objects_concurrent(
    bucket: str,
    keys: List[str],
    s3_additional_kwargs: Optional[Dict[str, Any]],
    boto3_primitives: _utils.Boto3PrimitivesType,
) -> None:
    boto3_session = _utils.boto3_from_primitives(primitives=boto3_primitives)
    return _delete_objects(bucket=bucket,
                           keys=keys,
                           boto3_session=boto3_session,
                           s3_additional_kwargs=s3_additional_kwargs)
Ejemplo n.º 11
0
def _write_batch(
    database: str,
    table: str,
    cols_names: List[str],
    measure_cols_names: List[str],
    measure_types: List[str],
    version: int,
    batch: List[Any],
    boto3_primitives: _utils.Boto3PrimitivesType,
) -> List[Dict[str, str]]:
    boto3_session: boto3.Session = _utils.boto3_from_primitives(
        primitives=boto3_primitives)
    client: boto3.client = _utils.client(
        service_name="timestream-write",
        session=boto3_session,
        botocore_config=Config(read_timeout=20,
                               max_pool_connections=5000,
                               retries={"max_attempts": 10}),
    )
    try:
        time_loc = 0
        measure_cols_loc = 1
        dimensions_cols_loc = 1 + len(measure_cols_names)
        records: List[Dict[str, Any]] = []
        for rec in batch:
            record: Dict[str, Any] = {
                "Dimensions": [{
                    "Name": name,
                    "DimensionValueType": "VARCHAR",
                    "Value": str(value)
                } for name, value in zip(cols_names[dimensions_cols_loc:],
                                         rec[dimensions_cols_loc:])],
                "Time":
                str(round(rec[time_loc].timestamp() * 1_000)),
                "TimeUnit":
                "MILLISECONDS",
                "Version":
                version,
            }
            if len(measure_cols_names) == 1:
                record["MeasureName"] = measure_cols_names[0]
                record["MeasureValueType"] = measure_types[0]
                record["MeasureValue"] = str(rec[measure_cols_loc])
            else:
                record["MeasureName"] = measure_cols_names[0]
                record["MeasureValueType"] = "MULTI"
                record["MeasureValues"] = [{
                    "Name": measure_name,
                    "Value": str(measure_value),
                    "Type": measure_value_type
                } for measure_name, measure_value, measure_value_type in zip(
                    measure_cols_names,
                    rec[measure_cols_loc:dimensions_cols_loc], measure_types)]
            records.append(record)
Ejemplo n.º 12
0
def _write_batch(
    database: str,
    table: str,
    cols_names: List[str],
    measure_type: str,
    batch: List[Any],
    boto3_primitives: _utils.Boto3PrimitivesType,
) -> List[Dict[str, str]]:
    boto3_session: boto3.Session = _utils.boto3_from_primitives(
        primitives=boto3_primitives)
    client: boto3.client = _utils.client(
        service_name="timestream-write",
        session=boto3_session,
        botocore_config=Config(read_timeout=20,
                               max_pool_connections=5000,
                               retries={"max_attempts": 10}),
    )
    try:
        _utils.try_it(
            f=client.write_records,
            ex=(client.exceptions.ThrottlingException,
                client.exceptions.InternalServerException),
            max_num_tries=5,
            DatabaseName=database,
            TableName=table,
            Records=[{
                "Dimensions": [{
                    "Name": name,
                    "DimensionValueType": "VARCHAR",
                    "Value": str(value)
                } for name, value in zip(cols_names[2:], rec[2:])],
                "MeasureName":
                cols_names[1],
                "MeasureValueType":
                measure_type,
                "MeasureValue":
                str(rec[1]),
                "Time":
                str(round(rec[0].timestamp() * 1_000)),
                "TimeUnit":
                "MILLISECONDS",
            } for rec in batch],
        )
    except client.exceptions.RejectedRecordsException as ex:
        return cast(List[Dict[str, str]], ex.response["RejectedRecords"])
    return []
Ejemplo n.º 13
0
def _describe_object_concurrent(
    path: str, boto3_primitives: _utils.Boto3PrimitivesType
) -> Tuple[str, Dict[str, Any]]:
    boto3_session = _utils.boto3_from_primitives(primitives=boto3_primitives)
    return _describe_object(path=path, boto3_session=boto3_session)