Esempio n. 1
0
def _get_path_root(path: Union[str, List[str]],
                   dataset: bool) -> Optional[str]:
    if (dataset is True) and (not isinstance(path, str)):
        raise exceptions.InvalidArgument(
            "The path argument must be a string if dataset=True (Amazon S3 prefix)."
        )
    return _prefix_cleanup(str(path)) if dataset is True else None
Esempio n. 2
0
def _read_text(
    parser_func: Callable[..., pd.DataFrame],
    path: Union[str, List[str]],
    path_suffix: Union[str, List[str], None],
    path_ignore_suffix: Union[str, List[str], None],
    ignore_empty: bool,
    use_threads: bool,
    last_modified_begin: Optional[datetime.datetime],
    last_modified_end: Optional[datetime.datetime],
    boto3_session: Optional[boto3.Session],
    s3_additional_kwargs: Optional[Dict[str, str]],
    chunksize: Optional[int],
    dataset: bool,
    partition_filter: Optional[Callable[[Dict[str, str]], bool]],
    ignore_index: bool,
    **pandas_kwargs: Any,
) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
    if "iterator" in pandas_kwargs:
        raise exceptions.InvalidArgument(
            "Please, use the chunksize argument instead of iterator.")
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    paths: List[str] = _path2list(
        path=path,
        boto3_session=session,
        suffix=path_suffix,
        ignore_suffix=_get_path_ignore_suffix(
            path_ignore_suffix=path_ignore_suffix),
        ignore_empty=ignore_empty,
        last_modified_begin=last_modified_begin,
        last_modified_end=last_modified_end,
        s3_additional_kwargs=s3_additional_kwargs,
    )
    path_root: Optional[str] = _get_path_root(path=path, dataset=dataset)
    if path_root is not None:
        paths = _apply_partition_filter(path_root=path_root,
                                        paths=paths,
                                        filter_func=partition_filter)
    if len(paths) < 1:
        raise exceptions.NoFilesFound(f"No files Found on: {path}.")
    _logger.debug("paths:\n%s", paths)

    args: Dict[str, Any] = {
        "parser_func": parser_func,
        "boto3_session": session,
        "dataset": dataset,
        "path_root": path_root,
        "pandas_kwargs": pandas_kwargs,
        "s3_additional_kwargs": s3_additional_kwargs,
        "use_threads": use_threads,
    }
    _logger.debug("args:\n%s", pprint.pformat(args))
    ret: Union[pd.DataFrame, Iterator[pd.DataFrame]]
    if chunksize is not None:
        ret = _read_text_chunked(paths=paths, chunksize=chunksize, **args)
    elif len(paths) == 1:
        ret = _read_text_file(path=paths[0], **args)
    else:
        ret = _union(dfs=[_read_text_file(path=p, **args) for p in paths],
                     ignore_index=ignore_index)
    return ret
Esempio n. 3
0
def _create_table(
    database: str,
    table: str,
    description: Optional[str],
    parameters: Optional[Dict[str, str]],
    columns_comments: Optional[Dict[str, str]],
    mode: str,
    catalog_versioning: bool,
    boto3_session: Optional[boto3.Session],
    table_input: Dict[str, Any],
):
    if description is not None:
        table_input["Description"] = description
    if parameters is not None:
        for k, v in parameters.items():
            table_input["Parameters"][k] = v
    if columns_comments is not None:
        for col in table_input["StorageDescriptor"]["Columns"]:
            name: str = col["Name"]
            if name in columns_comments:
                col["Comment"] = columns_comments[name]
        for par in table_input["PartitionKeys"]:
            name = par["Name"]
            if name in columns_comments:
                par["Comment"] = columns_comments[name]
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    client_glue: boto3.client = _utils.client(service_name="glue", session=session)
    exist: bool = does_table_exist(database=database, table=table, boto3_session=session)
    if mode not in ("overwrite", "append"):  # pragma: no cover
        raise exceptions.InvalidArgument(f"{mode} is not a valid mode. It must be 'overwrite' or 'append'.")
    if (exist is True) and (mode == "overwrite"):
        skip_archive: bool = not catalog_versioning
        client_glue.update_table(DatabaseName=database, TableInput=table_input, SkipArchive=skip_archive)
    elif exist is False:
        client_glue.create_table(DatabaseName=database, TableInput=table_input)
Esempio n. 4
0
 def __get_region_from_session(self) -> str:
     """Extract region from session."""
     region: Optional[str] = self.boto3_session.region_name
     if region is not None:
         return region
     raise exceptions.InvalidArgument(
         "There is no region_name defined on boto3, please configure it.")
Esempio n. 5
0
def get_region_from_session(boto3_session: Optional[boto3.Session] = None, default_region: Optional[str] = None) -> str:
    """Extract region from session."""
    session: boto3.Session = ensure_session(session=boto3_session)
    region: Optional[str] = session.region_name
    if region is not None:
        return region
    if default_region is not None:
        return default_region
    raise exceptions.InvalidArgument("There is no region_name defined on boto3, please configure it.")
Esempio n. 6
0
def create_ingestion(
    dataset_name: Optional[str] = None,
    dataset_id: Optional[str] = None,
    ingestion_id: Optional[str] = None,
    account_id: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> str:
    """Create and starts a new SPICE ingestion on a dataset.

    Note
    ----
    You must pass ``dataset_name`` OR ``dataset_id`` argument.

    Parameters
    ----------
    dataset_name : str, optional
        Dataset name.
    dataset_id : str, optional
        Dataset ID.
    ingestion_id : str, optional
        Ingestion ID.
    account_id : str, optional
        If None, the account ID will be inferred from your boto3 session.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    str
        Ingestion ID

    Examples
    --------
    >>> import awswrangler as wr
    >>> status = wr.quicksight.create_ingestion("my_dataset")
    """
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    if account_id is None:
        account_id = sts.get_account_id(boto3_session=session)
    if (dataset_name is None) and (dataset_id is None):
        raise exceptions.InvalidArgument(
            "You must pass a not None dataset_name or dataset_id argument."
        )  # pragma: no cover
    if (dataset_id is None) and (dataset_name is not None):
        dataset_id = get_dataset_id(name=dataset_name,
                                    account_id=account_id,
                                    boto3_session=session)
    if ingestion_id is None:
        ingestion_id = uuid.uuid4().hex
    client: boto3.client = _utils.client(service_name="quicksight",
                                         session=session)
    response: Dict[str,
                   Any] = client.create_ingestion(DataSetId=dataset_id,
                                                  IngestionId=ingestion_id,
                                                  AwsAccountId=account_id)
    return response["IngestionId"]
def _validate_args(
    start_timestamp: int,
    end_timestamp: int,
) -> None:
    if start_timestamp < 0:
        raise exceptions.InvalidArgument(
            "`start_time` cannot be a negative value.")
    if start_timestamp >= end_timestamp:
        raise exceptions.InvalidArgumentCombination(
            "`start_time` must be inferior to `end_time`.")
Esempio n. 8
0
def _get_id(
    name: str,
    func: Callable[..., List[Dict[str, Any]]],
    attr_name: str,
    account_id: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> str:
    ids: List[str] = _get_ids(
        name=name, func=func, attr_name=attr_name, account_id=account_id, boto3_session=boto3_session
    )
    if len(ids) == 0:
        raise exceptions.InvalidArgument(f"There is no {attr_name} related with name {name}")
    if len(ids) > 1:
        raise exceptions.InvalidArgument(
            f"There is {len(ids)} {attr_name} with name {name}. "
            f"Please pass the id argument to specify "
            f"which one you would like to describe."
        )
    return ids[0]
Esempio n. 9
0
def delete_template(
    name: Optional[str] = None,
    template_id: Optional[str] = None,
    version_number: Optional[int] = None,
    account_id: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> None:  # pragma: no cover
    """Delete a tamplate.

    Note
    ----
    You must pass a not None ``name`` or ``template_id`` argument.

    Parameters
    ----------
    name : str, optional
        Dashboard name.
    template_id : str, optional
        The ID for the dashboard.
    version_number : int, optional
        Specifies the version of the template that you want to delete.
        If you don't provide a version number, it deletes all versions of the template.
    account_id : str, optional
        If None, the account ID will be inferred from your boto3 session.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    None
        None.

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.quicksight.delete_template(name="...")
    """
    if (name is None) and (template_id is None):
        raise exceptions.InvalidArgument(
            "You must pass a not None name or template_id argument.")
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    if (template_id is None) and (name is not None):
        template_id = get_template_id(name=name,
                                      account_id=account_id,
                                      boto3_session=session)
    args: Dict[str, Any] = {
        "func_name": "delete_template",
        "account_id": account_id,
        "boto3_session": session,
        "TemplateId": template_id,
    }
    if version_number is not None:
        args["VersionNumber"] = version_number
    _delete(**args)
Esempio n. 10
0
def delete_dashboard(
    name: Optional[str] = None,
    dashboard_id: Optional[str] = None,
    version_number: Optional[int] = None,
    account_id: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> None:  # pragma: no cover
    """Delete a dashboard.

    Note
    ----
    You must pass a not None ``name`` or ``dashboard_id`` argument.

    Parameters
    ----------
    name : str, optional
        Dashboard name.
    dashboard_id : str, optional
        The ID for the dashboard.
    version_number : int, optional
        The version number of the dashboard. If the version number property is provided,
        only the specified version of the dashboard is deleted.
    account_id : str, optional
        If None, the account ID will be inferred from your boto3 session.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    None
        None.

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.quicksight.delete_dashboard(name="...")
    """
    if (name is None) and (dashboard_id is None):
        raise exceptions.InvalidArgument(
            "You must pass a not None name or dashboard_id argument.")
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    if (dashboard_id is None) and (name is not None):
        dashboard_id = get_dashboard_id(name=name,
                                        account_id=account_id,
                                        boto3_session=session)
    args: Dict[str, Any] = {
        "func_name": "delete_dashboard",
        "account_id": account_id,
        "boto3_session": session,
        "DashboardId": dashboard_id,
    }
    if version_number is not None:
        args["VersionNumber"] = version_number
    _delete(**args)
Esempio n. 11
0
def describe_ingestion(
    ingestion_id: str,
    dataset_name: Optional[str] = None,
    dataset_id: Optional[str] = None,
    account_id: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> Dict[str, Any]:
    """Describe a QuickSight ingestion by ID.

    Note
    ----
    You must pass a not None value for ``dataset_name`` or ``dataset_id`` argument.

    Parameters
    ----------
    ingestion_id : str
        Ingestion ID.
    dataset_name : str, optional
        Dataset name.
    dataset_id : str, optional
        Dataset ID.
    account_id : str, optional
        If None, the account ID will be inferred from your boto3 session.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    Dict[str, Any]
        Ingestion Description.

    Examples
    --------
    >>> import awswrangler as wr
    >>> description = wr.quicksight.describe_dataset(ingestion_id="...", dataset_name="...")
    """
    if (dataset_name is None) and (dataset_id is None):
        raise exceptions.InvalidArgument(
            "You must pass a not None name or dataset_id argument.")
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    if account_id is None:
        account_id = sts.get_account_id(boto3_session=session)
    if (dataset_id is None) and (dataset_name is not None):
        dataset_id = get_dataset_id(name=dataset_name,
                                    account_id=account_id,
                                    boto3_session=session)
    client: boto3.client = _utils.client(service_name="quicksight",
                                         session=session)
    return cast(
        Dict[str, Any],
        client.describe_ingestion(IngestionId=ingestion_id,
                                  AwsAccountId=account_id,
                                  DataSetId=dataset_id)["Ingestion"],
    )
Esempio n. 12
0
def get_data_source_arn(name: str,
                        account_id: Optional[str] = None,
                        boto3_session: Optional[boto3.Session] = None) -> str:
    """Get QuickSight data source ARN given a name and fails if there is more than 1 ARN associated with this name.

    Note
    ----
    This function returns a list of ARNs because Quicksight accepts duplicated data source names,
    so you may have more than 1 ARN for a given name.

    Parameters
    ----------
    name : str
        Data source name.
    account_id : str, optional
        If None, the account ID will be inferred from your boto3 session.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    str
        Data source ARN.

    Examples
    --------
    >>> import awswrangler as wr
    >>> arn = wr.quicksight.get_data_source_arn("...")
    """
    arns: List[str] = get_data_source_arns(name=name,
                                           account_id=account_id,
                                           boto3_session=boto3_session)
    if len(arns) == 0:
        raise exceptions.InvalidArgument(
            f"There is not data source with name {name}")
    if len(arns) > 1:
        raise exceptions.InvalidArgument(
            f"There is more than 1 data source with name {name}. "
            f"Please pass the data_source_arn argument to specify "
            f"which one you would like to describe.")
    return arns[0]
Esempio n. 13
0
def extract_athena_table_columns(
        database: str, table: str,
        boto3_session: boto3.Session) -> List[Dict[str, str]]:
    """Extract athena columns data types from table and raising an exception if not exist."""
    dtypes: Optional[Dict[str, str]] = catalog.get_table_types(
        database=database, table=table, boto3_session=boto3_session)
    if dtypes is None:
        raise exceptions.InvalidArgument(
            f"{database}.{table} does not exist on Athena.")
    return [{
        "Name": name,
        "Type": _data_types.athena2quicksight(dtype=dtype)
    } for name, dtype in dtypes.items()]
Esempio n. 14
0
def _apply_partitions(df: pd.DataFrame, dataset: bool, path: str, path_root: Optional[str]) -> pd.DataFrame:
    if dataset is False:
        return df
    if dataset is True and path_root is None:
        raise exceptions.InvalidArgument("A path_root is required when dataset=True.")
    path_root = cast(str, path_root)
    partitions: Dict[str, str] = _extract_partitions_from_path(path_root=path_root, path=path)
    _logger.debug("partitions: %s", partitions)
    count: int = len(df.index)
    _logger.debug("count: %s", count)
    for name, value in partitions.items():
        df[name] = pd.Categorical.from_codes(np.repeat([0], count), categories=[value])
    return df
Esempio n. 15
0
def describe_data_source_permissions(
    name: Optional[str] = None,
    data_source_id: Optional[str] = None,
    account_id: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> Dict[str, Any]:
    """Describe a QuickSight data source permissions by name or ID.

    Note
    ----
    You must pass a not None ``name`` or ``data_source_id`` argument.

    Parameters
    ----------
    name : str, optional
        Data source name.
    data_source_id : str, optional
        Data source ID.
    account_id : str, optional
        If None, the account ID will be inferred from your boto3 session.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    Dict[str, Any]
        Data source Permissions Description.

    Examples
    --------
    >>> import awswrangler as wr
    >>> description = wr.quicksight.describe_data_source_permissions("my-data-source")
    """
    if (name is None) and (data_source_id is None):
        raise exceptions.InvalidArgument(
            "You must pass a not None name or data_source_id argument.")
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    if account_id is None:
        account_id = sts.get_account_id(boto3_session=session)
    if (data_source_id is None) and (name is not None):
        data_source_id = get_data_source_id(name=name,
                                            account_id=account_id,
                                            boto3_session=session)
    client: boto3.client = _utils.client(service_name="quicksight",
                                         session=session)
    return cast(
        Dict[str, Any],
        client.describe_data_source_permissions(
            AwsAccountId=account_id,
            DataSourceId=data_source_id)["Permissions"],
    )
Esempio n. 16
0
def cancel_ingestion(
    ingestion_id: str = None,
    dataset_name: Optional[str] = None,
    dataset_id: Optional[str] = None,
    account_id: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> None:
    """Cancel an ongoing ingestion of data into SPICE.

    Note
    ----
    You must pass a not None value for ``dataset_name`` or ``dataset_id`` argument.

    Parameters
    ----------
    ingestion_id : str
        Ingestion ID.
    dataset_name : str, optional
        Dataset name.
    dataset_id : str, optional
        Dataset ID.
    account_id : str, optional
        If None, the account ID will be inferred from your boto3 session.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    None
        None.

    Examples
    --------
    >>> import awswrangler as wr
    >>>  wr.quicksight.cancel_ingestion(ingestion_id="...", dataset_name="...")
    """
    if (dataset_name is None) and (dataset_id is None):
        raise exceptions.InvalidArgument(
            "You must pass a not None name or dataset_id argument.")
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    if account_id is None:
        account_id = sts.get_account_id(boto3_session=session)
    if (dataset_id is None) and (dataset_name is not None):
        dataset_id = get_dataset_id(name=dataset_name,
                                    account_id=account_id,
                                    boto3_session=session)
    client: boto3.client = _utils.client(service_name="quicksight",
                                         session=session)
    client.cancel_ingestion(IngestionId=ingestion_id,
                            AwsAccountId=account_id,
                            DataSetId=dataset_id)
Esempio n. 17
0
def delete_dataset(
    name: Optional[str] = None,
    dataset_id: Optional[str] = None,
    account_id: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> None:
    """Delete a dataset.

    Note
    ----
    You must pass a not None ``name`` or ``dataset_id`` argument.

    Parameters
    ----------
    name : str, optional
        Dashboard name.
    dataset_id : str, optional
        The ID for the dataset.
    account_id : str, optional
        If None, the account ID will be inferred from your boto3 session.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    None
        None.

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.quicksight.delete_dataset(name="...")
    """
    if (name is None) and (dataset_id is None):
        raise exceptions.InvalidArgument(
            "You must pass a not None name or dataset_id argument."
        )  # pragma: no cover
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    if (dataset_id is None) and (name is not None):
        dataset_id = get_dataset_id(name=name,
                                    account_id=account_id,
                                    boto3_session=session)  # pragma: no cover
    args: Dict[str, Any] = {
        "func_name": "delete_data_set",
        "account_id": account_id,
        "boto3_session": session,
        "DataSetId": dataset_id,
    }
    _delete(**args)
Esempio n. 18
0
def list_ingestions(
    dataset_name: Optional[str] = None,
    dataset_id: Optional[str] = None,
    account_id: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> List[Dict[str, Any]]:
    """List the history of SPICE ingestions for a dataset.

    Parameters
    ----------
    dataset_name : str, optional
        Dataset name.
    dataset_id : str, optional
        The ID of the dataset used in the ingestion.
    account_id : str, optional
        If None, the account ID will be inferred from your boto3 session.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    List[Dict[str, Any]]
        IAM policy assignments.

    Examples
    --------
    >>> import awswrangler as wr
    >>> ingestions = wr.quicksight.list_ingestions()
    """
    if (dataset_name is None) and (dataset_id is None):
        raise exceptions.InvalidArgument(
            "You must pass a not None name or dataset_id argument."
        )  # pragma: no cover
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    if account_id is None:
        account_id = sts.get_account_id(boto3_session=session)
    if (dataset_id is None) and (dataset_name is not None):
        dataset_id = get_dataset_id(name=dataset_name,
                                    account_id=account_id,
                                    boto3_session=session)
    return _list(
        func_name="list_ingestions",
        attr_name="Ingestions",
        account_id=account_id,
        boto3_session=boto3_session,
        DataSetId=dataset_id,
    )
Esempio n. 19
0
def _pyarrow_chunk_generator(
    pq_file: pyarrow.parquet.ParquetFile,
    chunked: Union[bool, int],
    columns: Optional[List[str]],
    use_threads_flag: bool,
) -> Iterator[pa.RecordBatch]:
    if chunked is True:
        batch_size = 65_536
    elif isinstance(chunked, int) and chunked > 0:
        batch_size = chunked
    else:
        raise exceptions.InvalidArgument(f"chunked: {chunked}")

    chunks = pq_file.iter_batches(
        batch_size=batch_size, columns=columns, use_threads=use_threads_flag, use_pandas_metadata=False
    )

    for chunk in chunks:
        yield chunk
Esempio n. 20
0
def _read_parquet_chunked(
    paths: List[str],
    chunked: Union[bool, int],
    columns: Optional[List[str]],
    categories: Optional[List[str]],
    validate_schema: bool,
    safe: bool,
    boto3_session: boto3.Session,
    dataset: bool,
    path_root: Optional[str],
    s3_additional_kwargs: Optional[Dict[str, str]],
    use_threads: bool,
) -> Iterator[pd.DataFrame]:
    next_slice: Optional[pd.DataFrame] = None
    last_schema: Optional[Dict[str, str]] = None
    last_path: str = ""
    for path in paths:
        with open_s3_object(
                path=path,
                mode="rb",
                use_threads=use_threads,
                s3_block_size=10_485_760,  # 10 MB (10 * 2**20)
                s3_additional_kwargs=s3_additional_kwargs,
                boto3_session=boto3_session,
        ) as f:
            pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(
                source=f, read_dictionary=categories)
            schema: Dict[str,
                         str] = _data_types.athena_types_from_pyarrow_schema(
                             schema=pq_file.schema.to_arrow_schema(),
                             partitions=None)[0]
            if validate_schema is True and last_schema is not None:
                if schema != last_schema:
                    raise exceptions.InvalidSchemaConvergence(
                        f"Was detect at least 2 different schemas:\n"
                        f"    - {last_path} -> {last_schema}\n"
                        f"    - {path} -> {schema}")
            last_schema = schema
            last_path = path
            num_row_groups: int = pq_file.num_row_groups
            _logger.debug("num_row_groups: %s", num_row_groups)
            for i in range(num_row_groups):
                _logger.debug("Reading Row Group %s...", i)
                df: pd.DataFrame = _arrowtable2df(
                    table=pq_file.read_row_group(i=i,
                                                 columns=columns,
                                                 use_threads=use_threads,
                                                 use_pandas_metadata=False),
                    categories=categories,
                    safe=safe,
                    use_threads=use_threads,
                    dataset=dataset,
                    path=path,
                    path_root=path_root,
                )
                if chunked is True:
                    yield df
                elif isinstance(chunked, int) and chunked > 0:
                    if next_slice is not None:
                        df = _union(dfs=[next_slice, df], ignore_index=None)
                    while len(df.index) >= chunked:
                        yield df.iloc[:chunked]
                        df = df.iloc[chunked:]
                    if df.empty:
                        next_slice = None
                    else:
                        next_slice = df
                else:
                    raise exceptions.InvalidArgument(f"chunked: {chunked}")
Esempio n. 21
0
def create_athena_dataset(
    name: str,
    database: Optional[str] = None,
    table: Optional[str] = None,
    sql: Optional[str] = None,
    sql_name: str = "CustomSQL",
    data_source_name: Optional[str] = None,
    data_source_arn: Optional[str] = None,
    import_mode: str = "DIRECT_QUERY",
    allowed_to_use: Optional[List[str]] = None,
    allowed_to_manage: Optional[List[str]] = None,
    logical_table_alias: str = "LogicalTable",
    rename_columns: Optional[Dict[str, str]] = None,
    cast_columns_types: Optional[Dict[str, str]] = None,
    tags: Optional[Dict[str, str]] = None,
    account_id: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> str:
    """Create a QuickSight dataset.

    Note
    ----
    You will not be able to see the the dataset in the console
    if you not pass your user to one of the ``allowed_*`` arguments.

    Note
    ----
    You must pass ``database``/``table`` OR ``sql`` argument.

    Note
    ----
    You must pass ``data_source_name`` OR ``data_source_arn`` argument.

    Parameters
    ----------
    name : str
        Dataset name.
    database : str
        Athena's database name.
    table : str
        Athena's table name.
    sql : str
        Use a SQL query to define your table.
    sql_name : str
        Query name.
    data_source_name : str, optional
        QuickSight data source name.
    data_source_arn : str, optional
        QuickSight data source ARN.
    import_mode : str
        Indicates whether you want to import the data into SPICE.
        'SPICE'|'DIRECT_QUERY'
    tags : Dict[str, str], optional
        Key/Value collection to put on the Cluster.
        e.g. {"foo": "boo", "bar": "xoo"})
    allowed_to_use : optional
        List of principals that will be allowed to see and use the data source.
        e.g. ["john", "Mary"]
    allowed_to_manage : optional
        List of principals that will be allowed to see, use, update and delete the data source.
        e.g. ["Mary"]
    logical_table_alias : str
        A display name for the logical table.
    rename_columns : Dict[str, str], optional
        Dictionary to map column renames. e.g. {"old_name": "new_name", "old_name2": "new_name2"}
    cast_columns_types : Dict[str, str], optional
        Dictionary to map column casts. e.g. {"col_name": "STRING", "col_name2": "DECIMAL"}
        Valid types: 'STRING'|'INTEGER'|'DECIMAL'|'DATETIME'
    account_id : str, optional
        If None, the account ID will be inferred from your boto3 session.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    str
        Dataset ID.

    Examples
    --------
    >>> import awswrangler as wr
    >>> dataset_id = wr.quicksight.create_athena_dataset(
    ...     name="...",
    ...     database="..."
    ...     table="..."
    ...     data_source_name="..."
    ...     allowed_to_manage=["Mary"]
    ... )
    """
    if (data_source_name is None) and (data_source_arn is None):
        raise exceptions.InvalidArgument("You must pass a not None data_source_name or data_source_arn argument.")
    if ((database is None) and (table is None)) and (sql is None):
        raise exceptions.InvalidArgument("You must pass database/table OR sql argument.")
    if (database is not None) and (sql is not None):
        raise exceptions.InvalidArgument(
            "If you provide sql argument, please include the database name inside the sql statement."
            "Do NOT pass in with database argument."
        )
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    client: boto3.client = _utils.client(service_name="quicksight", session=session)
    if account_id is None:
        account_id = sts.get_account_id(boto3_session=session)
    if (data_source_arn is None) and (data_source_name is not None):
        data_source_arn = get_data_source_arn(name=data_source_name, account_id=account_id, boto3_session=session)
    if sql is not None:
        physical_table: Dict[str, Dict[str, Any]] = {
            "CustomSql": {
                "DataSourceArn": data_source_arn,
                "Name": sql_name,
                "SqlQuery": sql,
                "Columns": extract_athena_query_columns(
                    sql=sql,
                    data_source_arn=data_source_arn,  # type: ignore
                    account_id=account_id,
                    boto3_session=session,
                ),
            }
        }
    else:
        physical_table = {
            "RelationalTable": {
                "DataSourceArn": data_source_arn,
                "Schema": database,
                "Name": table,
                "InputColumns": extract_athena_table_columns(
                    database=database,  # type: ignore
                    table=table,  # type: ignore
                    boto3_session=session,
                ),
            }
        }
    table_uuid: str = uuid.uuid4().hex
    dataset_id: str = uuid.uuid4().hex
    args: Dict[str, Any] = {
        "AwsAccountId": account_id,
        "DataSetId": dataset_id,
        "Name": name,
        "ImportMode": import_mode,
        "PhysicalTableMap": {table_uuid: physical_table},
        "LogicalTableMap": {table_uuid: {"Alias": logical_table_alias, "Source": {"PhysicalTableId": table_uuid}}},
    }
    trans: List[Dict[str, Dict[str, Any]]] = _generate_transformations(
        rename_columns=rename_columns, cast_columns_types=cast_columns_types
    )
    if trans:
        args["LogicalTableMap"][table_uuid]["DataTransforms"] = trans
    permissions: List[Dict[str, Union[str, List[str]]]] = _generate_permissions(
        resource="dataset",
        account_id=account_id,
        boto3_session=session,
        allowed_to_use=allowed_to_use,
        allowed_to_manage=allowed_to_manage,
    )
    if permissions:
        args["Permissions"] = permissions
    if tags is not None:
        _tags: List[Dict[str, str]] = [{"Key": k, "Value": v} for k, v in tags.items()]
        args["Tags"] = _tags
    client.create_data_set(**args)
    return dataset_id
Esempio n. 22
0
def to_sql(df: pd.DataFrame, con: sqlalchemy.engine.Engine,
           **pandas_kwargs: Any) -> None:
    """Write records stored in a DataFrame to a SQL database.

    Support for **Redshift**, **PostgreSQL** and **MySQL**.

    Support for all pandas to_sql() arguments:
    https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_sql.html

    Note
    ----
    Redshift: For large DataFrames (1MM+ rows) consider the function **wr.db.copy_to_redshift()**.

    Note
    ----
    Redshift: `index=False` will be forced.

    Parameters
    ----------
    df : pandas.DataFrame
        Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
    con : sqlalchemy.engine.Engine
        SQLAlchemy Engine. Please use,
        wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine()
    pandas_kwargs
        KEYWORD arguments forwarded to pandas.DataFrame.to_sql(). You can NOT pass `pandas_kwargs` explicit, just add
        valid Pandas arguments in the function call and Wrangler will accept it.
        e.g. wr.db.to_sql(df, con=con, name="table_name", schema="schema_name", if_exists="replace", index=False)
        https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_sql.html

    Returns
    -------
    None
        None.

    Examples
    --------
    Writing to Redshift with temporary credentials

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> wr.db.to_sql(
    ...     df=pd.DataFrame({'col': [1, 2, 3]}),
    ...     con=wr.db.get_redshift_temp_engine(cluster_identifier="...", user="******"),
    ...     name="table_name",
    ...     schema="schema_name"
    ... )

    Writing to Redshift with temporary credentials and using pandas_kwargs

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> wr.db.to_sql(
    ...     df=pd.DataFrame({'col': [1, 2, 3]}),
    ...     con=wr.db.get_redshift_temp_engine(cluster_identifier="...", user="******"),
    ...     name="table_name",
    ...     schema="schema_name",
    ...     if_exists="replace",
    ...     index=False,
    ... )

    Writing to Redshift from Glue Catalog Connections

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> wr.db.to_sql(
    ...     df=pd.DataFrame({'col': [1, 2, 3]}),
    ...     con=wr.catalog.get_engine(connection="..."),
    ...     name="table_name",
    ...     schema="schema_name"
    ... )

    """
    if "pandas_kwargs" in pandas_kwargs:
        raise exceptions.InvalidArgument(
            "You can NOT pass `pandas_kwargs` explicit, just add valid "
            "Pandas arguments in the function call and Wrangler will accept it."
            "e.g. wr.db.to_sql(df, con, name='...', schema='...', if_exists='replace')"
        )
    if df.empty is True:
        raise exceptions.EmptyDataFrame()
    if not isinstance(con, sqlalchemy.engine.Engine):
        raise exceptions.InvalidConnection(
            "Invalid 'con' argument, please pass a "
            "SQLAlchemy Engine. Use wr.db.get_engine(), "
            "wr.db.get_redshift_temp_engine() or wr.catalog.get_engine()")
    if "dtype" in pandas_kwargs:
        cast_columns: Dict[str, VisitableType] = pandas_kwargs["dtype"]
    else:
        cast_columns = {}
    dtypes: Dict[str,
                 VisitableType] = _data_types.sqlalchemy_types_from_pandas(
                     df=df, db_type=con.name, dtype=cast_columns)
    pandas_kwargs["dtype"] = dtypes
    pandas_kwargs["con"] = con
    if pandas_kwargs["con"].name.lower(
    ) == "redshift":  # Redshift does not accept index
        pandas_kwargs["index"] = False
    _utils.try_it(f=df.to_sql,
                  ex=sqlalchemy.exc.InternalError,
                  **pandas_kwargs)
Esempio n. 23
0
def to_csv(  # pylint: disable=too-many-arguments,too-many-locals
    df: pd.DataFrame,
    path: str,
    sep: str = ",",
    index: bool = True,
    columns: Optional[List[str]] = None,
    use_threads: bool = True,
    boto3_session: Optional[boto3.Session] = None,
    s3_additional_kwargs: Optional[Dict[str, Any]] = None,
    sanitize_columns: bool = False,
    dataset: bool = False,
    partition_cols: Optional[List[str]] = None,
    concurrent_partitioning: bool = False,
    mode: Optional[str] = None,
    catalog_versioning: bool = False,
    database: Optional[str] = None,
    table: Optional[str] = None,
    dtype: Optional[Dict[str, str]] = None,
    description: Optional[str] = None,
    parameters: Optional[Dict[str, str]] = None,
    columns_comments: Optional[Dict[str, str]] = None,
    regular_partitions: bool = True,
    projection_enabled: bool = False,
    projection_types: Optional[Dict[str, str]] = None,
    projection_ranges: Optional[Dict[str, str]] = None,
    projection_values: Optional[Dict[str, str]] = None,
    projection_intervals: Optional[Dict[str, str]] = None,
    projection_digits: Optional[Dict[str, str]] = None,
    catalog_id: Optional[str] = None,
    **pandas_kwargs: Any,
) -> Dict[str, Union[List[str], Dict[str, List[str]]]]:
    """Write CSV file or dataset on Amazon S3.

    The concept of Dataset goes beyond the simple idea of ordinary files and enable more
    complex features like partitioning and catalog integration (Amazon Athena/AWS Glue Catalog).

    Note
    ----
    If database` and `table` arguments are passed, the table name and all column names
    will be automatically sanitized using `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`.
    Please, pass `sanitize_columns=True` to enforce this behaviour always.

    Note
    ----
    If `dataset=True`, `pandas_kwargs` will be ignored due
    restrictive quoting, date_format, escapechar, encoding, etc required by Athena/Glue Catalog.

    Note
    ----
    By now Pandas does not support in-memory CSV compression.
    https://github.com/pandas-dev/pandas/issues/22555
    So the `compression` will not be supported on Wrangler too.

    Note
    ----
    On `append` mode, the `parameters` will be upsert on an existing table.

    Note
    ----
    In case of `use_threads=True` the number of threads
    that will be spawned will be gotten from os.cpu_count().

    Parameters
    ----------
    df: pandas.DataFrame
        Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
    path : str
        Amazon S3 path (e.g. s3://bucket/filename.csv).
    sep : str
        String of length 1. Field delimiter for the output file.
    index : bool
        Write row names (index).
    columns : List[str], optional
        Columns to write.
    use_threads : bool
        True to enable concurrent requests, False to disable multiple threads.
        If enabled os.cpu_count() will be used as the max number of threads.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 Session will be used if boto3_session receive None.
    s3_additional_kwargs : Optional[Dict[str, Any]]
        Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass",
        "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging".
        e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'}
    sanitize_columns : bool
        True to sanitize columns names or False to keep it as is.
        True value is forced if `dataset=True`.
    dataset : bool
        If True store a parquet dataset instead of a ordinary file(s)
        If True, enable all follow arguments:
        partition_cols, mode, database, table, description, parameters, columns_comments, concurrent_partitioning,
        catalog_versioning, projection_enabled, projection_types, projection_ranges, projection_values,
        projection_intervals, projection_digits, catalog_id, schema_evolution.
    partition_cols: List[str], optional
        List of column names that will be used to create partitions. Only takes effect if dataset=True.
    concurrent_partitioning: bool
        If True will increase the parallelism level during the partitions writing. It will decrease the
        writing time and increase the memory usage.
        https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/022%20-%20Writing%20Partitions%20Concurrently.ipynb
    mode : str, optional
        ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True.
        For details check the related tutorial:
        https://aws-data-wrangler.readthedocs.io/en/stable/stubs/awswrangler.s3.to_parquet.html#awswrangler.s3.to_parquet
    catalog_versioning : bool
        If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it.
    database : str, optional
        Glue/Athena catalog: Database name.
    table : str, optional
        Glue/Athena catalog: Table name.
    dtype : Dict[str, str], optional
        Dictionary of columns names and Athena/Glue types to be casted.
        Useful when you have columns with undetermined or mixed data types.
        (e.g. {'col name': 'bigint', 'col2 name': 'int'})
    description : str, optional
        Glue/Athena catalog: Table description
    parameters : Dict[str, str], optional
        Glue/Athena catalog: Key/value pairs to tag the table.
    columns_comments : Dict[str, str], optional
        Glue/Athena catalog:
        Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}).
    regular_partitions : bool
        Create regular partitions (Non projected partitions) on Glue Catalog.
        Disable when you will work only with Partition Projection.
        Keep enabled even when working with projections is useful to keep
        Redshift Spectrum working with the regular partitions.
    projection_enabled : bool
        Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html)
    projection_types : Optional[Dict[str, str]]
        Dictionary of partitions names and Athena projections types.
        Valid types: "enum", "integer", "date", "injected"
        https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html
        (e.g. {'col_name': 'enum', 'col2_name': 'integer'})
    projection_ranges: Optional[Dict[str, str]]
        Dictionary of partitions names and Athena projections ranges.
        https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html
        (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'})
    projection_values: Optional[Dict[str, str]]
        Dictionary of partitions names and Athena projections values.
        https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html
        (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'})
    projection_intervals: Optional[Dict[str, str]]
        Dictionary of partitions names and Athena projections intervals.
        https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html
        (e.g. {'col_name': '1', 'col2_name': '5'})
    projection_digits: Optional[Dict[str, str]]
        Dictionary of partitions names and Athena projections digits.
        https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html
        (e.g. {'col_name': '1', 'col2_name': '2'})
    catalog_id : str, optional
        The ID of the Data Catalog from which to retrieve Databases.
        If none is provided, the AWS account ID is used by default.
    pandas_kwargs :
        KEYWORD arguments forwarded to pandas.DataFrame.to_csv(). You can NOT pass `pandas_kwargs` explicit, just add
        valid Pandas arguments in the function call and Wrangler will accept it.
        e.g. wr.s3.to_csv(df, path, sep='|', na_rep='NULL', decimal=',')
        https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html

    Returns
    -------
    Dict[str, Union[List[str], Dict[str, List[str]]]]
        Dictionary with:
        'paths': List of all stored files paths on S3.
        'partitions_values': Dictionary of partitions added with keys as S3 path locations
        and values as a list of partitions values as str.

    Examples
    --------
    Writing single file

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> wr.s3.to_csv(
    ...     df=pd.DataFrame({'col': [1, 2, 3]}),
    ...     path='s3://bucket/prefix/my_file.csv',
    ... )
    {
        'paths': ['s3://bucket/prefix/my_file.csv'],
        'partitions_values': {}
    }

    Writing single file with pandas_kwargs

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> wr.s3.to_csv(
    ...     df=pd.DataFrame({'col': [1, 2, 3]}),
    ...     path='s3://bucket/prefix/my_file.csv',
    ...     sep='|',
    ...     na_rep='NULL',
    ...     decimal=','
    ... )
    {
        'paths': ['s3://bucket/prefix/my_file.csv'],
        'partitions_values': {}
    }

    Writing single file encrypted with a KMS key

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> wr.s3.to_csv(
    ...     df=pd.DataFrame({'col': [1, 2, 3]}),
    ...     path='s3://bucket/prefix/my_file.csv',
    ...     s3_additional_kwargs={
    ...         'ServerSideEncryption': 'aws:kms',
    ...         'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'
    ...     }
    ... )
    {
        'paths': ['s3://bucket/prefix/my_file.csv'],
        'partitions_values': {}
    }

    Writing partitioned dataset

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> wr.s3.to_csv(
    ...     df=pd.DataFrame({
    ...         'col': [1, 2, 3],
    ...         'col2': ['A', 'A', 'B']
    ...     }),
    ...     path='s3://bucket/prefix',
    ...     dataset=True,
    ...     partition_cols=['col2']
    ... )
    {
        'paths': ['s3://.../col2=A/x.csv', 's3://.../col2=B/y.csv'],
        'partitions_values: {
            's3://.../col2=A/': ['A'],
            's3://.../col2=B/': ['B']
        }
    }

    Writing dataset to S3 with metadata on Athena/Glue Catalog.

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> wr.s3.to_csv(
    ...     df=pd.DataFrame({
    ...         'col': [1, 2, 3],
    ...         'col2': ['A', 'A', 'B']
    ...     }),
    ...     path='s3://bucket/prefix',
    ...     dataset=True,
    ...     partition_cols=['col2'],
    ...     database='default',  # Athena/Glue database
    ...     table='my_table'  # Athena/Glue table
    ... )
    {
        'paths': ['s3://.../col2=A/x.csv', 's3://.../col2=B/y.csv'],
        'partitions_values: {
            's3://.../col2=A/': ['A'],
            's3://.../col2=B/': ['B']
        }
    }

    Writing dataset casting empty column data type

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> wr.s3.to_csv(
    ...     df=pd.DataFrame({
    ...         'col': [1, 2, 3],
    ...         'col2': ['A', 'A', 'B'],
    ...         'col3': [None, None, None]
    ...     }),
    ...     path='s3://bucket/prefix',
    ...     dataset=True,
    ...     database='default',  # Athena/Glue database
    ...     table='my_table'  # Athena/Glue table
    ...     dtype={'col3': 'date'}
    ... )
    {
        'paths': ['s3://.../x.csv'],
        'partitions_values: {}
    }

    """
    if "pandas_kwargs" in pandas_kwargs:
        raise exceptions.InvalidArgument(
            "You can NOT pass `pandas_kwargs` explicit, just add valid "
            "Pandas arguments in the function call and Wrangler will accept it."
            "e.g. wr.s3.to_csv(df, path, sep='|', na_rep='NULL', decimal=',')")
    _validate_args(
        df=df,
        table=table,
        database=database,
        dataset=dataset,
        path=path,
        partition_cols=partition_cols,
        mode=mode,
        description=description,
        parameters=parameters,
        columns_comments=columns_comments,
    )

    # Initializing defaults
    partition_cols = partition_cols if partition_cols else []
    dtype = dtype if dtype else {}
    partitions_values: Dict[str, List[str]] = {}
    mode = "append" if mode is None else mode
    session: boto3.Session = _utils.ensure_session(session=boto3_session)

    # Sanitize table to respect Athena's standards
    if (sanitize_columns is True) or (database is not None
                                      and table is not None):
        df, dtype, partition_cols = _sanitize(df=df,
                                              dtype=dtype,
                                              partition_cols=partition_cols)

    # Evaluating dtype
    catalog_table_input: Optional[Dict[str, Any]] = None
    if database is not None and table is not None:
        catalog_table_input = catalog._get_table_input(  # pylint: disable=protected-access
            database=database,
            table=table,
            boto3_session=session,
            catalog_id=catalog_id)
    df = _apply_dtype(df=df,
                      dtype=dtype,
                      catalog_table_input=catalog_table_input,
                      mode=mode)

    if dataset is False:
        pandas_kwargs["sep"] = sep
        pandas_kwargs["index"] = index
        pandas_kwargs["columns"] = columns
        _to_text(
            file_format="csv",
            df=df,
            use_threads=use_threads,
            path=path,
            boto3_session=session,
            s3_additional_kwargs=s3_additional_kwargs,
            **pandas_kwargs,
        )
        paths = [path]
    else:
        df = df[columns] if columns else df
        paths, partitions_values = _to_dataset(
            func=_to_text,
            concurrent_partitioning=concurrent_partitioning,
            df=df,
            path_root=path,
            index=index,
            sep=sep,
            use_threads=use_threads,
            partition_cols=partition_cols,
            mode=mode,
            boto3_session=session,
            s3_additional_kwargs=s3_additional_kwargs,
            file_format="csv",
            quoting=csv.QUOTE_NONE,
            escapechar="\\",
            header=False,
            date_format="%Y-%m-%d %H:%M:%S.%f",
        )
        if (database is not None) and (table is not None):
            try:
                columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned(
                    df=df,
                    index=index,
                    partition_cols=partition_cols,
                    dtype=dtype,
                    index_left=True)
                catalog._create_csv_table(  # pylint: disable=protected-access
                    database=database,
                    table=table,
                    path=path,
                    columns_types=columns_types,
                    partitions_types=partitions_types,
                    description=description,
                    parameters=parameters,
                    columns_comments=columns_comments,
                    boto3_session=session,
                    mode=mode,
                    catalog_versioning=catalog_versioning,
                    sep=sep,
                    projection_enabled=projection_enabled,
                    projection_types=projection_types,
                    projection_ranges=projection_ranges,
                    projection_values=projection_values,
                    projection_intervals=projection_intervals,
                    projection_digits=projection_digits,
                    catalog_table_input=catalog_table_input,
                    catalog_id=catalog_id,
                    compression=None,
                    skip_header_line_count=None,
                )
                if partitions_values and (regular_partitions is True):
                    _logger.debug("partitions_values:\n%s", partitions_values)
                    catalog.add_csv_partitions(
                        database=database,
                        table=table,
                        partitions_values=partitions_values,
                        boto3_session=session,
                        sep=sep,
                        catalog_id=catalog_id,
                        columns_types=columns_types,
                    )
            except Exception:
                _logger.debug(
                    "Catalog write failed, cleaning up S3 (paths: %s).", paths)
                delete_objects(path=paths,
                               use_threads=use_threads,
                               boto3_session=session)
                raise
    return {"paths": paths, "partitions_values": partitions_values}
Esempio n. 24
0
def to_json(
    df: pd.DataFrame,
    path: str,
    boto3_session: Optional[boto3.Session] = None,
    s3_additional_kwargs: Optional[Dict[str, Any]] = None,
    use_threads: bool = True,
    **pandas_kwargs: Any,
) -> None:
    """Write JSON file on Amazon S3.

    Note
    ----
    In case of `use_threads=True` the number of threads
    that will be spawned will be gotten from os.cpu_count().

    Parameters
    ----------
    df: pandas.DataFrame
        Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
    path : str
        Amazon S3 path (e.g. s3://bucket/filename.csv).
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 Session will be used if boto3_session receive None.
    s3_additional_kwargs : Optional[Dict[str, Any]]
        Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass",
        "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging".
        e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'}
    use_threads : bool
        True to enable concurrent requests, False to disable multiple threads.
        If enabled os.cpu_count() will be used as the max number of threads.
    pandas_kwargs:
        KEYWORD arguments forwarded to pandas.DataFrame.to_json(). You can NOT pass `pandas_kwargs` explicit, just add
        valid Pandas arguments in the function call and Wrangler will accept it.
        e.g. wr.s3.to_json(df, path, lines=True, date_format='iso')
        https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html

    Returns
    -------
    None
        None.

    Examples
    --------
    Writing JSON file

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> wr.s3.to_json(
    ...     df=pd.DataFrame({'col': [1, 2, 3]}),
    ...     path='s3://bucket/filename.json',
    ... )

    Writing JSON file using pandas_kwargs

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> wr.s3.to_json(
    ...     df=pd.DataFrame({'col': [1, 2, 3]}),
    ...     path='s3://bucket/filename.json',
    ...     lines=True,
    ...     date_format='iso'
    ... )

    Writing CSV file encrypted with a KMS key

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> wr.s3.to_json(
    ...     df=pd.DataFrame({'col': [1, 2, 3]}),
    ...     path='s3://bucket/filename.json',
    ...     s3_additional_kwargs={
    ...         'ServerSideEncryption': 'aws:kms',
    ...         'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'
    ...     }
    ... )

    """
    if "pandas_kwargs" in pandas_kwargs:
        raise exceptions.InvalidArgument(
            "You can NOT pass `pandas_kwargs` explicit, just add valid "
            "Pandas arguments in the function call and Wrangler will accept it."
            "e.g. wr.s3.to_json(df, path, lines=True, date_format='iso')")
    _to_text(
        file_format="json",
        df=df,
        path=path,
        use_threads=use_threads,
        boto3_session=boto3_session,
        s3_additional_kwargs=s3_additional_kwargs,
        **pandas_kwargs,
    )
Esempio n. 25
0
def _read_parquet_chunked(  # pylint: disable=too-many-branches
    paths: List[str],
    chunked: Union[bool, int],
    validate_schema: bool,
    ignore_index: Optional[bool],
    columns: Optional[List[str]],
    categories: Optional[List[str]],
    safe: bool,
    map_types: bool,
    boto3_session: boto3.Session,
    dataset: bool,
    path_root: Optional[str],
    s3_additional_kwargs: Optional[Dict[str, str]],
    use_threads: Union[bool, int],
) -> Iterator[pd.DataFrame]:
    next_slice: Optional[pd.DataFrame] = None
    last_schema: Optional[Dict[str, str]] = None
    last_path: str = ""
    for path in paths:
        with open_s3_object(
                path=path,
                mode="rb",
                use_threads=use_threads,
                s3_block_size=10_485_760,  # 10 MB (10 * 2**20)
                s3_additional_kwargs=s3_additional_kwargs,
                boto3_session=boto3_session,
        ) as f:
            pq_file: Optional[
                pyarrow.parquet.ParquetFile] = _pyarrow_parquet_file_wrapper(
                    source=f, read_dictionary=categories)
            if pq_file is None:
                continue
            if validate_schema is True:
                schema: Dict[
                    str, str] = _data_types.athena_types_from_pyarrow_schema(
                        schema=pq_file.schema.to_arrow_schema(),
                        partitions=None)[0]
                if last_schema is not None:
                    if schema != last_schema:
                        raise exceptions.InvalidSchemaConvergence(
                            f"Was detect at least 2 different schemas:\n"
                            f"    - {last_path} -> {last_schema}\n"
                            f"    - {path} -> {schema}")
                last_schema = schema
                last_path = path
            num_row_groups: int = pq_file.num_row_groups
            _logger.debug("num_row_groups: %s", num_row_groups)
            use_threads_flag: bool = use_threads if isinstance(
                use_threads, bool) else bool(use_threads > 1)
            # iter_batches is only available for pyarrow >= 3.0.0
            if callable(getattr(pq_file, "iter_batches", None)):
                chunk_generator = _pyarrow_chunk_generator(
                    pq_file=pq_file,
                    chunked=chunked,
                    columns=columns,
                    use_threads_flag=use_threads_flag)
            else:
                chunk_generator = _row_group_chunk_generator(
                    pq_file=pq_file,
                    columns=columns,
                    use_threads_flag=use_threads_flag,
                    num_row_groups=num_row_groups)

            for chunk in chunk_generator:
                df: pd.DataFrame = _arrowtable2df(
                    table=chunk,
                    categories=categories,
                    safe=safe,
                    map_types=map_types,
                    use_threads=use_threads,
                    dataset=dataset,
                    path=path,
                    path_root=path_root,
                )
                if chunked is True:
                    yield df
                elif isinstance(chunked, int) and chunked > 0:
                    if next_slice is not None:
                        df = _union(dfs=[next_slice, df],
                                    ignore_index=ignore_index)
                    while len(df.index) >= chunked:
                        yield df.iloc[:chunked, :].copy()
                        df = df.iloc[chunked:, :]
                    if df.empty:
                        next_slice = None
                    else:
                        next_slice = df
                else:
                    raise exceptions.InvalidArgument(f"chunked: {chunked}")
Esempio n. 26
0
def _create_table(  # pylint: disable=too-many-branches,too-many-statements
    database: str,
    table: str,
    description: Optional[str],
    parameters: Optional[Dict[str, str]],
    mode: str,
    catalog_versioning: bool,
    boto3_session: Optional[boto3.Session],
    table_input: Dict[str, Any],
    table_exist: bool,
    projection_enabled: bool,
    partitions_types: Optional[Dict[str, str]],
    columns_comments: Optional[Dict[str, str]],
    projection_types: Optional[Dict[str, str]],
    projection_ranges: Optional[Dict[str, str]],
    projection_values: Optional[Dict[str, str]],
    projection_intervals: Optional[Dict[str, str]],
    projection_digits: Optional[Dict[str, str]],
    catalog_id: Optional[str],
) -> None:
    # Description
    mode = _update_if_necessary(dic=table_input,
                                key="Description",
                                value=description,
                                mode=mode)

    # Parameters
    parameters = parameters if parameters else {}
    for k, v in parameters.items():
        mode = _update_if_necessary(dic=table_input["Parameters"],
                                    key=k,
                                    value=v,
                                    mode=mode)

    # Projection
    if projection_enabled is True:
        table_input["Parameters"]["projection.enabled"] = "true"
        partitions_types = partitions_types if partitions_types else {}
        projection_types = projection_types if projection_types else {}
        projection_ranges = projection_ranges if projection_ranges else {}
        projection_values = projection_values if projection_values else {}
        projection_intervals = projection_intervals if projection_intervals else {}
        projection_digits = projection_digits if projection_digits else {}
        projection_types = {
            sanitize_column_name(k): v
            for k, v in projection_types.items()
        }
        projection_ranges = {
            sanitize_column_name(k): v
            for k, v in projection_ranges.items()
        }
        projection_values = {
            sanitize_column_name(k): v
            for k, v in projection_values.items()
        }
        projection_intervals = {
            sanitize_column_name(k): v
            for k, v in projection_intervals.items()
        }
        projection_digits = {
            sanitize_column_name(k): v
            for k, v in projection_digits.items()
        }
        for k, v in projection_types.items():
            dtype: Optional[str] = partitions_types.get(k)
            if dtype is None:
                raise exceptions.InvalidArgumentCombination(
                    f"Column {k} appears as projected column but not as partitioned column."
                )
            if dtype == "date":
                table_input["Parameters"][
                    f"projection.{k}.format"] = "yyyy-MM-dd"
            elif dtype == "timestamp":
                table_input["Parameters"][
                    f"projection.{k}.format"] = "yyyy-MM-dd HH:mm:ss"
                table_input["Parameters"][
                    f"projection.{k}.interval.unit"] = "SECONDS"
                table_input["Parameters"][f"projection.{k}.interval"] = "1"
        for k, v in projection_types.items():
            mode = _update_if_necessary(dic=table_input["Parameters"],
                                        key=f"projection.{k}.type",
                                        value=v,
                                        mode=mode)
        for k, v in projection_ranges.items():
            mode = _update_if_necessary(dic=table_input["Parameters"],
                                        key=f"projection.{k}.range",
                                        value=v,
                                        mode=mode)
        for k, v in projection_values.items():
            mode = _update_if_necessary(dic=table_input["Parameters"],
                                        key=f"projection.{k}.values",
                                        value=v,
                                        mode=mode)
        for k, v in projection_intervals.items():
            mode = _update_if_necessary(dic=table_input["Parameters"],
                                        key=f"projection.{k}.interval",
                                        value=str(v),
                                        mode=mode)
        for k, v in projection_digits.items():
            mode = _update_if_necessary(dic=table_input["Parameters"],
                                        key=f"projection.{k}.digits",
                                        value=str(v),
                                        mode=mode)
    else:
        table_input["Parameters"]["projection.enabled"] = "false"

    # Column comments
    columns_comments = columns_comments if columns_comments else {}
    columns_comments = {
        sanitize_column_name(k): v
        for k, v in columns_comments.items()
    }
    if columns_comments:
        for col in table_input["StorageDescriptor"]["Columns"]:
            name: str = col["Name"]
            if name in columns_comments:
                mode = _update_if_necessary(dic=col,
                                            key="Comment",
                                            value=columns_comments[name],
                                            mode=mode)
        for par in table_input["PartitionKeys"]:
            name = par["Name"]
            if name in columns_comments:
                mode = _update_if_necessary(dic=par,
                                            key="Comment",
                                            value=columns_comments[name],
                                            mode=mode)

    _logger.debug("table_input: %s", table_input)

    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    client_glue: boto3.client = _utils.client(service_name="glue",
                                              session=session)
    skip_archive: bool = not catalog_versioning
    if mode not in ("overwrite", "append", "overwrite_partitions", "update"):
        raise exceptions.InvalidArgument(
            f"{mode} is not a valid mode. It must be 'overwrite', 'append' or 'overwrite_partitions'."
        )
    if table_exist is True and mode == "overwrite":
        delete_all_partitions(table=table,
                              database=database,
                              catalog_id=catalog_id,
                              boto3_session=session)
        _logger.debug("Updating table (%s)...", mode)
        client_glue.update_table(**_catalog_id(catalog_id=catalog_id,
                                               DatabaseName=database,
                                               TableInput=table_input,
                                               SkipArchive=skip_archive))
    elif (table_exist is True) and (mode in ("append", "overwrite_partitions",
                                             "update")):
        if mode == "update":
            _logger.debug("Updating table (%s)...", mode)
            client_glue.update_table(**_catalog_id(catalog_id=catalog_id,
                                                   DatabaseName=database,
                                                   TableInput=table_input,
                                                   SkipArchive=skip_archive))
    elif table_exist is False:
        try:
            _logger.debug("Creating table (%s)...", mode)
            client_glue.create_table(**_catalog_id(catalog_id=catalog_id,
                                                   DatabaseName=database,
                                                   TableInput=table_input))
        except client_glue.exceptions.AlreadyExistsException:
            if mode == "overwrite":
                _utils.try_it(
                    f=_overwrite_table,
                    ex=client_glue.exceptions.AlreadyExistsException,
                    client_glue=client_glue,
                    catalog_id=catalog_id,
                    database=database,
                    table=table,
                    table_input=table_input,
                    boto3_session=boto3_session,
                )
    _logger.debug("Leaving table as is (%s)...", mode)
def to_excel(
    df: pd.DataFrame,
    path: str,
    boto3_session: Optional[boto3.Session] = None,
    s3_additional_kwargs: Optional[Dict[str, Any]] = None,
    use_threads: Union[bool, int] = True,
    **pandas_kwargs: Any,
) -> str:
    """Write EXCEL file on Amazon S3.

    Note
    ----
    This function accepts any Pandas's read_excel() argument.
    https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html

    Note
    ----
    Depending on the file extension ('xlsx', 'xls', 'odf'...), an additional library
    might have to be installed first (e.g. xlrd).

    Note
    ----
    In case of `use_threads=True` the number of threads
    that will be spawned will be gotten from os.cpu_count().

    Parameters
    ----------
    df: pandas.DataFrame
        Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
    path : str
        Amazon S3 path (e.g. s3://bucket/filename.xlsx).
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 Session will be used if boto3_session receive None.
    s3_additional_kwargs : Optional[Dict[str, Any]]
        Forwarded to botocore requests.
        e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'}
    use_threads : bool, int
        True to enable concurrent requests, False to disable multiple threads.
        If enabled os.cpu_count() will be used as the max number of threads.
        If integer is provided, specified number is used.
    pandas_kwargs:
        KEYWORD arguments forwarded to pandas.DataFrame.to_excel(). You can NOT pass `pandas_kwargs` explicit, just add
        valid Pandas arguments in the function call and Wrangler will accept it.
        e.g. wr.s3.to_excel(df, path, na_rep="", index=False)
        https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_excel.html

    Returns
    -------
    str
        Written S3 path.

    Examples
    --------
    Writing EXCEL file

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> wr.s3.to_excel(df, 's3://bucket/filename.xlsx')

    """
    if "pandas_kwargs" in pandas_kwargs:
        raise exceptions.InvalidArgument(
            "You can NOT pass `pandas_kwargs` explicit, just add valid "
            "Pandas arguments in the function call and Wrangler will accept it."
            "e.g. wr.s3.to_excel(df, path, na_rep="
            ", index=False)")
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    with open_s3_object(
            path=path,
            mode="wb",
            use_threads=use_threads,
            s3_additional_kwargs=s3_additional_kwargs,
            boto3_session=session,
    ) as f:
        _logger.debug("pandas_kwargs: %s", pandas_kwargs)
        df.to_excel(f, **pandas_kwargs)
    return path
Esempio n. 28
0
def to_json(  # pylint: disable=too-many-arguments,too-many-locals,too-many-statements,too-many-branches
    df: pd.DataFrame,
    path: Optional[str] = None,
    index: bool = True,
    columns: Optional[List[str]] = None,
    use_threads: Union[bool, int] = True,
    boto3_session: Optional[boto3.Session] = None,
    s3_additional_kwargs: Optional[Dict[str, Any]] = None,
    sanitize_columns: bool = False,
    dataset: bool = False,
    filename_prefix: Optional[str] = None,
    partition_cols: Optional[List[str]] = None,
    bucketing_info: Optional[Tuple[List[str], int]] = None,
    concurrent_partitioning: bool = False,
    mode: Optional[str] = None,
    catalog_versioning: bool = False,
    schema_evolution: bool = True,
    database: Optional[str] = None,
    table: Optional[str] = None,
    table_type: Optional[str] = None,
    transaction_id: Optional[str] = None,
    dtype: Optional[Dict[str, str]] = None,
    description: Optional[str] = None,
    parameters: Optional[Dict[str, str]] = None,
    columns_comments: Optional[Dict[str, str]] = None,
    regular_partitions: bool = True,
    projection_enabled: bool = False,
    projection_types: Optional[Dict[str, str]] = None,
    projection_ranges: Optional[Dict[str, str]] = None,
    projection_values: Optional[Dict[str, str]] = None,
    projection_intervals: Optional[Dict[str, str]] = None,
    projection_digits: Optional[Dict[str, str]] = None,
    catalog_id: Optional[str] = None,
    **pandas_kwargs: Any,
) -> Union[List[str], Dict[str, Union[List[str], Dict[str, List[str]]]]]:
    """Write JSON file on Amazon S3.

    Note
    ----
    In case of `use_threads=True` the number of threads
    that will be spawned will be gotten from os.cpu_count().

    Note
    ----
    Compression: The minimum acceptable version to achive it is Pandas 1.2.0 that requires Python >= 3.7.1.

    Parameters
    ----------
    df: pandas.DataFrame
        Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
    path : str
        Amazon S3 path (e.g. s3://bucket/filename.json).
    index : bool
        Write row names (index).
    columns : Optional[List[str]]
        Columns to write.
    use_threads : bool, int
        True to enable concurrent requests, False to disable multiple threads.
        If enabled os.cpu_count() will be used as the max number of threads.
        If integer is provided, specified number is used.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 Session will be used if boto3_session receive None.
    s3_additional_kwargs : Optional[Dict[str, Any]]
        Forwarded to botocore requests.
        e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'}
    sanitize_columns : bool
        True to sanitize columns names or False to keep it as is.
        True value is forced if `dataset=True`.
    dataset : bool
        If True store as a dataset instead of ordinary file(s)
        If True, enable all follow arguments:
        partition_cols, mode, database, table, description, parameters, columns_comments, concurrent_partitioning,
        catalog_versioning, projection_enabled, projection_types, projection_ranges, projection_values,
        projection_intervals, projection_digits, catalog_id, schema_evolution.
    filename_prefix: str, optional
        If dataset=True, add a filename prefix to the output files.
    partition_cols: List[str], optional
        List of column names that will be used to create partitions. Only takes effect if dataset=True.
    bucketing_info: Tuple[List[str], int], optional
        Tuple consisting of the column names used for bucketing as the first element and the number of buckets as the
        second element.
        Only `str`, `int` and `bool` are supported as column data types for bucketing.
    concurrent_partitioning: bool
        If True will increase the parallelism level during the partitions writing. It will decrease the
        writing time and increase the memory usage.
        https://aws-data-wrangler.readthedocs.io/en/2.13.0/tutorials/022%20-%20Writing%20Partitions%20Concurrently.html
    mode : str, optional
        ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True.
        For details check the related tutorial:
        https://aws-data-wrangler.readthedocs.io/en/2.13.0/stubs/awswrangler.s3.to_parquet.html#awswrangler.s3.to_parquet
    catalog_versioning : bool
        If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it.
    schema_evolution : bool
        If True allows schema evolution (new or missing columns), otherwise a exception will be raised.
        (Only considered if dataset=True and mode in ("append", "overwrite_partitions"))
        Related tutorial:
        https://aws-data-wrangler.readthedocs.io/en/2.13.0/tutorials/014%20-%20Schema%20Evolution.html
    database : str, optional
        Glue/Athena catalog: Database name.
    table : str, optional
        Glue/Athena catalog: Table name.
    table_type: str, optional
        The type of the Glue Table. Set to EXTERNAL_TABLE if None
    transaction_id: str, optional
        The ID of the transaction when writing to a Governed Table.
    dtype : Dict[str, str], optional
        Dictionary of columns names and Athena/Glue types to be casted.
        Useful when you have columns with undetermined or mixed data types.
        (e.g. {'col name': 'bigint', 'col2 name': 'int'})
    description : str, optional
        Glue/Athena catalog: Table description
    parameters : Dict[str, str], optional
        Glue/Athena catalog: Key/value pairs to tag the table.
    columns_comments : Dict[str, str], optional
        Glue/Athena catalog:
        Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}).
    regular_partitions : bool
        Create regular partitions (Non projected partitions) on Glue Catalog.
        Disable when you will work only with Partition Projection.
        Keep enabled even when working with projections is useful to keep
        Redshift Spectrum working with the regular partitions.
    projection_enabled : bool
        Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html)
    projection_types : Optional[Dict[str, str]]
        Dictionary of partitions names and Athena projections types.
        Valid types: "enum", "integer", "date", "injected"
        https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html
        (e.g. {'col_name': 'enum', 'col2_name': 'integer'})
    projection_ranges: Optional[Dict[str, str]]
        Dictionary of partitions names and Athena projections ranges.
        https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html
        (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'})
    projection_values: Optional[Dict[str, str]]
        Dictionary of partitions names and Athena projections values.
        https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html
        (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'})
    projection_intervals: Optional[Dict[str, str]]
        Dictionary of partitions names and Athena projections intervals.
        https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html
        (e.g. {'col_name': '1', 'col2_name': '5'})
    projection_digits: Optional[Dict[str, str]]
        Dictionary of partitions names and Athena projections digits.
        https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html
        (e.g. {'col_name': '1', 'col2_name': '2'})
    catalog_id : str, optional
        The ID of the Data Catalog from which to retrieve Databases.
        If none is provided, the AWS account ID is used by default.
    pandas_kwargs:
        KEYWORD arguments forwarded to pandas.DataFrame.to_json(). You can NOT pass `pandas_kwargs` explicit, just add
        valid Pandas arguments in the function call and Wrangler will accept it.
        e.g. wr.s3.to_json(df, path, lines=True, date_format='iso')
        https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html

    Returns
    -------
    List[str]
        List of written files.

    Examples
    --------
    Writing JSON file

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> wr.s3.to_json(
    ...     df=pd.DataFrame({'col': [1, 2, 3]}),
    ...     path='s3://bucket/filename.json',
    ... )

    Writing JSON file using pandas_kwargs

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> wr.s3.to_json(
    ...     df=pd.DataFrame({'col': [1, 2, 3]}),
    ...     path='s3://bucket/filename.json',
    ...     lines=True,
    ...     date_format='iso'
    ... )

    Writing CSV file encrypted with a KMS key

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> wr.s3.to_json(
    ...     df=pd.DataFrame({'col': [1, 2, 3]}),
    ...     path='s3://bucket/filename.json',
    ...     s3_additional_kwargs={
    ...         'ServerSideEncryption': 'aws:kms',
    ...         'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'
    ...     }
    ... )

    """
    if "pandas_kwargs" in pandas_kwargs:
        raise exceptions.InvalidArgument(
            "You can NOT pass `pandas_kwargs` explicit, just add valid "
            "Pandas arguments in the function call and Wrangler will accept it."
            "e.g. wr.s3.to_json(df, path, lines=True, date_format='iso')")
    if pandas_kwargs.get("compression") and str(
            pd.__version__) < LooseVersion("1.2.0"):
        raise exceptions.InvalidArgument(
            f"JSON compression on S3 is not supported for Pandas version {pd.__version__}. "
            "The minimum acceptable version to achive it is Pandas 1.2.0 that requires Python >=3.7.1."
        )

    _validate_args(
        df=df,
        table=table,
        database=database,
        dataset=dataset,
        path=path,
        partition_cols=partition_cols,
        bucketing_info=bucketing_info,
        mode=mode,
        description=description,
        parameters=parameters,
        columns_comments=columns_comments,
    )

    # Initializing defaults
    partition_cols = partition_cols if partition_cols else []
    dtype = dtype if dtype else {}
    partitions_values: Dict[str, List[str]] = {}
    mode = "append" if mode is None else mode
    commit_trans: bool = False
    if transaction_id:
        table_type = "GOVERNED"
    filename_prefix = filename_prefix + uuid.uuid4(
    ).hex if filename_prefix else uuid.uuid4().hex
    session: boto3.Session = _utils.ensure_session(session=boto3_session)

    # Sanitize table to respect Athena's standards
    if (sanitize_columns is True) or (database is not None
                                      and table is not None):
        df, dtype, partition_cols = _sanitize(df=df,
                                              dtype=dtype,
                                              partition_cols=partition_cols)

    # Evaluating dtype
    catalog_table_input: Optional[Dict[str, Any]] = None

    if database and table:
        catalog_table_input = catalog._get_table_input(  # pylint: disable=protected-access
            database=database,
            table=table,
            boto3_session=session,
            transaction_id=transaction_id,
            catalog_id=catalog_id)
        catalog_path: Optional[str] = None
        if catalog_table_input:
            table_type = catalog_table_input["TableType"]
            catalog_path = catalog_table_input.get("StorageDescriptor",
                                                   {}).get("Location")
        if path is None:
            if catalog_path:
                path = catalog_path
            else:
                raise exceptions.InvalidArgumentValue(
                    "Glue table does not exist in the catalog. Please pass the `path` argument to create it."
                )
        elif path and catalog_path:
            if path.rstrip("/") != catalog_path.rstrip("/"):
                raise exceptions.InvalidArgumentValue(
                    f"The specified path: {path}, does not match the existing Glue catalog table path: {catalog_path}"
                )
        if pandas_kwargs.get("compression") not in ("gzip", "bz2", None):
            raise exceptions.InvalidArgumentCombination(
                "If database and table are given, you must use one of these compressions: gzip, bz2 or None."
            )
        if (table_type == "GOVERNED") and (not transaction_id):
            _logger.debug(
                "`transaction_id` not specified for GOVERNED table, starting transaction"
            )
            transaction_id = lakeformation.start_transaction(
                read_only=False, boto3_session=boto3_session)
            commit_trans = True

    df = _apply_dtype(df=df,
                      dtype=dtype,
                      catalog_table_input=catalog_table_input,
                      mode=mode)

    if dataset is False:
        return _to_text(
            file_format="json",
            df=df,
            path=path,
            use_threads=use_threads,
            boto3_session=session,
            s3_additional_kwargs=s3_additional_kwargs,
            **pandas_kwargs,
        )

    compression: Optional[str] = pandas_kwargs.get("compression", None)
    df = df[columns] if columns else df

    columns_types: Dict[str, str] = {}
    partitions_types: Dict[str, str] = {}

    if database and table:
        columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned(
            df=df, index=index, partition_cols=partition_cols, dtype=dtype)
        if schema_evolution is False:
            _utils.check_schema_changes(columns_types=columns_types,
                                        table_input=catalog_table_input,
                                        mode=mode)

        if (catalog_table_input is None) and (table_type == "GOVERNED"):
            catalog._create_json_table(  # pylint: disable=protected-access
                database=database,
                table=table,
                path=path,  # type: ignore
                columns_types=columns_types,
                table_type=table_type,
                partitions_types=partitions_types,
                bucketing_info=bucketing_info,
                description=description,
                parameters=parameters,
                columns_comments=columns_comments,
                boto3_session=session,
                mode=mode,
                transaction_id=transaction_id,
                catalog_versioning=catalog_versioning,
                schema_evolution=schema_evolution,
                projection_enabled=projection_enabled,
                projection_types=projection_types,
                projection_ranges=projection_ranges,
                projection_values=projection_values,
                projection_intervals=projection_intervals,
                projection_digits=projection_digits,
                projection_storage_location_template=None,
                catalog_table_input=catalog_table_input,
                catalog_id=catalog_id,
                compression=pandas_kwargs.get("compression"),
                serde_library=None,
                serde_parameters=None,
            )
            catalog_table_input = catalog._get_table_input(  # pylint: disable=protected-access
                database=database,
                table=table,
                boto3_session=session,
                transaction_id=transaction_id,
                catalog_id=catalog_id,
            )

    paths, partitions_values = _to_dataset(
        func=_to_text,
        concurrent_partitioning=concurrent_partitioning,
        df=df,
        path_root=path,  # type: ignore
        filename_prefix=filename_prefix,
        index=index,
        compression=compression,
        catalog_id=catalog_id,
        database=database,
        table=table,
        table_type=table_type,
        transaction_id=transaction_id,
        use_threads=use_threads,
        partition_cols=partition_cols,
        partitions_types=partitions_types,
        bucketing_info=bucketing_info,
        mode=mode,
        boto3_session=session,
        s3_additional_kwargs=s3_additional_kwargs,
        file_format="json",
    )
    if database and table:
        try:
            serde_info: Dict[str, Any] = {}
            if catalog_table_input:
                serde_info = catalog_table_input["StorageDescriptor"][
                    "SerdeInfo"]
            serde_library: Optional[str] = serde_info.get(
                "SerializationLibrary", None)
            serde_parameters: Optional[Dict[str, str]] = serde_info.get(
                "Parameters", None)
            catalog._create_json_table(  # pylint: disable=protected-access
                database=database,
                table=table,
                path=path,  # type: ignore
                columns_types=columns_types,
                table_type=table_type,
                partitions_types=partitions_types,
                bucketing_info=bucketing_info,
                description=description,
                parameters=parameters,
                columns_comments=columns_comments,
                boto3_session=session,
                mode=mode,
                transaction_id=transaction_id,
                catalog_versioning=catalog_versioning,
                schema_evolution=schema_evolution,
                projection_enabled=projection_enabled,
                projection_types=projection_types,
                projection_ranges=projection_ranges,
                projection_values=projection_values,
                projection_intervals=projection_intervals,
                projection_digits=projection_digits,
                projection_storage_location_template=None,
                catalog_table_input=catalog_table_input,
                catalog_id=catalog_id,
                compression=pandas_kwargs.get("compression"),
                serde_library=serde_library,
                serde_parameters=serde_parameters,
            )
            if partitions_values and (regular_partitions is
                                      True) and (table_type != "GOVERNED"):
                _logger.debug("partitions_values:\n%s", partitions_values)
                catalog.add_json_partitions(
                    database=database,
                    table=table,
                    partitions_values=partitions_values,
                    bucketing_info=bucketing_info,
                    boto3_session=session,
                    serde_library=serde_library,
                    serde_parameters=serde_parameters,
                    catalog_id=catalog_id,
                    columns_types=columns_types,
                    compression=pandas_kwargs.get("compression"),
                )
                if commit_trans:
                    lakeformation.commit_transaction(
                        transaction_id=transaction_id,
                        boto3_session=boto3_session  # type: ignore
                    )
        except Exception:
            _logger.debug("Catalog write failed, cleaning up S3 (paths: %s).",
                          paths)
            delete_objects(
                path=paths,
                use_threads=use_threads,
                boto3_session=session,
                s3_additional_kwargs=s3_additional_kwargs,
            )
            raise
    return {"paths": paths, "partitions_values": partitions_values}
Esempio n. 29
0
def to_csv(  # pylint: disable=too-many-arguments,too-many-locals,too-many-statements,too-many-branches
    df: pd.DataFrame,
    path: Optional[str] = None,
    sep: str = ",",
    index: bool = True,
    columns: Optional[List[str]] = None,
    use_threads: Union[bool, int] = True,
    boto3_session: Optional[boto3.Session] = None,
    s3_additional_kwargs: Optional[Dict[str, Any]] = None,
    sanitize_columns: bool = False,
    dataset: bool = False,
    filename_prefix: Optional[str] = None,
    partition_cols: Optional[List[str]] = None,
    bucketing_info: Optional[Tuple[List[str], int]] = None,
    concurrent_partitioning: bool = False,
    mode: Optional[str] = None,
    catalog_versioning: bool = False,
    schema_evolution: bool = False,
    database: Optional[str] = None,
    table: Optional[str] = None,
    table_type: Optional[str] = None,
    transaction_id: Optional[str] = None,
    dtype: Optional[Dict[str, str]] = None,
    description: Optional[str] = None,
    parameters: Optional[Dict[str, str]] = None,
    columns_comments: Optional[Dict[str, str]] = None,
    regular_partitions: bool = True,
    projection_enabled: bool = False,
    projection_types: Optional[Dict[str, str]] = None,
    projection_ranges: Optional[Dict[str, str]] = None,
    projection_values: Optional[Dict[str, str]] = None,
    projection_intervals: Optional[Dict[str, str]] = None,
    projection_digits: Optional[Dict[str, str]] = None,
    catalog_id: Optional[str] = None,
    **pandas_kwargs: Any,
) -> Dict[str, Union[List[str], Dict[str, List[str]]]]:
    """Write CSV file or dataset on Amazon S3.

    The concept of Dataset goes beyond the simple idea of ordinary files and enable more
    complex features like partitioning and catalog integration (Amazon Athena/AWS Glue Catalog).

    Note
    ----
    If database` and `table` arguments are passed, the table name and all column names
    will be automatically sanitized using `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`.
    Please, pass `sanitize_columns=True` to enforce this behaviour always.

    Note
    ----
    If `table` and `database` arguments are passed, `pandas_kwargs` will be ignored due
    restrictive quoting, date_format, escapechar and encoding required by Athena/Glue Catalog.

    Note
    ----
    Compression: The minimum acceptable version to achive it is Pandas 1.2.0 that requires Python >= 3.7.1.

    Note
    ----
    On `append` mode, the `parameters` will be upsert on an existing table.

    Note
    ----
    In case of `use_threads=True` the number of threads
    that will be spawned will be gotten from os.cpu_count().

    Parameters
    ----------
    df: pandas.DataFrame
        Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
    path : str, optional
        Amazon S3 path (e.g. s3://bucket/prefix/filename.csv) (for dataset e.g. ``s3://bucket/prefix``).
        Required if dataset=False or when creating a new dataset
    sep : str
        String of length 1. Field delimiter for the output file.
    index : bool
        Write row names (index).
    columns : Optional[List[str]]
        Columns to write.
    use_threads : bool, int
        True to enable concurrent requests, False to disable multiple threads.
        If enabled os.cpu_count() will be used as the max number of threads.
        If integer is provided, specified number is used.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 Session will be used if boto3_session receive None.
    s3_additional_kwargs : Optional[Dict[str, Any]]
        Forwarded to botocore requests.
        e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'}
    sanitize_columns : bool
        True to sanitize columns names or False to keep it as is.
        True value is forced if `dataset=True`.
    dataset : bool
        If True store as a dataset instead of ordinary file(s)
        If True, enable all follow arguments:
        partition_cols, mode, database, table, description, parameters, columns_comments, concurrent_partitioning,
        catalog_versioning, projection_enabled, projection_types, projection_ranges, projection_values,
        projection_intervals, projection_digits, catalog_id, schema_evolution.
    filename_prefix: str, optional
        If dataset=True, add a filename prefix to the output files.
    partition_cols: List[str], optional
        List of column names that will be used to create partitions. Only takes effect if dataset=True.
    bucketing_info: Tuple[List[str], int], optional
        Tuple consisting of the column names used for bucketing as the first element and the number of buckets as the
        second element.
        Only `str`, `int` and `bool` are supported as column data types for bucketing.
    concurrent_partitioning: bool
        If True will increase the parallelism level during the partitions writing. It will decrease the
        writing time and increase the memory usage.
        https://aws-data-wrangler.readthedocs.io/en/2.13.0/tutorials/022%20-%20Writing%20Partitions%20Concurrently.html
    mode : str, optional
        ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True.
        For details check the related tutorial:
        https://aws-data-wrangler.readthedocs.io/en/2.13.0/stubs/awswrangler.s3.to_parquet.html#awswrangler.s3.to_parquet
    catalog_versioning : bool
        If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it.
    schema_evolution : bool
        If True allows schema evolution (new or missing columns), otherwise a exception will be raised.
        (Only considered if dataset=True and mode in ("append", "overwrite_partitions")). False by default.
        Related tutorial:
        https://aws-data-wrangler.readthedocs.io/en/2.13.0/tutorials/014%20-%20Schema%20Evolution.html
    database : str, optional
        Glue/Athena catalog: Database name.
    table : str, optional
        Glue/Athena catalog: Table name.
    table_type: str, optional
        The type of the Glue Table. Set to EXTERNAL_TABLE if None
    transaction_id: str, optional
        The ID of the transaction when writing to a Governed Table.
    dtype : Dict[str, str], optional
        Dictionary of columns names and Athena/Glue types to be casted.
        Useful when you have columns with undetermined or mixed data types.
        (e.g. {'col name': 'bigint', 'col2 name': 'int'})
    description : str, optional
        Glue/Athena catalog: Table description
    parameters : Dict[str, str], optional
        Glue/Athena catalog: Key/value pairs to tag the table.
    columns_comments : Dict[str, str], optional
        Glue/Athena catalog:
        Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}).
    regular_partitions : bool
        Create regular partitions (Non projected partitions) on Glue Catalog.
        Disable when you will work only with Partition Projection.
        Keep enabled even when working with projections is useful to keep
        Redshift Spectrum working with the regular partitions.
    projection_enabled : bool
        Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html)
    projection_types : Optional[Dict[str, str]]
        Dictionary of partitions names and Athena projections types.
        Valid types: "enum", "integer", "date", "injected"
        https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html
        (e.g. {'col_name': 'enum', 'col2_name': 'integer'})
    projection_ranges: Optional[Dict[str, str]]
        Dictionary of partitions names and Athena projections ranges.
        https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html
        (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'})
    projection_values: Optional[Dict[str, str]]
        Dictionary of partitions names and Athena projections values.
        https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html
        (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'})
    projection_intervals: Optional[Dict[str, str]]
        Dictionary of partitions names and Athena projections intervals.
        https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html
        (e.g. {'col_name': '1', 'col2_name': '5'})
    projection_digits: Optional[Dict[str, str]]
        Dictionary of partitions names and Athena projections digits.
        https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html
        (e.g. {'col_name': '1', 'col2_name': '2'})
    catalog_id : str, optional
        The ID of the Data Catalog from which to retrieve Databases.
        If none is provided, the AWS account ID is used by default.
    pandas_kwargs :
        KEYWORD arguments forwarded to pandas.DataFrame.to_csv(). You can NOT pass `pandas_kwargs` explicit, just add
        valid Pandas arguments in the function call and Wrangler will accept it.
        e.g. wr.s3.to_csv(df, path, sep='|', na_rep='NULL', decimal=',')
        https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html

    Returns
    -------
    Dict[str, Union[List[str], Dict[str, List[str]]]]
        Dictionary with:
        'paths': List of all stored files paths on S3.
        'partitions_values': Dictionary of partitions added with keys as S3 path locations
        and values as a list of partitions values as str.

    Examples
    --------
    Writing single file

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> wr.s3.to_csv(
    ...     df=pd.DataFrame({'col': [1, 2, 3]}),
    ...     path='s3://bucket/prefix/my_file.csv',
    ... )
    {
        'paths': ['s3://bucket/prefix/my_file.csv'],
        'partitions_values': {}
    }

    Writing single file with pandas_kwargs

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> wr.s3.to_csv(
    ...     df=pd.DataFrame({'col': [1, 2, 3]}),
    ...     path='s3://bucket/prefix/my_file.csv',
    ...     sep='|',
    ...     na_rep='NULL',
    ...     decimal=','
    ... )
    {
        'paths': ['s3://bucket/prefix/my_file.csv'],
        'partitions_values': {}
    }

    Writing single file encrypted with a KMS key

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> wr.s3.to_csv(
    ...     df=pd.DataFrame({'col': [1, 2, 3]}),
    ...     path='s3://bucket/prefix/my_file.csv',
    ...     s3_additional_kwargs={
    ...         'ServerSideEncryption': 'aws:kms',
    ...         'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'
    ...     }
    ... )
    {
        'paths': ['s3://bucket/prefix/my_file.csv'],
        'partitions_values': {}
    }

    Writing partitioned dataset

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> wr.s3.to_csv(
    ...     df=pd.DataFrame({
    ...         'col': [1, 2, 3],
    ...         'col2': ['A', 'A', 'B']
    ...     }),
    ...     path='s3://bucket/prefix',
    ...     dataset=True,
    ...     partition_cols=['col2']
    ... )
    {
        'paths': ['s3://.../col2=A/x.csv', 's3://.../col2=B/y.csv'],
        'partitions_values: {
            's3://.../col2=A/': ['A'],
            's3://.../col2=B/': ['B']
        }
    }

    Writing bucketed dataset

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> wr.s3.to_csv(
    ...     df=pd.DataFrame({
    ...         'col': [1, 2, 3],
    ...         'col2': ['A', 'A', 'B']
    ...     }),
    ...     path='s3://bucket/prefix',
    ...     dataset=True,
    ...     bucketing_info=(["col2"], 2)
    ... )
    {
        'paths': ['s3://.../x_bucket-00000.csv', 's3://.../col2=B/x_bucket-00001.csv'],
        'partitions_values: {}
    }

    Writing dataset to S3 with metadata on Athena/Glue Catalog.

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> wr.s3.to_csv(
    ...     df=pd.DataFrame({
    ...         'col': [1, 2, 3],
    ...         'col2': ['A', 'A', 'B']
    ...     }),
    ...     path='s3://bucket/prefix',
    ...     dataset=True,
    ...     partition_cols=['col2'],
    ...     database='default',  # Athena/Glue database
    ...     table='my_table'  # Athena/Glue table
    ... )
    {
        'paths': ['s3://.../col2=A/x.csv', 's3://.../col2=B/y.csv'],
        'partitions_values: {
            's3://.../col2=A/': ['A'],
            's3://.../col2=B/': ['B']
        }
    }

    Writing dataset to Glue governed table

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> wr.s3.to_csv(
    ...     df=pd.DataFrame({
    ...         'col': [1, 2, 3],
    ...         'col2': ['A', 'A', 'B'],
    ...         'col3': [None, None, None]
    ...     }),
    ...     dataset=True,
    ...     mode='append',
    ...     database='default',  # Athena/Glue database
    ...     table='my_table',  # Athena/Glue table
    ...     table_type='GOVERNED',
    ...     transaction_id="xxx",
    ... )
    {
        'paths': ['s3://.../x.csv'],
        'partitions_values: {}
    }

    Writing dataset casting empty column data type

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> wr.s3.to_csv(
    ...     df=pd.DataFrame({
    ...         'col': [1, 2, 3],
    ...         'col2': ['A', 'A', 'B'],
    ...         'col3': [None, None, None]
    ...     }),
    ...     path='s3://bucket/prefix',
    ...     dataset=True,
    ...     database='default',  # Athena/Glue database
    ...     table='my_table'  # Athena/Glue table
    ...     dtype={'col3': 'date'}
    ... )
    {
        'paths': ['s3://.../x.csv'],
        'partitions_values: {}
    }

    """
    if "pandas_kwargs" in pandas_kwargs:
        raise exceptions.InvalidArgument(
            "You can NOT pass `pandas_kwargs` explicit, just add valid "
            "Pandas arguments in the function call and Wrangler will accept it."
            "e.g. wr.s3.to_csv(df, path, sep='|', na_rep='NULL', decimal=',', compression='gzip')"
        )
    if pandas_kwargs.get("compression") and str(
            pd.__version__) < LooseVersion("1.2.0"):
        raise exceptions.InvalidArgument(
            f"CSV compression on S3 is not supported for Pandas version {pd.__version__}. "
            "The minimum acceptable version to achive it is Pandas 1.2.0 that requires Python >=3.7.1."
        )
    _validate_args(
        df=df,
        table=table,
        database=database,
        dataset=dataset,
        path=path,
        partition_cols=partition_cols,
        bucketing_info=bucketing_info,
        mode=mode,
        description=description,
        parameters=parameters,
        columns_comments=columns_comments,
    )

    # Initializing defaults
    partition_cols = partition_cols if partition_cols else []
    dtype = dtype if dtype else {}
    partitions_values: Dict[str, List[str]] = {}
    mode = "append" if mode is None else mode
    commit_trans: bool = False
    if transaction_id:
        table_type = "GOVERNED"
    filename_prefix = filename_prefix + uuid.uuid4(
    ).hex if filename_prefix else uuid.uuid4().hex
    session: boto3.Session = _utils.ensure_session(session=boto3_session)

    # Sanitize table to respect Athena's standards
    if (sanitize_columns is True) or (database is not None
                                      and table is not None):
        df, dtype, partition_cols = _sanitize(df=df,
                                              dtype=dtype,
                                              partition_cols=partition_cols)

    # Evaluating dtype
    catalog_table_input: Optional[Dict[str, Any]] = None
    if database and table:
        catalog_table_input = catalog._get_table_input(  # pylint: disable=protected-access
            database=database,
            table=table,
            boto3_session=session,
            transaction_id=transaction_id,
            catalog_id=catalog_id)

        catalog_path: Optional[str] = None
        if catalog_table_input:
            table_type = catalog_table_input["TableType"]
            catalog_path = catalog_table_input.get("StorageDescriptor",
                                                   {}).get("Location")
        if path is None:
            if catalog_path:
                path = catalog_path
            else:
                raise exceptions.InvalidArgumentValue(
                    "Glue table does not exist in the catalog. Please pass the `path` argument to create it."
                )
        elif path and catalog_path:
            if path.rstrip("/") != catalog_path.rstrip("/"):
                raise exceptions.InvalidArgumentValue(
                    f"The specified path: {path}, does not match the existing Glue catalog table path: {catalog_path}"
                )
        if pandas_kwargs.get("compression") not in ("gzip", "bz2", None):
            raise exceptions.InvalidArgumentCombination(
                "If database and table are given, you must use one of these compressions: gzip, bz2 or None."
            )
        if (table_type == "GOVERNED") and (not transaction_id):
            _logger.debug(
                "`transaction_id` not specified for GOVERNED table, starting transaction"
            )
            transaction_id = lakeformation.start_transaction(
                read_only=False, boto3_session=boto3_session)
            commit_trans = True

    df = _apply_dtype(df=df,
                      dtype=dtype,
                      catalog_table_input=catalog_table_input,
                      mode=mode)

    paths: List[str] = []
    if dataset is False:
        pandas_kwargs["sep"] = sep
        pandas_kwargs["index"] = index
        pandas_kwargs["columns"] = columns
        _to_text(
            file_format="csv",
            df=df,
            use_threads=use_threads,
            path=path,
            boto3_session=session,
            s3_additional_kwargs=s3_additional_kwargs,
            **pandas_kwargs,
        )
        paths = [path]  # type: ignore
    else:
        compression: Optional[str] = pandas_kwargs.get("compression", None)
        if database and table:
            quoting: Optional[int] = csv.QUOTE_NONE
            escapechar: Optional[str] = "\\"
            header: Union[bool, List[str]] = pandas_kwargs.get("header", False)
            date_format: Optional[str] = "%Y-%m-%d %H:%M:%S.%f"
            pd_kwargs: Dict[str, Any] = {}
        else:
            quoting = pandas_kwargs.get("quoting", None)
            escapechar = pandas_kwargs.get("escapechar", None)
            header = pandas_kwargs.get("header", True)
            date_format = pandas_kwargs.get("date_format", None)
            pd_kwargs = pandas_kwargs.copy()
            pd_kwargs.pop("quoting", None)
            pd_kwargs.pop("escapechar", None)
            pd_kwargs.pop("header", None)
            pd_kwargs.pop("date_format", None)
            pd_kwargs.pop("compression", None)

        df = df[columns] if columns else df

        columns_types: Dict[str, str] = {}
        partitions_types: Dict[str, str] = {}

        if database and table:
            columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned(
                df=df,
                index=index,
                partition_cols=partition_cols,
                dtype=dtype,
                index_left=True)
            if schema_evolution is False:
                _utils.check_schema_changes(columns_types=columns_types,
                                            table_input=catalog_table_input,
                                            mode=mode)

            if (catalog_table_input is None) and (table_type == "GOVERNED"):
                catalog._create_csv_table(  # pylint: disable=protected-access
                    database=database,
                    table=table,
                    path=path,
                    columns_types=columns_types,
                    table_type=table_type,
                    partitions_types=partitions_types,
                    bucketing_info=bucketing_info,
                    description=description,
                    parameters=parameters,
                    columns_comments=columns_comments,
                    boto3_session=session,
                    mode=mode,
                    transaction_id=transaction_id,
                    schema_evolution=schema_evolution,
                    catalog_versioning=catalog_versioning,
                    sep=sep,
                    projection_enabled=projection_enabled,
                    projection_types=projection_types,
                    projection_ranges=projection_ranges,
                    projection_values=projection_values,
                    projection_intervals=projection_intervals,
                    projection_digits=projection_digits,
                    projection_storage_location_template=None,
                    catalog_table_input=catalog_table_input,
                    catalog_id=catalog_id,
                    compression=pandas_kwargs.get("compression"),
                    skip_header_line_count=None,
                    serde_library=None,
                    serde_parameters=None,
                )
                catalog_table_input = catalog._get_table_input(  # pylint: disable=protected-access
                    database=database,
                    table=table,
                    boto3_session=session,
                    transaction_id=transaction_id,
                    catalog_id=catalog_id,
                )

        paths, partitions_values = _to_dataset(
            func=_to_text,
            concurrent_partitioning=concurrent_partitioning,
            df=df,
            path_root=path,  # type: ignore
            index=index,
            sep=sep,
            compression=compression,
            catalog_id=catalog_id,
            database=database,
            table=table,
            table_type=table_type,
            transaction_id=transaction_id,
            filename_prefix=filename_prefix,
            use_threads=use_threads,
            partition_cols=partition_cols,
            partitions_types=partitions_types,
            bucketing_info=bucketing_info,
            mode=mode,
            boto3_session=session,
            s3_additional_kwargs=s3_additional_kwargs,
            file_format="csv",
            quoting=quoting,
            escapechar=escapechar,
            header=header,
            date_format=date_format,
            **pd_kwargs,
        )
        if database and table:
            try:
                serde_info: Dict[str, Any] = {}
                if catalog_table_input:
                    serde_info = catalog_table_input["StorageDescriptor"][
                        "SerdeInfo"]
                serde_library: Optional[str] = serde_info.get(
                    "SerializationLibrary", None)
                serde_parameters: Optional[Dict[str, str]] = serde_info.get(
                    "Parameters", None)
                catalog._create_csv_table(  # pylint: disable=protected-access
                    database=database,
                    table=table,
                    path=path,
                    columns_types=columns_types,
                    table_type=table_type,
                    partitions_types=partitions_types,
                    bucketing_info=bucketing_info,
                    description=description,
                    parameters=parameters,
                    columns_comments=columns_comments,
                    boto3_session=session,
                    mode=mode,
                    transaction_id=transaction_id,
                    catalog_versioning=catalog_versioning,
                    schema_evolution=schema_evolution,
                    sep=sep,
                    projection_enabled=projection_enabled,
                    projection_types=projection_types,
                    projection_ranges=projection_ranges,
                    projection_values=projection_values,
                    projection_intervals=projection_intervals,
                    projection_digits=projection_digits,
                    projection_storage_location_template=None,
                    catalog_table_input=catalog_table_input,
                    catalog_id=catalog_id,
                    compression=pandas_kwargs.get("compression"),
                    skip_header_line_count=True if header else None,
                    serde_library=serde_library,
                    serde_parameters=serde_parameters,
                )
                if partitions_values and (regular_partitions is
                                          True) and (table_type != "GOVERNED"):
                    _logger.debug("partitions_values:\n%s", partitions_values)
                    catalog.add_csv_partitions(
                        database=database,
                        table=table,
                        partitions_values=partitions_values,
                        bucketing_info=bucketing_info,
                        boto3_session=session,
                        sep=sep,
                        serde_library=serde_library,
                        serde_parameters=serde_parameters,
                        catalog_id=catalog_id,
                        columns_types=columns_types,
                        compression=pandas_kwargs.get("compression"),
                    )
                if commit_trans:
                    lakeformation.commit_transaction(
                        transaction_id=transaction_id,
                        boto3_session=boto3_session  # type: ignore
                    )
            except Exception:
                _logger.debug(
                    "Catalog write failed, cleaning up S3 (paths: %s).", paths)
                delete_objects(
                    path=paths,
                    use_threads=use_threads,
                    boto3_session=session,
                    s3_additional_kwargs=s3_additional_kwargs,
                )
                raise
    return {"paths": paths, "partitions_values": partitions_values}
def read_json(
    path: Union[str, List[str]],
    path_suffix: Union[str, List[str], None] = None,
    path_ignore_suffix: Union[str, List[str], None] = None,
    orient: str = "columns",
    use_threads: bool = True,
    last_modified_begin: Optional[datetime.datetime] = None,
    last_modified_end: Optional[datetime.datetime] = None,
    boto3_session: Optional[boto3.Session] = None,
    s3_additional_kwargs: Optional[Dict[str, str]] = None,
    chunksize: Optional[int] = None,
    dataset: bool = False,
    partition_filter: Optional[Callable[[Dict[str, str]], bool]] = None,
    **pandas_kwargs: Any,
) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
    """Read JSON file(s) from from a received S3 prefix or list of S3 objects paths.

    This function accepts Unix shell-style wildcards in the path argument.
    * (matches everything), ? (matches any single character),
    [seq] (matches any character in seq), [!seq] (matches any character not in seq).

    Note
    ----
    For partial and gradual reading use the argument ``chunksize`` instead of ``iterator``.

    Note
    ----
    In case of `use_threads=True` the number of threads
    that will be spawned will be gotten from os.cpu_count().

    Note
    ----
    The filter by last_modified begin last_modified end is applied after list all S3 files

    Parameters
    ----------
    path : Union[str, List[str]]
        S3 prefix (accepts Unix shell-style wildcards)
        (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. ``[s3://bucket/key0, s3://bucket/key1]``).
    path_suffix: Union[str, List[str], None]
        Suffix or List of suffixes for filtering S3 keys.
    path_ignore_suffix: Union[str, List[str], None]
        Suffix or List of suffixes for S3 keys to be ignored.
    orient : str
        Same as Pandas: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_json.html
    use_threads : bool
        True to enable concurrent requests, False to disable multiple threads.
        If enabled os.cpu_count() will be used as the max number of threads.
    last_modified_begin
        Filter the s3 files by the Last modified date of the object.
        The filter is applied only after list all s3 files.
    last_modified_end: datetime, optional
        Filter the s3 files by the Last modified date of the object.
        The filter is applied only after list all s3 files.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
    s3_additional_kwargs:
        Forward to botocore requests, only "SSECustomerAlgorithm" and "SSECustomerKey" arguments will be considered.
    chunksize: int, optional
        If specified, return an generator where chunksize is the number of rows to include in each chunk.
    dataset: bool
        If `True` read a JSON dataset instead of simple file(s) loading all the related partitions as columns.
        If `True`, the `lines=True` will be assumed by default.
    partition_filter: Optional[Callable[[Dict[str, str]], bool]]
        Callback Function filters to apply on PARTITION columns (PUSH-DOWN filter).
        This function MUST receive a single argument (Dict[str, str]) where keys are partitions
        names and values are partitions values. Partitions values will be always strings extracted from S3.
        This function MUST return a bool, True to read the partition or False to ignore it.
        Ignored if `dataset=False`.
        E.g ``lambda x: True if x["year"] == "2020" and x["month"] == "1" else False``
        https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/023%20-%20Flexible%20Partitions%20Filter.ipynb
    pandas_kwargs:
        KEYWORD arguments forwarded to pandas.read_json(). You can NOT pass `pandas_kwargs` explicit, just add valid
        Pandas arguments in the function call and Wrangler will accept it.
        e.g. wr.s3.read_json('s3://bucket/prefix/', lines=True, keep_default_dates=True)
        https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_json.html

    Returns
    -------
    Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]]
        Pandas DataFrame or a Generator in case of `chunksize != None`.

    Examples
    --------
    Reading all JSON files under a prefix

    >>> import awswrangler as wr
    >>> df = wr.s3.read_json(path='s3://bucket/prefix/')

    Reading all CSV files under a prefix and using pandas_kwargs

    >>> import awswrangler as wr
    >>> df = wr.s3.read_json('s3://bucket/prefix/', lines=True, keep_default_dates=True)

    Reading all JSON files from a list

    >>> import awswrangler as wr
    >>> df = wr.s3.read_json(path=['s3://bucket/filename0.json', 's3://bucket/filename1.json'])

    Reading in chunks of 100 lines

    >>> import awswrangler as wr
    >>> dfs = wr.s3.read_json(path=['s3://bucket/0.json', 's3://bucket/1.json'], chunksize=100, lines=True)
    >>> for df in dfs:
    >>>     print(df)  # 100 lines Pandas DataFrame

    Reading JSON Dataset with PUSH-DOWN filter over partitions

    >>> import awswrangler as wr
    >>> my_filter = lambda x: True if x["city"].startswith("new") else False
    >>> df = wr.s3.read_json(path, dataset=True, partition_filter=my_filter)

    """
    if "pandas_kwargs" in pandas_kwargs:
        raise exceptions.InvalidArgument(
            "You can NOT pass `pandas_kwargs` explicit, just add valid "
            "Pandas arguments in the function call and Wrangler will accept it."
            "e.g. wr.s3.read_json(path, lines=True, keep_default_dates=True)")
    if (dataset is True) and ("lines" not in pandas_kwargs):
        pandas_kwargs["lines"] = True
    pandas_kwargs["orient"] = orient
    ignore_index: bool = orient not in ("split", "index", "columns")
    return _read_text(
        parser_func=pd.read_json,
        path=path,
        path_suffix=path_suffix,
        path_ignore_suffix=path_ignore_suffix,
        use_threads=use_threads,
        boto3_session=boto3_session,
        s3_additional_kwargs=s3_additional_kwargs,
        chunksize=chunksize,
        dataset=dataset,
        partition_filter=partition_filter,
        last_modified_begin=last_modified_begin,
        last_modified_end=last_modified_end,
        ignore_index=ignore_index,
        **pandas_kwargs,
    )