Esempi in Python per get_account_id, esempi in Python per awswrangler.sts.get_account_id

Esempio n. 1

0

Mostra file

def delete_all_datasets(account_id: Optional[str] = None,
                        boto3_session: Optional[boto3.Session] = None) -> None:
    """Delete all datasets.

    Parameters
    ----------
    account_id : str, optional
        If None, the account ID will be inferred from your boto3 session.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    None
        None.

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.quicksight.delete_all_datasets()
    """
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    if account_id is None:
        account_id = sts.get_account_id(boto3_session=session)
    for dataset in list_datasets(account_id=account_id, boto3_session=session):
        delete_dataset(dataset_id=dataset["DataSetId"],
                       account_id=account_id,
                       boto3_session=session)

Esempio n. 2

0

Mostra file

def delete_all_templates(
        account_id: Optional[str] = None,
        boto3_session: Optional[boto3.Session] = None) -> None:
    """Delete all templates.

    Parameters
    ----------
    account_id : str, optional
        If None, the account ID will be inferred from your boto3 session.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    None
        None.

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.quicksight.delete_all_templates()
    """
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    if account_id is None:
        account_id = sts.get_account_id(boto3_session=session)
    for template in list_templates(account_id=account_id,
                                   boto3_session=session):  # pragma: no cover
        delete_template(template_id=template["TemplateId"],
                        account_id=account_id,
                        boto3_session=session)

Esempio n. 3

0

Mostra file

def create_athena_bucket(boto3_session: Optional[boto3.Session] = None) -> str:
    """Create the default Athena bucket if it doesn't exist.

    Parameters
    ----------
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    str
        Bucket s3 path (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/)

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.athena.create_athena_bucket()
    's3://aws-athena-query-results-ACCOUNT-REGION/'

    """
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    account_id: str = sts.get_account_id(boto3_session=session)
    region_name: str = str(session.region_name).lower()
    s3_output = f"s3://aws-athena-query-results-{account_id}-{region_name}/"
    s3_resource = session.resource("s3")
    s3_resource.Bucket(s3_output)
    return s3_output

Esempio n. 4

0

Mostra file

File: _delete.py Progetto: telegit/aws-data-wrangler

def _delete(
    func_name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, **kwargs: Any
) -> None:
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    if account_id is None:
        account_id = sts.get_account_id(boto3_session=session)
    client: boto3.client = _utils.client(service_name="quicksight", session=session)
    func: Callable[..., None] = getattr(client, func_name)
    func(AwsAccountId=account_id, **kwargs)

Esempio n. 5

0

Mostra file

def create_ingestion(
    dataset_name: Optional[str] = None,
    dataset_id: Optional[str] = None,
    ingestion_id: Optional[str] = None,
    account_id: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> str:
    """Create and starts a new SPICE ingestion on a dataset.

    Note
    ----
    You must pass ``dataset_name`` OR ``dataset_id`` argument.

    Parameters
    ----------
    dataset_name : str, optional
        Dataset name.
    dataset_id : str, optional
        Dataset ID.
    ingestion_id : str, optional
        Ingestion ID.
    account_id : str, optional
        If None, the account ID will be inferred from your boto3 session.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    str
        Ingestion ID

    Examples
    --------
    >>> import awswrangler as wr
    >>> status = wr.quicksight.create_ingestion("my_dataset")
    """
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    if account_id is None:
        account_id = sts.get_account_id(boto3_session=session)
    if (dataset_name is None) and (dataset_id is None):
        raise exceptions.InvalidArgument(
            "You must pass a not None dataset_name or dataset_id argument."
        )  # pragma: no cover
    if (dataset_id is None) and (dataset_name is not None):
        dataset_id = get_dataset_id(name=dataset_name,
                                    account_id=account_id,
                                    boto3_session=session)
    if ingestion_id is None:
        ingestion_id = uuid.uuid4().hex
    client: boto3.client = _utils.client(service_name="quicksight",
                                         session=session)
    response: Dict[str,
                   Any] = client.create_ingestion(DataSetId=dataset_id,
                                                  IngestionId=ingestion_id,
                                                  AwsAccountId=account_id)
    return response["IngestionId"]

Esempio n. 6

0

Mostra file

File: _describe.py Progetto: weishao-aws/aws-data-wrangler

def describe_ingestion(
    ingestion_id: str,
    dataset_name: Optional[str] = None,
    dataset_id: Optional[str] = None,
    account_id: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> Dict[str, Any]:
    """Describe a QuickSight ingestion by ID.

    Note
    ----
    You must pass a not None value for ``dataset_name`` or ``dataset_id`` argument.

    Parameters
    ----------
    ingestion_id : str
        Ingestion ID.
    dataset_name : str, optional
        Dataset name.
    dataset_id : str, optional
        Dataset ID.
    account_id : str, optional
        If None, the account ID will be inferred from your boto3 session.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    Dict[str, Any]
        Ingestion Description.

    Examples
    --------
    >>> import awswrangler as wr
    >>> description = wr.quicksight.describe_dataset(ingestion_id="...", dataset_name="...")
    """
    if (dataset_name is None) and (dataset_id is None):
        raise exceptions.InvalidArgument(
            "You must pass a not None name or dataset_id argument.")
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    if account_id is None:
        account_id = sts.get_account_id(boto3_session=session)
    if (dataset_id is None) and (dataset_name is not None):
        dataset_id = get_dataset_id(name=dataset_name,
                                    account_id=account_id,
                                    boto3_session=session)
    client: boto3.client = _utils.client(service_name="quicksight",
                                         session=session)
    return cast(
        Dict[str, Any],
        client.describe_ingestion(IngestionId=ingestion_id,
                                  AwsAccountId=account_id,
                                  DataSetId=dataset_id)["Ingestion"],
    )

Esempio n. 7

0

Mostra file

File: _cancel.py Progetto: rparthas/aws-data-wrangler

def cancel_ingestion(
    ingestion_id: str = None,
    dataset_name: Optional[str] = None,
    dataset_id: Optional[str] = None,
    account_id: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> None:
    """Cancel an ongoing ingestion of data into SPICE.

    Note
    ----
    You must pass a not None value for ``dataset_name`` or ``dataset_id`` argument.

    Parameters
    ----------
    ingestion_id : str
        Ingestion ID.
    dataset_name : str, optional
        Dataset name.
    dataset_id : str, optional
        Dataset ID.
    account_id : str, optional
        If None, the account ID will be inferred from your boto3 session.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    None
        None.

    Examples
    --------
    >>> import awswrangler as wr
    >>>  wr.quicksight.cancel_ingestion(ingestion_id="...", dataset_name="...")
    """
    if (dataset_name is None) and (dataset_id is None):
        raise exceptions.InvalidArgument(
            "You must pass a not None name or dataset_id argument.")
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    if account_id is None:
        account_id = sts.get_account_id(boto3_session=session)
    if (dataset_id is None) and (dataset_name is not None):
        dataset_id = get_dataset_id(name=dataset_name,
                                    account_id=account_id,
                                    boto3_session=session)
    client: boto3.client = _utils.client(service_name="quicksight",
                                         session=session)
    client.cancel_ingestion(IngestionId=ingestion_id,
                            AwsAccountId=account_id,
                            DataSetId=dataset_id)

Esempio n. 8

0

Mostra file

File: _describe.py Progetto: weishao-aws/aws-data-wrangler

def describe_data_source_permissions(
    name: Optional[str] = None,
    data_source_id: Optional[str] = None,
    account_id: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> Dict[str, Any]:
    """Describe a QuickSight data source permissions by name or ID.

    Note
    ----
    You must pass a not None ``name`` or ``data_source_id`` argument.

    Parameters
    ----------
    name : str, optional
        Data source name.
    data_source_id : str, optional
        Data source ID.
    account_id : str, optional
        If None, the account ID will be inferred from your boto3 session.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    Dict[str, Any]
        Data source Permissions Description.

    Examples
    --------
    >>> import awswrangler as wr
    >>> description = wr.quicksight.describe_data_source_permissions("my-data-source")
    """
    if (name is None) and (data_source_id is None):
        raise exceptions.InvalidArgument(
            "You must pass a not None name or data_source_id argument.")
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    if account_id is None:
        account_id = sts.get_account_id(boto3_session=session)
    if (data_source_id is None) and (name is not None):
        data_source_id = get_data_source_id(name=name,
                                            account_id=account_id,
                                            boto3_session=session)
    client: boto3.client = _utils.client(service_name="quicksight",
                                         session=session)
    return cast(
        Dict[str, Any],
        client.describe_data_source_permissions(
            AwsAccountId=account_id,
            DataSourceId=data_source_id)["Permissions"],
    )

Esempio n. 9

0

Mostra file

File: emr.py Progetto: rparthas/aws-data-wrangler

def _get_default_logging_path(
    subnet_id: Optional[str] = None,
    account_id: Optional[str] = None,
    region: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> str:
    """Get EMR default logging path.

    E.g. "s3://aws-logs-{account_id}-{region}/elasticmapreduce/"

    Parameters
    ----------
    subnet_id : str, optional
        Subnet ID. If not provided, you must pass `account_id` and `region` explicit.
    account_id: str, optional
        Account ID.
    region: str, optional
        Region e.g. 'us-east-1'
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    str
        Default logging path.
        E.g. "s3://aws-logs-{account_id}-{region}/elasticmapreduce/"

    Examples
    --------
    >>> import awswrangler as wr
    >>> state = wr.emr._get_default_logging_path("subnet-id")
    's3://aws-logs-{account_id}-{region}/elasticmapreduce/'

    """
    if account_id is None:
        boto3_session = _utils.ensure_session(session=boto3_session)
        _account_id: str = sts.get_account_id(boto3_session=boto3_session)
    else:
        _account_id = account_id
    if (region is None) and (subnet_id is not None):
        _region: str = _utils.get_region_from_session(
            boto3_session=boto3_session)
    elif (region is None) and (subnet_id is None):
        raise exceptions.InvalidArgumentCombination(
            "You must pass region or subnet_id or both.")
    else:
        _region = region  # type: ignore
    return f"s3://aws-logs-{_account_id}-{_region}/elasticmapreduce/"

Esempio n. 10

0

Mostra file

def list_ingestions(
    dataset_name: Optional[str] = None,
    dataset_id: Optional[str] = None,
    account_id: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> List[Dict[str, Any]]:
    """List the history of SPICE ingestions for a dataset.

    Parameters
    ----------
    dataset_name : str, optional
        Dataset name.
    dataset_id : str, optional
        The ID of the dataset used in the ingestion.
    account_id : str, optional
        If None, the account ID will be inferred from your boto3 session.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    List[Dict[str, Any]]
        IAM policy assignments.

    Examples
    --------
    >>> import awswrangler as wr
    >>> ingestions = wr.quicksight.list_ingestions()
    """
    if (dataset_name is None) and (dataset_id is None):
        raise exceptions.InvalidArgument(
            "You must pass a not None name or dataset_id argument."
        )  # pragma: no cover
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    if account_id is None:
        account_id = sts.get_account_id(boto3_session=session)
    if (dataset_id is None) and (dataset_name is not None):
        dataset_id = get_dataset_id(name=dataset_name,
                                    account_id=account_id,
                                    boto3_session=session)
    return _list(
        func_name="list_ingestions",
        attr_name="Ingestions",
        account_id=account_id,
        boto3_session=boto3_session,
        DataSetId=dataset_id,
    )

Esempio n. 11

0

Mostra file

File: _utils.py Progetto: awslabs/aws-data-wrangler

def create_athena_bucket(boto3_session: Optional[boto3.Session] = None) -> str:
    """Create the default Athena bucket if it doesn't exist.

    Parameters
    ----------
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    str
        Bucket s3 path (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/)

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.athena.create_athena_bucket()
    's3://aws-athena-query-results-ACCOUNT-REGION/'

    """
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    account_id: str = sts.get_account_id(boto3_session=session)
    region_name: str = str(session.region_name).lower()
    bucket_name = f"aws-athena-query-results-{account_id}-{region_name}"
    path = f"s3://{bucket_name}/"
    resource = _utils.resource(service_name="s3", session=session)
    bucket = resource.Bucket(bucket_name)
    args = {} if region_name == "us-east-1" else {
        "CreateBucketConfiguration": {
            "LocationConstraint": region_name
        }
    }
    try:
        bucket.create(**args)
    except resource.meta.client.exceptions.BucketAlreadyOwnedByYou as err:
        _logger.debug("Bucket %s already exists.",
                      err.response["Error"]["BucketName"])
    except botocore.exceptions.ClientError as err:
        if err.response["Error"]["Code"] == "OperationAborted":
            _logger.debug(
                "A conflicting conditional operation is currently in progress against this resource."
            )
    bucket.wait_until_exists()
    return path

Esempio n. 12

0

Mostra file

def _list(
    func_name: str,
    attr_name: str,
    account_id: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
    **kwargs: Any,
) -> List[Dict[str, Any]]:
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    if account_id is None:
        account_id = sts.get_account_id(boto3_session=session)
    client: boto3.client = _utils.client(service_name="quicksight", session=session)
    func: Callable[..., Dict[str, Any]] = getattr(client, func_name)
    response: Dict[str, Any] = func(AwsAccountId=account_id, **kwargs)
    next_token: str = response.get("NextToken", None)
    result: List[Dict[str, Any]] = response[attr_name]
    while next_token is not None:
        response = func(AwsAccountId=account_id, NextToken=next_token, **kwargs)
        next_token = response.get("NextToken", None)
        result += response[attr_name]
    return result

Esempio n. 13

0

Mostra file

File: _create.py Progetto: vikramshitole/aws-data-wrangler

def create_athena_dataset(
    name: str,
    database: Optional[str] = None,
    table: Optional[str] = None,
    sql: Optional[str] = None,
    sql_name: str = "CustomSQL",
    data_source_name: Optional[str] = None,
    data_source_arn: Optional[str] = None,
    import_mode: str = "DIRECT_QUERY",
    allowed_to_use: Optional[List[str]] = None,
    allowed_to_manage: Optional[List[str]] = None,
    logical_table_alias: str = "LogicalTable",
    rename_columns: Optional[Dict[str, str]] = None,
    cast_columns_types: Optional[Dict[str, str]] = None,
    tags: Optional[Dict[str, str]] = None,
    account_id: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> str:
    """Create a QuickSight dataset.

    Note
    ----
    You will not be able to see the the dataset in the console
    if you not pass your user to one of the ``allowed_*`` arguments.

    Note
    ----
    You must pass ``database``/``table`` OR ``sql`` argument.

    Note
    ----
    You must pass ``data_source_name`` OR ``data_source_arn`` argument.

    Parameters
    ----------
    name : str
        Dataset name.
    database : str
        Athena's database name.
    table : str
        Athena's table name.
    sql : str
        Use a SQL query to define your table.
    sql_name : str
        Query name.
    data_source_name : str, optional
        QuickSight data source name.
    data_source_arn : str, optional
        QuickSight data source ARN.
    import_mode : str
        Indicates whether you want to import the data into SPICE.
        'SPICE'|'DIRECT_QUERY'
    tags : Dict[str, str], optional
        Key/Value collection to put on the Cluster.
        e.g. {"foo": "boo", "bar": "xoo"})
    allowed_to_use : optional
        List of principals that will be allowed to see and use the data source.
        e.g. ["john", "Mary"]
    allowed_to_manage : optional
        List of principals that will be allowed to see, use, update and delete the data source.
        e.g. ["Mary"]
    logical_table_alias : str
        A display name for the logical table.
    rename_columns : Dict[str, str], optional
        Dictionary to map column renames. e.g. {"old_name": "new_name", "old_name2": "new_name2"}
    cast_columns_types : Dict[str, str], optional
        Dictionary to map column casts. e.g. {"col_name": "STRING", "col_name2": "DECIMAL"}
        Valid types: 'STRING'|'INTEGER'|'DECIMAL'|'DATETIME'
    account_id : str, optional
        If None, the account ID will be inferred from your boto3 session.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    str
        Dataset ID.

    Examples
    --------
    >>> import awswrangler as wr
    >>> dataset_id = wr.quicksight.create_athena_dataset(
    ...     name="...",
    ...     database="..."
    ...     table="..."
    ...     data_source_name="..."
    ...     allowed_to_manage=["Mary"]
    ... )
    """
    if (data_source_name is None) and (data_source_arn is None):
        raise exceptions.InvalidArgument("You must pass a not None data_source_name or data_source_arn argument.")
    if ((database is None) and (table is None)) and (sql is None):
        raise exceptions.InvalidArgument("You must pass database/table OR sql argument.")
    if (database is not None) and (sql is not None):
        raise exceptions.InvalidArgument(
            "If you provide sql argument, please include the database name inside the sql statement."
            "Do NOT pass in with database argument."
        )
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    client: boto3.client = _utils.client(service_name="quicksight", session=session)
    if account_id is None:
        account_id = sts.get_account_id(boto3_session=session)
    if (data_source_arn is None) and (data_source_name is not None):
        data_source_arn = get_data_source_arn(name=data_source_name, account_id=account_id, boto3_session=session)
    if sql is not None:
        physical_table: Dict[str, Dict[str, Any]] = {
            "CustomSql": {
                "DataSourceArn": data_source_arn,
                "Name": sql_name,
                "SqlQuery": sql,
                "Columns": extract_athena_query_columns(
                    sql=sql,
                    data_source_arn=data_source_arn,  # type: ignore
                    account_id=account_id,
                    boto3_session=session,
                ),
            }
        }
    else:
        physical_table = {
            "RelationalTable": {
                "DataSourceArn": data_source_arn,
                "Schema": database,
                "Name": table,
                "InputColumns": extract_athena_table_columns(
                    database=database,  # type: ignore
                    table=table,  # type: ignore
                    boto3_session=session,
                ),
            }
        }
    table_uuid: str = uuid.uuid4().hex
    dataset_id: str = uuid.uuid4().hex
    args: Dict[str, Any] = {
        "AwsAccountId": account_id,
        "DataSetId": dataset_id,
        "Name": name,
        "ImportMode": import_mode,
        "PhysicalTableMap": {table_uuid: physical_table},
        "LogicalTableMap": {table_uuid: {"Alias": logical_table_alias, "Source": {"PhysicalTableId": table_uuid}}},
    }
    trans: List[Dict[str, Dict[str, Any]]] = _generate_transformations(
        rename_columns=rename_columns, cast_columns_types=cast_columns_types
    )
    if trans:
        args["LogicalTableMap"][table_uuid]["DataTransforms"] = trans
    permissions: List[Dict[str, Union[str, List[str]]]] = _generate_permissions(
        resource="dataset",
        account_id=account_id,
        boto3_session=session,
        allowed_to_use=allowed_to_use,
        allowed_to_manage=allowed_to_manage,
    )
    if permissions:
        args["Permissions"] = permissions
    if tags is not None:
        _tags: List[Dict[str, str]] = [{"Key": k, "Value": v} for k, v in tags.items()]
        args["Tags"] = _tags
    client.create_data_set(**args)
    return dataset_id

Esempio n. 14

0

Mostra file

File: _create.py Progetto: vikramshitole/aws-data-wrangler

def create_athena_data_source(
    name: str,
    workgroup: str = "primary",
    allowed_to_use: Optional[List[str]] = None,
    allowed_to_manage: Optional[List[str]] = None,
    tags: Optional[Dict[str, str]] = None,
    account_id: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> None:
    """Create a QuickSight data source pointing to an Athena/Workgroup.

    Note
    ----
    You will not be able to see the the data source in the console
    if you not pass your user to one of the ``allowed_*`` arguments.

    Parameters
    ----------
    name : str
        Data source name.
    workgroup : str
        Athena workgroup.
    tags : Dict[str, str], optional
        Key/Value collection to put on the Cluster.
        e.g. {"foo": "boo", "bar": "xoo"})
    allowed_to_use : optional
        List of principals that will be allowed to see and use the data source.
        e.g. ["John"]
    allowed_to_manage : optional
        List of principals that will be allowed to see, use, update and delete the data source.
        e.g. ["Mary"]
    account_id : str, optional
        If None, the account ID will be inferred from your boto3 session.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    None
        None.

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.quicksight.create_athena_data_source(
    ...     name="...",
    ...     allowed_to_manage=["john"]
    ... )
    """
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    client: boto3.client = _utils.client(service_name="quicksight", session=session)
    if account_id is None:
        account_id = sts.get_account_id(boto3_session=session)
    args: Dict[str, Any] = {
        "AwsAccountId": account_id,
        "DataSourceId": name,
        "Name": name,
        "Type": "ATHENA",
        "DataSourceParameters": {"AthenaParameters": {"WorkGroup": workgroup}},
        "SslProperties": {"DisableSsl": True},
    }
    permissions: List[Dict[str, Union[str, List[str]]]] = _generate_permissions(
        resource="data_source",
        account_id=account_id,
        boto3_session=session,
        allowed_to_use=allowed_to_use,
        allowed_to_manage=allowed_to_manage,
    )
    if permissions:
        args["Permissions"] = permissions
    if tags is not None:
        _tags: List[Dict[str, str]] = [{"Key": k, "Value": v} for k, v in tags.items()]
        args["Tags"] = _tags
    client.create_data_source(**args)

Esempio n. 15

0

Mostra file

File: emr.py Progetto: rparthas/aws-data-wrangler

def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-statements
    account_id: str = sts.get_account_id(boto3_session=pars["boto3_session"])
    region: str = _utils.get_region_from_session(
        boto3_session=pars["boto3_session"])

    # S3 Logging path
    if pars.get("logging_s3_path") is None:
        pars["logging_s3_path"] = _get_default_logging_path(
            subnet_id=None,
            account_id=account_id,
            region=region,
            boto3_session=pars["boto3_session"])

    spark_env: Optional[Dict[str, str]] = None
    yarn_env: Optional[Dict[str, str]] = None
    livy_env: Optional[Dict[str, str]] = None

    if pars["spark_pyarrow"] is True:
        if pars["spark_defaults"] is None:
            pars["spark_defaults"] = {
                "spark.sql.execution.arrow.enabled": "true"
            }
        else:
            pars["spark_defaults"][
                "spark.sql.execution.arrow.enabled"] = "true"
        spark_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"}
        yarn_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"}
        livy_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"}

    if pars["python3"] is True:
        if spark_env is None:
            spark_env = {"PYSPARK_PYTHON": "/usr/bin/python3"}
        else:
            spark_env["PYSPARK_PYTHON"] = "/usr/bin/python3"

    if pars["spark_jars_path"] is not None:
        paths: str = ",".join(pars["spark_jars_path"])
        if pars["spark_defaults"] is None:
            pars["spark_defaults"] = {"spark.jars": paths}
        else:
            pars["spark_defaults"]["spark.jars"] = paths

    args: Dict[str, Any] = {
        "Name": pars["cluster_name"],
        "LogUri": pars["logging_s3_path"],
        "ReleaseLabel": pars["emr_release"],
        "VisibleToAllUsers": pars["visible_to_all_users"],
        "JobFlowRole": pars["emr_ec2_role"],
        "ServiceRole": pars["emr_role"],
        "Instances": {
            "KeepJobFlowAliveWhenNoSteps":
            pars["keep_cluster_alive_when_no_steps"],
            "TerminationProtected": pars["termination_protected"],
            "Ec2SubnetId": pars["subnet_id"],
            "InstanceFleets": [],
        },
    }

    # EC2 Key Pair
    if pars["key_pair_name"] is not None:
        args["Instances"]["Ec2KeyName"] = pars["key_pair_name"]

    # Security groups
    if pars["security_group_master"] is not None:
        args["Instances"]["EmrManagedMasterSecurityGroup"] = pars[
            "security_group_master"]
    if pars["security_groups_master_additional"] is not None:
        args["Instances"]["AdditionalMasterSecurityGroups"] = pars[
            "security_groups_master_additional"]
    if pars["security_group_slave"] is not None:
        args["Instances"]["EmrManagedSlaveSecurityGroup"] = pars[
            "security_group_slave"]
    if pars["security_groups_slave_additional"] is not None:
        args["Instances"]["AdditionalSlaveSecurityGroups"] = pars[
            "security_groups_slave_additional"]
    if pars["security_group_service_access"] is not None:
        args["Instances"]["ServiceAccessSecurityGroup"] = pars[
            "security_group_service_access"]

    # Configurations
    args["Configurations"] = [{
        "Classification": "spark-log4j",
        "Properties": {
            "log4j.rootCategory": f"{pars['spark_log_level']}, console"
        }
    }]
    if pars["docker"] is True:
        if pars.get("extra_registries") is None:
            extra_registries: List[str] = []
        else:
            extra_registries = pars["extra_registries"]
        registries: str = f"local,centos,{account_id}.dkr.ecr.{region}.amazonaws.com,{','.join(extra_registries)}"
        registries = registries[:-1] if registries.endswith(
            ",") else registries
        args["Configurations"].append({
            "Classification":
            "container-executor",
            "Properties": {},
            "Configurations": [{
                "Classification": "docker",
                "Properties": {
                    "docker.privileged-containers.registries": registries,
                    "docker.trusted.registries": registries,
                },
                "Configurations": [],
            }],
        })

    if spark_env is not None:
        args["Configurations"].append({
            "Classification":
            "spark-env",
            "Properties": {},
            "Configurations": [{
                "Classification": "export",
                "Properties": spark_env,
                "Configurations": []
            }],
        })
    if yarn_env is not None:
        args["Configurations"].append({
            "Classification":
            "yarn-env",
            "Properties": {},
            "Configurations": [{
                "Classification": "export",
                "Properties": yarn_env,
                "Configurations": []
            }],
        })
    if livy_env is not None:
        args["Configurations"].append({
            "Classification":
            "livy-env",
            "Properties": {},
            "Configurations": [{
                "Classification": "export",
                "Properties": livy_env,
                "Configurations": []
            }],
        })
    if pars["spark_glue_catalog"] is True:
        args["Configurations"].append({
            "Classification": "spark-hive-site",
            "Properties": {
                "hive.metastore.client.factory.class":
                "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"  # noqa
            },
            "Configurations": [],
        })
    if pars["hive_glue_catalog"] is True:
        hive_conf: Optional[Dict[str, Any]] = {
            "Classification": "hive-site",
            "Properties": {},
            "Configurations": []
        }
        hive_conf["Properties"][
            "hive.metastore.client.factory.class"] = "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
        args["Configurations"].append(hive_conf)
    if pars["presto_glue_catalog"] is True:
        args["Configurations"].append({
            "Classification": "presto-connector-hive",
            "Properties": {
                "hive.metastore.glue.datacatalog.enabled": "true"
            },
            "Configurations": [],
        })
    if pars["consistent_view"] is True:
        args["Configurations"].append({
            "Classification": "emrfs-site",
            "Properties": {
                "fs.s3.consistent.retryPeriodSeconds":
                str(pars.get("consistent_view_retry_seconds", "10")),
                "fs.s3.consistent":
                "true",
                "fs.s3.consistent.retryCount":
                str(pars.get("consistent_view_retry_count", "5")),
                "fs.s3.consistent.metadata.tableName":
                pars.get("consistent_view_table_name", "EmrFSMetadata"),
            },
        })
    if pars["maximize_resource_allocation"] is True:
        args["Configurations"].append({
            "Classification": "spark",
            "Properties": {
                "maximizeResourceAllocation": "true"
            }
        })
    if pars["spark_defaults"] is not None:
        spark_defaults: Dict[str, Union[str, Dict[str, str]]] = {
            "Classification": "spark-defaults",
            "Properties": pars["spark_defaults"],
        }
        args["Configurations"].append(spark_defaults)
    if pars.get("custom_classifications") is not None:
        for c in pars["custom_classifications"]:
            args["Configurations"].append(c)

    # Applications
    if pars["applications"]:
        args["Applications"] = [{"Name": x} for x in pars["applications"]]

    # Bootstraps
    if pars["bootstraps_paths"]:
        args["BootstrapActions"] = [{
            "Name": x,
            "ScriptBootstrapAction": {
                "Path": x
            }
        } for x in pars["bootstraps_paths"]]

    # Debugging and Steps
    if (pars["debugging"] is True) or (pars["steps"] is not None):
        args["Steps"] = []
        if pars["debugging"] is True:
            args["Steps"].append({
                "Name": "Setup Hadoop Debugging",
                "ActionOnFailure": "TERMINATE_CLUSTER",
                "HadoopJarStep": {
                    "Jar": "command-runner.jar",
                    "Args": ["state-pusher-script"]
                },
            })
        if pars["steps"] is not None:
            args["Steps"] += pars["steps"]

    # Master Instance Fleet
    timeout_action_master: str = "SWITCH_TO_ON_DEMAND" if pars[
        "spot_timeout_to_on_demand_master"] else "TERMINATE_CLUSTER"
    fleet_master: Dict = {
        "Name":
        "MASTER",
        "InstanceFleetType":
        "MASTER",
        "TargetOnDemandCapacity":
        pars["instance_num_on_demand_master"],
        "TargetSpotCapacity":
        pars["instance_num_spot_master"],
        "InstanceTypeConfigs": [{
            "InstanceType":
            pars["instance_type_master"],
            "WeightedCapacity":
            1,
            "BidPriceAsPercentageOfOnDemandPrice":
            pars["spot_bid_percentage_of_on_demand_master"],
            "EbsConfiguration": {
                "EbsBlockDeviceConfigs": [{
                    "VolumeSpecification": {
                        "SizeInGB": pars["instance_ebs_size_master"],
                        "VolumeType": "gp2"
                    },
                    "VolumesPerInstance": 1,
                }],
                "EbsOptimized":
                True,
            },
        }],
    }
    if pars["instance_num_spot_master"] > 0:
        fleet_master["LaunchSpecifications"] = {
            "SpotSpecification": {
                "TimeoutDurationMinutes":
                pars["spot_provisioning_timeout_master"],
                "TimeoutAction": timeout_action_master,
            }
        }
    args["Instances"]["InstanceFleets"].append(fleet_master)

    # Core Instance Fleet
    if (pars["instance_num_spot_core"] >
            0) or pars["instance_num_on_demand_core"] > 0:
        timeout_action_core = "SWITCH_TO_ON_DEMAND" if pars[
            "spot_timeout_to_on_demand_core"] else "TERMINATE_CLUSTER"
        fleet_core: Dict = {
            "Name":
            "CORE",
            "InstanceFleetType":
            "CORE",
            "TargetOnDemandCapacity":
            pars["instance_num_on_demand_core"],
            "TargetSpotCapacity":
            pars["instance_num_spot_core"],
            "InstanceTypeConfigs": [{
                "InstanceType":
                pars["instance_type_core"],
                "WeightedCapacity":
                1,
                "BidPriceAsPercentageOfOnDemandPrice":
                pars["spot_bid_percentage_of_on_demand_core"],
                "EbsConfiguration": {
                    "EbsBlockDeviceConfigs": [{
                        "VolumeSpecification": {
                            "SizeInGB": pars["instance_ebs_size_core"],
                            "VolumeType": "gp2",
                        },
                        "VolumesPerInstance": 1,
                    }],
                    "EbsOptimized":
                    True,
                },
            }],
        }
        if pars["instance_num_spot_core"] > 0:
            fleet_core["LaunchSpecifications"] = {
                "SpotSpecification": {
                    "TimeoutDurationMinutes":
                    pars["spot_provisioning_timeout_core"],
                    "TimeoutAction":
                    timeout_action_core,
                }
            }
        args["Instances"]["InstanceFleets"].append(fleet_core)

    # Task Instance Fleet
    if (pars["instance_num_spot_task"] >
            0) or pars["instance_num_on_demand_task"] > 0:
        timeout_action_task: str = "SWITCH_TO_ON_DEMAND" if pars[
            "spot_timeout_to_on_demand_task"] else "TERMINATE_CLUSTER"
        fleet_task: Dict = {
            "Name":
            "TASK",
            "InstanceFleetType":
            "TASK",
            "TargetOnDemandCapacity":
            pars["instance_num_on_demand_task"],
            "TargetSpotCapacity":
            pars["instance_num_spot_task"],
            "InstanceTypeConfigs": [{
                "InstanceType":
                pars["instance_type_task"],
                "WeightedCapacity":
                1,
                "BidPriceAsPercentageOfOnDemandPrice":
                pars["spot_bid_percentage_of_on_demand_task"],
                "EbsConfiguration": {
                    "EbsBlockDeviceConfigs": [{
                        "VolumeSpecification": {
                            "SizeInGB": pars["instance_ebs_size_task"],
                            "VolumeType": "gp2",
                        },
                        "VolumesPerInstance": 1,
                    }],
                    "EbsOptimized":
                    True,
                },
            }],
        }
        if pars["instance_num_spot_task"] > 0:
            fleet_task["LaunchSpecifications"] = {
                "SpotSpecification": {
                    "TimeoutDurationMinutes":
                    pars["spot_provisioning_timeout_task"],
                    "TimeoutAction":
                    timeout_action_task,
                }
            }
        args["Instances"]["InstanceFleets"].append(fleet_task)

    # Tags
    if pars["tags"] is not None:
        args["Tags"] = [{
            "Key": k,
            "Value": v
        } for k, v in pars["tags"].items()]

    _logger.debug("args: \n%s", pprint.pformat(args))
    return args