Ejemplo n.º 1
0
def _generate_permissions(
    resource: str,
    account_id: str,
    boto3_session: boto3.Session,
    allowed_to_use: Optional[List[str]] = None,
    allowed_to_manage: Optional[List[str]] = None,
) -> List[Dict[str, Union[str, List[str]]]]:
    permissions: List[Dict[str, Union[str, List[str]]]] = []
    if (allowed_to_use is None) and (allowed_to_manage is None):
        return permissions

    # Forcing same principal not be in both lists at the same time.
    if (allowed_to_use is not None) and (allowed_to_manage is not None):
        allowed_to_use = list(set(allowed_to_use) - set(allowed_to_manage))

    region: str = _utils.get_region_from_session(boto3_session=boto3_session)
    if allowed_to_use is not None:
        permissions += [
            {
                "Principal": _generate_principal(user_name=user_name, account_id=account_id, region=region),
                "Actions": _ALLOWED_ACTIONS[resource]["allowed_to_use"],
            }
            for user_name in allowed_to_use
        ]
    if allowed_to_manage is not None:
        permissions += [
            {
                "Principal": _generate_principal(user_name=user_name, account_id=account_id, region=region),
                "Actions": _ALLOWED_ACTIONS[resource]["allowed_to_manage"],
            }
            for user_name in allowed_to_manage
        ]
    return permissions
Ejemplo n.º 2
0
def build_step(
    command: str,
    name: str = "my-step",
    action_on_failure: str = "CONTINUE",
    script: bool = False,
    region: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> Dict[str, Any]:
    """Build the Step structure (dictionary).

    Parameters
    ----------
    command : str
        e.g. 'echo "Hello!"'
        e.g. for script 's3://.../script.sh arg1 arg2'
    name : str, optional
        Step name.
    action_on_failure : str
        'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE'
    script : bool
        False for raw command or True for script runner.
        https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-commandrunner.html
    region: str, optional
        Region name to not get it from boto3.Session. (e.g. `us-east-1`)
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    Dict[str, Any]
        Step structure.

    Examples
    --------
    >>> import awswrangler as wr
    >>> steps = []
    >>> for cmd in ['echo "Hello"', "ls -la"]:
    ...     steps.append(wr.emr.build_step(name=cmd, command=cmd))
    >>> wr.emr.submit_steps(cluster_id="cluster-id", steps=steps)

    """
    jar: str = "command-runner.jar"
    if script is True:
        if region is not None:
            _region: str = region
        else:
            _region = _utils.get_region_from_session(
                boto3_session=boto3_session, default_region="us-east-1")
        jar = f"s3://{_region}.elasticmapreduce/libs/script-runner/script-runner.jar"
    step: Dict[str, Any] = {
        "Name": name,
        "ActionOnFailure": action_on_failure,
        "HadoopJarStep": {
            "Jar": jar,
            "Args": command.split(" ")
        },
    }
    return step
Ejemplo n.º 3
0
def submit_ecr_credentials_refresh(
        cluster_id: str,
        path: str,
        action_on_failure: str = "CONTINUE",
        boto3_session: Optional[boto3.Session] = None) -> str:
    """Update internal ECR credentials.

    Parameters
    ----------
    cluster_id : str
        Cluster ID.
    path : str
        Amazon S3 path where Wrangler will stage the script ecr_credentials_refresh.py (e.g. s3://bucket/emr/)
    action_on_failure : str
        'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE'
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    str
        Step ID.

    Examples
    --------
    >>> import awswrangler as wr
    >>> step_id = wr.emr.submit_ecr_credentials_refresh("cluster_id", "s3://bucket/emr/")

    """
    path = path[:-1] if path.endswith("/") else path
    path_script: str = f"{path}/ecr_credentials_refresh.py"
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    client_s3: boto3.client = _utils.client(service_name="s3", session=session)
    bucket, key = _utils.parse_path(path=path_script)
    region: str = _utils.get_region_from_session(boto3_session=boto3_session)
    client_s3.put_object(
        Body=_get_ecr_credentials_refresh_content(region).encode(
            encoding="utf-8"),
        Bucket=bucket,
        Key=key)
    command: str = f"spark-submit --deploy-mode cluster {path_script}"
    name: str = "ECR Credentials Refresh"
    step: Dict[str, Any] = build_step(name=name,
                                      command=command,
                                      action_on_failure=action_on_failure,
                                      script=False,
                                      boto3_session=session)
    client_emr: boto3.client = _utils.client(service_name="emr",
                                             session=session)
    response: Dict[str,
                   Any] = client_emr.add_job_flow_steps(JobFlowId=cluster_id,
                                                        Steps=[step])
    _logger.debug("response: \n%s", pprint.pformat(response))
    return response["StepIds"][0]
Ejemplo n.º 4
0
def _get_default_logging_path(
    subnet_id: Optional[str] = None,
    account_id: Optional[str] = None,
    region: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> str:
    """Get EMR default logging path.

    E.g. "s3://aws-logs-{account_id}-{region}/elasticmapreduce/"

    Parameters
    ----------
    subnet_id : str, optional
        Subnet ID. If not provided, you must pass `account_id` and `region` explicit.
    account_id: str, optional
        Account ID.
    region: str, optional
        Region e.g. 'us-east-1'
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    str
        Default logging path.
        E.g. "s3://aws-logs-{account_id}-{region}/elasticmapreduce/"

    Examples
    --------
    >>> import awswrangler as wr
    >>> state = wr.emr._get_default_logging_path("subnet-id")
    's3://aws-logs-{account_id}-{region}/elasticmapreduce/'

    """
    if account_id is None:
        boto3_session = _utils.ensure_session(session=boto3_session)
        _account_id: str = _utils.get_account_id(boto3_session=boto3_session)
    else:
        _account_id = account_id
    if (region is None) and (subnet_id is not None):
        boto3_session = _utils.ensure_session(session=boto3_session)
        _region: str = _utils.get_region_from_session(
            boto3_session=boto3_session)
    elif (region is None) and (subnet_id is None):
        raise exceptions.InvalidArgumentCombination(
            "You must pass region or subnet_id or both.")
    else:
        _region = region  # type: ignore
    return f"s3://aws-logs-{_account_id}-{_region}/elasticmapreduce/"
Ejemplo n.º 5
0
def _build_cluster_args(**pars):  # pylint: disable=too-many-branches,too-many-statements
    account_id: str = sts.get_account_id(boto3_session=pars["boto3_session"])
    region: str = _utils.get_region_from_session(
        boto3_session=pars["boto3_session"])

    # S3 Logging path
    if pars.get("logging_s3_path") is None:
        pars["logging_s3_path"] = _get_default_logging_path(
            subnet_id=None,
            account_id=account_id,
            region=region,
            boto3_session=pars["boto3_session"])

    spark_env: Optional[Dict[str, str]] = None
    yarn_env: Optional[Dict[str, str]] = None
    livy_env: Optional[Dict[str, str]] = None

    if pars["spark_pyarrow"] is True:
        if pars["spark_defaults"] is None:
            pars["spark_defaults"] = {
                "spark.sql.execution.arrow.enabled": "true"
            }
        else:
            pars["spark_defaults"][
                "spark.sql.execution.arrow.enabled"] = "true"
        spark_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"}
        yarn_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"}
        livy_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"}

    if pars["python3"] is True:
        if spark_env is None:
            spark_env = {"PYSPARK_PYTHON": "/usr/bin/python3"}
        else:
            spark_env["PYSPARK_PYTHON"] = "/usr/bin/python3"

    if pars["spark_jars_path"] is not None:
        paths: str = ",".join(pars["spark_jars_path"])
        if pars["spark_defaults"] is None:
            pars["spark_defaults"] = {"spark.jars": paths}
        else:
            pars["spark_defaults"]["spark.jars"] = paths

    args: Dict[str, Any] = {
        "Name": pars["cluster_name"],
        "LogUri": pars["logging_s3_path"],
        "ReleaseLabel": pars["emr_release"],
        "VisibleToAllUsers": pars["visible_to_all_users"],
        "JobFlowRole": pars["emr_ec2_role"],
        "ServiceRole": pars["emr_role"],
        "Instances": {
            "KeepJobFlowAliveWhenNoSteps":
            pars["keep_cluster_alive_when_no_steps"],
            "TerminationProtected": pars["termination_protected"],
            "Ec2SubnetId": pars["subnet_id"],
            "InstanceFleets": [],
        },
    }

    # EC2 Key Pair
    if pars["key_pair_name"] is not None:
        args["Instances"]["Ec2KeyName"] = pars["key_pair_name"]

    # Security groups
    if pars["security_group_master"] is not None:
        args["Instances"]["EmrManagedMasterSecurityGroup"] = pars[
            "security_group_master"]
    if pars["security_groups_master_additional"] is not None:
        args["Instances"]["AdditionalMasterSecurityGroups"] = pars[
            "security_groups_master_additional"]
    if pars["security_group_slave"] is not None:
        args["Instances"]["EmrManagedSlaveSecurityGroup"] = pars[
            "security_group_slave"]
    if pars["security_groups_slave_additional"] is not None:
        args["Instances"]["AdditionalSlaveSecurityGroups"] = pars[
            "security_groups_slave_additional"]
    if pars["security_group_service_access"] is not None:
        args["Instances"]["ServiceAccessSecurityGroup"] = pars[
            "security_group_service_access"]

    # Configurations
    args["Configurations"] = [{
        "Classification": "spark-log4j",
        "Properties": {
            "log4j.rootCategory": f"{pars['spark_log_level']}, console"
        }
    }]
    if pars["docker"] is True:
        if pars.get("extra_registries") is None:
            extra_registries: List[str] = []
        else:
            extra_registries = pars["extra_registries"]
        registries: str = f"local,centos,{account_id}.dkr.ecr.{region}.amazonaws.com,{','.join(extra_registries)}"
        registries = registries[:-1] if registries.endswith(
            ",") else registries
        args["Configurations"].append({
            "Classification":
            "container-executor",
            "Properties": {},
            "Configurations": [{
                "Classification": "docker",
                "Properties": {
                    "docker.privileged-containers.registries": registries,
                    "docker.trusted.registries": registries,
                },
                "Configurations": [],
            }],
        })

    if spark_env is not None:
        args["Configurations"].append({
            "Classification":
            "spark-env",
            "Properties": {},
            "Configurations": [{
                "Classification": "export",
                "Properties": spark_env,
                "Configurations": []
            }],
        })
    if yarn_env is not None:
        args["Configurations"].append({
            "Classification":
            "yarn-env",
            "Properties": {},
            "Configurations": [{
                "Classification": "export",
                "Properties": yarn_env,
                "Configurations": []
            }],
        })
    if livy_env is not None:
        args["Configurations"].append({
            "Classification":
            "livy-env",
            "Properties": {},
            "Configurations": [{
                "Classification": "export",
                "Properties": livy_env,
                "Configurations": []
            }],
        })
    if pars["spark_glue_catalog"] is True:
        args["Configurations"].append({
            "Classification": "spark-hive-site",
            "Properties": {
                "hive.metastore.client.factory.class":
                "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"  # noqa
            },
            "Configurations": [],
        })
    if pars["hive_glue_catalog"] is True:
        hive_conf: Optional[Dict[str, Any]] = {
            "Classification": "hive-site",
            "Properties": {},
            "Configurations": []
        }
        hive_conf["Properties"][
            "hive.metastore.client.factory.class"] = "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
        args["Configurations"].append(hive_conf)
    if pars["presto_glue_catalog"] is True:
        args["Configurations"].append({
            "Classification": "presto-connector-hive",
            "Properties": {
                "hive.metastore.glue.datacatalog.enabled": "true"
            },
            "Configurations": [],
        })
    if pars["consistent_view"] is True:
        args["Configurations"].append({
            "Classification": "emrfs-site",
            "Properties": {
                "fs.s3.consistent.retryPeriodSeconds":
                str(pars.get("consistent_view_retry_seconds", "10")),
                "fs.s3.consistent":
                "true",
                "fs.s3.consistent.retryCount":
                str(pars.get("consistent_view_retry_count", "5")),
                "fs.s3.consistent.metadata.tableName":
                pars.get("consistent_view_table_name", "EmrFSMetadata"),
            },
        })
    if pars["maximize_resource_allocation"] is True:
        args["Configurations"].append({
            "Classification": "spark",
            "Properties": {
                "maximizeResourceAllocation": "true"
            }
        })
    if pars["spark_defaults"] is not None:
        spark_defaults: Dict[str, Union[str, Dict[str, str]]] = {
            "Classification": "spark-defaults",
            "Properties": pars["spark_defaults"],
        }
        args["Configurations"].append(spark_defaults)
    if pars.get("custom_classifications") is not None:
        for c in pars["custom_classifications"]:
            args["Configurations"].append(c)

    # Applications
    if pars["applications"]:
        args["Applications"] = [{"Name": x} for x in pars["applications"]]

    # Bootstraps
    if pars["bootstraps_paths"]:
        args["BootstrapActions"] = [{
            "Name": x,
            "ScriptBootstrapAction": {
                "Path": x
            }
        } for x in pars["bootstraps_paths"]]

    # Debugging and Steps
    if (pars["debugging"] is True) or (pars["steps"] is not None):
        args["Steps"] = []
        if pars["debugging"] is True:
            args["Steps"].append({
                "Name": "Setup Hadoop Debugging",
                "ActionOnFailure": "TERMINATE_CLUSTER",
                "HadoopJarStep": {
                    "Jar": "command-runner.jar",
                    "Args": ["state-pusher-script"]
                },
            })
        if pars["steps"] is not None:
            args["Steps"] += pars["steps"]

    # Master Instance Fleet
    timeout_action_master: str = "SWITCH_TO_ON_DEMAND" if pars[
        "spot_timeout_to_on_demand_master"] else "TERMINATE_CLUSTER"
    fleet_master: Dict = {
        "Name":
        "MASTER",
        "InstanceFleetType":
        "MASTER",
        "TargetOnDemandCapacity":
        pars["instance_num_on_demand_master"],
        "TargetSpotCapacity":
        pars["instance_num_spot_master"],
        "InstanceTypeConfigs": [{
            "InstanceType":
            pars["instance_type_master"],
            "WeightedCapacity":
            1,
            "BidPriceAsPercentageOfOnDemandPrice":
            pars["spot_bid_percentage_of_on_demand_master"],
            "EbsConfiguration": {
                "EbsBlockDeviceConfigs": [{
                    "VolumeSpecification": {
                        "SizeInGB": pars["instance_ebs_size_master"],
                        "VolumeType": "gp2"
                    },
                    "VolumesPerInstance": 1,
                }],
                "EbsOptimized":
                True,
            },
        }],
    }
    if pars["instance_num_spot_master"] > 0:
        fleet_master["LaunchSpecifications"] = {
            "SpotSpecification": {
                "TimeoutDurationMinutes":
                pars["spot_provisioning_timeout_master"],
                "TimeoutAction": timeout_action_master,
            }
        }
    args["Instances"]["InstanceFleets"].append(fleet_master)

    # Core Instance Fleet
    if (pars["instance_num_spot_core"] >
            0) or pars["instance_num_on_demand_core"] > 0:
        timeout_action_core = "SWITCH_TO_ON_DEMAND" if pars[
            "spot_timeout_to_on_demand_core"] else "TERMINATE_CLUSTER"
        fleet_core: Dict = {
            "Name":
            "CORE",
            "InstanceFleetType":
            "CORE",
            "TargetOnDemandCapacity":
            pars["instance_num_on_demand_core"],
            "TargetSpotCapacity":
            pars["instance_num_spot_core"],
            "InstanceTypeConfigs": [{
                "InstanceType":
                pars["instance_type_core"],
                "WeightedCapacity":
                1,
                "BidPriceAsPercentageOfOnDemandPrice":
                pars["spot_bid_percentage_of_on_demand_core"],
                "EbsConfiguration": {
                    "EbsBlockDeviceConfigs": [{
                        "VolumeSpecification": {
                            "SizeInGB": pars["instance_ebs_size_core"],
                            "VolumeType": "gp2",
                        },
                        "VolumesPerInstance": 1,
                    }],
                    "EbsOptimized":
                    True,
                },
            }],
        }
        if pars["instance_num_spot_core"] > 0:
            fleet_core["LaunchSpecifications"] = {
                "SpotSpecification": {
                    "TimeoutDurationMinutes":
                    pars["spot_provisioning_timeout_core"],
                    "TimeoutAction":
                    timeout_action_core,
                }
            }
        args["Instances"]["InstanceFleets"].append(fleet_core)

    # Task Instance Fleet
    if (pars["instance_num_spot_task"] >
            0) or pars["instance_num_on_demand_task"] > 0:
        timeout_action_task: str = "SWITCH_TO_ON_DEMAND" if pars[
            "spot_timeout_to_on_demand_task"] else "TERMINATE_CLUSTER"
        fleet_task: Dict = {
            "Name":
            "TASK",
            "InstanceFleetType":
            "TASK",
            "TargetOnDemandCapacity":
            pars["instance_num_on_demand_task"],
            "TargetSpotCapacity":
            pars["instance_num_spot_task"],
            "InstanceTypeConfigs": [{
                "InstanceType":
                pars["instance_type_task"],
                "WeightedCapacity":
                1,
                "BidPriceAsPercentageOfOnDemandPrice":
                pars["spot_bid_percentage_of_on_demand_task"],
                "EbsConfiguration": {
                    "EbsBlockDeviceConfigs": [{
                        "VolumeSpecification": {
                            "SizeInGB": pars["instance_ebs_size_task"],
                            "VolumeType": "gp2",
                        },
                        "VolumesPerInstance": 1,
                    }],
                    "EbsOptimized":
                    True,
                },
            }],
        }
        if pars["instance_num_spot_task"] > 0:
            fleet_task["LaunchSpecifications"] = {
                "SpotSpecification": {
                    "TimeoutDurationMinutes":
                    pars["spot_provisioning_timeout_task"],
                    "TimeoutAction":
                    timeout_action_task,
                }
            }
        args["Instances"]["InstanceFleets"].append(fleet_task)

    # Tags
    if pars["tags"] is not None:
        args["Tags"] = [{
            "Key": k,
            "Value": v
        } for k, v in pars["tags"].items()]

    _logger.debug("args: \n%s", pprint.pformat(args))
    return args
Ejemplo n.º 6
0
def connect(
    host: str,
    port: Optional[int] = 443,
    boto3_session: Optional[boto3.Session] = boto3.Session(),
    region: Optional[str] = None,
    username: Optional[str] = None,
    password: Optional[str] = None,
) -> OpenSearch:
    """Create a secure connection to the specified Amazon OpenSearch domain.

    Note
    ----
    We use `opensearch-py <https://github.com/opensearch-project/opensearch-py>`_, an OpenSearch python client.

    The username and password are mandatory if the OS Cluster uses `Fine Grained Access Control \
<https://docs.aws.amazon.com/opensearch-service/latest/developerguide/fgac.html>`_.
    If fine grained access control is disabled, session access key and secret keys are used.

    Parameters
    ----------
    host : str
        Amazon OpenSearch domain, for example: my-test-domain.us-east-1.es.amazonaws.com.
    port : int
        OpenSearch Service only accepts connections over port 80 (HTTP) or 443 (HTTPS)
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 Session will be used if boto3_session receive None.
    region :
        AWS region of the Amazon OS domain. If not provided will be extracted from boto3_session.
    username :
        Fine-grained access control username. Mandatory if OS Cluster uses Fine Grained Access Control.
    password :
        Fine-grained access control password. Mandatory if OS Cluster uses Fine Grained Access Control.

    Returns
    -------
    opensearchpy.OpenSearch
        OpenSearch low-level client.
        https://github.com/opensearch-project/opensearch-py/blob/main/opensearchpy/client/__init__.py
    """
    valid_ports = {80, 443}

    if port not in valid_ports:
        raise ValueError(f"results: port must be one of {valid_ports}")

    if username and password:
        http_auth = (username, password)
    else:
        if region is None:
            region = _utils.get_region_from_session(
                boto3_session=boto3_session)
        creds = _utils.get_credentials_from_session(
            boto3_session=boto3_session)
        if creds.access_key is None or creds.secret_key is None:
            raise exceptions.InvalidArgument(
                "One of IAM Role or AWS ACCESS_KEY_ID and SECRET_ACCESS_KEY must be "
                "given. Unable to find ACCESS_KEY_ID and SECRET_ACCESS_KEY in boto3 "
                "session.")
        http_auth = AWS4Auth(creds.access_key,
                             creds.secret_key,
                             region,
                             "es",
                             session_token=creds.token)
    try:
        es = OpenSearch(
            host=_strip_endpoint(host),
            port=port,
            http_auth=http_auth,
            use_ssl=True,
            verify_certs=True,
            connection_class=RequestsHttpConnection,
            timeout=30,
            max_retries=10,
            retry_on_timeout=True,
        )
    except Exception as e:
        _logger.error(
            "Error connecting to Opensearch cluster. Please verify authentication details"
        )
        raise e
    return es