def _generate_permissions( resource: str, account_id: str, boto3_session: boto3.Session, allowed_to_use: Optional[List[str]] = None, allowed_to_manage: Optional[List[str]] = None, ) -> List[Dict[str, Union[str, List[str]]]]: permissions: List[Dict[str, Union[str, List[str]]]] = [] if (allowed_to_use is None) and (allowed_to_manage is None): return permissions # Forcing same principal not be in both lists at the same time. if (allowed_to_use is not None) and (allowed_to_manage is not None): allowed_to_use = list(set(allowed_to_use) - set(allowed_to_manage)) region: str = _utils.get_region_from_session(boto3_session=boto3_session) if allowed_to_use is not None: permissions += [ { "Principal": _generate_principal(user_name=user_name, account_id=account_id, region=region), "Actions": _ALLOWED_ACTIONS[resource]["allowed_to_use"], } for user_name in allowed_to_use ] if allowed_to_manage is not None: permissions += [ { "Principal": _generate_principal(user_name=user_name, account_id=account_id, region=region), "Actions": _ALLOWED_ACTIONS[resource]["allowed_to_manage"], } for user_name in allowed_to_manage ] return permissions
def build_step( command: str, name: str = "my-step", action_on_failure: str = "CONTINUE", script: bool = False, region: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> Dict[str, Any]: """Build the Step structure (dictionary). Parameters ---------- command : str e.g. 'echo "Hello!"' e.g. for script 's3://.../script.sh arg1 arg2' name : str, optional Step name. action_on_failure : str 'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE' script : bool False for raw command or True for script runner. https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-commandrunner.html region: str, optional Region name to not get it from boto3.Session. (e.g. `us-east-1`) boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Dict[str, Any] Step structure. Examples -------- >>> import awswrangler as wr >>> steps = [] >>> for cmd in ['echo "Hello"', "ls -la"]: ... steps.append(wr.emr.build_step(name=cmd, command=cmd)) >>> wr.emr.submit_steps(cluster_id="cluster-id", steps=steps) """ jar: str = "command-runner.jar" if script is True: if region is not None: _region: str = region else: _region = _utils.get_region_from_session( boto3_session=boto3_session, default_region="us-east-1") jar = f"s3://{_region}.elasticmapreduce/libs/script-runner/script-runner.jar" step: Dict[str, Any] = { "Name": name, "ActionOnFailure": action_on_failure, "HadoopJarStep": { "Jar": jar, "Args": command.split(" ") }, } return step
def submit_ecr_credentials_refresh( cluster_id: str, path: str, action_on_failure: str = "CONTINUE", boto3_session: Optional[boto3.Session] = None) -> str: """Update internal ECR credentials. Parameters ---------- cluster_id : str Cluster ID. path : str Amazon S3 path where Wrangler will stage the script ecr_credentials_refresh.py (e.g. s3://bucket/emr/) action_on_failure : str 'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE' boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- str Step ID. Examples -------- >>> import awswrangler as wr >>> step_id = wr.emr.submit_ecr_credentials_refresh("cluster_id", "s3://bucket/emr/") """ path = path[:-1] if path.endswith("/") else path path_script: str = f"{path}/ecr_credentials_refresh.py" session: boto3.Session = _utils.ensure_session(session=boto3_session) client_s3: boto3.client = _utils.client(service_name="s3", session=session) bucket, key = _utils.parse_path(path=path_script) region: str = _utils.get_region_from_session(boto3_session=boto3_session) client_s3.put_object( Body=_get_ecr_credentials_refresh_content(region).encode( encoding="utf-8"), Bucket=bucket, Key=key) command: str = f"spark-submit --deploy-mode cluster {path_script}" name: str = "ECR Credentials Refresh" step: Dict[str, Any] = build_step(name=name, command=command, action_on_failure=action_on_failure, script=False, boto3_session=session) client_emr: boto3.client = _utils.client(service_name="emr", session=session) response: Dict[str, Any] = client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=[step]) _logger.debug("response: \n%s", pprint.pformat(response)) return response["StepIds"][0]
def _get_default_logging_path( subnet_id: Optional[str] = None, account_id: Optional[str] = None, region: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> str: """Get EMR default logging path. E.g. "s3://aws-logs-{account_id}-{region}/elasticmapreduce/" Parameters ---------- subnet_id : str, optional Subnet ID. If not provided, you must pass `account_id` and `region` explicit. account_id: str, optional Account ID. region: str, optional Region e.g. 'us-east-1' boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- str Default logging path. E.g. "s3://aws-logs-{account_id}-{region}/elasticmapreduce/" Examples -------- >>> import awswrangler as wr >>> state = wr.emr._get_default_logging_path("subnet-id") 's3://aws-logs-{account_id}-{region}/elasticmapreduce/' """ if account_id is None: boto3_session = _utils.ensure_session(session=boto3_session) _account_id: str = _utils.get_account_id(boto3_session=boto3_session) else: _account_id = account_id if (region is None) and (subnet_id is not None): boto3_session = _utils.ensure_session(session=boto3_session) _region: str = _utils.get_region_from_session( boto3_session=boto3_session) elif (region is None) and (subnet_id is None): raise exceptions.InvalidArgumentCombination( "You must pass region or subnet_id or both.") else: _region = region # type: ignore return f"s3://aws-logs-{_account_id}-{_region}/elasticmapreduce/"
def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-statements account_id: str = sts.get_account_id(boto3_session=pars["boto3_session"]) region: str = _utils.get_region_from_session( boto3_session=pars["boto3_session"]) # S3 Logging path if pars.get("logging_s3_path") is None: pars["logging_s3_path"] = _get_default_logging_path( subnet_id=None, account_id=account_id, region=region, boto3_session=pars["boto3_session"]) spark_env: Optional[Dict[str, str]] = None yarn_env: Optional[Dict[str, str]] = None livy_env: Optional[Dict[str, str]] = None if pars["spark_pyarrow"] is True: if pars["spark_defaults"] is None: pars["spark_defaults"] = { "spark.sql.execution.arrow.enabled": "true" } else: pars["spark_defaults"][ "spark.sql.execution.arrow.enabled"] = "true" spark_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"} yarn_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"} livy_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"} if pars["python3"] is True: if spark_env is None: spark_env = {"PYSPARK_PYTHON": "/usr/bin/python3"} else: spark_env["PYSPARK_PYTHON"] = "/usr/bin/python3" if pars["spark_jars_path"] is not None: paths: str = ",".join(pars["spark_jars_path"]) if pars["spark_defaults"] is None: pars["spark_defaults"] = {"spark.jars": paths} else: pars["spark_defaults"]["spark.jars"] = paths args: Dict[str, Any] = { "Name": pars["cluster_name"], "LogUri": pars["logging_s3_path"], "ReleaseLabel": pars["emr_release"], "VisibleToAllUsers": pars["visible_to_all_users"], "JobFlowRole": pars["emr_ec2_role"], "ServiceRole": pars["emr_role"], "Instances": { "KeepJobFlowAliveWhenNoSteps": pars["keep_cluster_alive_when_no_steps"], "TerminationProtected": pars["termination_protected"], "Ec2SubnetId": pars["subnet_id"], "InstanceFleets": [], }, } # EC2 Key Pair if pars["key_pair_name"] is not None: args["Instances"]["Ec2KeyName"] = pars["key_pair_name"] # Security groups if pars["security_group_master"] is not None: args["Instances"]["EmrManagedMasterSecurityGroup"] = pars[ "security_group_master"] if pars["security_groups_master_additional"] is not None: args["Instances"]["AdditionalMasterSecurityGroups"] = pars[ "security_groups_master_additional"] if pars["security_group_slave"] is not None: args["Instances"]["EmrManagedSlaveSecurityGroup"] = pars[ "security_group_slave"] if pars["security_groups_slave_additional"] is not None: args["Instances"]["AdditionalSlaveSecurityGroups"] = pars[ "security_groups_slave_additional"] if pars["security_group_service_access"] is not None: args["Instances"]["ServiceAccessSecurityGroup"] = pars[ "security_group_service_access"] # Configurations args["Configurations"] = [{ "Classification": "spark-log4j", "Properties": { "log4j.rootCategory": f"{pars['spark_log_level']}, console" } }] if pars["docker"] is True: if pars.get("extra_registries") is None: extra_registries: List[str] = [] else: extra_registries = pars["extra_registries"] registries: str = f"local,centos,{account_id}.dkr.ecr.{region}.amazonaws.com,{','.join(extra_registries)}" registries = registries[:-1] if registries.endswith( ",") else registries args["Configurations"].append({ "Classification": "container-executor", "Properties": {}, "Configurations": [{ "Classification": "docker", "Properties": { "docker.privileged-containers.registries": registries, "docker.trusted.registries": registries, }, "Configurations": [], }], }) if spark_env is not None: args["Configurations"].append({ "Classification": "spark-env", "Properties": {}, "Configurations": [{ "Classification": "export", "Properties": spark_env, "Configurations": [] }], }) if yarn_env is not None: args["Configurations"].append({ "Classification": "yarn-env", "Properties": {}, "Configurations": [{ "Classification": "export", "Properties": yarn_env, "Configurations": [] }], }) if livy_env is not None: args["Configurations"].append({ "Classification": "livy-env", "Properties": {}, "Configurations": [{ "Classification": "export", "Properties": livy_env, "Configurations": [] }], }) if pars["spark_glue_catalog"] is True: args["Configurations"].append({ "Classification": "spark-hive-site", "Properties": { "hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory" # noqa }, "Configurations": [], }) if pars["hive_glue_catalog"] is True: hive_conf: Optional[Dict[str, Any]] = { "Classification": "hive-site", "Properties": {}, "Configurations": [] } hive_conf["Properties"][ "hive.metastore.client.factory.class"] = "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory" args["Configurations"].append(hive_conf) if pars["presto_glue_catalog"] is True: args["Configurations"].append({ "Classification": "presto-connector-hive", "Properties": { "hive.metastore.glue.datacatalog.enabled": "true" }, "Configurations": [], }) if pars["consistent_view"] is True: args["Configurations"].append({ "Classification": "emrfs-site", "Properties": { "fs.s3.consistent.retryPeriodSeconds": str(pars.get("consistent_view_retry_seconds", "10")), "fs.s3.consistent": "true", "fs.s3.consistent.retryCount": str(pars.get("consistent_view_retry_count", "5")), "fs.s3.consistent.metadata.tableName": pars.get("consistent_view_table_name", "EmrFSMetadata"), }, }) if pars["maximize_resource_allocation"] is True: args["Configurations"].append({ "Classification": "spark", "Properties": { "maximizeResourceAllocation": "true" } }) if pars["spark_defaults"] is not None: spark_defaults: Dict[str, Union[str, Dict[str, str]]] = { "Classification": "spark-defaults", "Properties": pars["spark_defaults"], } args["Configurations"].append(spark_defaults) if pars.get("custom_classifications") is not None: for c in pars["custom_classifications"]: args["Configurations"].append(c) # Applications if pars["applications"]: args["Applications"] = [{"Name": x} for x in pars["applications"]] # Bootstraps if pars["bootstraps_paths"]: args["BootstrapActions"] = [{ "Name": x, "ScriptBootstrapAction": { "Path": x } } for x in pars["bootstraps_paths"]] # Debugging and Steps if (pars["debugging"] is True) or (pars["steps"] is not None): args["Steps"] = [] if pars["debugging"] is True: args["Steps"].append({ "Name": "Setup Hadoop Debugging", "ActionOnFailure": "TERMINATE_CLUSTER", "HadoopJarStep": { "Jar": "command-runner.jar", "Args": ["state-pusher-script"] }, }) if pars["steps"] is not None: args["Steps"] += pars["steps"] # Master Instance Fleet timeout_action_master: str = "SWITCH_TO_ON_DEMAND" if pars[ "spot_timeout_to_on_demand_master"] else "TERMINATE_CLUSTER" fleet_master: Dict = { "Name": "MASTER", "InstanceFleetType": "MASTER", "TargetOnDemandCapacity": pars["instance_num_on_demand_master"], "TargetSpotCapacity": pars["instance_num_spot_master"], "InstanceTypeConfigs": [{ "InstanceType": pars["instance_type_master"], "WeightedCapacity": 1, "BidPriceAsPercentageOfOnDemandPrice": pars["spot_bid_percentage_of_on_demand_master"], "EbsConfiguration": { "EbsBlockDeviceConfigs": [{ "VolumeSpecification": { "SizeInGB": pars["instance_ebs_size_master"], "VolumeType": "gp2" }, "VolumesPerInstance": 1, }], "EbsOptimized": True, }, }], } if pars["instance_num_spot_master"] > 0: fleet_master["LaunchSpecifications"] = { "SpotSpecification": { "TimeoutDurationMinutes": pars["spot_provisioning_timeout_master"], "TimeoutAction": timeout_action_master, } } args["Instances"]["InstanceFleets"].append(fleet_master) # Core Instance Fleet if (pars["instance_num_spot_core"] > 0) or pars["instance_num_on_demand_core"] > 0: timeout_action_core = "SWITCH_TO_ON_DEMAND" if pars[ "spot_timeout_to_on_demand_core"] else "TERMINATE_CLUSTER" fleet_core: Dict = { "Name": "CORE", "InstanceFleetType": "CORE", "TargetOnDemandCapacity": pars["instance_num_on_demand_core"], "TargetSpotCapacity": pars["instance_num_spot_core"], "InstanceTypeConfigs": [{ "InstanceType": pars["instance_type_core"], "WeightedCapacity": 1, "BidPriceAsPercentageOfOnDemandPrice": pars["spot_bid_percentage_of_on_demand_core"], "EbsConfiguration": { "EbsBlockDeviceConfigs": [{ "VolumeSpecification": { "SizeInGB": pars["instance_ebs_size_core"], "VolumeType": "gp2", }, "VolumesPerInstance": 1, }], "EbsOptimized": True, }, }], } if pars["instance_num_spot_core"] > 0: fleet_core["LaunchSpecifications"] = { "SpotSpecification": { "TimeoutDurationMinutes": pars["spot_provisioning_timeout_core"], "TimeoutAction": timeout_action_core, } } args["Instances"]["InstanceFleets"].append(fleet_core) # Task Instance Fleet if (pars["instance_num_spot_task"] > 0) or pars["instance_num_on_demand_task"] > 0: timeout_action_task: str = "SWITCH_TO_ON_DEMAND" if pars[ "spot_timeout_to_on_demand_task"] else "TERMINATE_CLUSTER" fleet_task: Dict = { "Name": "TASK", "InstanceFleetType": "TASK", "TargetOnDemandCapacity": pars["instance_num_on_demand_task"], "TargetSpotCapacity": pars["instance_num_spot_task"], "InstanceTypeConfigs": [{ "InstanceType": pars["instance_type_task"], "WeightedCapacity": 1, "BidPriceAsPercentageOfOnDemandPrice": pars["spot_bid_percentage_of_on_demand_task"], "EbsConfiguration": { "EbsBlockDeviceConfigs": [{ "VolumeSpecification": { "SizeInGB": pars["instance_ebs_size_task"], "VolumeType": "gp2", }, "VolumesPerInstance": 1, }], "EbsOptimized": True, }, }], } if pars["instance_num_spot_task"] > 0: fleet_task["LaunchSpecifications"] = { "SpotSpecification": { "TimeoutDurationMinutes": pars["spot_provisioning_timeout_task"], "TimeoutAction": timeout_action_task, } } args["Instances"]["InstanceFleets"].append(fleet_task) # Tags if pars["tags"] is not None: args["Tags"] = [{ "Key": k, "Value": v } for k, v in pars["tags"].items()] _logger.debug("args: \n%s", pprint.pformat(args)) return args
def connect( host: str, port: Optional[int] = 443, boto3_session: Optional[boto3.Session] = boto3.Session(), region: Optional[str] = None, username: Optional[str] = None, password: Optional[str] = None, ) -> OpenSearch: """Create a secure connection to the specified Amazon OpenSearch domain. Note ---- We use `opensearch-py <https://github.com/opensearch-project/opensearch-py>`_, an OpenSearch python client. The username and password are mandatory if the OS Cluster uses `Fine Grained Access Control \ <https://docs.aws.amazon.com/opensearch-service/latest/developerguide/fgac.html>`_. If fine grained access control is disabled, session access key and secret keys are used. Parameters ---------- host : str Amazon OpenSearch domain, for example: my-test-domain.us-east-1.es.amazonaws.com. port : int OpenSearch Service only accepts connections over port 80 (HTTP) or 443 (HTTPS) boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 Session will be used if boto3_session receive None. region : AWS region of the Amazon OS domain. If not provided will be extracted from boto3_session. username : Fine-grained access control username. Mandatory if OS Cluster uses Fine Grained Access Control. password : Fine-grained access control password. Mandatory if OS Cluster uses Fine Grained Access Control. Returns ------- opensearchpy.OpenSearch OpenSearch low-level client. https://github.com/opensearch-project/opensearch-py/blob/main/opensearchpy/client/__init__.py """ valid_ports = {80, 443} if port not in valid_ports: raise ValueError(f"results: port must be one of {valid_ports}") if username and password: http_auth = (username, password) else: if region is None: region = _utils.get_region_from_session( boto3_session=boto3_session) creds = _utils.get_credentials_from_session( boto3_session=boto3_session) if creds.access_key is None or creds.secret_key is None: raise exceptions.InvalidArgument( "One of IAM Role or AWS ACCESS_KEY_ID and SECRET_ACCESS_KEY must be " "given. Unable to find ACCESS_KEY_ID and SECRET_ACCESS_KEY in boto3 " "session.") http_auth = AWS4Auth(creds.access_key, creds.secret_key, region, "es", session_token=creds.token) try: es = OpenSearch( host=_strip_endpoint(host), port=port, http_auth=http_auth, use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection, timeout=30, max_retries=10, retry_on_timeout=True, ) except Exception as e: _logger.error( "Error connecting to Opensearch cluster. Please verify authentication details" ) raise e return es