def start(self): region_name = self.config.get( "awsRegionId") or dku_emr.get_current_region() client = dku_emr.get_emr_client(self.config, region_name) clusterId = self.config["emrClusterId"] logging.info("Attaching to EMR cluster id %s" % clusterId) return dku_emr.make_cluster_keys_and_data(client, clusterId, create_user_dir=True)
def attach_cluster(self): region_name = self.my_cluster.config.get( "awsRegionId") or dku_emr.get_current_region() client = boto3.client("emr", region_name=region_name) cluster_id = self.my_cluster.config["emrClusterId"] logging.info("Attaching to EMR cluster id %s" % cluster_id) return dku_emr.make_cluster_keys_and_data(client, cluster_id, create_user_dir=True)
def start(self): region = self.config.get("awsRegionId") or dku_emr.get_current_region() client = boto3.client('emr', region_name=region) release = 'emr-%s' % self.config["emrVersion"] name = "DSS cluster id=%s name=%s" % (self.cluster_id, self.cluster_name) logging.info("starting cluster, release=%s name=%s" % (release, name)) extraArgs = {} if "logsPath" in self.config: extraArgs['LogUri'] = self.config["logsPath"] if "securityConfiguration" in self.config: extraArgs["SecurityConfiguration"] = self.config[ "securityConfiguration"] if self.config.get("ebsRootVolumeSize", 0): extraArgs["EbsRootVolumeSize"] = self.config["ebsRootVolumeSize"] security_groups = [] if "additionalSecurityGroups" in self.config: security_groups = [ x.strip() for x in self.config["additionalSecurityGroups"].split(",") ] subnet = self.config.get("subnetId") or dku_emr.get_current_subnet() instances = { 'InstanceGroups': [{ 'InstanceRole': 'MASTER', 'InstanceType': self.config["masterInstanceType"], 'InstanceCount': 1 }], 'KeepJobFlowAliveWhenNoSteps': True, 'Ec2SubnetId': subnet, 'AdditionalMasterSecurityGroups': security_groups, 'AdditionalSlaveSecurityGroups': security_groups } if self.config.get("coreInstanceCount"): if not self.config.get("coreInstanceType"): raise Exception("Missing core instance type") instances['InstanceGroups'].append({ 'InstanceRole': 'CORE', 'InstanceType': self.config["coreInstanceType"], 'InstanceCount': self.config["coreInstanceCount"] }) if self.config.get("taskInstanceCount"): if not self.config.get("taskInstanceType"): raise Exception("Missing task instance type") instances['InstanceGroups'].append({ 'InstanceRole': 'TASK', 'InstanceType': self.config["taskInstanceType"], 'InstanceCount': self.config["taskInstanceCount"] }) if "ec2KeyName" in self.config: instances['Ec2KeyName'] = self.config["ec2KeyName"] tags = [{'Key': 'Name', 'Value': name}] for tag in self.config.get("tags", []): tags.append({"Key": tag["from"], "Value": tag["to"]}) if self.config["metastoreDBMode"] == "CUSTOM_JDBC": props = { "javax.jdo.option.ConnectionURL": self.config["metastoreJDBCURL"], "javax.jdo.option.ConnectionDriverName": self.config["metastoreJDBCDriver"], "javax.jdo.option.ConnectionUserName": self.config["metastoreJDBCUser"], "javax.jdo.option.ConnectionPassword": self.config["metastoreJDBCPassword"], } Configurations = [{ "Classification": "hive-site", "Properties": props }] extraArgs["Configurations"] = Configurations elif self.config["metastoreDBMode"] == "MYSQL": props = { "javax.jdo.option.ConnectionURL": "jdbc:mysql://%s:3306/hive?createDatabaseIfNotExist=true" % self.config["metastoreMySQLHost"], "javax.jdo.option.ConnectionDriverName": "org.mariadb.jdbc.Driver", "javax.jdo.option.ConnectionUserName": self.config["metastoreMySQLUser"], "javax.jdo.option.ConnectionPassword": self.config["metastoreMySQLPassword"] } Configurations = [{ "Classification": "hive-site", "Properties": props }] extraArgs["Configurations"] = Configurations elif self.config["metastoreDBMode"] == "AWS_GLUE_DATA_CATALOG": props = { "hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory" } Configurations = [{ "Classification": "hive-site", "Properties": props }] extraArgs["Configurations"] = Configurations logging.info( "Starting cluster: %s", dict(Name=name, ReleaseLabel=release, Instances=instances, Applications=[{ "Name": "Hadoop" }, { "Name": "Hive" }, { "Name": "Tez" }, { "Name": "Pig" }, { "Name": "Spark" }, { "Name": "Zookeeper" }], VisibleToAllUsers=True, JobFlowRole=self.config["nodesRole"], ServiceRole=self.config["serviceRole"], Tags=tags, **extraArgs)) response = client.run_job_flow(Name=name, ReleaseLabel=release, Instances=instances, Applications=[{ "Name": "Hadoop" }, { "Name": "Hive" }, { "Name": "Tez" }, { "Name": "Pig" }, { "Name": "Spark" }, { "Name": "Zookeeper" }], VisibleToAllUsers=True, JobFlowRole=self.config["nodesRole"], ServiceRole=self.config["serviceRole"], Tags=tags, **extraArgs) clusterId = response['JobFlowId'] logging.info("clusterId=%s" % clusterId) logging.info("waiting for cluster to start") client.get_waiter('cluster_running').wait(ClusterId=clusterId) return dku_emr.make_cluster_keys_and_data( client, clusterId, create_user_dir=True, create_databases=self.config.get("databasesToCreate"))
def build_cluster(self): region = self.my_cluster.config.get( "awsRegionId") or dku_emr.get_current_region() client = boto3.client('emr', region_name=region) release = 'emr-%s' % self.my_cluster.config["emrVersion"] name_prefix = "dss-" name = (name_prefix + self.my_cluster.cluster_id if self.my_cluster.cluster_id[0:len(name_prefix)] != name_prefix else self.my_cluster.cluster_id) logging.info("starting cluster, release=%s name=%s" % (release, name)) extra_args = {} # Path for logs in S3 logs_path = self.my_cluster.config.get("logsPath") if logs_path is not None: if "s3://" in logs_path: extra_args[Arg.LogUri] = logs_path else: raise Exception( "'{}' is not a valid S3 path".format(logs_path)) # Use specified security config if "securityConfiguration" in self.my_cluster.config: extra_args[Arg.SecurityConfig] = self.my_cluster.config[ "securityConfiguration"] # EBS root volume size (minimum of 10) extra_args[Arg.EbsRootVolSize] = int( self.my_cluster.config.get('ebsRootVolumeSize') or 25) # EMR app (e.g., Spark, Hive) configs extra_args[Arg.Configurations] = self._get_software_configs() # EMR instance groups and configs instances = self._get_instance_configs() # EMR applications to install applications = self._get_apps_to_install() # Tags tags = self._get_tags(name) # All args to run_job_flow(..) job_flow_params = { Arg.Name: name, Arg.Release: release, Arg.Instances: instances, Arg.Applications: applications, Arg.VisibleToAllUsers: True, Arg.JobFlowRole: self.my_cluster.config["nodesRole"], Arg.ServiceRole: self.my_cluster.config["serviceRole"], Arg.Tags: tags } job_flow_params.update(extra_args) logging.info("Starting cluster: %s", job_flow_params) response = client.run_job_flow(**job_flow_params) cluster_id = response[Response.JobFlowId] logging.info("cluster_id=%s" % cluster_id) logging.info("waiting for cluster to start") client.get_waiter('cluster_running').wait(ClusterId=cluster_id) cluster_keys_and_data = dku_emr.make_cluster_keys_and_data( client, cluster_id, create_user_dir=True, create_databases=self.my_cluster.config.get("databasesToCreate")) # TODO: Implement install of python libs and post-launch SSH commands to run on each node # try: # python_libs = self.my_cluster.config.get('pythonLibs') or self.my_cluster.config.get('python_libs') # if str(python_libs) not in {"None", "[]"}: # self._install_python_libs() # # extra_setup_cmds = self.my_cluster.config.get('extraSetup') or self.my_cluster.config.get('extra_setup') # if str(extra_setup_cmds) not in {"None", "[]"}: # self._run_shell_commands() # # return cluster_keys_and_data # # except Exception as e: # logging.info(e) # logging.info("Setup failed. Terminating cluster.") # self.my_cluster.stop(data={'emrClusterId': cluster_id}) return cluster_keys_and_data