Example #1
0
 def start(self):
     region_name = self.config.get(
         "awsRegionId") or dku_emr.get_current_region()
     client = dku_emr.get_emr_client(self.config, region_name)
     clusterId = self.config["emrClusterId"]
     logging.info("Attaching to EMR cluster id %s" % clusterId)
     return dku_emr.make_cluster_keys_and_data(client,
                                               clusterId,
                                               create_user_dir=True)
Example #2
0
    def attach_cluster(self):
        region_name = self.my_cluster.config.get(
            "awsRegionId") or dku_emr.get_current_region()
        client = boto3.client("emr", region_name=region_name)
        cluster_id = self.my_cluster.config["emrClusterId"]
        logging.info("Attaching to EMR cluster id %s" % cluster_id)

        return dku_emr.make_cluster_keys_and_data(client,
                                                  cluster_id,
                                                  create_user_dir=True)
Example #3
0
    def start(self):
        region = self.config.get("awsRegionId") or dku_emr.get_current_region()
        client = boto3.client('emr', region_name=region)
        release = 'emr-%s' % self.config["emrVersion"]

        name = "DSS cluster id=%s name=%s" % (self.cluster_id,
                                              self.cluster_name)

        logging.info("starting cluster, release=%s name=%s" % (release, name))

        extraArgs = {}
        if "logsPath" in self.config:
            extraArgs['LogUri'] = self.config["logsPath"]
        if "securityConfiguration" in self.config:
            extraArgs["SecurityConfiguration"] = self.config[
                "securityConfiguration"]
        if self.config.get("ebsRootVolumeSize", 0):
            extraArgs["EbsRootVolumeSize"] = self.config["ebsRootVolumeSize"]

        security_groups = []
        if "additionalSecurityGroups" in self.config:
            security_groups = [
                x.strip()
                for x in self.config["additionalSecurityGroups"].split(",")
            ]

        subnet = self.config.get("subnetId") or dku_emr.get_current_subnet()

        instances = {
            'InstanceGroups': [{
                'InstanceRole':
                'MASTER',
                'InstanceType':
                self.config["masterInstanceType"],
                'InstanceCount':
                1
            }],
            'KeepJobFlowAliveWhenNoSteps':
            True,
            'Ec2SubnetId':
            subnet,
            'AdditionalMasterSecurityGroups':
            security_groups,
            'AdditionalSlaveSecurityGroups':
            security_groups
        }

        if self.config.get("coreInstanceCount"):
            if not self.config.get("coreInstanceType"):
                raise Exception("Missing core instance type")
            instances['InstanceGroups'].append({
                'InstanceRole':
                'CORE',
                'InstanceType':
                self.config["coreInstanceType"],
                'InstanceCount':
                self.config["coreInstanceCount"]
            })

        if self.config.get("taskInstanceCount"):
            if not self.config.get("taskInstanceType"):
                raise Exception("Missing task instance type")
            instances['InstanceGroups'].append({
                'InstanceRole':
                'TASK',
                'InstanceType':
                self.config["taskInstanceType"],
                'InstanceCount':
                self.config["taskInstanceCount"]
            })

        if "ec2KeyName" in self.config:
            instances['Ec2KeyName'] = self.config["ec2KeyName"]

        tags = [{'Key': 'Name', 'Value': name}]
        for tag in self.config.get("tags", []):
            tags.append({"Key": tag["from"], "Value": tag["to"]})

        if self.config["metastoreDBMode"] == "CUSTOM_JDBC":
            props = {
                "javax.jdo.option.ConnectionURL":
                self.config["metastoreJDBCURL"],
                "javax.jdo.option.ConnectionDriverName":
                self.config["metastoreJDBCDriver"],
                "javax.jdo.option.ConnectionUserName":
                self.config["metastoreJDBCUser"],
                "javax.jdo.option.ConnectionPassword":
                self.config["metastoreJDBCPassword"],
            }
            Configurations = [{
                "Classification": "hive-site",
                "Properties": props
            }]
            extraArgs["Configurations"] = Configurations
        elif self.config["metastoreDBMode"] == "MYSQL":
            props = {
                "javax.jdo.option.ConnectionURL":
                "jdbc:mysql://%s:3306/hive?createDatabaseIfNotExist=true" %
                self.config["metastoreMySQLHost"],
                "javax.jdo.option.ConnectionDriverName":
                "org.mariadb.jdbc.Driver",
                "javax.jdo.option.ConnectionUserName":
                self.config["metastoreMySQLUser"],
                "javax.jdo.option.ConnectionPassword":
                self.config["metastoreMySQLPassword"]
            }
            Configurations = [{
                "Classification": "hive-site",
                "Properties": props
            }]
            extraArgs["Configurations"] = Configurations
        elif self.config["metastoreDBMode"] == "AWS_GLUE_DATA_CATALOG":
            props = {
                "hive.metastore.client.factory.class":
                "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
            }
            Configurations = [{
                "Classification": "hive-site",
                "Properties": props
            }]
            extraArgs["Configurations"] = Configurations

        logging.info(
            "Starting cluster: %s",
            dict(Name=name,
                 ReleaseLabel=release,
                 Instances=instances,
                 Applications=[{
                     "Name": "Hadoop"
                 }, {
                     "Name": "Hive"
                 }, {
                     "Name": "Tez"
                 }, {
                     "Name": "Pig"
                 }, {
                     "Name": "Spark"
                 }, {
                     "Name": "Zookeeper"
                 }],
                 VisibleToAllUsers=True,
                 JobFlowRole=self.config["nodesRole"],
                 ServiceRole=self.config["serviceRole"],
                 Tags=tags,
                 **extraArgs))

        response = client.run_job_flow(Name=name,
                                       ReleaseLabel=release,
                                       Instances=instances,
                                       Applications=[{
                                           "Name": "Hadoop"
                                       }, {
                                           "Name": "Hive"
                                       }, {
                                           "Name": "Tez"
                                       }, {
                                           "Name": "Pig"
                                       }, {
                                           "Name": "Spark"
                                       }, {
                                           "Name": "Zookeeper"
                                       }],
                                       VisibleToAllUsers=True,
                                       JobFlowRole=self.config["nodesRole"],
                                       ServiceRole=self.config["serviceRole"],
                                       Tags=tags,
                                       **extraArgs)

        clusterId = response['JobFlowId']
        logging.info("clusterId=%s" % clusterId)

        logging.info("waiting for cluster to start")
        client.get_waiter('cluster_running').wait(ClusterId=clusterId)

        return dku_emr.make_cluster_keys_and_data(
            client,
            clusterId,
            create_user_dir=True,
            create_databases=self.config.get("databasesToCreate"))
Example #4
0
    def build_cluster(self):
        region = self.my_cluster.config.get(
            "awsRegionId") or dku_emr.get_current_region()
        client = boto3.client('emr', region_name=region)
        release = 'emr-%s' % self.my_cluster.config["emrVersion"]
        name_prefix = "dss-"
        name = (name_prefix + self.my_cluster.cluster_id if
                self.my_cluster.cluster_id[0:len(name_prefix)] != name_prefix
                else self.my_cluster.cluster_id)
        logging.info("starting cluster, release=%s name=%s" % (release, name))

        extra_args = {}

        # Path for logs in S3
        logs_path = self.my_cluster.config.get("logsPath")
        if logs_path is not None:
            if "s3://" in logs_path:
                extra_args[Arg.LogUri] = logs_path
            else:
                raise Exception(
                    "'{}' is not a valid S3 path".format(logs_path))

        # Use specified security config
        if "securityConfiguration" in self.my_cluster.config:
            extra_args[Arg.SecurityConfig] = self.my_cluster.config[
                "securityConfiguration"]

        # EBS root volume size (minimum of 10)
        extra_args[Arg.EbsRootVolSize] = int(
            self.my_cluster.config.get('ebsRootVolumeSize') or 25)

        # EMR app (e.g., Spark, Hive) configs
        extra_args[Arg.Configurations] = self._get_software_configs()

        # EMR instance groups and configs
        instances = self._get_instance_configs()

        # EMR applications to install
        applications = self._get_apps_to_install()

        # Tags
        tags = self._get_tags(name)

        # All args to run_job_flow(..)
        job_flow_params = {
            Arg.Name: name,
            Arg.Release: release,
            Arg.Instances: instances,
            Arg.Applications: applications,
            Arg.VisibleToAllUsers: True,
            Arg.JobFlowRole: self.my_cluster.config["nodesRole"],
            Arg.ServiceRole: self.my_cluster.config["serviceRole"],
            Arg.Tags: tags
        }
        job_flow_params.update(extra_args)

        logging.info("Starting cluster: %s", job_flow_params)

        response = client.run_job_flow(**job_flow_params)
        cluster_id = response[Response.JobFlowId]
        logging.info("cluster_id=%s" % cluster_id)

        logging.info("waiting for cluster to start")
        client.get_waiter('cluster_running').wait(ClusterId=cluster_id)

        cluster_keys_and_data = dku_emr.make_cluster_keys_and_data(
            client,
            cluster_id,
            create_user_dir=True,
            create_databases=self.my_cluster.config.get("databasesToCreate"))

        # TODO: Implement install of python libs and post-launch SSH commands to run on each node
        # try:
        #     python_libs = self.my_cluster.config.get('pythonLibs') or self.my_cluster.config.get('python_libs')
        #     if str(python_libs) not in {"None", "[]"}:
        #         self._install_python_libs()
        #
        #     extra_setup_cmds = self.my_cluster.config.get('extraSetup') or self.my_cluster.config.get('extra_setup')
        #     if str(extra_setup_cmds) not in {"None", "[]"}:
        #         self._run_shell_commands()
        #
        #     return cluster_keys_and_data
        #
        # except Exception as e:
        #     logging.info(e)
        #     logging.info("Setup failed. Terminating cluster.")
        #     self.my_cluster.stop(data={'emrClusterId': cluster_id})
        return cluster_keys_and_data