Exemple #1
0
    def start_application(self, data):
        try:
            self.update_application_state("Running")
            plugin_log.log("%s | Starting application execution" %
                           (time.strftime("%H:%M:%S")))

            binary_url = str(data['binary_url'])
            execution_class = str(data['execution_class'])
            execution_parameters = str(data['execution_parameters'])
            expected_time = int(data['expected_time'])
            number_of_jobs = int(data['number_of_jobs'])
            starting_cap = int(data['starting_cap'])

            # Optimizer integration
            app_name = data['app_name']
            days = 0

            if app_name.lower() == 'bulma':
                if 'days' in data.keys():
                    days = data['days']
                else:
                    self._log("""%s | 'days' parameter missing""" %
                              (time.strftime("%H:%M:%S")))
                    raise ex.ConfigurationError()

            cores, vms = optimizer.get_info(api.optimizer_url, expected_time,
                                            app_name, days)
            optimizer_command = ''
            if cores >= 0:
                optimizer_command = ' --total-executor-cores %d ' % cores

            plugin_log.log("%s | Submission id: %s" %
                           (time.strftime("%H:%M:%S"), self.app_id))

            plugin_log.log("%s | Connecting with Mesos cluster..." %
                           (time.strftime("%H:%M:%S")))

            conn = ssh.get_connection(api.mesos_url, api.cluster_username,
                                      api.cluster_password,
                                      api.cluster_key_path)

            plugin_log.log("%s | Connected with Mesos cluster" %
                           (time.strftime("%H:%M:%S")))

            # Execute all the spark needed commands
            # to run an spark job from command line
            if execution_class != "" and execution_class is not None:
                # If the class field is empty, it means that the
                # job binary is python
                binary_path = '~/exec_bin.jar'
                spark_run = ('sudo %s --name %s ' + '--master mesos://%s:%s ' +
                             optimizer_command + '--class %s %s %s')
            else:
                binary_path = '~/exec_bin.py'
                spark_run = ('sudo %s --name %s ' + '--master mesos://%s:%s ' +
                             optimizer_command + '%s %s %s')

            plugin_log.log("%s | Download the binary to cluster" %
                           (time.strftime("%H:%M:%S")))

            try:
                stdin, stdout, stderr = conn.exec_command(
                    'wget %s -O %s' % (binary_url, binary_path))

                plugin_log.log("%s | Waiting for download the binary..." %
                               (time.strftime("%H:%M:%S")))

                # TODO: Fix possible wget error
                stdout.read()
                plugin_log.log("%s | Binary downloaded" %
                               (time.strftime("%H:%M:%S")))

            except Exception as e:
                plugin_log.log("%s | Error downloading binary" %
                               (time.strftime("%H:%M:%S")))
                self.update_application_state("Error")
                return "Error"

            i, o, e = conn.exec_command(
                spark_run %
                (api.spark_path, self.app_id, api.mesos_url, api.mesos_port,
                 execution_class, binary_path, execution_parameters))

            # Discovery ips of the executors from Mesos
            # and discovery the ids on KVM using the ips
            list_vms_one = (
                'onevm list --user %s --password %s --endpoint %s' %
                (api.one_username, api.one_password, api.one_url))

            stdin, stdout, stderr = conn.exec_command(list_vms_one)

            list_response = stdout.read()

            vms_ips, master = mesos.get_executors_ip(conn, self.frameworks_url,
                                                     self.app_id)
            plugin_log.log("%s | Master: %s" %
                           (time.strftime("%H:%M:%S"), master))

            plugin_log.log("%s | Executors: %s" %
                           (time.strftime("%H:%M:%S"), vms_ips))

            vms_ids = mesos.extract_vms_ids(list_response)
            plugin_log.log("%s | Executors IDs: %s" %
                           (time.strftime("%H:%M:%S"), vms_ids))

            executors_vms_ids = []
            for ip in vms_ips:
                for id in vms_ids:
                    vm_info_one = (
                        'onevm show %s '
                        '--user %s '
                        '--password %s '
                        '--endpoint %s' %
                        (id, api.one_username, api.one_password, api.one_url))

                    stdin, stdout, stderr = conn.exec_command(vm_info_one)
                    if ip in stdout.read():
                        executors_vms_ids.append(id)
                        break

            plugin_log.log("%s | Executors IDs: %s" %
                           (time.strftime("%H:%M:%S"), executors_vms_ids))

            # Set up the initial configuration of cpu cap
            controller.setup_environment(api.controller_url, executors_vms_ids,
                                         starting_cap, data)

            info_plugin = {
                "spark_submisson_url": master,
                "expected_time": expected_time,
                "number_of_jobs": number_of_jobs
            }

            plugin_log.log("%s | Starting monitor" %
                           (time.strftime("%H:%M:%S")))
            monitor.start_monitor(api.monitor_url, self.app_id, 'spark-mesos',
                                  info_plugin, 2)

            plugin_log.log("%s | Starting controller" %
                           (time.strftime("%H:%M:%S")))
            controller.start_controller(api.controller_url, self.app_id,
                                        executors_vms_ids, data)

            # This command locks the plugin execution
            # until the execution be done
            print o.read()

            plugin_log.log("%s | Stopping monitor" %
                           (time.strftime("%H:%M:%S")))
            monitor.stop_monitor(api.monitor_url, self.app_id)

            plugin_log.log("%s | Stopping controller" %
                           (time.strftime("%H:%M:%S")))
            controller.stop_controller(api.controller_url, self.app_id)

            plugin_log.log("%s | Remove binaries" %
                           (time.strftime("%H:%M:%S")))
            conn.exec_command('rm -rf ~/exec_bin.*')

            plugin_log.log("%s | Finished application execution" %
                           (time.strftime("%H:%M:%S")))

            self.update_application_state("OK")
            return 'OK'

        except Exception as e:
            plugin_log.log(e.message)
            print e.message
            self.update_application_state("Error")
Exemple #2
0
    def start_application(self, data, spark_applications_ids, app_id):
        try:
            self.update_application_state("Running")

            # Broker Parameters
            cluster_id = None
            user = api.user
            password = api.password
            project_id = api.project_id
            auth_ip = api.auth_ip
            domain = api.domain
            public_key = api.public_key
            key_path = api.key_path
            log_path = api.log_path
            container = api.container
            hosts = api.hosts
            remote_hdfs = api.remote_hdfs
            swift_logdir = api.swift_logdir
            number_of_attempts = api.number_of_attempts
            dummy_opportunistic = api.dummy_opportunistic

            # User Request Parameters
            net_id = data['net_id']
            master_ng = data['master_ng']
            slave_ng = data['slave_ng']
            op_slave_ng = data['opportunistic_slave_ng']
            opportunism = str(data['opportunistic'])
            plugin = data['openstack_plugin']
            percentage = int(data['percentage'])
            job_type = data['job_type']
            version = data['version']
            args = data['args']
            main_class = data['main_class']
            dependencies = data['dependencies']
            job_template_name = data['job_template_name']
            job_binary_name = data['job_binary_name']
            job_binary_url = data['job_binary_url']
            image_id = data['image_id']
            monitor_plugin = data['monitor_plugin']
            expected_time = data['expected_time']
            collect_period = data['collect_period']
            number_of_jobs = data['number_of_jobs']
            image_id = data['image_id']
            starting_cap = data['starting_cap']

            # Optimizer Parameters
            app_name = data['app_name']
            days = 0

            if app_name.lower() == 'bulma':
                if 'days' in data.keys():
                    days = data['days']
                else:
                    self._log("""%s | 'days' parameter missing""" %
                              (time.strftime("%H:%M:%S")))
                    raise ex.ConfigurationError()

            # Openstack Components
            connector = os_connector.OpenStackConnector(plugin_log)

            sahara = connector.get_sahara_client(user, password, project_id,
                                                 auth_ip, domain)

            swift = connector.get_swift_client(user, password, project_id,
                                               auth_ip, domain)

            nova = connector.get_nova_client(user, password, project_id,
                                             auth_ip, domain)

            # Optimizer gets the vcpu size of flavor
            cores_per_slave = connector.get_vcpus_by_nodegroup(
                nova, sahara, slave_ng)

            cores, vms = optimizer.get_info(api.optimizer_url, expected_time,
                                            app_name, days)

            if cores <= 0:
                if 'cluster_size' in data.keys():
                    req_cluster_size = data['cluster_size']
                else:
                    self._log("""%s | 'cluster_size' parameter missing""" %
                              (time.strftime("%H:%M:%S")))
                    raise ex.ConfigurationError()
            else:
                req_cluster_size = int(
                    math.ceil(cores / float(cores_per_slave)))

            # Check Oportunism
            if opportunism == "True":
                self._log("""%s | Checking if opportunistic instances
                          are available""" % (time.strftime("%H:%M:%S")))

                pred_cluster_size = optimizer.get_cluster_size(
                    api.optimizer_url, hosts, percentage, dummy_opportunistic)
            else:
                pred_cluster_size = req_cluster_size

            if pred_cluster_size > req_cluster_size:
                cluster_size = pred_cluster_size
            else:
                cluster_size = req_cluster_size

            self._log("%s | Cluster size: %s" %
                      (time.strftime("%H:%M:%S"), str(cluster_size)))

            self._log("%s | Creating cluster..." % (time.strftime("%H:%M:%S")))

            cluster_id = self._create_cluster(sahara, connector,
                                              req_cluster_size,
                                              pred_cluster_size, public_key,
                                              net_id, image_id, plugin,
                                              version, master_ng, slave_ng,
                                              op_slave_ng)

            self._log("%s | Cluster id: %s" %
                      (time.strftime("%H:%M:%S"), cluster_id))

            swift_path = self._is_swift_path(args)

            if cluster_id:
                master = connector.get_master_instance(
                    sahara, cluster_id)['internal_ip']

                self._log("%s | Master is %s" %
                          (time.strftime("%H:%M:%S"), master))

                workers = connector.get_worker_instances(sahara, cluster_id)
                workers_id = []

                for worker in workers:
                    workers_id.append(worker['instance_id'])

                self._log("%s | Configuring controller" %
                          (time.strftime("%H:%M:%S")))

                controller.setup_environment(api.controller_url, workers_id,
                                             starting_cap, data)

                if swift_path:
                    job_status = self._swift_spark_execution(
                        master, key_path, sahara, connector, job_binary_name,
                        job_binary_url, user, password, job_template_name,
                        job_type, plugin, cluster_size, args, main_class,
                        cluster_id, spark_applications_ids, workers_id, app_id,
                        expected_time, monitor_plugin, collect_period,
                        number_of_jobs, log_path, swift, container, data,
                        number_of_attempts)
                else:
                    job_status = self._hdfs_spark_execution(
                        master, remote_hdfs, key_path, args, job_binary_url,
                        main_class, dependencies, spark_applications_ids,
                        expected_time, monitor_plugin, collect_period,
                        number_of_jobs, workers_id, data, connector, swift,
                        swift_logdir, container, number_of_attempts)

            else:
                # FIXME: exception type
                self.update_application_state("Error")
                raise ex.ClusterNotCreatedException()

            # Delete cluster
            self._log("%s | Delete cluster: %s" %
                      (time.strftime("%H:%M:%S"), cluster_id))

            connector.delete_cluster(sahara, cluster_id)

            self._log("%s | Finished application execution" %
                      (time.strftime("%H:%M:%S")))

            return job_status

        except KeyError as ke:
            self._log("%s | Parameter missing in submission: %s, "
                      "please check the config file" %
                      (time.strftime("%H:%M:%S"), str(ke)))

            self._log("%s | Finished application execution with error" %
                      (time.strftime("%H:%M:%S")))

            self.update_application_state("Error")

        except ex.ConfigurationError:
            self._log("%s | Finished application execution with error" %
                      (time.strftime("%H:%M:%S")))

            self.update_application_state("Error")

        except SaharaAPIException:
            self._log("%s | There is not enough resource to create a cluster" %
                      (time.strftime("%H:%M:%S")))

            self._log("%s | Finished application execution with error" %
                      (time.strftime("%H:%M:%S")))

            self.update_application_state("Error")

        except Exception:
            if cluster_id is not None:
                self._log("%s | Delete cluster: %s" %
                          (time.strftime("%H:%M:%S"), cluster_id))
                connector.delete_cluster(sahara, cluster_id)

            self._log("%s | Unknown error, please report to administrators "
                      "of WP3 infrastructure" % (time.strftime("%H:%M:%S")))

            self._log("%s | Finished application execution with error" %
                      (time.strftime("%H:%M:%S")))

            self.update_application_state("Error")
Exemple #3
0
    def start_application(self, data):
        try:
            self.update_application_state("Running")

            user = api.user
            password = api.password
            project_id = api.project_id
            auth_ip = api.auth_ip
            domain = api.domain
            public_key = api.public_key

            connector = os_connector.OpenStackConnector(LOG)
            nova = connector.get_nova_client(user, password, project_id,
                                             auth_ip, domain)

            monitor_plugin = data['monitor_plugin']
            expected_time = data['expected_time']
            log_path = data['log_path']
            image_id = data['image_id']
            flavor_id = data['flavor_id']
            command = data['command']
            cluster_size = data['cluster_size']
            starting_cap = data['scaling_parameters']["starting_cap"]

            app_start_time = 0
            app_end_time = 0

            LOG.log("Creating instance(s)")
            print "Creating instance(s)..."

            # Create a number of instances to run the application based on
            # cluster_size, image_id, flavor_id and public_key
            instances = self._create_instances(nova, connector, image_id,
                                               flavor_id, public_key,
                                               cluster_size)

            LOG.log("Waiting until instance become active...")
            print "Waiting until instance become active..."

            # Retrive network information from all instances when they
            # reach ACTIVE state
            instances_nets = []
            for instance_id in instances:
                instance_status = connector.get_instance_status(
                    nova, instance_id)
                while instance_status != 'ACTIVE':
                    instance_status = connector.get_instance_status(
                        nova, instance_id)

                instance_ips = connector.get_instance_networks(
                    nova, instance_id)

                instances_nets.append(instance_ips)
                time.sleep(5)

            time.sleep(30)

            LOG.log("Checking if ssh is available")
            print "Checking if ssh is available"

            # Verify if ssh is available for any ip address for each instance
            instances_ips = []
            for instance_net in instances_nets:
                for net_ip_list in instance_net.values():
                    for ip in net_ip_list:

                        attempts = 2
                        while attempts != -1:
                            try:
                                conn = self._get_ssh_connection(
                                    ip, api.key_path)
                                instances_ips.append(ip)
                                attempts = -1

                            except Exception as e:
                                LOG.log("Fail to connect")
                                LOG.log(e.message)

                                print "Fail to connect"
                                print e.message

                                attempts -= 1
                                time.sleep(30)

            LOG.log("Setting up environment")
            print "Setting up environment"

            # Set CPU cap in all instances
            controller.setup_environment(api.controller_url, instances,
                                         starting_cap, data)

            # Execute application and start monitor and controller service.
            applications = []
            for ip in instances_ips:
                LOG.log("Executing commands into the instance")
                print "Executing commands into the instance"

                # TODO Check if exec_command will work without blocking exec

                conn = self._get_ssh_connection(ip, api.key_path)

                conn.exec_command(command)
                app_start_time = time.time()

                app_id = "app-os-generic" + str(uuid.uuid4())[:8]
                applications.append(app_id)

                monitor_plugin = monitor_plugin
                info_plugin = {
                    "host_ip": ip,
                    "log_path": log_path,
                    "expected_time": expected_time
                }

                collect_period = 1
                try:
                    LOG.log("Starting monitoring")
                    print "Starting monitoring"

                    monitor.start_monitor(api.monitor_url, app_id,
                                          monitor_plugin, info_plugin,
                                          collect_period)

                    LOG.log("Starting scaling")
                    print "Starting scaling"

                    controller.start_controller(api.controller_url, app_id,
                                                instances, data)

                except Exception as e:
                    LOG.log(e.message)
                    print e.message

            # Stop monitor and controller when each application stops
            application_running = True
            while application_running:
                status_instances = []
                for instance_id in instances:
                    status = connector.get_instance_status(nova, instance_id)
                    status_instances.append(status)

                if self._instances_down(status_instances):
                    application_running = False
                    app_end_time = time.time()

                    LOG.log("Application finished")
                    print "Application finished"

                    for app_id in applications:
                        LOG.log("Stopping monitoring")
                        print "Stopping monitoring"
                        monitor.stop_monitor(api.monitor_url, app_id)

                        LOG.log("Stopping scaling")
                        print "Stopping scaling"
                        controller.stop_controller(api.controller_url, app_id)

                else:
                    instance_status = []

                time.sleep(2)

            LOG.log("Removing instances...")
            print "Removing instances..."

            # Remove instances after the end of all applications
            self._remove_instances(nova, connector, instances)

            LOG.log("Finished application execution")
            print "Finished application execution"

            application_time = app_end_time - app_start_time
            application_time_log.log(
                "%s|%.0f|%.0f" % (app_id, app_start_time, application_time))

            self.application_time = application_time
            self.start_time = app_start_time
            self.update_application_state("OK")

            return str(application_time)

        except Exception as e:
            LOG.log(e.message)
            print e.message
            self.update_application_state("Error")