Beispiel #1
0
    def start_application(self, data, spark_applications_ids, app_id):
        try:
            self.update_application_state("Running")

            # Broker Parameters
            cluster_id = None
            user = api.user
            password = api.password
            project_id = api.project_id
            auth_ip = api.auth_ip
            domain = api.domain
            public_key = api.public_key
            key_path = api.key_path
            log_path = api.log_path
            container = api.container
            hosts = api.hosts
            remote_hdfs = api.remote_hdfs
            swift_logdir = api.swift_logdir
            number_of_attempts = api.number_of_attempts
            dummy_opportunistic = api.dummy_opportunistic

            # User Request Parameters
            net_id = data['net_id']
            master_ng = data['master_ng']
            slave_ng = data['slave_ng']
            op_slave_ng = data['opportunistic_slave_ng']
            opportunism = str(data['opportunistic'])
            plugin = data['openstack_plugin']
            percentage = int(data['percentage'])
            job_type = data['job_type']
            version = data['version']
            args = data['args']
            main_class = data['main_class']
            dependencies = data['dependencies']
            job_template_name = data['job_template_name']
            job_binary_name = data['job_binary_name']
            job_binary_url = data['job_binary_url']
            image_id = data['image_id']
            monitor_plugin = data['monitor_plugin']
            expected_time = data['expected_time']
            collect_period = data['collect_period']
            number_of_jobs = data['number_of_jobs']
            image_id = data['image_id']
            starting_cap = data['starting_cap']

            # Optimizer Parameters
            app_name = data['app_name']
            days = 0

            if app_name.lower() == 'bulma':
                if 'days' in data.keys():
                    days = data['days']
                else:
                    self._log("""%s | 'days' parameter missing""" %
                              (time.strftime("%H:%M:%S")))
                    raise ex.ConfigurationError()

            # Openstack Components
            connector = os_connector.OpenStackConnector(plugin_log)

            sahara = connector.get_sahara_client(user, password, project_id,
                                                 auth_ip, domain)

            swift = connector.get_swift_client(user, password, project_id,
                                               auth_ip, domain)

            nova = connector.get_nova_client(user, password, project_id,
                                             auth_ip, domain)

            # Optimizer gets the vcpu size of flavor
            cores_per_slave = connector.get_vcpus_by_nodegroup(
                nova, sahara, slave_ng)

            cores, vms = optimizer.get_info(api.optimizer_url, expected_time,
                                            app_name, days)

            if cores <= 0:
                if 'cluster_size' in data.keys():
                    req_cluster_size = data['cluster_size']
                else:
                    self._log("""%s | 'cluster_size' parameter missing""" %
                              (time.strftime("%H:%M:%S")))
                    raise ex.ConfigurationError()
            else:
                req_cluster_size = int(
                    math.ceil(cores / float(cores_per_slave)))

            # Check Oportunism
            if opportunism == "True":
                self._log("""%s | Checking if opportunistic instances
                          are available""" % (time.strftime("%H:%M:%S")))

                pred_cluster_size = optimizer.get_cluster_size(
                    api.optimizer_url, hosts, percentage, dummy_opportunistic)
            else:
                pred_cluster_size = req_cluster_size

            if pred_cluster_size > req_cluster_size:
                cluster_size = pred_cluster_size
            else:
                cluster_size = req_cluster_size

            self._log("%s | Cluster size: %s" %
                      (time.strftime("%H:%M:%S"), str(cluster_size)))

            self._log("%s | Creating cluster..." % (time.strftime("%H:%M:%S")))

            cluster_id = self._create_cluster(sahara, connector,
                                              req_cluster_size,
                                              pred_cluster_size, public_key,
                                              net_id, image_id, plugin,
                                              version, master_ng, slave_ng,
                                              op_slave_ng)

            self._log("%s | Cluster id: %s" %
                      (time.strftime("%H:%M:%S"), cluster_id))

            swift_path = self._is_swift_path(args)

            if cluster_id:
                master = connector.get_master_instance(
                    sahara, cluster_id)['internal_ip']

                self._log("%s | Master is %s" %
                          (time.strftime("%H:%M:%S"), master))

                workers = connector.get_worker_instances(sahara, cluster_id)
                workers_id = []

                for worker in workers:
                    workers_id.append(worker['instance_id'])

                self._log("%s | Configuring controller" %
                          (time.strftime("%H:%M:%S")))

                controller.setup_environment(api.controller_url, workers_id,
                                             starting_cap, data)

                if swift_path:
                    job_status = self._swift_spark_execution(
                        master, key_path, sahara, connector, job_binary_name,
                        job_binary_url, user, password, job_template_name,
                        job_type, plugin, cluster_size, args, main_class,
                        cluster_id, spark_applications_ids, workers_id, app_id,
                        expected_time, monitor_plugin, collect_period,
                        number_of_jobs, log_path, swift, container, data,
                        number_of_attempts)
                else:
                    job_status = self._hdfs_spark_execution(
                        master, remote_hdfs, key_path, args, job_binary_url,
                        main_class, dependencies, spark_applications_ids,
                        expected_time, monitor_plugin, collect_period,
                        number_of_jobs, workers_id, data, connector, swift,
                        swift_logdir, container, number_of_attempts)

            else:
                # FIXME: exception type
                self.update_application_state("Error")
                raise ex.ClusterNotCreatedException()

            # Delete cluster
            self._log("%s | Delete cluster: %s" %
                      (time.strftime("%H:%M:%S"), cluster_id))

            connector.delete_cluster(sahara, cluster_id)

            self._log("%s | Finished application execution" %
                      (time.strftime("%H:%M:%S")))

            return job_status

        except KeyError as ke:
            self._log("%s | Parameter missing in submission: %s, "
                      "please check the config file" %
                      (time.strftime("%H:%M:%S"), str(ke)))

            self._log("%s | Finished application execution with error" %
                      (time.strftime("%H:%M:%S")))

            self.update_application_state("Error")

        except ex.ConfigurationError:
            self._log("%s | Finished application execution with error" %
                      (time.strftime("%H:%M:%S")))

            self.update_application_state("Error")

        except SaharaAPIException:
            self._log("%s | There is not enough resource to create a cluster" %
                      (time.strftime("%H:%M:%S")))

            self._log("%s | Finished application execution with error" %
                      (time.strftime("%H:%M:%S")))

            self.update_application_state("Error")

        except Exception:
            if cluster_id is not None:
                self._log("%s | Delete cluster: %s" %
                          (time.strftime("%H:%M:%S"), cluster_id))
                connector.delete_cluster(sahara, cluster_id)

            self._log("%s | Unknown error, please report to administrators "
                      "of WP3 infrastructure" % (time.strftime("%H:%M:%S")))

            self._log("%s | Finished application execution with error" %
                      (time.strftime("%H:%M:%S")))

            self.update_application_state("Error")
Beispiel #2
0
    def start_application(self, data):
        try:
            self.update_application_state("Running")

            user = api.user
            password = api.password
            project_id = api.project_id
            auth_ip = api.auth_ip
            domain = api.domain
            public_key = api.public_key

            connector = os_connector.OpenStackConnector(LOG)
            nova = connector.get_nova_client(user, password, project_id,
                                             auth_ip, domain)

            monitor_plugin = data['monitor_plugin']
            expected_time = data['expected_time']
            log_path = data['log_path']
            image_id = data['image_id']
            flavor_id = data['flavor_id']
            command = data['command']
            cluster_size = data['cluster_size']
            starting_cap = data['scaling_parameters']["starting_cap"]

            app_start_time = 0
            app_end_time = 0

            LOG.log("Creating instance(s)")
            print "Creating instance(s)..."

            # Create a number of instances to run the application based on
            # cluster_size, image_id, flavor_id and public_key
            instances = self._create_instances(nova, connector, image_id,
                                               flavor_id, public_key,
                                               cluster_size)

            LOG.log("Waiting until instance become active...")
            print "Waiting until instance become active..."

            # Retrive network information from all instances when they
            # reach ACTIVE state
            instances_nets = []
            for instance_id in instances:
                instance_status = connector.get_instance_status(
                    nova, instance_id)
                while instance_status != 'ACTIVE':
                    instance_status = connector.get_instance_status(
                        nova, instance_id)

                instance_ips = connector.get_instance_networks(
                    nova, instance_id)

                instances_nets.append(instance_ips)
                time.sleep(5)

            time.sleep(30)

            LOG.log("Checking if ssh is available")
            print "Checking if ssh is available"

            # Verify if ssh is available for any ip address for each instance
            instances_ips = []
            for instance_net in instances_nets:
                for net_ip_list in instance_net.values():
                    for ip in net_ip_list:

                        attempts = 2
                        while attempts != -1:
                            try:
                                conn = self._get_ssh_connection(
                                    ip, api.key_path)
                                instances_ips.append(ip)
                                attempts = -1

                            except Exception as e:
                                LOG.log("Fail to connect")
                                LOG.log(e.message)

                                print "Fail to connect"
                                print e.message

                                attempts -= 1
                                time.sleep(30)

            LOG.log("Setting up environment")
            print "Setting up environment"

            # Set CPU cap in all instances
            controller.setup_environment(api.controller_url, instances,
                                         starting_cap, data)

            # Execute application and start monitor and controller service.
            applications = []
            for ip in instances_ips:
                LOG.log("Executing commands into the instance")
                print "Executing commands into the instance"

                # TODO Check if exec_command will work without blocking exec

                conn = self._get_ssh_connection(ip, api.key_path)

                conn.exec_command(command)
                app_start_time = time.time()

                app_id = "app-os-generic" + str(uuid.uuid4())[:8]
                applications.append(app_id)

                monitor_plugin = monitor_plugin
                info_plugin = {
                    "host_ip": ip,
                    "log_path": log_path,
                    "expected_time": expected_time
                }

                collect_period = 1
                try:
                    LOG.log("Starting monitoring")
                    print "Starting monitoring"

                    monitor.start_monitor(api.monitor_url, app_id,
                                          monitor_plugin, info_plugin,
                                          collect_period)

                    LOG.log("Starting scaling")
                    print "Starting scaling"

                    controller.start_controller(api.controller_url, app_id,
                                                instances, data)

                except Exception as e:
                    LOG.log(e.message)
                    print e.message

            # Stop monitor and controller when each application stops
            application_running = True
            while application_running:
                status_instances = []
                for instance_id in instances:
                    status = connector.get_instance_status(nova, instance_id)
                    status_instances.append(status)

                if self._instances_down(status_instances):
                    application_running = False
                    app_end_time = time.time()

                    LOG.log("Application finished")
                    print "Application finished"

                    for app_id in applications:
                        LOG.log("Stopping monitoring")
                        print "Stopping monitoring"
                        monitor.stop_monitor(api.monitor_url, app_id)

                        LOG.log("Stopping scaling")
                        print "Stopping scaling"
                        controller.stop_controller(api.controller_url, app_id)

                else:
                    instance_status = []

                time.sleep(2)

            LOG.log("Removing instances...")
            print "Removing instances..."

            # Remove instances after the end of all applications
            self._remove_instances(nova, connector, instances)

            LOG.log("Finished application execution")
            print "Finished application execution"

            application_time = app_end_time - app_start_time
            application_time_log.log(
                "%s|%.0f|%.0f" % (app_id, app_start_time, application_time))

            self.application_time = application_time
            self.start_time = app_start_time
            self.update_application_state("OK")

            return str(application_time)

        except Exception as e:
            LOG.log(e.message)
            print e.message
            self.update_application_state("Error")