class MockRedis(): """ Constructor of the mock of a redis object Returns: MockRedis: The simulation of a redis object """ def __init__(self): self.map = {"job": []} self.logger = Log('redis_mock_log', 'redis_mock.log') """ Function the simulates the push of a job in the redis queue Args: metric_queue (string): Representing the metric queue metric (Object): Representing the metric to be pushed in the queue. Returns: None """ def rpush(self, metric_queue, metric): if self.map.get(metric_queue) is None: self.map[metric_queue] = [] self.map[metric_queue].append(metric) """ Function the simulates the pop of a job from the redis queue Args: metric_queue (string): Representing the metric queue Returns: Object: Representing the metric pop from the queue """ def rpop(self, metric_queue): try: return self.map.get(metric_queue).pop(0) except Exception as e: self.logger.log(e) """ Function the simulates the deletion of a redis queue Args: queue_name (string): Representing the name of the queue to be deleted. Returns: None """ def delete(self, queue_name): self.map.pop(queue_name)
def __init__(self, app_id): self.application_state = "None" self.state_lock = threading.RLock() self.application_time = -1 self.start_time = -1 self.app_id = app_id self._verify_existing_log_paths(app_id) self._clean_log_files(app_id) self.running_log = Log("Running_Application_%s" % app_id, "logs/apps/%s/execution" % app_id) self.stdout = Log("stdout_%s" % app_id, "logs/apps/%s/stdout" % app_id) self.stderr = Log("stderr_%s" % app_id, "logs/apps/%s/stderr" % app_id)
from broker.utils.openstack import connector as os_connector from broker.plugins import base from broker.service import api from broker.utils import hdfs from broker.utils.framework import monitor from broker.utils.framework import optimizer from broker.utils import remote from broker.utils.framework import controller from broker.utils import spark from broker.utils.logger import Log, configure_logging from saharaclient.api.base import APIException as SaharaAPIException from broker.utils.ids import ID_Generator from broker.plugins.base import GenericApplicationExecutor plugin_log = Log("Sahara_Plugin", "logs/sahara_plugin.log") application_time_log = Log("Application_Time", "logs/application_time.log") instances_log = Log("Instances", "logs/instances.log") configure_logging() class OpenStackSparkApplicationExecutor(GenericApplicationExecutor): def __init__(self, app_id): self.application_state = "None" self.state_lock = threading.RLock() self.application_time = -1 self.start_time = -1 self.app_id = app_id self._verify_existing_log_paths(app_id)
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or # implied. # See the License for the specific language governing permissions and # limitations under the License. import time import kubernetes as kube import redis from broker.service import api from influxdb import InfluxDBClient from broker.utils.logger import Log KUBEJOBS_LOG = Log("KubeJobsPlugin", "logs/kubejobs.log") def create_job( app_id, cmd, img, init_size, env_vars, config_id="", cas_addr="", scone_heap="200M", las_addr="172.17.0.1:18766", scone_hw="hw", scone_queues="4", scone_version="1",
from broker.plugins import base from broker.service import api from broker.utils.framework import optimizer from broker.utils.framework import monitor from broker.utils.framework import controller from broker.utils.plugins import mesos from broker.utils import ssh from broker.utils.logger import Log, configure_logging from broker.plugins.base import GenericApplicationExecutor from uuid import uuid4 import time import threading plugin_log = Log("Spark-Mesos_Plugin", "logs/mesos_plugin.log") configure_logging() class SparkMesosApplicationExecutor(GenericApplicationExecutor): def __init__(self, app_id, frameworks_url): self.application_state = "None" self.state_lock = threading.RLock() self.application_time = -1 self.start_time = -1 self.app_id = app_id self.frameworks_url = frameworks_url def get_application_state(self): with self.state_lock: state = self.application_state
# implied. # See the License for the specific language governing permissions and # limitations under the License. import json import requests from broker.utils.plugins.chronos import ManagerChronos from broker.plugins.base import GenericApplicationExecutor from broker.plugins import base from broker.utils.ids import ID_Generator from broker.utils.logger import Log from broker.service import api LOG = Log("ChronosPlugin", "logs/chronos_plugin.log") application_time_log = Log("Application_time", "logs/application_time.log") class ChronosApplicationExecutor(GenericApplicationExecutor): def __init__(self): self.id = ID_Generator().get_ID() def start_application(self, data): try: self.update_application_state("Running") # Credentials of framework url = api.chronos_url user = api.chronos_username
def __init__(self): self.map = {"job": []} self.logger = Log('redis_mock_log', 'redis_mock.log')
import time import threading import uuid from broker import exceptions as ex from broker.plugins import base from broker.service import api from broker.utils import hdfs from broker.utils import remote from broker.utils import spark from broker.utils.logger import Log, configure_logging from broker.utils.ids import ID_Generator from broker.plugins.base import GenericApplicationExecutor plugin_log = Log("SparkGeneric_Plugin", "logs/sparkgeneric_plugin.log") application_time_log = Log("Application_Time", "logs/application_time.log") instances_log = Log("Instances", "logs/instances.log") configure_logging() class SparkGenericApplicationExecutor(GenericApplicationExecutor): def __init__(self, app_id, master_ip): self.application_state = "None" self.state_lock = threading.RLock() self.application_time = -1 self.start_time = -1 self.app_id = app_id self.master = master_ip
# You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or # implied. # See the License for the specific language governing permissions and # limitations under the License. import configparser import kubernetes as kube from broker.utils.logger import Log API_LOG = Log("APIv10", "logs/APIv10.log") CONFIG_PATH = "./data/conf" try: # Conf reading config = configparser.RawConfigParser() config.read('./broker.cfg') """ Services configuration """ monitor_url = config.get('services', 'monitor_url') controller_url = config.get('services', 'controller_url') visualizer_url = config.get('services', 'visualizer_url') authorization_url = config.get('services', 'authorization_url') optimizer_url = config.get('services', 'optimizer_url') """ General configuration """
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or # implied. # See the License for the specific language governing permissions and # limitations under the License. from broker.plugins import base as plugin_base from broker.service import api from broker.utils.logger import Log from broker.utils.framework import authorizer from broker.utils.framework import optimizer from broker import exceptions as ex API_LOG = Log("APIv10", "logs/APIv10.log") submissions = {} def run_submission(data): if ('plugin' not in data or 'plugin_info' not in data): API_LOG.log("Missing plugin fields in request") raise ex.BadRequestException("Missing plugin fields in request") if data['enable_auth']: if ('username' not in data or 'password' not in data): API_LOG.log("Missing plugin fields in request") raise ex.BadRequestException("Missing plugin fields in request") username = data['username']
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or # implied. # See the License for the specific language governing permissions and # limitations under the License. import traceback import flask from werkzeug import datastructures from broker import exceptions as ex from broker.utils import serializer as u_serializer from broker.utils.logger import Log LOG = Log("UtilsAPI", "logs/utilsapi.log") class Rest(flask.Blueprint): def get(self, rule, status_code=200): return self._mroute('GET', rule, status_code) def post(self, rule, status_code=202): return self._mroute('POST', rule, status_code) def post_file(self, rule, status_code=202): return self._mroute('POST', rule, status_code, file_upload=True) def put(self, rule, status_code=204): return self._mroute('PUT', rule, status_code)
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or # implied. # See the License for the specific language governing permissions and # limitations under the License. import abc import six from stevedore import enabled from broker import exceptions as ex from broker.service import api from broker.utils.logger import Log LOG = Log("Servicev10", "logs/serviceAPIv10.log") def required(fun): return abc.abstractmethod(fun) def required_with_default(fun): return fun def optional(fun): fun.__not_implemented__ = True return fun
# You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or # implied. # See the License for the specific language governing permissions and # limitations under the License. import requests import time from broker.utils.logger import Log spark_log = Log("Spark Log", "logs/spark.log") def get_running_app(submission_url, applications, number_of_attempts): app_id = None attempts = 0 while app_id is None: try: all_app = requests.get('http://' + submission_url + ':4040/api/v1/applications?status=running') for app in all_app.json(): if app['attempts'][0]['completed'] == False: if app['id'] not in applications: print app['id'] return app['id']#, app['name']
class SparkGenericApplicationExecutor(GenericApplicationExecutor): def __init__(self, app_id, master_ip): self.application_state = "None" self.state_lock = threading.RLock() self.application_time = -1 self.start_time = -1 self.app_id = app_id self.master = master_ip self._verify_existing_log_paths(app_id) self._clean_log_files(app_id) self.running_log = Log("Running_Application_%s" % app_id, "logs/apps/%s/execution" % app_id) self.stdout = Log("stdout_%s" % app_id, "logs/apps/%s/stdout" % app_id) self.stderr = Log("stderr_%s" % app_id, "logs/apps/%s/stderr" % app_id) def get_application_state(self): with self.state_lock: state = self.application_state return state def update_application_state(self, state): with self.state_lock: self.application_state = state def get_application_execution_time(self): return self.application_time def get_application_start_time(self): return self.start_time def start_application(self, data, spark_applications_ids, app_id): try: self.update_application_state("Running") # Broker Parameters key_path = api.key_path remote_hdfs = api.remote_hdfs number_of_attempts = api.number_of_attempts master_ip = self.master # User Request Parameters args = data['args'] main_class = data['main_class'] dependencies = data['dependencies'] job_binary_url = data['job_binary_url'] self._log("%s | Master is %s" % (time.strftime("%H:%M:%S"), master_ip)) job_status = self._hdfs_spark_execution(master_ip, remote_hdfs, key_path, args, job_binary_url, main_class, dependencies, spark_applications_ids, number_of_attempts) self._log("%s | Finished application execution" % time.strftime("%H:%M:%S")) return job_status except KeyError as ke: self._log("%s | Parameter missing in submission: %s, " "please check the config file" % (time.strftime("%H:%M:%S"), str(ke))) self._log("%s | Finished application execution with error" % (time.strftime("%H:%M:%S"))) self.update_application_state("Error") except Exception: self._log("%s | Unknown error, please report to administrators " "of WP3 infrastructure" % (time.strftime("%H:%M:%S"))) self._log("%s | Finished application execution with error" % (time.strftime("%H:%M:%S"))) self.update_application_state("Error") def get_application_time(self): return self.application_time def _hdfs_spark_execution(self, master, remote_hdfs, key_path, args, job_bin_url, main_class, dependencies, spark_applications_ids, number_of_attempts): job_exec_id = str(uuid.uuid4())[0:7] self._log("%s | Job execution ID: %s" % (time.strftime("%H:%M:%S"), job_exec_id)) # Defining params local_path = '/tmp/spark-jobs/' + job_exec_id + '/' job_binary_path = hdfs.get_path(job_bin_url) # Create temporary job directories self._log("%s | Create temporary job directories" % time.strftime("%H:%M:%S")) self._mkdir(local_path) # Create cluster directories self._log("%s | Creating cluster directories" % time.strftime("%H:%M:%S")) remote.execute_command(master, key_path, 'mkdir -p %s' % local_path) # Get job binary from hdfs self._log("%s | Get job binary from hdfs" % time.strftime("%H:%M:%S")) remote.copy_from_hdfs(master, key_path, remote_hdfs, job_binary_path, local_path) # Enabling event log on cluster self._log("%s | Enabling event log on cluster" % time.strftime("%H:%M:%S")) self._enable_event_log(master, key_path, local_path) # Submit job self._log("%s | Starting job" % time.strftime("%H:%M:%S")) local_binary_file = ( local_path + remote.list_directory(key_path, master, local_path)) spark_job = self._submit_job(master, key_path, main_class, dependencies, local_binary_file, args) spark_app_id = spark.get_running_app(master, spark_applications_ids, number_of_attempts) if spark_app_id is None: self._log("%s | Error on submission of application, " "please check the config file" % time.strftime("%H:%M:%S")) (output, err) = spark_job.communicate() self.stdout.log(output) self.stderr.log(err) raise ex.ConfigurationError() spark_applications_ids.append(spark_app_id) (output, err) = spark_job.communicate() self.stdout.log(output) self.stderr.log(err) self._log("%s | Copy log from cluster" % (time.strftime("%H:%M:%S"))) event_log_path = local_path + 'eventlog/' self._mkdir(event_log_path) remote_event_log_path = 'ubuntu@%s:%s%s' % (master, local_path, spark_app_id) remote.copy(key_path, remote_event_log_path, event_log_path) spark_applications_ids.remove(spark_app_id) self.update_application_state("OK") return 'OK' def _submit_job(self, remote_instance, key_path, main_class, dependencies, job_binary_file, args): args_line = '' for arg in args: args_line += arg + ' ' spark_submit = ('/opt/spark/bin/spark-submit ' '--packages %(dependencies)s ' '--class %(main_class)s ' '--master spark://%(master)s:7077 ' '%(job_binary_file)s %(args)s ' % { 'dependencies': dependencies, 'main_class': main_class, 'master': remote_instance, 'job_binary_file': 'file://' + job_binary_file, 'args': args_line }) if main_class == '': spark_submit = spark_submit.replace('--class', '') if dependencies == '': spark_submit = spark_submit.replace('--packages', '') job = remote.execute_command_popen(remote_instance, key_path, spark_submit) return job def _enable_event_log(self, master, key_path, path): enable_event_log_command = ( "echo -e 'spark.executor.extraClassPath " "/usr/lib/hadoop-mapreduce/hadoop-openstack.jar\n" "spark.eventLog.enabled true\n" "spark.eventLog.dir " "file://%(path)s' > " "/opt/spark/conf/spark-defaults.conf" % { 'path': path }) remote.execute_command(master, key_path, enable_event_log_command) def _log(self, string): plugin_log.log(string) self.running_log.log(string) def _verify_existing_log_paths(self, app_id): if not os.path.exists('logs'): os.mkdir('logs') elif not os.path.exists('logs/apps'): os.mkdir('logs/apps') if not os.path.exists('logs/apps/%s' % app_id): os.mkdir('logs/apps/%s' % app_id) def _clean_log_files(self, app_id): open("logs/apps/%s/execution" % app_id, "w").close() open("logs/apps/%s/stdout" % app_id, "w").close() open("logs/apps/%s/stderr" % app_id, "w").close() def _mkdir(self, path): subprocess.call('mkdir -p %s' % path, shell=True)
class OpenStackSparkApplicationExecutor(GenericApplicationExecutor): def __init__(self, app_id): self.application_state = "None" self.state_lock = threading.RLock() self.application_time = -1 self.start_time = -1 self.app_id = app_id self._verify_existing_log_paths(app_id) self._clean_log_files(app_id) self.running_log = Log("Running_Application_%s" % app_id, "logs/apps/%s/execution" % app_id) self.stdout = Log("stdout_%s" % app_id, "logs/apps/%s/stdout" % app_id) self.stderr = Log("stderr_%s" % app_id, "logs/apps/%s/stderr" % app_id) def get_application_state(self): with self.state_lock: state = self.application_state return state def update_application_state(self, state): with self.state_lock: self.application_state = state def get_application_execution_time(self): return self.application_time def get_application_start_time(self): return self.start_time def start_application(self, data, spark_applications_ids, app_id): try: self.update_application_state("Running") # Broker Parameters cluster_id = None user = api.user password = api.password project_id = api.project_id auth_ip = api.auth_ip domain = api.domain public_key = api.public_key key_path = api.key_path log_path = api.log_path container = api.container hosts = api.hosts remote_hdfs = api.remote_hdfs swift_logdir = api.swift_logdir number_of_attempts = api.number_of_attempts dummy_opportunistic = api.dummy_opportunistic # User Request Parameters net_id = data['net_id'] master_ng = data['master_ng'] slave_ng = data['slave_ng'] op_slave_ng = data['opportunistic_slave_ng'] opportunism = str(data['opportunistic']) plugin = data['openstack_plugin'] percentage = int(data['percentage']) job_type = data['job_type'] version = data['version'] args = data['args'] main_class = data['main_class'] dependencies = data['dependencies'] job_template_name = data['job_template_name'] job_binary_name = data['job_binary_name'] job_binary_url = data['job_binary_url'] image_id = data['image_id'] monitor_plugin = data['monitor_plugin'] expected_time = data['expected_time'] collect_period = data['collect_period'] number_of_jobs = data['number_of_jobs'] image_id = data['image_id'] starting_cap = data['starting_cap'] # Optimizer Parameters app_name = data['app_name'] days = 0 if app_name.lower() == 'bulma': if 'days' in data.keys(): days = data['days'] else: self._log("""%s | 'days' parameter missing""" % (time.strftime("%H:%M:%S"))) raise ex.ConfigurationError() # Openstack Components connector = os_connector.OpenStackConnector(plugin_log) sahara = connector.get_sahara_client(user, password, project_id, auth_ip, domain) swift = connector.get_swift_client(user, password, project_id, auth_ip, domain) nova = connector.get_nova_client(user, password, project_id, auth_ip, domain) # Optimizer gets the vcpu size of flavor cores_per_slave = connector.get_vcpus_by_nodegroup( nova, sahara, slave_ng) cores, vms = optimizer.get_info(api.optimizer_url, expected_time, app_name, days) if cores <= 0: if 'cluster_size' in data.keys(): req_cluster_size = data['cluster_size'] else: self._log("""%s | 'cluster_size' parameter missing""" % (time.strftime("%H:%M:%S"))) raise ex.ConfigurationError() else: req_cluster_size = int( math.ceil(cores / float(cores_per_slave))) # Check Oportunism if opportunism == "True": self._log("""%s | Checking if opportunistic instances are available""" % (time.strftime("%H:%M:%S"))) pred_cluster_size = optimizer.get_cluster_size( api.optimizer_url, hosts, percentage, dummy_opportunistic) else: pred_cluster_size = req_cluster_size if pred_cluster_size > req_cluster_size: cluster_size = pred_cluster_size else: cluster_size = req_cluster_size self._log("%s | Cluster size: %s" % (time.strftime("%H:%M:%S"), str(cluster_size))) self._log("%s | Creating cluster..." % (time.strftime("%H:%M:%S"))) cluster_id = self._create_cluster(sahara, connector, req_cluster_size, pred_cluster_size, public_key, net_id, image_id, plugin, version, master_ng, slave_ng, op_slave_ng) self._log("%s | Cluster id: %s" % (time.strftime("%H:%M:%S"), cluster_id)) swift_path = self._is_swift_path(args) if cluster_id: master = connector.get_master_instance( sahara, cluster_id)['internal_ip'] self._log("%s | Master is %s" % (time.strftime("%H:%M:%S"), master)) workers = connector.get_worker_instances(sahara, cluster_id) workers_id = [] for worker in workers: workers_id.append(worker['instance_id']) self._log("%s | Configuring controller" % (time.strftime("%H:%M:%S"))) controller.setup_environment(api.controller_url, workers_id, starting_cap, data) if swift_path: job_status = self._swift_spark_execution( master, key_path, sahara, connector, job_binary_name, job_binary_url, user, password, job_template_name, job_type, plugin, cluster_size, args, main_class, cluster_id, spark_applications_ids, workers_id, app_id, expected_time, monitor_plugin, collect_period, number_of_jobs, log_path, swift, container, data, number_of_attempts) else: job_status = self._hdfs_spark_execution( master, remote_hdfs, key_path, args, job_binary_url, main_class, dependencies, spark_applications_ids, expected_time, monitor_plugin, collect_period, number_of_jobs, workers_id, data, connector, swift, swift_logdir, container, number_of_attempts) else: # FIXME: exception type self.update_application_state("Error") raise ex.ClusterNotCreatedException() # Delete cluster self._log("%s | Delete cluster: %s" % (time.strftime("%H:%M:%S"), cluster_id)) connector.delete_cluster(sahara, cluster_id) self._log("%s | Finished application execution" % (time.strftime("%H:%M:%S"))) return job_status except KeyError as ke: self._log("%s | Parameter missing in submission: %s, " "please check the config file" % (time.strftime("%H:%M:%S"), str(ke))) self._log("%s | Finished application execution with error" % (time.strftime("%H:%M:%S"))) self.update_application_state("Error") except ex.ConfigurationError: self._log("%s | Finished application execution with error" % (time.strftime("%H:%M:%S"))) self.update_application_state("Error") except SaharaAPIException: self._log("%s | There is not enough resource to create a cluster" % (time.strftime("%H:%M:%S"))) self._log("%s | Finished application execution with error" % (time.strftime("%H:%M:%S"))) self.update_application_state("Error") except Exception: if cluster_id is not None: self._log("%s | Delete cluster: %s" % (time.strftime("%H:%M:%S"), cluster_id)) connector.delete_cluster(sahara, cluster_id) self._log("%s | Unknown error, please report to administrators " "of WP3 infrastructure" % (time.strftime("%H:%M:%S"))) self._log("%s | Finished application execution with error" % (time.strftime("%H:%M:%S"))) self.update_application_state("Error") def get_application_time(self): return self.application_time def _get_job_binary_id(self, sahara, connector, job_binary_name, job_binary_url, user, password): extra = dict(user=user, password=password) job_binary_id = connector.get_job_binary(sahara, job_binary_url) if not job_binary_id: job_binary_id = connector.create_job_binary( sahara, job_binary_name, job_binary_url, extra) return job_binary_id def _get_job_template_id(self, sahara, connector, mains, job_template_name, job_type): job_template_id = connector.get_job_template(sahara, mains) if not job_template_id: job_template_id = connector.create_job_template( sahara, job_template_name, job_type, mains) return job_template_id def _wait_on_job_finish(self, sahara, connector, job_exec_id, spark_app_id): completed = failed = False start_time = datetime.datetime.now() self.start_time = time.mktime(start_time.timetuple()) while not (completed or failed): job_status = connector.get_job_status(sahara, job_exec_id) self._log("%s | Sahara current job status: %s" % (time.strftime("%H:%M:%S"), job_status)) if job_status == 'RUNNING': time.sleep(2) current_time = datetime.datetime.now() current_job_time = (current_time - start_time).total_seconds() if current_job_time > 3600: self._log("%s | Job execution killed due to inactivity" % time.strftime("%H:%M:%S")) job_status = 'TIMEOUT' completed = connector.is_job_completed(job_status) failed = connector.is_job_failed(job_status) end_time = datetime.datetime.now() total_time = end_time - start_time application_time_log.log( "%s|%.0f|%.0f" % (spark_app_id, float(time.mktime( start_time.timetuple())), float(total_time.total_seconds()))) self.application_time = total_time.total_seconds() self._log("%s | Sahara job took %s seconds to execute" % (time.strftime("%H:%M:%S"), str(total_time.total_seconds()))) return job_status def _create_cluster(self, sahara, connector, req_cluster_size, pred_cluster_size, public_key, net_id, image_id, plugin, version, master_ng, slave_ng, op_slave_ng): self._log('Creating cluster') try: cluster_id = connector.create_cluster(sahara, req_cluster_size, pred_cluster_size, public_key, net_id, image_id, plugin, version, master_ng, slave_ng, op_slave_ng) except SaharaAPIException: raise SaharaAPIException('Could not create clusters') return cluster_id def _is_swift_path(self, args): for arg in args: if arg.startswith('hdfs://') or arg.startswith('swift://'): if arg.startswith('swift://'): return True else: return False def _swift_spark_execution(self, master, key_path, sahara, connector, job_binary_name, job_binary_url, user, password, job_template_name, job_type, plugin, cluster_size, args, main_class, cluster_id, spark_applications_ids, workers_id, app_id, expected_time, monitor_plugin, collect_period, number_of_jobs, log_path, swift, container, data, number_of_attempts): # Preparing job job_binary_id = self._get_job_binary_id(sahara, connector, job_binary_name, job_binary_url, user, password) mains = [job_binary_id] job_template_id = self._get_job_template_id(sahara, connector, mains, job_template_name, job_type) self._log("%s | Starting job..." % (time.strftime("%H:%M:%S"))) # Running job # What is os_utils? # configs = os_utils.get_job_config(connector, plugin, # cluster_size, user, password, # args, main_class) configs = None job = connector.create_job_execution(sahara, job_template_id, cluster_id, configs=configs) self._log("%s | Created job" % (time.strftime("%H:%M:%S"))) spark_app_id = spark.get_running_app(master, spark_applications_ids, number_of_attempts) spark_applications_ids.append(spark_app_id) self._log("%s | Spark app id" % (time.strftime("%H:%M:%S"))) job_exec_id = job.id for worker_id in workers_id: instances_log.log("%s|%s" % (app_id, worker_id)) job_status = connector.get_job_status(sahara, job_exec_id) self._log("%s | Sahara job status: %s" % (time.strftime("%H:%M:%S"), job_status)) info_plugin = { "spark_submisson_url": "http://" + master, "expected_time": expected_time, "number_of_jobs": number_of_jobs } self._log("%s | Starting monitor" % (time.strftime("%H:%M:%S"))) monitor.start_monitor(api.monitor_url, spark_app_id, monitor_plugin, info_plugin, collect_period) self._log("%s | Starting controller" % (time.strftime("%H:%M:%S"))) controller.start_controller(api.controller_url, spark_app_id, workers_id, data) job_status = self._wait_on_job_finish(sahara, connector, job_exec_id, app_id) self._log("%s | Stopping monitor" % (time.strftime("%H:%M:%S"))) monitor.stop_monitor(api.monitor_url, spark_app_id) self._log("%s | Stopping controller" % (time.strftime("%H:%M:%S"))) controller.stop_controller(api.controller_url, spark_app_id) spark_applications_ids.remove(spark_app_id) self._log("Finished application execution") if connector.is_job_completed(job_status): self.update_application_state("OK") if connector.is_job_failed(job_status): self.update_application_state("Error") return job_status def _hdfs_spark_execution(self, master, remote_hdfs, key_path, args, job_bin_url, main_class, dependencies, spark_applications_ids, expected_time, monitor_plugin, collect_period, number_of_jobs, workers_id, data, connector, swift, swift_logdir, container, number_of_attempts): job_exec_id = str(uuid.uuid4())[0:7] self._log("%s | Job execution ID: %s" % (time.strftime("%H:%M:%S"), job_exec_id)) # Defining params local_path = '/tmp/spark-jobs/' + job_exec_id + '/' # remote_path = 'ubuntu@' + master + ':' + local_path job_input_paths, job_output_path, job_params = (hdfs.get_job_params( key_path, remote_hdfs, args)) job_binary_path = hdfs.get_path(job_bin_url) # Create temporary job directories self._log("%s | Create temporary job directories" % (time.strftime("%H:%M:%S"))) self._mkdir(local_path) # Create cluster directories self._log("%s | Creating cluster directories" % (time.strftime("%H:%M:%S"))) remote.execute_command(master, key_path, 'mkdir -p %s' % local_path) # Get job binary from hdfs self._log("%s | Get job binary from hdfs" % (time.strftime("%H:%M:%S"))) remote.copy_from_hdfs(master, key_path, remote_hdfs, job_binary_path, local_path) # Enabling event log on cluster self._log("%s | Enabling event log on cluster" % (time.strftime("%H:%M:%S"))) self._enable_event_log(master, key_path, local_path) # Submit job self._log("%s | Starting job" % (time.strftime("%H:%M:%S"))) local_binary_file = ( local_path + remote.list_directory(key_path, master, local_path)) spark_job = self._submit_job(master, key_path, main_class, dependencies, local_binary_file, args) spark_app_id = spark.get_running_app(master, spark_applications_ids, number_of_attempts) if spark_app_id is None: self._log("%s | Error on submission of application, " "please check the config file" % (time.strftime("%H:%M:%S"))) (output, err) = spark_job.communicate() self.stdout.log(output) self.stderr.log(err) raise ex.ConfigurationError() spark_applications_ids.append(spark_app_id) info_plugin = { "spark_submisson_url": "http://" + master, "expected_time": expected_time, "number_of_jobs": number_of_jobs } self._log("%s | Starting monitor" % (time.strftime("%H:%M:%S"))) monitor.start_monitor(api.monitor_url, spark_app_id, monitor_plugin, info_plugin, collect_period) self._log("%s | Starting controller" % (time.strftime("%H:%M:%S"))) controller.start_controller(api.controller_url, spark_app_id, workers_id, data) (output, err) = spark_job.communicate() self._log("%s | Stopping monitor" % (time.strftime("%H:%M:%S"))) monitor.stop_monitor(api.monitor_url, spark_app_id) self._log("%s | Stopping controller" % (time.strftime("%H:%M:%S"))) controller.stop_controller(api.controller_url, spark_app_id) self.stdout.log(output) self.stderr.log(err) self._log("%s | Copy log from cluster" % (time.strftime("%H:%M:%S"))) event_log_path = local_path + 'eventlog/' self._mkdir(event_log_path) remote_event_log_path = 'ubuntu@%s:%s%s' % (master, local_path, spark_app_id) remote.copy(key_path, remote_event_log_path, event_log_path) self._log("%s | Upload log to Swift" % (time.strftime("%H:%M:%S"))) connector.upload_directory(swift, event_log_path, swift_logdir, container) spark_applications_ids.remove(spark_app_id) self.update_application_state("OK") return 'OK' def _submit_job(self, remote_instance, key_path, main_class, dependencies, job_binary_file, args): args_line = '' for arg in args: args_line += arg + ' ' spark_submit = ('/opt/spark/bin/spark-submit ' '--packages %(dependencies)s ' '--class %(main_class)s ' '--master spark://%(master)s:7077 ' '%(job_binary_file)s %(args)s ' % { 'dependencies': dependencies, 'main_class': main_class, 'master': remote_instance, 'job_binary_file': 'file://' + job_binary_file, 'args': args_line }) if main_class == '': spark_submit = spark_submit.replace('--class', '') if dependencies == '': spark_submit = spark_submit.replace('--packages', '') self._log("%s | spark-submit: %s" % (time.strftime("%H:%M:%S"), spark_submit)) job = remote.execute_command_popen(remote_instance, key_path, spark_submit) return job def _enable_event_log(self, master, key_path, path): enable_event_log_command = ( "echo -e 'spark.executor.extraClassPath " "/usr/lib/hadoop-mapreduce/hadoop-openstack.jar\n" "spark.eventLog.enabled true\n" "spark.eventLog.dir " "file://%(path)s' > " "/opt/spark/conf/spark-defaults.conf" % { 'path': path }) remote.execute_command(master, key_path, enable_event_log_command) def _log(self, string): plugin_log.log(string) self.running_log.log(string) def _verify_existing_log_paths(self, app_id): if not os.path.exists('logs'): os.mkdir('logs') elif not os.path.exists('logs/apps'): os.mkdir('logs/apps') if not os.path.exists('logs/apps/%s' % app_id): os.mkdir('logs/apps/%s' % app_id) def _clean_log_files(self, app_id): # Commented because isn't used # running_log_file = open("logs/apps/%s/execution" \ # % app_id, "w").close() # stdout_file = open("logs/apps/%s/stdout" % app_id, "w").close() # stderr_file = open("logs/apps/%s/stderr" % app_id, "w").close() pass def _mkdir(self, path): subprocess.call('mkdir -p %s' % path, shell=True)
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or # implied. # See the License for the specific language governing permissions and # limitations under the License. import subprocess from broker.utils.logger import Log R_PREFIX = 'Rscript ' PYTHON_PREFIX = 'python ' LOGGER = Log('utils_shell_log', 'shell.log') def execute_r_script(script, args): command = R_PREFIX + script + " " + " ".join(args) p_status = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p_status.communicate() try: LOGGER.log("{} {}".format(out, err)) value = float(out) return value except Exception as e: LOGGER.log(e)
import time import datetime import uuid from broker.plugins.base import GenericApplicationExecutor from broker.plugins import base from broker.utils.ids import ID_Generator from broker.utils.logger import Log from broker.utils.plugins import k8s from broker.utils.framework import monitor from broker.utils.framework import controller from broker.utils.framework import visualizer from broker.service import api from broker.service.api import v10 KUBEJOBS_LOG = Log("KubeJobsPlugin", "logs/kubejobs.log") application_time_log = Log("Application_time", "logs/application_time.log") class KubeJobsExecutor(GenericApplicationExecutor): def __init__(self, app_id): self.id = ID_Generator().get_ID() self.app_id = app_id self.starting_time = None self.rds = None self.status = "created" self.waiting_time = 600 self.job_completed = False self.terminated = False self.visualizer_url = "URL not generated!"