def update_stateless_job(self, job_id, new_job_config): """ param job_id: id of the job param new_job_config: new config of the job type job_id: str type new_job_config: job.JobConfig rtype: job.UpdateResponse """ request = update_svc.CreateUpdateRequest( jobId=peloton.JobID(value=job_id), jobConfig=new_job_config, updateConfig=update_pb2.UpdateConfig(), ) try: print_okblue("Updating Job %s" % job_id) resp = self.client.update_svc.CreateUpdate( request, metadata=self.client.jobmgr_metadata, timeout=default_timeout, ) return resp except Exception as e: print_fail("Exception calling Update Stateless Job: %s" % str(e)) raise
def stop_task(self, job_id, instance_id): """ param job_id: id of the job param instance_id: instance id of the task to stop type job_id: str type instance_id: int rtype: task.StopResponse """ rng = task.InstanceRange(to=instance_id + 1) setattr(rng, "from", instance_id) request = task.StopRequest( jobId=peloton.JobID(value=job_id), ranges=[rng] ) try: print_okblue("Stopping task %d of Job %s" % (instance_id, job_id)) resp = self.client.task_svc.Stop( request, metadata=self.client.jobmgr_metadata, timeout=default_timeout, ) return resp except Exception as e: print_fail("Exception calling Stop Tasks :%s" % str(e)) raise
def setup(self, dynamic_env, instance_number, job_name=None, version=None, image_path=None): """ param dynamic: dict of dynamic environment virable param instance_number: number of tasks in the job type dynamic: dict type instance_number: int return: job-id """ if not job_name: job_name = self.label + '_' + self.name task_config = create_mesos_task_config(self.config, self.name, dynamic_env, version, image_path) if version: self.version = version resp = self.peloton_helper.create_job( label=self.label, name=job_name, default_task_config=task_config, num_instance=instance_number, ) self.job_id = resp.jobId.value print_okblue('Waiting for job %s creating...' % job_name) if not self.peloton_helper.monitering(self.job_id, RUNNING_TARGET_STATUS): raise ModuleLaunchFailedException("%s can not launch" % self.name) return self.job_id
def teardown_peloton(self, remove=False): print_okgreen("Step: stopping all peloton applications") for app in reversed(self.APP_ORDER): print_okblue("Stopping peloton application: %s" % app) self.peloton.teardown(self.label_name + "_" + "peloton-" + app, remove=remove) print_okgreen("Step: stopping cassandra") self.cassandra.teardown(remove=remove) try: os.remove(self.config_name) except OSError: pass
def create_job( self, label, name, num_instance, default_task_config, instance_config=None, **extra ): """ :param label: the label value of the job :param name: the name of the job :param respool_id: the id of the resource pool :param num_instance: the number of instance of the job :param default_task_config: the default task config of the job :param instance_config: instance specific task config :param extra: extra information of the job :type label: str :type name: str :type respool_id: str :type num_instance: int :type default_task_config: task.TaskConfig :type instance_config: dict<int, task.TaskConfig> :type extra: dict :rtypr: job.CreateResponse """ request = job.CreateRequest( config=self.get_job_config_spec( label, name, num_instance, default_task_config, instance_config=instance_config, **extra ) ) try: resp = self.client.job_svc.Create( request, metadata=self.client.jobmgr_metadata, timeout=default_timeout, ) print_okblue("Create job response : %s" % resp) return resp except Exception as e: print_fail("Exception calling Create job :%s" % str(e)) raise
def setup( self, dynamic_env, instance_number, job_name=None, version=None, image_path=None, ): """ Overrides setup() from base-class to create hostmgr in a phased manner. """ if "hostmgr" not in job_name: return super(Peloton, self).setup( dynamic_env, instance_number, job_name=job_name, version=version, image_path=image_path, ) # create a single instance of hostmgr to avoid running DB migrations # concurrently. super(Peloton, self).setup( dynamic_env, 1, job_name=job_name, version=version, image_path=image_path, ) # Wait a little so that DB migration can complete. # TODO(amitbose) Find a better way to wait time.sleep(30) jobInfo = self.peloton_helper.get_job(self.job_id).jobInfo runtime = jobInfo.runtime config = jobInfo.config # update the job to change the instances config.instanceCount = instance_number cl = peloton.ChangeLog(version=runtime.configurationVersion) config.changeLog.MergeFrom(cl) self.peloton_helper.update_stateless_job(self.job_id, config) print_okblue("Waiting for job %s update..." % job_name) if not self.peloton_helper.monitering( self.job_id, RUNNING_TARGET_STATUS ): raise ModuleLaunchFailedException( "%s can not launch: update failed" % self.name ) return self.job_id
def setup(self, dynamic_env, instance_number, job_name=None, version=None): """ param dynamic: dict of dynamic environment virable param instance_number: number of tasks in the job type dynamic: dict type instance_number: int return: job-id """ if not job_name: job_name = self.label + "_" + self.name if version: self.version = version instance_config = {} for i in range(instance_number): dynamic_env["MESOS_HOSTNAME"] = "-".join( [self.label, self.name, str(i), str(uuid.uuid4())] ) instance_config.update( { i: create_mesos_task_config( self.config, self.name, dynamic_env, version ) } ) resp = self.peloton_helper.create_job( label=self.label, name=job_name, default_task_config=instance_config[0], instance_config=instance_config, num_instance=instance_number, ) self.job_id = resp.jobId.value print_okblue("Waiting for job %s setup..." % job_name) if not self.peloton_helper.monitering( self.job_id, RUNNING_TARGET_STATUS ): raise ModuleLaunchFailedException("%s can not launch" % self.name) return self.job_id
def delete_job(self, job_id): """ param job_id: id of the job type job_id: str rtype: job.DeleteResponse """ request = job.DeleteRequest(id=peloton.JobID(value=job_id)) try: print_okblue("Deleting job %s" % job_id) resp = self.client.job_svc.Delete( request, metadata=self.client.jobmgr_metadata, timeout=default_timeout, ) return resp except Exception as e: print_fail("Exception calling delete job :%s" % str(e)) raise
def stop_job(self, job_id): """ param job_id: id of the job type job_id: str rtype: job.StopResponse """ request = task.StopRequest(jobId=peloton.JobID(value=job_id)) try: print_okblue("Killing all tasks of Job %s" % job_id) resp = self.client.task_svc.Stop( request, metadata=self.client.jobmgr_metadata, timeout=default_timeout, ) return resp except Exception as e: print_fail("Exception calling List Tasks :%s" % str(e)) raise
def create_respool_for_new_peloton(config, zk_server, agent_num, respool_name=RESPOOL_PATH): """ Create A respool for a cluster according the cluster size type config: dict type zk_server: string type agent_num: int type respool_name: string rtype: string """ client = PelotonClient(name='peloton-client', zk_servers=zk_server) # Respool size should be 90% of the cluster size # CPU, Memory and Disk values are the announced # resource value of every Mesos slave resource_config = config.get('mesos-slave').get('resource') respool_config = create_pool_config( name=respool_name, cpu=agent_num * resource_config.get('cpuLimit') * 0.9, memory=agent_num * resource_config.get('memLimitMb') * 0.9, disk=agent_num * resource_config.get('diskLimitMb') * 0.9, ) request = respool.CreateRequest(config=respool_config, ) resp = client.respool_svc.CreateResourcePool( request, metadata=client.resmgr_metadata, timeout=default_timeout, ) if resp.HasField('error'): print_fail('Failed to create resource pool %s: %s' % (respool_name, resp)) raise Exception("Resource pool creation failed") print_okblue('Created resource pool %s' % respool_name) return resp.result.value
def update_job(self, job_id, new_job_config): """ param job_id: id of the job param new_job_config: new config of the job type job_id: str type new_job_config: job.JobConfig rtype: job.UpdateResponse """ request = job.UpdateRequest( id=peloton.JobID(value=job_id), config=new_job_config, ) try: print_okblue("Updating Job %s" % job_id) resp = self.client.job_svc.Update( request, metadata=self.client.jobmgr_metadata, timeout=default_timeout, ) return resp except Exception, e: print_fail('Exception calling Update Job: %s' % str(e)) raise
def start_peloton( self, virtual_zookeeper, agent_num, version=None, skip_respool=False, peloton_image=None, peloton_apps_config=None, ): """ param virtual_zookeeper : The zk url and port param agent_num : The number of mesos agents to start param version : The peloton version param skip_respool : To skip creating the default respool or not param peloton_image : The docker image of peloton param peloton_app_config : The path to the peloton apps configs type virtual_zookeeper : str type agent_num : int type version : str type skip_respool : bool type peloton_image : str type peloton_app_config : str """ # Setup Cassandra chost, cport, keyspace = self.start_cassandra() # Wait a little for cassandra to start-up and create keyspace. # TODO(amitbose) find a better way to wait time.sleep(20) if peloton_image: parts = peloton_image.split(":") if len(parts) > 1: version = parts[-1] # Setup Peloton print_okgreen("Step: Create Peloton, version: %s, image: %s" % (version, peloton_image)) num_logs = self.config.get("peloton").get( "num_log_files", DEFAULT_PELOTON_NUM_LOG_FILES) for app in self.APP_ORDER: print_okblue("Creating peloton application: %s" % app) dynamic_env_master = { "PRODUCTION_CONFIG": self._get_base64_prod_config(app, peloton_apps_config), "APP": app, "ENVIRONMENT": "production", "ELECTION_ZK_SERVERS": virtual_zookeeper, "MESOS_ZK_PATH": "zk://%s/mesos" % virtual_zookeeper, "CASSANDRA_STORE": keyspace, "CASSANDRA_HOSTS": chost, "CASSANDRA_PORT": str(cport), "CONTAINER_LOGGER_LOGROTATE_STDERR_OPTIONS": "rotate %s" % num_logs, } mesos_slave_config = self.config.get("mesos-slave", {}) mesos_work_dir = [ kv["value"] for kv in mesos_slave_config.get("static_env", []) if kv.get("name") == "MESOS_WORK_DIR" ] if mesos_work_dir: dynamic_env_master["MESOS_AGENT_WORK_DIR"] = mesos_work_dir[0] if app == "hostmgr": dynamic_env_master["SCARCE_RESOURCE_TYPES"] = ",".join( self.config.get("peloton").get(app).get( "scarce_resource_types")) dynamic_env_master["SLACK_RESOURCE_TYPES"] = ",".join( self.config.get("peloton").get(app).get( "slack_resource_types")) dynamic_env_master["ENABLE_REVOCABLE_RESOURCES"] = str( self.config.get("peloton").get(app).get( "enable_revocable_resources")) if app == "placement_stateless": dynamic_env_master["APP"] = "placement" dynamic_env_master["TASK_TYPE"] = "STATELESS" peloton_app_count = int( self.config.get("peloton").get(app).get("instance_count")) self.vcluster_config["job_info"][app] = self.peloton.setup( dynamic_env_master, peloton_app_count, self.label_name + "_" + "peloton-" + app, version, peloton_image, ) self.vcluster_config.update({"Peloton Version": version}) # create a default resource pool if not skip_respool: create_respool_for_new_peloton(self.config, zk_server=virtual_zookeeper, agent_num=agent_num)
def start_peloton(self, virtual_zookeeper, agent_num, version=None, skip_respool=False, peloton_image=None, peloton_apps_config=None): """ param virtual_zookeeper : The zk url and port param agent_num : The number of mesos agents to start param version : The peloton version param skip_respool : To skip creating the default respool or not param peloton_image : The docker image of peloton param peloton_app_config : The path to the peloton apps configs type virtual_zookeeper : str type agent_num : int type version : str type skip_respool : bool type peloton_image : str type peloton_app_config : str """ # Setup Cassandra chost, cport, keyspace = self.start_cassandra() # Wait a little for cassandra to start-up and create keyspace. # TODO(amitbose) find a better way to wait time.sleep(20) if peloton_image: parts = peloton_image.split(':') if len(parts) > 1: version = parts[-1] # Setup Peloton print_okgreen('Step: Create Peloton, version: %s, image: %s' % (version, peloton_image)) num_logs = self.config.get('peloton').get( 'num_log_files', DEFAULT_PELOTON_NUM_LOG_FILES) for app in self.APP_ORDER: print_okblue('Creating peloton application: %s' % app) # placement_[stateless|stateful] is the placement app with the a # different name if app.startswith('placement_'): app = 'placement' prod_config_path = self._get_app_path(peloton_apps_config). \ format(app) with open(prod_config_path, "rb") as config_file: prod_config_base64 = base64.b64encode(config_file.read()) dynamic_env_master = { "PRODUCTION_CONFIG": prod_config_base64, 'APP': app, 'ENVIRONMENT': 'production', 'ELECTION_ZK_SERVERS': virtual_zookeeper, 'MESOS_ZK_PATH': 'zk://%s/mesos' % virtual_zookeeper, 'CASSANDRA_STORE': keyspace, 'CASSANDRA_HOSTS': chost, 'CASSANDRA_PORT': str(cport), 'CONTAINER_LOGGER_LOGROTATE_STDERR_OPTIONS': 'rotate %s' % num_logs, } mesos_slave_config = self.config.get('mesos-slave', {}) mesos_work_dir = [ kv['value'] for kv in mesos_slave_config.get('static_env', []) if kv.get('name') == 'MESOS_WORK_DIR' ] if mesos_work_dir: dynamic_env_master['MESOS_AGENT_WORK_DIR'] = mesos_work_dir[0] if app == 'hostmgr': dynamic_env_master['SCARCE_RESOURCE_TYPES'] = ','.join( self.config.get('peloton').get(app).get( 'scarce_resource_types')) dynamic_env_master['SLACK_RESOURCE_TYPES'] = ','.join( self.config.get('peloton').get(app).get( 'slack_resource_types')) dynamic_env_master['ENABLE_REVOCABLE_RESOURCES'] = \ str(self.config.get('peloton').get(app).get( 'enable_revocable_resources')) if app == "placement_stateless": dynamic_env_master['APP'] = 'placement' dynamic_env_master['TASK_TYPE'] = 'STATELESS' peloton_app_count = int( self.config.get('peloton').get(app).get('instance_count')) self.vcluster_config['job_info'][app] = (self.peloton.setup( dynamic_env_master, peloton_app_count, self.label_name + '_' + 'peloton-' + app, version, peloton_image, )) self.vcluster_config.update({ 'Peloton Version': version, }) # create a default resource pool if not skip_respool: create_respool_for_new_peloton( self.config, zk_server=virtual_zookeeper, agent_num=agent_num, )