def run_cluster_diagnostics(self, cluster_id, output_directory): try: output = cluster_diagnostic_helper.run(self, cluster_id, output_directory) return output except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def cluster_copy(self, cluster_id: str, source_path: str, destination_path: str): try: return self.__cluster_copy(cluster_id, 'spark', source_path, destination_path) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def delete_cluster(core_cluster_operations, cluster_id: str, keep_logs: bool = False): try: return core_cluster_operations.delete(cluster_id, keep_logs) except BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def create_cluster(self, cluster_conf: models.ClusterConfiguration, wait: bool = False): try: zip_resource_files = upload_node_scripts.zip_scripts( self.blob_client, cluster_conf.cluster_id, cluster_conf.custom_scripts, cluster_conf.spark_configuration, cluster_conf.user_configuration) start_task = create_cluster_helper.generate_cluster_start_task( self, zip_resource_files, cluster_conf.gpu_enabled, cluster_conf.docker_repo, cluster_conf.file_shares) software_metadata_key = "spark" vm_image = models.VmImage(publisher='Canonical', offer='UbuntuServer', sku='16.04') cluster = self.__create_pool_and_job(cluster_conf, software_metadata_key, start_task, vm_image) # Wait for the master to be ready if wait: util.wait_for_master_to_be_ready(self, cluster.id) cluster = self.get_cluster(cluster.id) return cluster except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def get_application_state(core_cluster_operations, cluster_id: str, app_name: str): try: return ApplicationState( core_cluster_operations.get_task_state(cluster_id, app_name).value) except BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def get_application(self, job_id, application_name): try: return models.Application( job_submit_helper.get_application(self, job_id, application_name)) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def get_recent_job(core_job_operations, id): try: job_schedule = core_job_operations.batch_client.job_schedule.get(id) return core_job_operations.batch_client.job.get( job_schedule.execution_info.recent_job.id) except BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def wait_until_application_done(self, cluster_id: str, task_id: str): try: helpers.wait_for_task_to_complete(job_id=cluster_id, task_id=task_id, batch_client=self.batch_client) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def get_application_status(core_cluster_operations, cluster_id: str, app_name: str): try: task = core_cluster_operations.batch_client.task.get( cluster_id, app_name) return task.state._value_ except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def wait_until_cluster_is_ready(self, cluster_id: str): try: util.wait_for_master_to_be_ready(self, cluster_id) pool = self.batch_client.pool.get(cluster_id) nodes = self.batch_client.compute_node.list(pool_id=cluster_id) return models.Cluster(pool, nodes) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def list_jobs(self): try: return [ models.Job(cloud_job_schedule) for cloud_job_schedule in job_submit_helper.list_jobs(self) ] except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def list_jobs(core_job_operations): try: return [ models.Job(cloud_job_schedule) for cloud_job_schedule in _list_jobs(core_job_operations) ] except BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def list_clusters(self): try: return [ models.Cluster(pool) for pool in self.__list_clusters(aztk.models.Software.spark) ] except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def get_job_application_log(core_job_operations, spark_job_operations, job_id, application_name): try: return models.ApplicationLog( _get_application_log(core_job_operations, spark_job_operations, job_id, application_name)) except BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def submit_job(self, job_configuration): try: cluster_data = self._get_cluster_data(job_configuration.id) node_data = NodeData( job_configuration.as_cluster_config()).add_core().done() zip_resource_files = cluster_data.upload_node_data( node_data).to_resource_file() start_task = create_cluster_helper.generate_cluster_start_task( self, zip_resource_files, job_configuration.gpu_enabled, job_configuration.docker_repo, worker_on_master=job_configuration.worker_on_master) application_tasks = [] for application in job_configuration.applications: application_tasks.append((application, cluster_submit_helper.generate_task( self, job_configuration.id, application))) job_manager_task = job_submit_helper.generate_task( self, job_configuration, application_tasks) software_metadata_key = "spark" vm_image = models.VmImage(publisher='Canonical', offer='UbuntuServer', sku='16.04') if job_configuration.max_dedicated_nodes and not job_configuration.max_low_pri_nodes: autoscale_formula = "maxNumberofVMs = {0}; targetNumberofVMs = {1};" \ " $TargetDedicatedNodes=min(maxNumberofVMs, targetNumberofVMs)".format( job_configuration.max_dedicated_nodes, job_configuration.max_dedicated_nodes) elif job_configuration.max_low_pri_nodes and not job_configuration.max_dedicated_nodes: autoscale_formula = "maxNumberofVMs = {0}; targetNumberofVMs = {1};" \ " $TargetLowPriorityNodes=min(maxNumberofVMs, targetNumberofVMs)".format( job_configuration.max_low_pri_nodes, job_configuration.max_low_pri_nodes) else: raise error.AztkError("Jobs do not support both dedicated and low priority nodes." \ " JobConfiguration fields max_dedicated_nodes and max_low_pri_nodes are mutually exclusive values.") job = self.__submit_job( job_configuration=job_configuration, start_task=start_task, job_manager_task=job_manager_task, autoscale_formula=autoscale_formula, software_metadata_key=software_metadata_key, vm_image_model=vm_image, application_metadata='\n'.join( application.name for application in (job_configuration.applications or []))) return models.Job(job) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def delete(core_job_operations, spark_job_operations, job_id: str, keep_logs: bool = False): try: return _delete(core_job_operations, spark_job_operations, job_id, keep_logs) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def cluster_run(self, cluster_id: str, command: str, host=False, internal: bool = False, timeout=None): try: return self.__cluster_run(cluster_id, command, internal, container_name='spark' if not host else None, timeout=timeout) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def create_user(self, cluster_id: str, username: str, password: str = None, ssh_key: str = None) -> str: try: cluster = self.get_cluster(cluster_id) master_node_id = cluster.master_node_id if not master_node_id: raise error.ClusterNotReadyError("The master has not yet been picked, a user cannot be added.") self.__create_user(cluster.id, master_node_id, username, password, ssh_key) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def list_applications(core_job_operations, job_id): try: applications = _list_applications(core_job_operations, job_id) for item in applications: if applications[item]: applications[item] = models.Application(applications[item]) return applications except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def submit(self, cluster_id: str, application: models.ApplicationConfiguration, wait: bool = False): try: cluster_submit_helper.submit_application(self, cluster_id, application, wait) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def list_clusters(core_cluster_operations): try: software_metadata_key = base_models.Software.spark return [ models.Cluster(cluster) for cluster in core_cluster_operations.list(software_metadata_key) ] except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def list_applications(self, job_id): try: applications = job_submit_helper.list_applications(self, job_id) for item in applications: if applications[item]: applications[item] = models.Application(applications[item]) return applications except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def submit_job(self, job_configuration: models.JobConfiguration): try: job_configuration = _apply_default_for_job_config( job_configuration) job_configuration.validate() cluster_data = self._get_cluster_data(job_configuration.id) node_data = NodeData( job_configuration.to_cluster_config()).add_core().done() zip_resource_files = cluster_data.upload_node_data( node_data).to_resource_file() start_task = create_cluster_helper.generate_cluster_start_task( self, zip_resource_files, job_configuration.id, job_configuration.gpu_enabled, job_configuration.get_docker_repo(), mixed_mode=job_configuration.mixed_mode(), worker_on_master=job_configuration.worker_on_master) application_tasks = [] for application in job_configuration.applications: application_tasks.append((application, cluster_submit_helper.generate_task( self, job_configuration.id, application))) job_manager_task = job_submit_helper.generate_task( self, job_configuration, application_tasks) software_metadata_key = "spark" vm_image = models.VmImage(publisher='Canonical', offer='UbuntuServer', sku='16.04') autoscale_formula = "$TargetDedicatedNodes = {0}; " \ "$TargetLowPriorityNodes = {1}".format( job_configuration.max_dedicated_nodes, job_configuration.max_low_pri_nodes) job = self.__submit_job( job_configuration=job_configuration, start_task=start_task, job_manager_task=job_manager_task, autoscale_formula=autoscale_formula, software_metadata_key=software_metadata_key, vm_image_model=vm_image, application_metadata='\n'.join( application.name for application in (job_configuration.applications or []))) return models.Job(job) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def create_cluster(core_cluster_operations, spark_cluster_operations, cluster_conf: models.ClusterConfiguration, vm_image: base_models.VmImage, wait: bool = False): """ Create a new aztk spark cluster Args: cluster_conf(aztk.spark.models.models.ClusterConfiguration): Configuration for the the cluster to be created wait(bool): If you should wait for the cluster to be ready before returning vm_image: models for cluster vm Returns: :obj:`aztk.spark.models.Cluster` """ cluster_conf = _apply_default_for_cluster_config(cluster_conf) cluster_conf.validate() cluster_data = core_cluster_operations.get_cluster_data( cluster_conf.cluster_id) try: zip_resource_files = None node_data = NodeData(cluster_conf).add_core().done() zip_resource_files = cluster_data.upload_node_data( node_data).to_resource_file() start_task = spark_cluster_operations._generate_cluster_start_task( core_cluster_operations, zip_resource_files, cluster_conf.cluster_id, cluster_conf.gpu_enabled(), cluster_conf.get_docker_repo(), cluster_conf.get_docker_run_options(), cluster_conf.file_shares, cluster_conf.mixed_mode(), cluster_conf.worker_on_master, ) software_metadata_key = base_models.Software.spark cluster = core_cluster_operations.create(cluster_conf, software_metadata_key, start_task, vm_image) # Wait for the master to be ready if wait: util.wait_for_master_to_be_ready(core_cluster_operations, spark_cluster_operations, cluster.id) cluster = spark_cluster_operations.get(cluster.id) return cluster except BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def get_application_log(base_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): try: return get_log(base_operations, cluster_id, application_name, tail, current_bytes) except BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def list_applications(core_operations, cluster_id): try: scheduling_target = core_operations.get_cluster_configuration(cluster_id).scheduling_target if scheduling_target is not SchedulingTarget.Any: tasks = core_operations.list_task_table_entries(cluster_id) else: tasks = core_operations.list_batch_tasks(cluster_id) return [Application(task) for task in tasks] except BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def run_cluster_diagnostics(spark_cluster_operations, cluster_id, output_directory=None, brief=False): try: output = _run(spark_cluster_operations, cluster_id, output_directory, brief) return output except BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def submit(core_cluster_operations, spark_cluster_operations, cluster_id: str, application: models.ApplicationConfiguration, remote: bool = False, wait: bool = False): try: submit_application(core_cluster_operations, spark_cluster_operations, cluster_id, application, remote, wait) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def submit_job(core_job_operations, spark_job_operations, job_configuration: models.JobConfiguration, wait: bool = False): try: job_configuration = _apply_default_for_job_config(job_configuration) job_configuration.validate() cluster_data = core_job_operations.get_cluster_data(job_configuration.id) node_data = NodeData(job_configuration.to_cluster_config()).add_core().done() zip_resource_files = cluster_data.upload_node_data(node_data).to_resource_file() start_task = spark_job_operations._generate_cluster_start_task( core_job_operations, zip_resource_files, job_configuration.id, job_configuration.gpu_enabled, job_configuration.get_docker_repo(), job_configuration.get_docker_run_options(), mixed_mode=job_configuration.mixed_mode(), worker_on_master=job_configuration.worker_on_master, ) application_tasks = [] for application in job_configuration.applications: application_tasks.append(( application, spark_job_operations._generate_application_task(core_job_operations, job_configuration.id, application), )) job_manager_task = generate_job_manager_task(core_job_operations, job_configuration, application_tasks) software_metadata_key = base_models.Software.spark vm_image = models.VmImage(publisher="Canonical", offer="UbuntuServer", sku="16.04") autoscale_formula = "$TargetDedicatedNodes = {0}; " "$TargetLowPriorityNodes = {1}".format( job_configuration.max_dedicated_nodes, job_configuration.max_low_pri_nodes) job = core_job_operations.submit( job_configuration=job_configuration, start_task=start_task, job_manager_task=job_manager_task, autoscale_formula=autoscale_formula, software_metadata_key=software_metadata_key, vm_image_model=vm_image, application_metadata="\n".join(application.name for application in (job_configuration.applications or [])), ) if wait: spark_job_operations.wait(id=job_configuration.id) return models.Job(job) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def get_application_log(self, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): try: return get_log_helper.get_log(self.batch_client, self.blob_client, cluster_id, application_name, tail, current_bytes) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))