def delete(core_job_operations, spark_job_operations, job_id: str, keep_logs: bool = False): try: return _delete(core_job_operations, spark_job_operations, job_id, keep_logs) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def create_user(self, cluster_id: str, username: str, password: str = None, ssh_key: str = None) -> str: try: cluster = self.get_cluster(cluster_id) master_node_id = cluster.master_node_id if not master_node_id: raise error.ClusterNotReadyError("The master has not yet been picked, a user cannot be added.") self.__create_user(cluster.id, master_node_id, username, password, ssh_key) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def list_applications(self, job_id): try: applications = job_submit_helper.list_applications(self, job_id) for item in applications: if applications[item]: applications[item] = models.Application(applications[item]) return applications except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def cluster_run(self, cluster_id: str, command: str, host=False, internal: bool = False, timeout=None): try: return self.__cluster_run(cluster_id, command, internal, container_name='spark' if not host else None, timeout=timeout) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def submit_job(self, job_configuration: models.JobConfiguration): try: job_configuration = _apply_default_for_job_config( job_configuration) job_configuration.validate() cluster_data = self._get_cluster_data(job_configuration.id) node_data = NodeData( job_configuration.to_cluster_config()).add_core().done() zip_resource_files = cluster_data.upload_node_data( node_data).to_resource_file() start_task = create_cluster_helper.generate_cluster_start_task( self, zip_resource_files, job_configuration.id, job_configuration.gpu_enabled, job_configuration.get_docker_repo(), mixed_mode=job_configuration.mixed_mode(), worker_on_master=job_configuration.worker_on_master) application_tasks = [] for application in job_configuration.applications: application_tasks.append((application, cluster_submit_helper.generate_task( self, job_configuration.id, application))) job_manager_task = job_submit_helper.generate_task( self, job_configuration, application_tasks) software_metadata_key = "spark" vm_image = models.VmImage(publisher='Canonical', offer='UbuntuServer', sku='16.04') autoscale_formula = "$TargetDedicatedNodes = {0}; " \ "$TargetLowPriorityNodes = {1}".format( job_configuration.max_dedicated_nodes, job_configuration.max_low_pri_nodes) job = self.__submit_job( job_configuration=job_configuration, start_task=start_task, job_manager_task=job_manager_task, autoscale_formula=autoscale_formula, software_metadata_key=software_metadata_key, vm_image_model=vm_image, application_metadata='\n'.join( application.name for application in (job_configuration.applications or []))) return models.Job(job) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def create_cluster(core_cluster_operations, spark_cluster_operations, cluster_conf: models.ClusterConfiguration, vm_image: base_models.VmImage, wait: bool = False): """ Create a new aztk spark cluster Args: cluster_conf(aztk.spark.models.models.ClusterConfiguration): Configuration for the the cluster to be created wait(bool): If you should wait for the cluster to be ready before returning vm_image: models for cluster vm Returns: :obj:`aztk.spark.models.Cluster` """ cluster_conf = _apply_default_for_cluster_config(cluster_conf) cluster_conf.validate() cluster_data = core_cluster_operations.get_cluster_data( cluster_conf.cluster_id) try: zip_resource_files = None node_data = NodeData(cluster_conf).add_core().done() zip_resource_files = cluster_data.upload_node_data( node_data).to_resource_file() start_task = spark_cluster_operations._generate_cluster_start_task( core_cluster_operations, zip_resource_files, cluster_conf.cluster_id, cluster_conf.gpu_enabled(), cluster_conf.get_docker_repo(), cluster_conf.get_docker_run_options(), cluster_conf.file_shares, cluster_conf.mixed_mode(), cluster_conf.worker_on_master, ) software_metadata_key = base_models.Software.spark cluster = core_cluster_operations.create(cluster_conf, software_metadata_key, start_task, vm_image) # Wait for the master to be ready if wait: util.wait_for_master_to_be_ready(core_cluster_operations, spark_cluster_operations, cluster.id) cluster = spark_cluster_operations.get(cluster.id) return cluster except BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def list_applications(core_operations, cluster_id): try: scheduling_target = core_operations.get_cluster_configuration(cluster_id).scheduling_target if scheduling_target is not SchedulingTarget.Any: tasks = core_operations.list_task_table_entries(cluster_id) else: tasks = core_operations.list_batch_tasks(cluster_id) return [Application(task) for task in tasks] except BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def _download_resource_file(task_id, resource_file): response = http_request_wrapper(requests.get, url=resource_file.blob_source, timeout=None, stream=True) if resource_file.file_path: write_path = os.path.join(os.environ.get("AZ_BATCH_TASK_WORKING_DIR"), resource_file.file_path) with open(write_path, 'wb') as stream: for chunk in response.iter_content(chunk_size=16777216): stream.write(chunk) return None raise error.AztkError("ResourceFile file_path not set.")
def get_application(spark_client, job_id, application_name): # info about the app recent_run_job = __get_recent_job(spark_client, job_id) try: return spark_client.batch_client.task.get(job_id=recent_run_job.id, task_id=application_name) except batch_models.batch_error.BatchErrorException: raise error.AztkError( "The Spark application {0} is still being provisioned or does not exist." .format(application_name))
def run_cluster_diagnostics(spark_cluster_operations, cluster_id, output_directory=None, brief=False): try: output = _run(spark_cluster_operations, cluster_id, output_directory, brief) return output except BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def submit(core_cluster_operations, spark_cluster_operations, cluster_id: str, application: models.ApplicationConfiguration, remote: bool = False, wait: bool = False): try: submit_application(core_cluster_operations, spark_cluster_operations, cluster_id, application, remote, wait) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def get_application_log(base_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): try: return get_log(base_operations, cluster_id, application_name, tail, current_bytes) except BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def submit_job(core_job_operations, spark_job_operations, job_configuration: models.JobConfiguration, wait: bool = False): try: job_configuration = _apply_default_for_job_config(job_configuration) job_configuration.validate() cluster_data = core_job_operations.get_cluster_data(job_configuration.id) node_data = NodeData(job_configuration.to_cluster_config()).add_core().done() zip_resource_files = cluster_data.upload_node_data(node_data).to_resource_file() start_task = spark_job_operations._generate_cluster_start_task( core_job_operations, zip_resource_files, job_configuration.id, job_configuration.gpu_enabled, job_configuration.get_docker_repo(), job_configuration.get_docker_run_options(), mixed_mode=job_configuration.mixed_mode(), worker_on_master=job_configuration.worker_on_master, ) application_tasks = [] for application in job_configuration.applications: application_tasks.append(( application, spark_job_operations._generate_application_task(core_job_operations, job_configuration.id, application), )) job_manager_task = generate_job_manager_task(core_job_operations, job_configuration, application_tasks) software_metadata_key = base_models.Software.spark vm_image = models.VmImage(publisher="Canonical", offer="UbuntuServer", sku="16.04") autoscale_formula = "$TargetDedicatedNodes = {0}; " "$TargetLowPriorityNodes = {1}".format( job_configuration.max_dedicated_nodes, job_configuration.max_low_pri_nodes) job = core_job_operations.submit( job_configuration=job_configuration, start_task=start_task, job_manager_task=job_manager_task, autoscale_formula=autoscale_formula, software_metadata_key=software_metadata_key, vm_image_model=vm_image, application_metadata="\n".join(application.name for application in (job_configuration.applications or [])), ) if wait: spark_job_operations.wait(id=job_configuration.id) return models.Job(job) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def cluster_run(core_cluster_operations, cluster_id: str, command: str, host=False, internal: bool = False, timeout=None): try: return core_cluster_operations.run( cluster_id, command, internal, container_name="spark" if not host else None, timeout=timeout) except BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def get_application_log(self, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): try: return get_log_helper.get_log(self.batch_client, self.blob_client, cluster_id, application_name, tail, current_bytes) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def create_user(self, cluster_id: str, username: str, password: str = None, ssh_key: str = None) -> str: try: cluster = self.get_cluster(cluster_id) master_node_id = cluster.master_node_id self.__create_user(cluster.id, master_node_id, username, password, ssh_key) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def _get_application_log(core_job_operations, spark_job_operations, job_id, application_name): # TODO: change where the logs are uploaded so they aren't overwritten on scheduled runs # current: job_id, application_name/output.log # new: job_id, recent_run_job.id/application_name/output.log recent_run_job = get_recent_job(core_job_operations, job_id) try: task = core_job_operations.batch_client.task.get(job_id=recent_run_job.id, task_id=application_name) except batch_models.batch_error.BatchErrorException as e: # see if the application is written to metadata of pool applications = spark_job_operations.list_applications(job_id) for application in applications: if applications[application] is None and application == application_name: raise error.AztkError("The application {0} has not yet been created.".format(application)) raise error.AztkError("The application {0} does not exist".format(application_name)) else: if task.state in (batch_models.TaskState.active, batch_models.TaskState.running, batch_models.TaskState.preparing): raise error.AztkError("The application {0} has not yet finished executing.".format(application_name)) return core_job_operations.get_application_log(job_id, application_name)
def cluster_download(self, cluster_id: str, source_path: str, destination_path: str, host: bool = False, internal: bool = False, timeout=None): try: container_name = None if host else 'spark' return self.__cluster_copy(cluster_id, source_path, destination_path, container_name=container_name, get=True, internal=internal, timeout=timeout) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def _get_application_log(core_job_operations, spark_job_operations, job_id, application_name): scheduling_target = core_job_operations.get_cluster_configuration( job_id).scheduling_target if scheduling_target is not models.SchedulingTarget.Any: return core_job_operations.get_application_log(job_id, application_name) # TODO: change where the logs are uploaded so they aren't overwritten on scheduled runs # current: job_id, application_name/output.log # new: job_id, recent_run_job.id/application_name/output.log recent_run_job = core_job_operations.get_recent_job(job_id) try: task = core_job_operations.get_batch_task(id=recent_run_job.id, task_id=application_name) except batch_models.BatchErrorException as e: # task may not exist since it may not yet be scheduled # see if the task is written to metadata of pool applications = spark_job_operations.list_applications(job_id) for application in applications: if applications[ application] is None and application == application_name: raise error.AztkError( "The application {0} has not yet been created.".format( application)) raise error.AztkError( "The application {0} does not exist".format(application_name)) else: if task.state in ( batch_models.TaskState.active, batch_models.TaskState.running, batch_models.TaskState.preparing, ): raise error.AztkError( "The application {0} has not yet finished executing.".format( application_name)) return core_job_operations.get_application_log(job_id, application_name)
def cluster_ssh_into_master(self, cluster_id, node_id, username, ssh_key=None, password=None, port_forward_list=None, internal=False): try: self.__ssh_into_node(cluster_id, node_id, username, ssh_key, password, port_forward_list, internal) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def validate(self) -> bool: """ Validate the config at its current state. Raises: Error if invalid """ if self.id is None: raise error.AztkError( "Please supply an ID for the Job in your configuration.") if self.max_dedicated_nodes == 0 and self.max_low_pri_nodes == 0: raise error.AztkError( "Please supply a valid (greater than 0) value for either max_dedicated_nodes or max_low_pri_nodes in your configuration." ) if self.vm_size is None: raise error.AztkError( "Please supply a vm_size in your configuration.") if self.mixed_mode() and not self.subnet_id: raise error.AztkError( "You must configure a VNET to use AZTK in mixed mode (dedicated and low priority nodes) and pass the subnet_id in your configuration.." )
def get_task_state(core_cluster_operations, cluster_id: str, task_id: str): try: scheduling_target = core_cluster_operations.get_cluster_configuration( cluster_id).scheduling_target if scheduling_target is not SchedulingTarget.Any: task = core_cluster_operations.get_task_from_table( cluster_id, task_id) return task.state else: task = core_cluster_operations.get_batch_task(cluster_id, task_id) return task.state except BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def _get_application(core_operations, job_id, application_name): # info about the app recent_run_job = core_operations.get_recent_job(job_id) scheduling_target = core_operations.get_cluster_configuration( job_id).scheduling_target if scheduling_target is not models.SchedulingTarget.Any: return core_operations.get_task_from_table(job_id, application_name) try: return core_operations.get_batch_task(id=recent_run_job.id, task_id=application_name) except batch_models.BatchErrorException: raise error.AztkError( "The Spark application {0} is still being provisioned or does not exist." .format(application_name))
def create_cluster(self, configuration: models.ClusterConfiguration, wait: bool = False): """ Create a new aztk spark cluster Args: cluster_conf(aztk.spark.models.models.ClusterConfiguration): Configuration for the the cluster to be created wait(bool): If you should wait for the cluster to be ready before returning Returns: aztk.spark.models.Cluster """ cluster_conf = models.ClusterConfiguration() cluster_conf.merge(DEFAULT_CLUSTER_CONFIG) cluster_conf.merge(configuration) cluster_conf.validate() cluster_data = self._get_cluster_data(cluster_conf.cluster_id) try: zip_resource_files = None node_data = NodeData(cluster_conf).add_core().done() zip_resource_files = cluster_data.upload_node_data(node_data).to_resource_file() start_task = create_cluster_helper.generate_cluster_start_task(self, zip_resource_files, cluster_conf.cluster_id, cluster_conf.gpu_enabled(), cluster_conf.get_docker_repo(), cluster_conf.file_shares, cluster_conf.plugins, cluster_conf.mixed_mode(), cluster_conf.worker_on_master) software_metadata_key = "spark" vm_image = models.VmImage( publisher='Canonical', offer='UbuntuServer', sku='16.04') cluster = self.__create_pool_and_job( cluster_conf, software_metadata_key, start_task, vm_image) # Wait for the master to be ready if wait: util.wait_for_master_to_be_ready(self, cluster.id) cluster = self.get_cluster(cluster.id) return cluster except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def validate(self) -> bool: """ Validate the config at its current state. Raises: Error if invalid """ if self.cluster_id is None: raise error.AztkError( "Please supply an id for the cluster with a parameter (--id)") if self.vm_count == 0 and self.vm_low_pri_count == 0: raise error.AztkError( "Please supply a valid (greater than 0) size or size_low_pri value either in the cluster.yaml configuration file or with a parameter (--size or --size-low-pri)" ) if self.vm_size is None: raise error.AztkError( "Please supply a vm_size in either the cluster.yaml configuration file or with a parameter (--vm-size)" ) if self.mixed_mode() and not self.subnet_id: raise error.AztkError( "You must configure a VNET to use AZTK in mixed mode (dedicated and low priority nodes). Set the VNET's subnet_id in your cluster.yaml." )
def ssh_into_master( spark_cluster_operations, core_cluster_operations, cluster_id, username, ssh_key=None, password=None, port_forward_list=None, internal=False, ): try: master_node_id = spark_cluster_operations.get(cluster_id).master_node_id core_cluster_operations.ssh_into_node(cluster_id, master_node_id, username, ssh_key, password, port_forward_list, internal) except BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e))
def get_log_from_storage(blob_client, container_name, application_name, task): try: blob = blob_client.get_blob_to_text( container_name, application_name + '/' + constants.SPARK_SUBMIT_LOGS_FILE) except azure.common.AzureMissingResourceHttpError: raise error.AztkError( "Logs not found in your storage account. They were either deleted or never existed." ) return models.ApplicationLog(name=application_name, cluster_id=container_name, application_state=task.state._value_, log=blob.content, total_bytes=blob.properties.content_length, exit_code=task.execution_info.exit_code)
def http_request_wrapper(func, *args, timeout=None, max_execution_time=300, **kwargs): start_time = time.clock() while True: try: response = func(*args, timeout=timeout, **kwargs) response.raise_for_status() return response except requests.Timeout: pass if (time.clock() - start_time > max_execution_time): raise error.AztkError("Waited {} seconds for request {}, exceeded max_execution_time={}".format( time.clock() - start_time, func.__name__, max_execution_time, ))
def cluster_copy( cluster_operations, cluster_id, source_path, destination_path=None, container_name=None, internal=False, get=False, timeout=None, ): cluster = cluster_operations.get(cluster_id) pool, nodes = cluster.pool, list(cluster.nodes) if internal: cluster_nodes = [(node, models.RemoteLogin(ip_address=node.ip_address, port="22")) for node in nodes] else: cluster_nodes = [ (node, cluster_operations.get_remote_login_settings(pool.id, node.id)) for node in nodes ] try: generated_username, ssh_key = cluster_operations.generate_user_on_cluster( pool.id, nodes) except BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) try: output = asyncio.get_event_loop().run_until_complete( ssh_lib.clus_copy( container_name=container_name, username=generated_username, nodes=cluster_nodes, source_path=source_path, destination_path=destination_path, ssh_key=ssh_key.exportKey().decode("utf-8"), get=get, timeout=timeout, )) return output except (OSError, BatchErrorException) as exc: raise exc finally: cluster_operations.delete_user_on_cluster(pool.id, nodes, generated_username)
def create_pool_if_not_exist(pool, batch_client): """ Creates the specified pool if it doesn't already exist :param batch_client: The batch client to use. :type batch_client: `batchserviceclient.BatchServiceClient` :param pool: The pool to create. :type pool: `batchserviceclient.models.PoolAddParameter` """ try: batch_client.pool.add(pool) except batch_models.BatchErrorException as e: if e.error.code == "PoolExists": raise error.AztkError( "A cluster with the same id already exists. Use a different id or delete the existing cluster") else: raise return True