def test_with_disabled_options(): cmd = CommandBuilder("ssh") cmd.add_option("--verbose", enable=True) cmd.add_option("-p", None) cmd.add_option("-L", "8080:localhost:8080", enable=False) assert cmd.to_str() == "ssh --verbose"
def __init__(self, name: str, docker_repo: str, cmd: str, gpu_enabled=False): if gpu_enabled: self.cmd = CommandBuilder('nvidia-docker run') else: self.cmd = CommandBuilder('docker run') self.cmd.add_option('--net', 'host') self.cmd.add_option('--name', name) self.cmd.add_argument('-d') self.cmd.add_argument(docker_repo) self.cmd.add_argument(cmd)
def __init__(self, name: str, docker_repo: str, docker_run_options: str, cmd: str, gpu_enabled=False): if gpu_enabled: self.cmd = CommandBuilder("nvidia-docker run") else: self.cmd = CommandBuilder("docker run") self.cmd.add_option("--net", "host") self.cmd.add_option("--name", name) self.cmd.add_argument("-d") self.cmd.add_argument(docker_run_options) self.cmd.add_argument(docker_repo) self.cmd.add_argument(cmd)
def generate_task(spark_client, container_id, application, remote=False): resource_files = [] # The application provided is not hosted remotely and therefore must be uploaded if not remote: app_resource_file = helpers.upload_file_to_container( container_name=container_id, application_name=application.name, file_path=application.application, blob_client=spark_client.blob_client, use_full_path=False, ) # Upload application file resource_files.append(app_resource_file) application.application = "$AZ_BATCH_TASK_WORKING_DIR/" + os.path.basename( application.application) # Upload dependent JARS jar_resource_file_paths = [] for jar in application.jars: current_jar_resource_file_path = helpers.upload_file_to_container( container_name=container_id, application_name=application.name, file_path=jar, blob_client=spark_client.blob_client, use_full_path=False, ) jar_resource_file_paths.append(current_jar_resource_file_path) resource_files.append(current_jar_resource_file_path) # Upload dependent python files py_files_resource_file_paths = [] for py_file in application.py_files: current_py_files_resource_file_path = helpers.upload_file_to_container( container_name=container_id, application_name=application.name, file_path=py_file, blob_client=spark_client.blob_client, use_full_path=False, ) py_files_resource_file_paths.append( current_py_files_resource_file_path) resource_files.append(current_py_files_resource_file_path) # Upload other dependent files files_resource_file_paths = [] for file in application.files: files_resource_file_path = helpers.upload_file_to_container( container_name=container_id, application_name=application.name, file_path=file, blob_client=spark_client.blob_client, use_full_path=False, ) files_resource_file_paths.append(files_resource_file_path) resource_files.append(files_resource_file_path) # Upload application definition application.jars = [os.path.basename(jar) for jar in application.jars] application.py_files = [ os.path.basename(py_files) for py_files in application.py_files ] application.files = [ os.path.basename(files) for files in application.files ] application_definition_file = helpers.upload_text_to_container( container_name=container_id, application_name=application.name, file_path="application.yaml", content=yaml.dump(vars(application)), blob_client=spark_client.blob_client, ) resource_files.append(application_definition_file) # create command to submit task task_cmd = CommandBuilder("sudo docker exec") task_cmd.add_argument("-i") task_cmd.add_option( "-e", "AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR") task_cmd.add_option("-e", "STORAGE_LOGS_CONTAINER={0}".format(container_id)) task_cmd.add_argument("spark /bin/bash >> output.log 2>&1") task_cmd.add_argument( r'-c "source ~/.bashrc; ' r"export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; " r"cd \$AZ_BATCH_TASK_WORKING_DIR; " r'\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/submit.py"' ) # Create task task = batch_models.TaskAddParameter( id=application.name, command_line=helpers.wrap_commands_in_shell([task_cmd.to_str()]), resource_files=resource_files, constraints=batch_models.TaskConstraints( max_task_retry_count=application.max_retry_count), user_identity=batch_models.UserIdentity( auto_user=batch_models.AutoUserSpecification( scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin)), ) return task
def __docker_run_cmd(docker_repo: str = None, gpu_enabled: bool = False, worker_on_master: bool = True, file_mounts = None, plugins = None, mixed_mode = False) -> str: """ Build the docker run command by setting up the environment variables """ if gpu_enabled: cmd = CommandBuilder('nvidia-docker run') else: cmd = CommandBuilder('docker run') cmd.add_option('--net', 'host') cmd.add_option('--name', constants.DOCKER_SPARK_CONTAINER_NAME) cmd.add_option('-v', '/mnt/batch/tasks:/mnt/batch/tasks') if file_mounts: for mount in file_mounts: cmd.add_option('-v', '{0}:{0}'.format(mount.mount_path)) cmd.add_option('-e', 'DOCKER_WORKING_DIR=/mnt/batch/tasks/startup/wd') cmd.add_option('-e', 'AZ_BATCH_ACCOUNT_NAME=$AZ_BATCH_ACCOUNT_NAME') cmd.add_option('-e', 'BATCH_ACCOUNT_KEY=$BATCH_ACCOUNT_KEY') cmd.add_option('-e', 'BATCH_SERVICE_URL=$BATCH_SERVICE_URL') cmd.add_option('-e', 'STORAGE_ACCOUNT_NAME=$STORAGE_ACCOUNT_NAME') cmd.add_option('-e', 'STORAGE_ACCOUNT_KEY=$STORAGE_ACCOUNT_KEY') cmd.add_option('-e', 'STORAGE_ACCOUNT_SUFFIX=$STORAGE_ACCOUNT_SUFFIX') cmd.add_option('-e', 'SP_TENANT_ID=$SP_TENANT_ID') cmd.add_option('-e', 'SP_CLIENT_ID=$SP_CLIENT_ID') cmd.add_option('-e', 'SP_CREDENTIAL=$SP_CREDENTIAL') cmd.add_option('-e', 'SP_BATCH_RESOURCE_ID=$SP_BATCH_RESOURCE_ID') cmd.add_option('-e', 'SP_STORAGE_RESOURCE_ID=$SP_STORAGE_RESOURCE_ID') cmd.add_option('-e', 'AZ_BATCH_POOL_ID=$AZ_BATCH_POOL_ID') cmd.add_option('-e', 'AZ_BATCH_NODE_ID=$AZ_BATCH_NODE_ID') cmd.add_option( '-e', 'AZ_BATCH_NODE_IS_DEDICATED=$AZ_BATCH_NODE_IS_DEDICATED') if worker_on_master is not None: cmd.add_option('-e', 'WORKER_ON_MASTER={}'.format(worker_on_master)) else: # default to True if not specified cmd.add_option('-e', 'WORKER_ON_MASTER={}'.format(True)) cmd.add_option('-e', 'MIXED_MODE={}'.format(mixed_mode)) cmd.add_option('-e', 'SPARK_WEB_UI_PORT=$SPARK_WEB_UI_PORT') cmd.add_option('-e', 'SPARK_WORKER_UI_PORT=$SPARK_WORKER_UI_PORT') cmd.add_option('-e', 'SPARK_CONTAINER_NAME=$SPARK_CONTAINER_NAME') cmd.add_option('-e', 'SPARK_SUBMIT_LOGS_FILE=$SPARK_SUBMIT_LOGS_FILE') cmd.add_option('-e', 'SPARK_JOB_UI_PORT=$SPARK_JOB_UI_PORT') cmd.add_option('-p', '8080:8080') # Spark Master UI cmd.add_option('-p', '7077:7077') # Spark Master cmd.add_option('-p', '7337:7337') # Spark Shuffle Service cmd.add_option('-p', '4040:4040') # Job UI cmd.add_option('-p', '18080:18080') # Spark History Server UI cmd.add_option('-p', '3022:3022') # Docker SSH if plugins: for plugin in plugins: for port in plugin.ports: cmd.add_option('-p', '{0}:{1}'.format(port.internal, port.internal)) # Jupyter UI cmd.add_option('-d', docker_repo) cmd.add_argument('/bin/bash /mnt/batch/tasks/startup/wd/docker_main.sh') return cmd.to_str()
class DockerCmd: """ Class helping to write a docker command """ def __init__(self, name: str, docker_repo: str, docker_run_options: str, cmd: str, gpu_enabled=False): if gpu_enabled: self.cmd = CommandBuilder("nvidia-docker run") else: self.cmd = CommandBuilder("docker run") self.cmd.add_option("--net", "host") self.cmd.add_option("--name", name) self.cmd.add_argument("-d") self.cmd.add_argument(docker_run_options) self.cmd.add_argument(docker_repo) self.cmd.add_argument(cmd) def add_env(self, env: str, value: str): self.cmd.add_option("-e", "{0}={1}".format(env, value)) def pass_env(self, env: str): """ Give the value of an environment variable in the main process to the docker image """ self.cmd.add_option("-e", "{0}".format(env)) def share_folder(self, folder: str): self.cmd.add_option("-v", "{0}:{0}".format(folder)) def open_port(self, port: int): self.cmd.add_option("-p", "{0}:{0}".format(port)) # Spark Master UI def to_str(self): return self.cmd.to_str()
def __app_cmd(): docker_exec = CommandBuilder("sudo docker exec") docker_exec.add_argument("-i") docker_exec.add_option( "-e", "AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR") docker_exec.add_option("-e", "AZ_BATCH_JOB_ID=$AZ_BATCH_JOB_ID") docker_exec.add_argument( "spark /bin/bash >> output.log 2>&1 -c \"python \$DOCKER_WORKING_DIR/job_submission.py\"" ) return docker_exec.to_str()
def __app_cmd(scheduling_target=None, resource_files=None): resource_file_sas_list = None if resource_files: resource_file_sas_list = ' '.join([ '\\\"{}\\\"'.format(task_def.blob_source) for task_def in resource_files ]) docker_exec = CommandBuilder("sudo docker exec") docker_exec.add_argument("-i") docker_exec.add_option( "-e", "AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR") docker_exec.add_option("-e", "AZ_BATCH_JOB_ID=$AZ_BATCH_JOB_ID") docker_exec.add_argument( r'spark /bin/bash >> output.log 2>&1 -c "' r"source ~/.bashrc; " r"export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; " r"cd \$AZ_BATCH_TASK_WORKING_DIR; " r'\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/scheduling/job_submission.py {0} {1}"' .format(scheduling_target if scheduling_target else "", resource_file_sas_list if resource_file_sas_list else "")) return docker_exec.to_str()
def __docker_run_cmd(docker_repo: str = None, gpu_enabled: bool = False, file_mounts = []) -> str: """ Build the docker run command by setting up the environment variables """ if gpu_enabled: cmd = CommandBuilder('nvidia-docker run') else: cmd = CommandBuilder('docker run') cmd.add_option('--net', 'host') cmd.add_option('--name', constants.DOCKER_SPARK_CONTAINER_NAME) cmd.add_option('-v', '/mnt/batch/tasks:/batch') if file_mounts: for mount in file_mounts: cmd.add_option('-v', '{0}:{0}'.format(mount.mount_path)) cmd.add_option('-e', 'DOCKER_WORKING_DIR=/batch/startup/wd') cmd.add_option('-e', 'AZ_BATCH_ACCOUNT_NAME=$AZ_BATCH_ACCOUNT_NAME') cmd.add_option('-e', 'BATCH_ACCOUNT_KEY=$BATCH_ACCOUNT_KEY') cmd.add_option('-e', 'BATCH_ACCOUNT_URL=$BATCH_ACCOUNT_URL') cmd.add_option('-e', 'STORAGE_ACCOUNT_NAME=$STORAGE_ACCOUNT_NAME') cmd.add_option('-e', 'STORAGE_ACCOUNT_KEY=$STORAGE_ACCOUNT_KEY') cmd.add_option('-e', 'STORAGE_ACCOUNT_SUFFIX=$STORAGE_ACCOUNT_SUFFIX') cmd.add_option('-e', 'AZ_BATCH_POOL_ID=$AZ_BATCH_POOL_ID') cmd.add_option('-e', 'AZ_BATCH_NODE_ID=$AZ_BATCH_NODE_ID') cmd.add_option( '-e', 'AZ_BATCH_NODE_IS_DEDICATED=$AZ_BATCH_NODE_IS_DEDICATED') cmd.add_option('-e', 'SPARK_WEB_UI_PORT=$SPARK_WEB_UI_PORT') cmd.add_option('-e', 'SPARK_WORKER_UI_PORT=$SPARK_WORKER_UI_PORT') cmd.add_option('-e', 'SPARK_JUPYTER_PORT=$SPARK_JUPYTER_PORT') cmd.add_option('-e', 'SPARK_JOB_UI_PORT=$SPARK_JOB_UI_PORT') cmd.add_option('-p', '8080:8080') # Spark Master UI cmd.add_option('-p', '7077:7077') # Spark Master cmd.add_option('-p', '4040:4040') # Job UI cmd.add_option('-p', '8888:8888') # Jupyter UI cmd.add_option('-p', '8787:8787') # Rstudio Server cmd.add_option('-p', '18080:18080') # Spark History Server UI cmd.add_option('-p', '3022:3022') # Docker SSH cmd.add_option('-p', '8020:8020') # Namenode IPC: ClientProtocol cmd.add_option('-p', '9000:9000') # Namenode IPC: ClientProtocol cmd.add_option('-p', '50010:50010') # Datanode http data transfer cmd.add_option('-p', '50020:50020') # Datanode IPC metaata operations cmd.add_option('-p', '50070:50070') # Namenode WebUI cmd.add_option('-p', '50075:50075') # DataNode WebUI cmd.add_option('-p', '50090:50090') # Secondary NameNode http address cmd.add_option('-d', docker_repo) cmd.add_argument('/bin/bash /batch/startup/wd/docker_main.sh') return cmd.to_str()
def test_only_command(): cmd = CommandBuilder("ssh") assert cmd.to_str() == "ssh"
def test_with_arg_and_option(): cmd = CommandBuilder("ssh") cmd.add_argument("[email protected]") cmd.add_option("-p", "2020") assert cmd.to_str() == "ssh -p 2020 [email protected]"
def test_with_multiple_options(): cmd = CommandBuilder("ssh") cmd.add_option("-L", "8080:localhost:8080") cmd.add_option("-p", "2020") assert cmd.to_str() == "ssh -L 8080:localhost:8080 -p 2020"
def __docker_run_cmd(docker_repo: str = None, gpu_enabled: bool = False, worker_on_master: bool = True, file_mounts = None, mixed_mode = False) -> str: """ Build the docker run command by setting up the environment variables """ if gpu_enabled: cmd = CommandBuilder('nvidia-docker run') else: cmd = CommandBuilder('docker run') cmd.add_option('--net', 'host') cmd.add_option('--name', constants.DOCKER_SPARK_CONTAINER_NAME) cmd.add_option('-v', '/mnt/batch/tasks:/mnt/batch/tasks') if file_mounts: for mount in file_mounts: cmd.add_option('-v', '{0}:{0}'.format(mount.mount_path)) cmd.add_option('-e', 'DOCKER_WORKING_DIR=/mnt/batch/tasks/startup/wd') cmd.add_option('-e', 'AZ_BATCH_ACCOUNT_NAME=$AZ_BATCH_ACCOUNT_NAME') cmd.add_option('-e', 'BATCH_ACCOUNT_KEY=$BATCH_ACCOUNT_KEY') cmd.add_option('-e', 'BATCH_SERVICE_URL=$BATCH_SERVICE_URL') cmd.add_option('-e', 'STORAGE_ACCOUNT_NAME=$STORAGE_ACCOUNT_NAME') cmd.add_option('-e', 'STORAGE_ACCOUNT_KEY=$STORAGE_ACCOUNT_KEY') cmd.add_option('-e', 'STORAGE_ACCOUNT_SUFFIX=$STORAGE_ACCOUNT_SUFFIX') cmd.add_option('-e', 'SP_TENANT_ID=$SP_TENANT_ID') cmd.add_option('-e', 'SP_CLIENT_ID=$SP_CLIENT_ID') cmd.add_option('-e', 'SP_CREDENTIAL=$SP_CREDENTIAL') cmd.add_option('-e', 'SP_BATCH_RESOURCE_ID=$SP_BATCH_RESOURCE_ID') cmd.add_option('-e', 'SP_STORAGE_RESOURCE_ID=$SP_STORAGE_RESOURCE_ID') cmd.add_option('-e', 'AZ_BATCH_POOL_ID=$AZ_BATCH_POOL_ID') cmd.add_option('-e', 'AZ_BATCH_NODE_ID=$AZ_BATCH_NODE_ID') cmd.add_option( '-e', 'AZ_BATCH_NODE_IS_DEDICATED=$AZ_BATCH_NODE_IS_DEDICATED') if worker_on_master is not None: cmd.add_option('-e', 'WORKER_ON_MASTER={}'.format(worker_on_master)) else: # default to True if not specified cmd.add_option('-e', 'WORKER_ON_MASTER={}'.format(True)) cmd.add_option('-e', 'MIXED_MODE={}'.format(mixed_mode)) cmd.add_option('-e', 'SPARK_WEB_UI_PORT=$SPARK_WEB_UI_PORT') cmd.add_option('-e', 'SPARK_WORKER_UI_PORT=$SPARK_WORKER_UI_PORT') cmd.add_option('-e', 'SPARK_CONTAINER_NAME=$SPARK_CONTAINER_NAME') cmd.add_option('-e', 'SPARK_SUBMIT_LOGS_FILE=$SPARK_SUBMIT_LOGS_FILE') cmd.add_option('-e', 'SPARK_JUPYTER_PORT=$SPARK_JUPYTER_PORT') cmd.add_option('-e', 'SPARK_JOB_UI_PORT=$SPARK_JOB_UI_PORT') cmd.add_option('-p', '8080:8080') # Spark Master UI cmd.add_option('-p', '7077:7077') # Spark Master cmd.add_option('-p', '7337:7337') # Spark Shuffle Service cmd.add_option('-p', '4040:4040') # Job UI cmd.add_option('-p', '8888:8888') # Jupyter UI cmd.add_option('-p', '8787:8787') # Rstudio Server cmd.add_option('-p', '18080:18080') # Spark History Server UI cmd.add_option('-p', '3022:3022') # Docker SSH cmd.add_option('-p', '8020:8020') # Namenode IPC: ClientProtocol cmd.add_option('-p', '9000:9000') # Namenode IPC: ClientProtocol cmd.add_option('-p', '50010:50010') # Datanode http data transfer cmd.add_option('-p', '50020:50020') # Datanode IPC metaata operations cmd.add_option('-p', '50070:50070') # Namenode WebUI cmd.add_option('-p', '50075:50075') # DataNode WebUI cmd.add_option('-p', '50090:50090') # Secondary NameNode http address cmd.add_option('-d', docker_repo) cmd.add_argument('/bin/bash /mnt/batch/tasks/startup/wd/docker_main.sh') return cmd.to_str()
def __app_submit_cmd( spark_client, cluster_id: str, name: str, app: str, app_args: str, main_class: str, jars: List[str], py_files: List[str], files: List[str], driver_java_options: str, driver_library_path: str, driver_class_path: str, driver_memory: str, executor_memory: str, driver_cores: str, executor_cores: str): cluster = spark_client.get_cluster(cluster_id) master_id = cluster.master_node_id master_ip = __get_node(spark_client, master_id, cluster_id).ip_address spark_home = constants.DOCKER_SPARK_HOME # set file paths to correct path on container files_path = '/batch/workitems/{0}/{1}/{2}/wd/'.format(cluster_id, "job-1", name) jars = [files_path + jar for jar in jars] py_files = [files_path + py_file for py_file in py_files] files = [files_path + f for f in files] # 2>&1 redirect stdout and stderr to be in the same file spark_submit_cmd = CommandBuilder( '{0}/bin/spark-submit'.format(spark_home)) spark_submit_cmd.add_option( '--master', 'spark://{0}:7077'.format(master_ip)) spark_submit_cmd.add_option('--name', name) spark_submit_cmd.add_option('--class', main_class) spark_submit_cmd.add_option('--jars', jars and ','.join(jars)) spark_submit_cmd.add_option('--py-files', py_files and ','.join(py_files)) spark_submit_cmd.add_option('--files', files and ','.join(files)) spark_submit_cmd.add_option('--driver-java-options', driver_java_options) spark_submit_cmd.add_option('--driver-library-path', driver_library_path) spark_submit_cmd.add_option('--driver-class-path', driver_class_path) spark_submit_cmd.add_option('--driver-memory', driver_memory) spark_submit_cmd.add_option('--executor-memory', executor_memory) spark_submit_cmd.add_option('--driver-cores', driver_cores) spark_submit_cmd.add_option('--executor-cores', executor_cores) spark_submit_cmd.add_argument( '/batch/workitems/{0}/{1}/{2}/wd/'.format(cluster_id, "job-1", name) + app + ' ' + ' '.join(['\'' + app_arg + '\'' for app_arg in app_args if app_args])) if cluster.gpu_enabled: docker_exec_cmd = CommandBuilder('sudo nvidia-docker exec') else: docker_exec_cmd = CommandBuilder('sudo docker exec') docker_exec_cmd.add_option('-i', constants.DOCKER_SPARK_CONTAINER_NAME) docker_exec_cmd.add_argument('/bin/bash >> {0} 2>&1 -c \"cd '.format( constants.SPARK_SUBMIT_LOGS_FILE) + files_path + '; ' + spark_submit_cmd.to_str() + '\"') return [ docker_exec_cmd.to_str() ]
def test_with_option(): cmd = CommandBuilder("ssh") cmd.add_option("-L", "8080:localhost:8080") assert cmd.to_str() == "ssh -L 8080:localhost:8080"
def __app_cmd(): docker_exec = CommandBuilder("sudo docker exec") docker_exec.add_argument("-i") docker_exec.add_option("-e", "AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR") docker_exec.add_option("-e", "AZ_BATCH_JOB_ID=$AZ_BATCH_JOB_ID") docker_exec.add_argument( r'spark /bin/bash >> output.log 2>&1 -c "' r"source ~/.bashrc; " r"export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; " r"cd \$AZ_BATCH_TASK_WORKING_DIR; " r'\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/job_submission.py"') return docker_exec.to_str()
def generate_task(spark_client, container_id, application): resource_files = [] app_resource_file = helpers.upload_file_to_container( container_name=container_id, application_name=application.name, file_path=application.application, blob_client=spark_client.blob_client, use_full_path=False) # Upload application file resource_files.append(app_resource_file) # Upload dependent JARS jar_resource_file_paths = [] for jar in application.jars: current_jar_resource_file_path = helpers.upload_file_to_container( container_name=container_id, application_name=application.name, file_path=jar, blob_client=spark_client.blob_client, use_full_path=False) jar_resource_file_paths.append(current_jar_resource_file_path) resource_files.append(current_jar_resource_file_path) # Upload dependent python files py_files_resource_file_paths = [] for py_file in application.py_files: current_py_files_resource_file_path = helpers.upload_file_to_container( container_name=container_id, application_name=application.name, file_path=py_file, blob_client=spark_client.blob_client, use_full_path=False) py_files_resource_file_paths.append( current_py_files_resource_file_path) resource_files.append(current_py_files_resource_file_path) # Upload other dependent files files_resource_file_paths = [] for file in application.files: files_resource_file_path = helpers.upload_file_to_container( container_name=container_id, application_name=application.name, file_path=file, blob_client=spark_client.blob_client, use_full_path=False) files_resource_file_paths.append(files_resource_file_path) resource_files.append(files_resource_file_path) # Upload application definition application.application = os.path.basename(application.application) application.jars = [os.path.basename(jar) for jar in application.jars] application.py_files = [ os.path.basename(py_files) for py_files in application.py_files ] application.files = [ os.path.basename(files) for files in application.files ] application_definition_file = helpers.upload_text_to_container( container_name=container_id, application_name=application.name, file_path='application.yaml', content=yaml.dump(vars(application)), blob_client=spark_client.blob_client) resource_files.append(application_definition_file) # create command to submit task task_cmd = CommandBuilder('sudo docker exec') task_cmd.add_argument('-i') task_cmd.add_option( '-e', 'AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR') task_cmd.add_option('-e', 'STORAGE_LOGS_CONTAINER={0}'.format(container_id)) task_cmd.add_argument('spark /bin/bash >> output.log 2>&1') task_cmd.add_argument('-c "source ~/.bashrc; '\ 'cd $AZ_BATCH_TASK_WORKING_DIR; ' \ '\$(pyenv root)/versions/\$AZTK_PYTHON_VERSION/bin/python ' \ '\$DOCKER_WORKING_DIR/aztk/node_scripts/submit.py"') # Create task task = batch_models.TaskAddParameter( id=application.name, command_line=helpers.wrap_commands_in_shell([task_cmd.to_str()]), resource_files=resource_files, constraints=batch_models.TaskConstraints( max_task_retry_count=application.max_retry_count), user_identity=batch_models.UserIdentity( auto_user=batch_models.AutoUserSpecification( scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin))) return task
def __app_submit_cmd( name: str, app: str, app_args: List[str], main_class: str, jars: List[str], py_files: List[str], files: List[str], driver_java_options: str, driver_library_path: str, driver_class_path: str, driver_memory: str, executor_memory: str, driver_cores: int, executor_cores: int, ): spark_home = os.environ["SPARK_HOME"] with open(os.path.join(spark_home, "conf", "master")) as f: master_ip = f.read().rstrip() # set file paths to correct path on container files_path = os.environ["AZ_BATCH_TASK_WORKING_DIR"] jars = [os.path.join(files_path, os.path.basename(jar)) for jar in jars] py_files = [os.path.join(files_path, os.path.basename(py_file)) for py_file in py_files] files = [os.path.join(files_path, os.path.basename(f)) for f in files] # 2>&1 redirect stdout and stderr to be in the same file spark_submit_cmd = CommandBuilder("{0}/bin/spark-submit".format(spark_home)) spark_submit_cmd.add_option("--master", "spark://{0}:7077".format(master_ip)) spark_submit_cmd.add_option("--name", name) spark_submit_cmd.add_option("--class", main_class) spark_submit_cmd.add_option("--jars", jars and ",".join(jars)) spark_submit_cmd.add_option("--py-files", py_files and ",".join(py_files)) spark_submit_cmd.add_option("--files", files and ",".join(files)) spark_submit_cmd.add_option("--driver-java-options", driver_java_options) spark_submit_cmd.add_option("--driver-library-path", driver_library_path) spark_submit_cmd.add_option("--driver-class-path", driver_class_path) spark_submit_cmd.add_option("--driver-memory", driver_memory) spark_submit_cmd.add_option("--executor-memory", executor_memory) if driver_cores: spark_submit_cmd.add_option("--driver-cores", str(driver_cores)) if executor_cores: spark_submit_cmd.add_option("--executor-cores", str(executor_cores)) spark_submit_cmd.add_argument( os.path.expandvars(app) + " " + " ".join(["'" + str(app_arg) + "'" for app_arg in (app_args or [])])) with open("spark-submit.txt", mode="w", encoding="UTF-8") as stream: stream.write(spark_submit_cmd.to_str()) return spark_submit_cmd
def __app_submit_cmd(name: str, app: str, app_args: List[str], main_class: str, jars: List[str], py_files: List[str], files: List[str], driver_java_options: str, driver_library_path: str, driver_class_path: str, driver_memory: str, executor_memory: str, driver_cores: int, executor_cores: int): cluster_id = os.environ['AZ_BATCH_POOL_ID'] spark_home = os.environ['SPARK_HOME'] with open(os.path.join(spark_home, 'conf', 'master')) as f: master_ip = f.read().rstrip() # set file paths to correct path on container files_path = os.environ['AZ_BATCH_TASK_WORKING_DIR'] jars = [os.path.join(files_path, os.path.basename(jar)) for jar in jars] py_files = [ os.path.join(files_path, os.path.basename(py_file)) for py_file in py_files ] files = [os.path.join(files_path, os.path.basename(f)) for f in files] # 2>&1 redirect stdout and stderr to be in the same file spark_submit_cmd = CommandBuilder( '{0}/bin/spark-submit'.format(spark_home)) spark_submit_cmd.add_option('--master', 'spark://{0}:7077'.format(master_ip)) spark_submit_cmd.add_option('--name', name) spark_submit_cmd.add_option('--class', main_class) spark_submit_cmd.add_option('--jars', jars and ','.join(jars)) spark_submit_cmd.add_option('--py-files', py_files and ','.join(py_files)) spark_submit_cmd.add_option('--files', files and ','.join(files)) spark_submit_cmd.add_option('--driver-java-options', driver_java_options) spark_submit_cmd.add_option('--driver-library-path', driver_library_path) spark_submit_cmd.add_option('--driver-class-path', driver_class_path) spark_submit_cmd.add_option('--driver-memory', driver_memory) spark_submit_cmd.add_option('--executor-memory', executor_memory) if driver_cores: spark_submit_cmd.add_option('--driver-cores', str(driver_cores)) if executor_cores: spark_submit_cmd.add_option('--executor-cores', str(executor_cores)) spark_submit_cmd.add_argument( os.path.expandvars(app) + ' ' + ' '.join(['\'' + str(app_arg) + '\'' for app_arg in (app_args or [])])) with open("spark-submit.txt", mode="w", encoding="UTF-8") as stream: stream.write(spark_submit_cmd.to_str()) return spark_submit_cmd