예제 #1
0
def __upload(blob_client):
    logging.debug("Uploading node scripts...")
    return helpers.upload_file_to_container(
        container_name="spark-node-scripts",
        file_path=local_tmp_zipfile,
        blob_client=blob_client,
        use_full_path=False)
예제 #2
0
def __upload(blob_client, cluster_id):
    logging.debug("Uploading node scripts...")

    return helpers.upload_file_to_container(
        container_name=cluster_id,
        application_name="aztk-node-scripts",
        file_path=local_tmp_zipfile,
        blob_client=blob_client,
        use_full_path=False)
예제 #3
0
def submit_application(spark_client, cluster_id, application, wait: bool = False):
    """
    Submit a spark app
    """

    resource_files = []

    app_resource_file = helpers.upload_file_to_container(container_name=application.name,
                                                         file_path=application.application,
                                                         blob_client=spark_client.blob_client,
                                                         use_full_path=False)

    # Upload application file
    resource_files.append(app_resource_file)

    # Upload dependent JARS
    jar_resource_file_paths = []
    for jar in application.jars:
        current_jar_resource_file_path = helpers.upload_file_to_container(container_name=application.name,
                                                                          file_path=jar,
                                                                          blob_client=spark_client.blob_client,
                                                                          use_full_path=False)
        jar_resource_file_paths.append(current_jar_resource_file_path)
        resource_files.append(current_jar_resource_file_path)

    # Upload dependent python files
    py_files_resource_file_paths = []
    for py_file in application.py_files:
        current_py_files_resource_file_path = helpers.upload_file_to_container(container_name=application.name,
                                                                               file_path=py_file,
                                                                               blob_client=spark_client.blob_client,
                                                                               use_full_path=False)
        py_files_resource_file_paths.append(
            current_py_files_resource_file_path)
        resource_files.append(current_py_files_resource_file_path)

    # Upload other dependent files
    files_resource_file_paths = []
    for file in application.files:
        files_resource_file_path = helpers.upload_file_to_container(container_name=application.name,
                                                                    file_path=file,
                                                                    blob_client=spark_client.blob_client,
                                                                    use_full_path=False)
        files_resource_file_paths.append(files_resource_file_path)
        resource_files.append(files_resource_file_path)

    # create command to submit task
    cmd = __app_submit_cmd(
        spark_client=spark_client,
        cluster_id=cluster_id,
        name=application.name,
        app=app_resource_file.file_path,
        app_args=application.application_args,
        main_class=application.main_class,
        jars=[jar_resource_file_path.file_path for jar_resource_file_path in jar_resource_file_paths],
        py_files=[py_files_resource.file_path for py_files_resource in py_files_resource_file_paths],
        files=[file_resource_file_path.file_path for file_resource_file_path in files_resource_file_paths],
        driver_java_options=application.driver_java_options,
        driver_library_path=application.driver_library_path,
        driver_class_path=application.driver_class_path,
        driver_memory=application.driver_memory,
        executor_memory=application.executor_memory,
        driver_cores=application.driver_cores,
        executor_cores=application.executor_cores)

    # Get cluster size
    cluster = spark_client.get_cluster(cluster_id)

    # Affinitize task to master node
    # master_node_affinity_id = helpers.get_master_node_id(cluster_id, spark_client.batch_client)
    rls = spark_client.get_remote_login_settings(cluster.id, cluster.master_node_id)

    # Create task
    task = batch_models.TaskAddParameter(
        id=application.name,
        affinity_info=batch_models.AffinityInformation(
            affinity_id=cluster.master_node_id),
        command_line=helpers.wrap_commands_in_shell(cmd),
        resource_files=resource_files,
        user_identity=batch_models.UserIdentity(
            auto_user=batch_models.AutoUserSpecification(
                scope=batch_models.AutoUserScope.task,
                elevation_level=batch_models.ElevationLevel.admin))
    )

    # Add task to batch job (which has the same name as cluster_id)
    job_id = cluster_id
    spark_client.batch_client.task.add(job_id=job_id, task=task)

    if wait:
        helpers.wait_for_task_to_complete(job_id=job_id, task_id=task.id, batch_client=spark_client.batch_client)
예제 #4
0
def generate_task(spark_client, container_id, application, remote=False):
    resource_files = []

    # The application provided is not hosted remotely and therefore must be uploaded
    if not remote:
        app_resource_file = helpers.upload_file_to_container(
            container_name=container_id,
            application_name=application.name,
            file_path=application.application,
            blob_client=spark_client.blob_client,
            use_full_path=False,
        )

        # Upload application file
        resource_files.append(app_resource_file)

        application.application = "$AZ_BATCH_TASK_WORKING_DIR/" + os.path.basename(
            application.application)

    # Upload dependent JARS
    jar_resource_file_paths = []
    for jar in application.jars:
        current_jar_resource_file_path = helpers.upload_file_to_container(
            container_name=container_id,
            application_name=application.name,
            file_path=jar,
            blob_client=spark_client.blob_client,
            use_full_path=False,
        )
        jar_resource_file_paths.append(current_jar_resource_file_path)
        resource_files.append(current_jar_resource_file_path)

    # Upload dependent python files
    py_files_resource_file_paths = []
    for py_file in application.py_files:
        current_py_files_resource_file_path = helpers.upload_file_to_container(
            container_name=container_id,
            application_name=application.name,
            file_path=py_file,
            blob_client=spark_client.blob_client,
            use_full_path=False,
        )
        py_files_resource_file_paths.append(
            current_py_files_resource_file_path)
        resource_files.append(current_py_files_resource_file_path)

    # Upload other dependent files
    files_resource_file_paths = []
    for file in application.files:
        files_resource_file_path = helpers.upload_file_to_container(
            container_name=container_id,
            application_name=application.name,
            file_path=file,
            blob_client=spark_client.blob_client,
            use_full_path=False,
        )
        files_resource_file_paths.append(files_resource_file_path)
        resource_files.append(files_resource_file_path)

    # Upload application definition
    application.jars = [os.path.basename(jar) for jar in application.jars]
    application.py_files = [
        os.path.basename(py_files) for py_files in application.py_files
    ]
    application.files = [
        os.path.basename(files) for files in application.files
    ]
    application_definition_file = helpers.upload_text_to_container(
        container_name=container_id,
        application_name=application.name,
        file_path="application.yaml",
        content=yaml.dump(vars(application)),
        blob_client=spark_client.blob_client,
    )
    resource_files.append(application_definition_file)

    # create command to submit task
    task_cmd = CommandBuilder("sudo docker exec")
    task_cmd.add_argument("-i")
    task_cmd.add_option(
        "-e", "AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR")
    task_cmd.add_option("-e",
                        "STORAGE_LOGS_CONTAINER={0}".format(container_id))
    task_cmd.add_argument("spark /bin/bash >> output.log 2>&1")
    task_cmd.add_argument(
        r'-c "source ~/.bashrc; '
        r"export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; "
        r"cd \$AZ_BATCH_TASK_WORKING_DIR; "
        r'\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/submit.py"'
    )

    # Create task
    task = batch_models.TaskAddParameter(
        id=application.name,
        command_line=helpers.wrap_commands_in_shell([task_cmd.to_str()]),
        resource_files=resource_files,
        constraints=batch_models.TaskConstraints(
            max_task_retry_count=application.max_retry_count),
        user_identity=batch_models.UserIdentity(
            auto_user=batch_models.AutoUserSpecification(
                scope=batch_models.AutoUserScope.task,
                elevation_level=batch_models.ElevationLevel.admin)),
    )

    return task
예제 #5
0
def generate_task(spark_client, container_id, application):
    resource_files = []

    app_resource_file = helpers.upload_file_to_container(
        container_name=container_id,
        application_name=application.name,
        file_path=application.application,
        blob_client=spark_client.blob_client,
        use_full_path=False)

    # Upload application file
    resource_files.append(app_resource_file)

    # Upload dependent JARS
    jar_resource_file_paths = []
    for jar in application.jars:
        current_jar_resource_file_path = helpers.upload_file_to_container(
            container_name=container_id,
            application_name=application.name,
            file_path=jar,
            blob_client=spark_client.blob_client,
            use_full_path=False)
        jar_resource_file_paths.append(current_jar_resource_file_path)
        resource_files.append(current_jar_resource_file_path)

    # Upload dependent python files
    py_files_resource_file_paths = []
    for py_file in application.py_files:
        current_py_files_resource_file_path = helpers.upload_file_to_container(
            container_name=container_id,
            application_name=application.name,
            file_path=py_file,
            blob_client=spark_client.blob_client,
            use_full_path=False)
        py_files_resource_file_paths.append(
            current_py_files_resource_file_path)
        resource_files.append(current_py_files_resource_file_path)

    # Upload other dependent files
    files_resource_file_paths = []
    for file in application.files:
        files_resource_file_path = helpers.upload_file_to_container(
            container_name=container_id,
            application_name=application.name,
            file_path=file,
            blob_client=spark_client.blob_client,
            use_full_path=False)
        files_resource_file_paths.append(files_resource_file_path)
        resource_files.append(files_resource_file_path)

    # Upload application definition
    application.application = os.path.basename(application.application)
    application.jars = [os.path.basename(jar) for jar in application.jars]
    application.py_files = [
        os.path.basename(py_files) for py_files in application.py_files
    ]
    application.files = [
        os.path.basename(files) for files in application.files
    ]
    application_definition_file = helpers.upload_text_to_container(
        container_name=container_id,
        application_name=application.name,
        file_path='application.yaml',
        content=yaml.dump(vars(application)),
        blob_client=spark_client.blob_client)
    resource_files.append(application_definition_file)

    # create command to submit task
    task_cmd = CommandBuilder('sudo docker exec')
    task_cmd.add_argument('-i')
    task_cmd.add_option(
        '-e', 'AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR')
    task_cmd.add_option('-e',
                        'STORAGE_LOGS_CONTAINER={0}'.format(container_id))
    task_cmd.add_argument('spark /bin/bash >> output.log 2>&1')
    task_cmd.add_argument('-c "source ~/.bashrc; '\
                          'cd $AZ_BATCH_TASK_WORKING_DIR; ' \
                          '\$(pyenv root)/versions/\$AZTK_PYTHON_VERSION/bin/python ' \
                          '\$DOCKER_WORKING_DIR/aztk/node_scripts/submit.py"')

    # Create task
    task = batch_models.TaskAddParameter(
        id=application.name,
        command_line=helpers.wrap_commands_in_shell([task_cmd.to_str()]),
        resource_files=resource_files,
        constraints=batch_models.TaskConstraints(
            max_task_retry_count=application.max_retry_count),
        user_identity=batch_models.UserIdentity(
            auto_user=batch_models.AutoUserSpecification(
                scope=batch_models.AutoUserScope.task,
                elevation_level=batch_models.ElevationLevel.admin)))

    return task