コード例 #1
0
ファイル: batch3.py プロジェクト: zhekunz2/c4pp
def create_job(batch_service_client, job_id, resource_files, pool_id):
    """
    Creates a job with the specified ID, associated with the specified pool.

    :param batch_service_client: A Batch service client.
    :type batch_service_client: `azure.batch.BatchServiceClient`
    :param str job_id: The ID for the job.
    :param str pool_id: The ID for the pool.
    """
    print('Creating job [{}]...'.format(job_id))

    job = batch.models.JobAddParameter(
        id=job_id,
        pool_info=batch.models.PoolInformation(pool_id=pool_id),
        job_preparation_task=batchmodels.JobPreparationTask(
            command_line="/bin/bash -c \"\
                sudo apt-get install -y --reinstall make; \
                sudo ./installrpy.sh; \
                tar -xf cmdstan.tar.gz; \
                cd cmdstan; \
                make build; \
                cd -; \
                if sudo test -d ../../cmdstan; then echo stanexist; exit 0; fi; \
                sudo mv * ../../; \
            \"",
            resource_files=resource_files,
            wait_for_success=True,
            user_identity=batchmodels.UserIdentity(user_name="admin"),
        ))
    #

    batch_service_client.job.add(job)
コード例 #2
0
def create_job(batch_service_client, job_id, resource_files, pool_id):
    """
    Creates a job with the specified ID, associated with the specified pool.

    :param batch_service_client: A Batch service client.
    :type batch_service_client: `azure.batch.BatchServiceClient`
    :param str job_id: The ID for the job.
    :param str pool_id: The ID for the pool.
    """
    print('Creating job [{}]...'.format(job_id))

    job = batch.models.JobAddParameter(
        id=job_id,
        pool_info=batch.models.PoolInformation(pool_id=pool_id),
        job_preparation_task=batchmodels.JobPreparationTask(
            command_line="/bin/bash -c \"sudo apt-get -y update; \
                sudo chmod o+w /usr/local/lib/R/site-library; \
                ./installrpy.sh; \
                tar -xf cmdstan.tar.gz; \
                cd cmdstan; \
                make build; \
                cd -; \
            \"",
            wait_for_success=True,
            resource_files=resource_files,
            user_identity=batchmodels.UserIdentity(user_name="admin"),
        ))

    batch_service_client.job.add(job)
コード例 #3
0
def add_docker_batch_task(batch_client, block_blob_client, job_id, pool_id):
    """Submits a docker task via Batch scheduler

    :param batch_client: The batch client to use.
    :type batch_client: `batchserviceclient.BatchServiceClient`
    :param block_blob_client: The storage block blob client to use.
    :type block_blob_client: `azure.storage.blob.BlockBlobService`
    :param str job_id: The id of the job to use.
    :param str pool_id: The id of the pool to use.
    :rtype: list
    :return: a list of task_id of the task added.
    """

    task_resource_sas_url = common.helpers.upload_blob_and_create_sas(
        block_blob_client, _CONTAINER_NAME, _TASK_RESOURCE_FILE,
        _TASK_RESOURCE_FILE_PATH,
        datetime.datetime.utcnow() + datetime.timedelta(hours=1))

    output_container_sas_key = common.helpers.create_container_and_create_sas(
        block_blob_client=block_blob_client,
        container_name=_OUTPUT_CONTAINER_NAME,
        permission=azureblob.ContainerPermissions.WRITE
        | azureblob.ContainerPermissions.LIST,
        expiry=datetime.datetime.utcnow() + datetime.timedelta(hours=1))

    # The start task pulls docker image yidingz/ffmpeg:v3
    job = batchmodels.JobAddParameter(
        id=job_id,
        pool_info=batchmodels.PoolInformation(pool_id=pool_id),
        job_preparation_task=batchmodels.JobPreparationTask(
            command_line=_JOB_STARTTASK_CLI, run_elevated=True))
    batch_client.job.add(job)

    task_id_list = []
    index = 0
    for url in _INPUT_FILE_URLS:
        filename = urllib.parse.urlsplit(url).path.split('/')[-1]
        parameters = "'{0}' '{1}' '{2}' '{3}'".format(
            url, filename, output_container_sas_key,
            block_blob_client.account_name)
        # Each task will download a video from chanel9,
        # transcode, and upload to specified output container
        task = batchmodels.TaskAddParameter(
            id=str(index).zfill(4) + '_' + filename.split('.')[0],
            command_line=_TASK_CLI.format(_TASK_RESOURCE_FILE, _FFMPEG_IMAGE,
                                          parameters),
            run_elevated=True,
            resource_files=[
                batchmodels.ResourceFile(file_path=_TASK_RESOURCE_FILE,
                                         blob_source=task_resource_sas_url)
            ])
        task_id_list.append(task.id)
        batch_client.task.add(job_id=job_id, task=task)
        index += 1
    return task_id_list
コード例 #4
0
def create_job(batch_client, name_job, name_pool, cmd_prep_task=None):
    user = models.UserIdentity(auto_user=models.AutoUserSpecification(
        elevation_level=models.ElevationLevel.admin,
        scope=models.AutoUserScope.task))

    prepare_task = models.JobPreparationTask(command_line=cmd_prep_task,
                                             id=None,
                                             user_identity=user)

    job = models.JobAddParameter(
        id=name_job,
        pool_info=models.PoolInformation(pool_id=name_pool),
        job_preparation_task=prepare_task)
    batch_client.job.add(job)
コード例 #5
0
    def create_job(self, job_preparation_commands=None):
        """Creates a job with the specified ID, associated with the specified pool.

        Args:
            job_preparation_commands: commands as list of strings to run before the job starts

        Returns:
            success: True if job could be created successfully, False otherwise.
        """

        if job_preparation_commands is None:
            job = batch_models.JobAddParameter(
                id=self.job_id,
                pool_info=batch_models.PoolInformation(pool_id=self.pool_id))
        else:
            job_prep_task = batch_models.JobPreparationTask(
                command_line=job_preparation_commands,
                wait_for_success=True,
                rerun_on_node_reboot_after_success=True)
            job = batch_models.JobAddParameter(
                id=self.job_id,
                pool_info=batch_models.PoolInformation(pool_id=self.pool_id),
                job_preparation_task=job_prep_task)

        try:
            logging.info('Attempting to create job [{}]...'.format(
                self.job_id))
            self.batch_client.job.add(job)
            logging.info('Job [{}] created successfully...'.format(
                self.job_id))
            return True
        except batch_models.batch_error.BatchErrorException as err:
            if err.error.code == "JobExists":
                logging.info("Job [{}] already exists".format(self.job_id))
                return False
            else:
                logging.exception(
                    "Unknown error occurred while trying to create job [{}]".
                    format(self.job_id))
                raise
コード例 #6
0
    def test_batch_jobs(self, **kwargs):
        client = self.create_sharedkey_client(**kwargs)
        # Test Create Job
        auto_pool = models.AutoPoolSpecification(
            pool_lifetime_option=models.PoolLifetimeOption.job,
            pool=models.PoolSpecification(
                vm_size='small',
                cloud_service_configuration=models.CloudServiceConfiguration(
                    os_family='5'
                )
            )
        )
        job_prep = models.JobPreparationTask(command_line="cmd /c \"echo hello world\"")
        job_release = models.JobReleaseTask(command_line="cmd /c \"echo goodbye world\"")
        job_param = models.JobAddParameter(
            id=self.get_resource_name('batch_job1_'),
            pool_info=models.PoolInformation(
                auto_pool_specification=auto_pool
            ),
            job_preparation_task=job_prep,
            job_release_task=job_release
        )
        response = client.job.add(job_param)
        self.assertIsNone(response)

        # Test Update Job
        constraints = models.JobConstraints(max_task_retry_count=3)
        options = models.JobUpdateParameter(
            priority=500,
            constraints=constraints,
            pool_info=models.PoolInformation(
                auto_pool_specification=auto_pool
            )
        )
        response = client.job.update(job_param.id, options)
        self.assertIsNone(response)

        # Test Patch Job
        options = models.JobPatchParameter(priority=900)
        response = client.job.patch(job_param.id, options)
        self.assertIsNone(response)

        job = client.job.get(job_param.id)
        self.assertIsInstance(job, models.CloudJob)
        self.assertEqual(job.id, job_param.id)
        self.assertEqual(job.constraints.max_task_retry_count, 3)
        self.assertEqual(job.priority, 900)

        # Test Create Job with Auto Complete
        job_auto_param = models.JobAddParameter(
            id=self.get_resource_name('batch_job2_'),
            on_all_tasks_complete=models.OnAllTasksComplete.terminate_job,
            on_task_failure=models.OnTaskFailure.perform_exit_options_job_action,
            pool_info=models.PoolInformation(
                auto_pool_specification=auto_pool
            )
        )
        response = client.job.add(job_auto_param)
        self.assertIsNone(response)
        job = client.job.get(job_auto_param.id)
        self.assertIsInstance(job, models.CloudJob)
        self.assertEqual(job.on_all_tasks_complete, models.OnAllTasksComplete.terminate_job)
        self.assertEqual(job.on_task_failure, models.OnTaskFailure.perform_exit_options_job_action)

        # Test List Jobs
        jobs = client.job.list()
        self.assertIsInstance(jobs, models.CloudJobPaged)
        self.assertEqual(len(list(jobs)), 2)

        # Test Disable Job
        response = client.job.disable(job_param.id, models.DisableJobOption.requeue)
        self.assertIsNone(response)

        # Test Enable Job
        response = client.job.enable(job_param.id)
        self.assertIsNone(response)

        # Prep and release task status
        task_status = client.job.list_preparation_and_release_task_status(job_param.id)
        self.assertIsInstance(task_status, models.JobPreparationAndReleaseTaskExecutionInformationPaged)
        self.assertEqual(list(task_status), [])

        # Test Terminate Job
        response = client.job.terminate(job_param.id)
        self.assertIsNone(response)

        # Test Delete Job
        response = client.job.delete(job_auto_param.id)
        self.assertIsNone(response)

        # Test Job Lifetime Statistics
        stats = client.job.get_all_lifetime_statistics()
        self.assertIsInstance(stats, models.JobStatistics)
        self.assertEqual(stats.num_succeeded_tasks, 0)
        self.assertEqual(stats.num_failed_tasks, 0)
コード例 #7
0
    def _setup_job(self, distributable_list, pool_id, name, log_writer=None):
        '''
        This is the main method for submitting to AzureBatch.
        '''

        job_id = datetime.datetime.utcnow().strftime("%Y%m%d-%H%M%S")  + "-" + name.replace("_","-").replace("/","-").replace(".","-").replace("+","-").replace("(","").replace(")","")
        job_id_etc_list = []

        if True: # Pickle the things-to-run - put them in a local directory under the current directory called "runs/[jobid]" where the jobid is based on the date.
            if log_writer is not None: log_writer("{0}: Pickle the thing to run".format(name))
            run_dir_rel = os.path.join("runs",job_id)
            pstutil.create_directory_if_necessary(run_dir_rel, isfile=False)
            for index, distributable in enumerate(distributable_list):
                distributablep_filename = os.path.join(run_dir_rel, "distributable{0}.p".format(index))
                with open(distributablep_filename, mode='wb') as f:
                    pickle.dump(distributable, f, pickle.HIGHEST_PROTOCOL)

        if True: # Copy (update) any (small) input files to the blob
            if log_writer is not None: log_writer("{0}: Upload small input files".format(name))
            data_blob_fn = "{0}-data-v{1}".format(self.container,self.data_version)
            inputOutputCopier = AzureBatchCopier(data_blob_fn, self.storage_key, self.storage_account_name)
            script_list = ["",""] #These will be scripts for copying to and from AzureStorage and the cluster nodes.
            inputOutputCopier2 = AzureBatchCopierNodeLocal(data_blob_fn, self.container, self.data_version, self.storage_key, self.storage_account_name,
                                 script_list)
            for index, distributable in enumerate(distributable_list):
                inputOutputCopier.input(distributable)
                inputOutputCopier2.input(distributable)
                inputOutputCopier2.output(distributable)
                output_blobfn = "{0}/output{1}".format(run_dir_rel.replace("\\","/"),index) #The name of the directory of return values in Azure Storage.
                job_id_etc_list.append((job_id, inputOutputCopier, output_blobfn, run_dir_rel))

        if True: # Create the jobprep program -- sets the python path and downloads the pythonpath code. Also create node-local folder for return values.
            if log_writer is not None: log_writer("{0}: Create jobprep.bat script".format(name))
            localpythonpath = os.environ.get("PYTHONPATH") #!!should it be able to work without pythonpath being set (e.g. if there was just one file)? Also, is None really the return or is it an exception.
            jobprep_filename = os.path.join(run_dir_rel, "jobprep.bat")
            # It only copies down files that are needed, but with some probability (about 1 in 50, say) fails, so we repeat three times.
            with open(jobprep_filename, mode='w') as f2:
                f2.write(r"""set
set path=%AZ_BATCH_APP_PACKAGE_ANACONDA2%\Anaconda2;%AZ_BATCH_APP_PACKAGE_ANACONDA2%\Anaconda2\scripts\;%path%
for /l %%t in (0,1,3) do FOR /L %%i IN (0,1,{7}) DO python.exe %AZ_BATCH_TASK_WORKING_DIR%\blobxfer.py --skipskip --delete --storageaccountkey {2} --download {3} {4}-pp-v{5}-%%i %AZ_BATCH_NODE_SHARED_DIR%\{4}\pp\v{5}\%%i --remoteresource .
{6}
mkdir %AZ_BATCH_TASK_WORKING_DIR%\..\..\output{8}
exit /b 0
                """
                .format(
                    None,                                   #0 - not used
                    None,                                   #1 - not used
                    self.storage_key,                       #2
                    self.storage_account_name,              #3
                    self.container,                         #4
                    self.pp_version,                        #5
                    script_list[0],                         #6
                    len(localpythonpath.split(';'))-1,      #7
                    index,                                  #8
                ))

        if True: #Split the taskcount roughly evenly among the distributables
            subtaskcount_list = deal(len(distributable_list),self.taskcount)

        if True: # Create the map.bat and reduce.bat programs to run.
            if log_writer is not None: log_writer("{0}: Create map.bat and reduce.bat script".format(name))
            pythonpath_string = "set pythonpath=" + ";".join(r"%AZ_BATCH_NODE_SHARED_DIR%\{0}\pp\v{1}\{2}".format(self.container,self.pp_version,i) for i in range(len(localpythonpath.split(';'))))
            for index in range(len(distributable_list)):
                subtaskcount = subtaskcount_list[index]
                output_blobfn = job_id_etc_list[index][2]
                for i, bat_filename in enumerate(["map{0}.bat".format(index),"reduce{0}.bat".format(index)]):
                    bat_filename = os.path.join(run_dir_rel, bat_filename)
                    with open(bat_filename, mode='w') as f1:
                        #note that it's getting distributable.py from site-packages and never from the pythonpath
                        f1.write(r"""set path=%AZ_BATCH_APP_PACKAGE_ANACONDA2%\Anaconda2;%AZ_BATCH_APP_PACKAGE_ANACONDA2%\Anaconda2\scripts\;%path%
mkdir %AZ_BATCH_TASK_WORKING_DIR%\..\..\output{14}
{6}cd %AZ_BATCH_TASK_WORKING_DIR%\..\..\output{14}
{6}FOR /L %%i IN (0,1,{11}) DO python.exe %AZ_BATCH_JOB_PREP_WORKING_DIR%\blobxfer.py --storageaccountkey {2} --download {3} {8}/{10} . --remoteresource %%i.{0}.p
cd %AZ_BATCH_NODE_SHARED_DIR%\{8}\data\v{9}
{13}
python.exe %AZ_BATCH_APP_PACKAGE_ANACONDA2%\Anaconda2\Lib\site-packages\fastlmm\util\distributable.py %AZ_BATCH_JOB_PREP_WORKING_DIR%\distributable{14}.p LocalInParts(%1,{0},result_file=r\"{4}/result.p\",mkl_num_threads={1},temp_dir=r\"{4}\")
IF %ERRORLEVEL% NEQ 0 (EXIT /B %ERRORLEVEL%)
{6}{7}
cd %AZ_BATCH_TASK_WORKING_DIR%\..\..\output{14}
{5}for /l %%t in (0,1,3) do python.exe %AZ_BATCH_JOB_PREP_WORKING_DIR%\blobxfer.py --storageaccountkey {2} --upload {3} {8} %1.{0}.p --remoteresource {10}/%1.{0}.p
{6}for /l %%t in (0,1,3) do python.exe %AZ_BATCH_JOB_PREP_WORKING_DIR%\blobxfer.py --storageaccountkey {2} --upload {3} {8} result.p --remoteresource {10}/result.p
                        """
                        .format(
                            subtaskcount,                           #0
                            self.mkl_num_threads,                   #1
                            self.storage_key,                       #2
                            self.storage_account_name,              #3
                            "%AZ_BATCH_TASK_WORKING_DIR%/../../output{0}".format(index), #4
                            "" if i==0 else "@rem ",                #5
                            "" if i==1 else "@rem ",                #6
                            script_list[1],                         #7
                            self.container,                         #8
                            self.data_version,                      #9
                            output_blobfn,                          #10
                            subtaskcount-1,                         #11
                            self.pp_version,                        #12
                            pythonpath_string,                      #13
                            index,                                  #14
                        ))

        if True: # Upload the thing-to-run to a blob and the blobxfer program
            if log_writer is not None: log_writer("{0}: Upload the thing to run".format(name))
            block_blob_client = azureblob.BlockBlobService(account_name=self.storage_account_name,account_key=self.storage_key)
            block_blob_client.create_container(self.container, fail_on_exist=False)

            blobxfer_blobfn = "utils/v{}/blobxfer.py".format(self.utils_version)
            blobxfer_url   = commonhelpers.upload_blob_and_create_sas(block_blob_client, self.container, blobxfer_blobfn, os.path.join(os.path.dirname(__file__),"blobxfer.py"), datetime.datetime.utcnow() + datetime.timedelta(days=30))

            jobprep_blobfn = "{}/jobprep.bat".format(run_dir_rel.replace("\\","/"))
            jobprepbat_url = commonhelpers.upload_blob_and_create_sas(block_blob_client, self.container, jobprep_blobfn, os.path.join(run_dir_rel, "jobprep.bat"), datetime.datetime.utcnow() + datetime.timedelta(days=30))

            map_reduce_url_list = []
            for index in range(len(distributable_list)):
                distributablep_blobfn = "{0}/distributable{1}.p".format(run_dir_rel.replace("\\","/"),index)
                distributablep_filename = os.path.join(run_dir_rel, "distributable{0}.p".format(index))
                distributablep_url = commonhelpers.upload_blob_and_create_sas(block_blob_client, self.container, distributablep_blobfn, distributablep_filename, datetime.datetime.utcnow() + datetime.timedelta(days=30)) #!!!should there be an expiry?

                map_blobfn = "{0}/map{1}.bat".format(run_dir_rel.replace("\\","/"),index)
                map_url = commonhelpers.upload_blob_and_create_sas(block_blob_client, self.container, map_blobfn, os.path.join(run_dir_rel, "map{0}.bat".format(index)), datetime.datetime.utcnow() + datetime.timedelta(days=30))

                reduce_blobfn = "{0}/reduce{1}.bat".format(run_dir_rel.replace("\\","/"),index)
                reduce_url = commonhelpers.upload_blob_and_create_sas(block_blob_client, self.container, reduce_blobfn, os.path.join(run_dir_rel, "reduce{0}.bat".format(index)), datetime.datetime.utcnow() + datetime.timedelta(days=30))
                map_reduce_url_list.append((map_url,reduce_url,distributablep_url))

        if True: # Copy everything on PYTHONPATH to a blob
            if log_writer is not None: log_writer("{0}: Upload items on pythonpath as requested".format(name))
            if self.update_python_path == 'every_time':
                self._update_python_path_function()

        if True: # Create a job with a job prep task
            if log_writer is not None: log_writer("{0}: Create jobprep.bat".format(name))
            resource_files=[
                batchmodels.ResourceFile(blob_source=blobxfer_url, file_path="blobxfer.py"),
                batchmodels.ResourceFile(blob_source=jobprepbat_url, file_path="jobprep.bat")]
            for index in range(len(distributable_list)):
                _, _, distributablep_url = map_reduce_url_list[index]
                resource_files.append(batchmodels.ResourceFile(blob_source=distributablep_url, file_path="distributable{0}.p".format(index)))
            
            job_preparation_task = batchmodels.JobPreparationTask(
                    id="jobprep",
                    #run_elevated=True,
                    user_identity=batchmodels.UserIdentity(auto_user=batchmodels.AutoUserSpecification(elevation_level='admin')),
                    resource_files=resource_files,
                    command_line="jobprep.bat",
                    )

            job = batchmodels.JobAddParameter(
                id=job_id,
                job_preparation_task=job_preparation_task,
                pool_info=batch.models.PoolInformation(pool_id=pool_id),
                uses_task_dependencies=True,
                on_task_failure='performExitOptionsJobAction',
                )
            try:
                self.batch_client.job.add(job)
            except batchmodels.BatchErrorException as e:
                if e.inner_exception.values is not None:
                    raise Exception(e.inner_exception.values[-1].value)
                else:
                    raise Exception(e.inner_exception)

        if True: # Add regular tasks to the job
            if log_writer is not None: log_writer("{0}: Add tasks to job".format(name))
            task_factor = int(10**math.ceil(math.log(max(subtaskcount_list),10))) #When we have multiple distributables, this helps us number them e.g. 0,1,2,10,11,12,20,21,22
            task_list = []
            for index in range(len(distributable_list)):
                start = len(task_list)
                map_url, reduce_url, _ = map_reduce_url_list[index]
                subtaskcount = subtaskcount_list[index]
                for taskindex in range(subtaskcount):
                    map_task = batchmodels.TaskAddParameter(
                        id=index * task_factor + taskindex,
                        #run_elevated=True,
                        user_identity=batchmodels.UserIdentity(auto_user=batchmodels.AutoUserSpecification(elevation_level='admin')),
                        #!!! seems to exit without needing a failure exit_conditions = batchmodels.ExitConditions(default=batchmodels.ExitOptions(job_action='terminate')),
                        resource_files=[batchmodels.ResourceFile(blob_source=map_url, file_path="map{0}.bat".format(index))],
                        command_line=r"map{0}.bat {1}".format(index, taskindex),
                    )
                    task_list.append(map_task)
                end = len(task_list)-1
                reduce_task = batchmodels.TaskAddParameter(
                    id="reduce{0}".format(index),
                    #run_elevated=True,
                    user_identity=batchmodels.UserIdentity(auto_user=batchmodels.AutoUserSpecification(elevation_level='admin')),
                    resource_files=[batchmodels.ResourceFile(blob_source=reduce_url, file_path="reduce{0}.bat".format(index))],
                    command_line=r"reduce{0}.bat {1}".format(index, subtaskcount),
                    depends_on = batchmodels.TaskDependencies(task_id_ranges=[batchmodels.TaskIdRange(task_list[start].id,task_list[end].id)])
                    )
                task_list.append(reduce_task)

            try:
                for i in range(0,len(task_list),100): #The Python API only lets us add 100 at a time.
                    self.batch_client.task.add_collection(job_id, task_list[i:i+100])
            except Exception as exception:
                print(exception)
                raise exception
        return job_id_etc_list