コード例 #1
0
ファイル: ALI.py プロジェクト: haidi-ustc/dpdispatcher
    def create_apg(self):
        request = CreateAutoProvisioningGroupRequest()
        request.set_accept_format('json')
        request.set_TotalTargetCapacity(str(self.nchunks_limit))
        request.set_LaunchTemplateId(self.cloud_resources["template_id"])
        request.set_AutoProvisioningGroupName(
            self.cloud_resources["instance_name"] +
            ''.join(random.choice(string.ascii_uppercase) for _ in range(20)))
        request.set_AutoProvisioningGroupType("maintain")
        request.set_SpotAllocationStrategy("lowest-price")
        request.set_SpotInstanceInterruptionBehavior("terminate")
        request.set_SpotInstancePoolsToUseCount(1)
        request.set_ExcessCapacityTerminationPolicy("termination")
        request.set_TerminateInstances(True)
        request.set_PayAsYouGoTargetCapacity("0")
        request.set_SpotTargetCapacity(str(self.nchunks_limit))
        config = self.generate_config()
        request.set_LaunchTemplateConfigs(config)

        try:
            response = self.client.do_action_with_exception(request)
            response = json.loads(response)
            with open('apg_id.json', 'w') as fp:
                json.dump({'apg_id': response["AutoProvisioningGroupId"]},
                          fp,
                          indent=4)
            return response["AutoProvisioningGroupId"]
        except ServerException as e:
            dlog.info("create apg failed, err msg: %s" % e)
            sys.exit()
        except ClientException as e:
            dlog.info("create apg failed, err msg: %s" % e)
            sys.exit()
コード例 #2
0
ファイル: ALI.py プロジェクト: haidi-ustc/dpdispatcher
 def create_template(self, image_id, sg_id, vpc_id):
     request = CreateLaunchTemplateRequest()
     request.set_accept_format('json')
     request.set_LaunchTemplateName(''.join(
         random.choice(string.ascii_uppercase) for _ in range(20)))
     request.set_ImageId(image_id)
     request.set_ImageOwnerAlias("self")
     request.set_PasswordInherit(True)
     if "address" in self.cloud_resources and self.cloud_resources[
             'address'] == "public":
         request.set_InternetMaxBandwidthIn(100)
         request.set_InternetMaxBandwidthOut(100)
     request.set_InstanceType("ecs.c6.large")
     request.set_InstanceName(self.cloud_resources["instance_name"])
     request.set_SecurityGroupId(sg_id)
     request.set_VpcId(vpc_id)
     request.set_SystemDiskCategory("cloud_efficiency")
     request.set_SystemDiskSize(70)
     request.set_IoOptimized("optimized")
     request.set_InstanceChargeType("PostPaid")
     request.set_NetworkType("vpc")
     request.set_SpotStrategy("SpotWithPriceLimit")
     request.set_SpotPriceLimit(100)
     try:
         response = self.client.do_action_with_exception(request)
         response = json.loads(response)
         return response["LaunchTemplateId"]
     except ServerException as e:
         dlog.info(e)
         sys.exit()
     except ClientException as e:
         dlog.info(e)
         sys.exit()
コード例 #3
0
 def check_status(self, job):
     job_id = job.job_id
     if job_id == "" :
         return JobStatus.unsubmitted
     ret, stdin, stdout, stderr\
         = self.context.block_call ("qstat -x " + job_id)
     err_str = stderr.read().decode('utf-8')
     if (ret != 0) :
         if str("qstat: Unknown Job Id") in err_str or str("Job has finished") in err_str:
             if self.check_finish_tag(job=job) :
                 return JobStatus.finished
             else :
                 return JobStatus.terminated
         else :
             raise RuntimeError ("status command qstat fails to execute. erro info: %s return code %d"
                                 % (err_str, ret))
     status_line = stdout.read().decode('utf-8').split ('\n')[-2]
     status_word = status_line.split ()[-2]        
     # dlog.info (status_word)
     if status_word in ["Q","H"] :
         return JobStatus.waiting
     elif    status_word in ["R"] :
         return JobStatus.running
     elif    status_word in ["C", "E", "K", "F"] :
         if self.check_finish_tag(job):
             dlog.info(f"job: {job.job_hash} {job.job_id} finished")
             return JobStatus.finished
         else :
             return JobStatus.terminated
     else :
         return JobStatus.unknown
コード例 #4
0
ファイル: ALI.py プロジェクト: haidi-ustc/dpdispatcher
 def describe_apg_instances(self):
     request = DescribeAutoProvisioningGroupInstancesRequest()
     request.set_accept_format('json')
     request.set_AutoProvisioningGroupId(self.cloud_resources["apg_id"])
     request.set_PageSize(100)
     iteration = self.nchunks // 100
     instance_list = []
     for i in range(iteration + 1):
         request.set_PageNumber(i + 1)
         count = 0
         flag = 0
         err_msg = 0
         while count < 10:
             try:
                 response = self.client.do_action_with_exception(request)
                 response = json.loads(response)
                 for ins in response["Instances"]["Instance"]:
                     instance_list.append(ins["InstanceId"])
                 flag = 1
                 break
             except ServerException as e:
                 # dlog.info(e)
                 err_msg = e
                 count += 1
             except ClientException as e:
                 # dlog.info(e)
                 err_msg = e
                 count += 1
         if not flag:
             dlog.info("describe_apg_instances failed, err msg: %s" %
                       err_msg)
             sys.exit()
     return instance_list
コード例 #5
0
ファイル: ALI.py プロジェクト: haidi-ustc/dpdispatcher
    def get_image_id(self, img_name):
        request = DescribeImagesRequest()
        request.set_accept_format('json')
        request.set_ImageOwnerAlias("self")
        request.set_PageSize(20)
        response = self.client.do_action_with_exception(request)
        response = json.loads(response)
        totalcount = response["TotalCount"]

        iteration = totalcount // 20
        if iteration * 20 < totalcount:
            iteration += 1

        for ii in range(1, iteration + 1):
            count = 0
            flag = 0
            request.set_PageNumber(ii)
            while count < 10:
                try:
                    response = self.client.do_action_with_exception(request)
                    response = json.loads(response)
                    for img in response["Images"]["Image"]:
                        if img["ImageName"] == img_name:
                            return img["ImageId"]
                    flag = 1
                    break
                except:
                    count += 1
                    time.sleep(10)
        if not flag:
            dlog.info("get image failed, exit")
            sys.exit()
コード例 #6
0
ファイル: ALI.py プロジェクト: haidi-ustc/dpdispatcher
    def delete(self, ii):
        '''delete one machine'''
        request = DeleteInstancesRequest()
        request.set_accept_format('json')
        request.set_InstanceIds(
            [self.dispatcher_list[ii]["entity"].instance_id])
        request.set_Force(True)
        count = 0
        flag = 0
        while count < 10:
            try:
                response = self.client.do_action_with_exception(request)
                flag = 1
                break
            except ServerException as e:
                time.sleep(10)
                count += 1

        if flag:
            status_list = [
                item["dispatcher_status"] for item in self.dispatcher_list
            ]
            running_num = status_list.count("running")
            running_num += status_list.count("unsubmitted")
            self.change_apg_capasity(running_num)
        else:
            dlog.info("delete failed, exit")
            sys.exit()
コード例 #7
0
 def all_finished(self, job_handler, mark_failure, clean=True):
     task_chunks = job_handler['task_chunks']
     task_chunks_str = ['+'.join(ii) for ii in task_chunks]
     task_hashes = [
         sha1(ii.encode('utf-8')).hexdigest() for ii in task_chunks_str
     ]
     job_list = job_handler['job_list']
     job_record = job_handler['job_record']
     command = job_handler['command']
     tag_failure_list = [
         'tag_failure_%d' % ii for ii in range(len(command))
     ]
     resources = job_handler['resources']
     outlog = job_handler['outlog']
     errlog = job_handler['errlog']
     backward_task_files = job_handler['backward_task_files']
     dlog.debug('checking jobs')
     nchunks = len(task_chunks)
     for idx in range(nchunks):
         cur_hash = task_hashes[idx]
         rjob = job_list[idx]
         if not job_record.check_finished(cur_hash):
             # chunk not finished according to record
             status = rjob['batch'].check_status()
             job_uuid = rjob['context'].job_uuid
             dlog.debug('checked job %s' % job_uuid)
             if status == JobStatus.terminated:
                 job_record.increase_nfail(cur_hash)
                 if job_record.check_nfail(cur_hash) > 3:
                     raise RuntimeError(
                         'Job %s failed for more than 3 times' % job_uuid)
                 dlog.info('job %s terminated, submit again' % job_uuid)
                 dlog.debug('try %s times for %s' %
                            (job_record.check_nfail(cur_hash), job_uuid))
                 rjob['batch'].submit(task_chunks[idx],
                                      command,
                                      res=resources,
                                      outlog=outlog,
                                      errlog=errlog,
                                      restart=True)
             elif status == JobStatus.finished:
                 dlog.info('job %s finished' % job_uuid)
                 if mark_failure:
                     rjob['context'].download(task_chunks[idx],
                                              tag_failure_list,
                                              check_exists=True,
                                              mark_failure=False)
                     rjob['context'].download(task_chunks[idx],
                                              backward_task_files,
                                              check_exists=True)
                 else:
                     rjob['context'].download(task_chunks[idx],
                                              backward_task_files)
                 if clean:
                     rjob['context'].clean()
                 job_record.record_finish(cur_hash)
                 job_record.dump()
     job_record.dump()
     return job_record.check_all_finished()
コード例 #8
0
 def download_(self,
               job_dirs,
               remote_down_files,
               check_exists=False,
               mark_failure=True,
               back_error=False):
     cwd = os.getcwd()
     for ii in job_dirs:
         local_job = os.path.join(self.local_root, ii)
         remote_job = os.path.join(self.remote_root, ii)
         flist = remote_down_files
         if back_error:
             os.chdir(remote_job)
             flist += glob('error*')
             os.chdir(cwd)
         for jj in flist:
             rfile = os.path.join(remote_job, jj)
             lfile = os.path.join(local_job, jj)
             if not os.path.realpath(rfile) == os.path.realpath(lfile):
                 if (not os.path.exists(rfile)) and (
                         not os.path.exists(lfile)):
                     if check_exists:
                         if mark_failure:
                             with open(
                                     os.path.join(
                                         self.local_root, ii,
                                         'tag_failure_download_%s' % jj),
                                     'w') as fp:
                                 pass
                         else:
                             pass
                     else:
                         raise RuntimeError('do not find download file ' +
                                            rfile)
                 elif (not os.path.exists(rfile)) and (
                         os.path.exists(lfile)):
                     # already downloaded
                     pass
                 elif (os.path.exists(rfile)) and (
                         not os.path.exists(lfile)):
                     # trivial case, download happily
                     shutil.move(rfile, lfile)
                 elif (os.path.exists(rfile)) and (os.path.exists(lfile)):
                     # both exists, replace!
                     dlog.info('find existing %s, replacing by %s' %
                               (lfile, rfile))
                     if os.path.isdir(lfile):
                         shutil.rmtree(lfile, ignore_errors=True)
                     elif os.path.isfile(lfile) or os.path.islink(lfile):
                         os.remove(lfile)
                     shutil.move(rfile, lfile)
                 else:
                     raise RuntimeError('should not reach here!')
             else:
                 # no nothing in the case of linked files
                 pass
     os.chdir(cwd)
コード例 #9
0
 def _rmtree(self, sftp, remotepath, level=0, verbose = False):
     for f in sftp.listdir_attr(remotepath):
         rpath = os.path.join(remotepath, f.filename)
         if stat.S_ISDIR(f.st_mode):
             self._rmtree(sftp, rpath, level=(level + 1))
         else:
             rpath = os.path.join(remotepath, f.filename)
             if verbose: dlog.info('removing %s%s' % ('    ' * level, rpath))
             sftp.remove(rpath)
     if verbose: dlog.info('removing %s%s' % ('    ' * level, remotepath))
     sftp.rmdir(remotepath)
コード例 #10
0
ファイル: ssh_context.py プロジェクト: njzjz/dpdispatcher
 def ensure_alive(self,
                 max_check = 10,
                 sleep_time = 10):
     count = 1
     while not self._check_alive():
         if count == max_check:
             raise RuntimeError('cannot connect ssh after %d failures at interval %d s' %
                                 (max_check, sleep_time))
         dlog.info('connection check failed, try to reconnect to ' + self.remote_root)
         self._setup_ssh()
         count += 1
         time.sleep(sleep_time)
コード例 #11
0
ファイル: ssh_context.py プロジェクト: njzjz/dpdispatcher
 def _rmtree(self, remotepath, verbose = False):
     """Remove the remote path."""
     # The original implementation method removes files one by one using sftp.
     # If the latency of the remote server is high, it is very slow.
     # Thus, it's better to use system's `rm` to remove a directory, which may
     # save a lot of time.
     if verbose:
         dlog.info('removing %s' % remotepath)
     # In some supercomputers, it's very slow to remove large numbers of files
     # (e.g. directory containing trajectory) due to bad I/O performance.
     # So an asynchronously option is provided.
     self.block_checkcall('rm -rf %s' % remotepath, asynchronously=self.clean_asynchronously)
コード例 #12
0
 def check_status(self, job, retry=0, max_retry=3):
     job_id = job.job_id
     if job_id == '':
         return JobStatus.unsubmitted
     ret, stdin, stdout, stderr \
         = self.context.block_call ('squeue -o "%.18i %.2t" -j ' + job_id)
     if (ret != 0):
         err_str = stderr.read().decode('utf-8')
         if str("Invalid job id specified") in err_str:
             if self.check_finish_tag(job):
                 return JobStatus.finished
             else:
                 return JobStatus.terminated
         elif "Socket timed out on send/recv operation" in err_str:
             # retry 3 times
             if retry < max_retry:
                 dlog.warning(
                     "Get error code %d in checking status through ssh with job: %s . message: %s"
                     % (ret, job.job_hash, err_str))
                 dlog.warning("Sleep 60 s and retry checking...")
                 # rest 60s
                 time.sleep(60)
                 return self.check_status(job_id,
                                          retry=retry + 1,
                                          max_retry=max_retry)
         else:
             raise RuntimeError(
                 "status command squeue fails to execute."
                 "job_id:%s \n error message:%s\n return code %d\n" %
                 (job_id, err_str, ret))
     status_line = stdout.read().decode('utf-8').split('\n')[-2]
     status_word = status_line.split()[-1]
     if not (len(status_line.split()) == 2 and status_word.isupper()):
         raise RuntimeError("Error in getting job status, " +
                            f"status_line = {status_line}, " +
                            f"parsed status_word = {status_word}")
     if status_word in ["PD", "CF", "S"]:
         return JobStatus.waiting
     elif status_word in ["R"]:
         return JobStatus.running
     elif status_word in ["CG"]:
         return JobStatus.completing
     elif status_word in [
             "C", "E", "K", "BF", "CA", "CD", "F", "NF", "PR", "SE", "ST",
             "TO"
     ]:
         if self.check_finish_tag(job):
             dlog.info(f"job: {job.job_hash} {job.job_id} finished")
             return JobStatus.finished
         else:
             return JobStatus.terminated
     else:
         return JobStatus.unknown
コード例 #13
0
ファイル: ALI.py プロジェクト: haidi-ustc/dpdispatcher
 def prepare(self):
     restart = False
     if os.path.exists('apg_id.json'):
         with open('apg_id.json') as fp:
             apg = json.load(fp)
             self.cloud_resources["apg_id"] = apg["apg_id"]
         task_chunks_str = ['+'.join(ii) for ii in self.task_chunks]
         task_hashes = [
             sha1(ii.encode('utf-8')).hexdigest() for ii in task_chunks_str
         ]
         for ii in range(self.nchunks):
             fn = 'jr.%.06d.json' % ii
             if os.path.exists(
                     os.path.join(os.path.abspath(self.work_path), fn)):
                 cur_hash = task_hashes[ii]
                 job_record = JobRecord(self.work_path,
                                        self.task_chunks[ii], fn)
                 if not job_record.check_finished(cur_hash):
                     if not self.check_spot_callback(
                             job_record.record[cur_hash]['context']
                         ['instance_id']):
                         self.dispatcher_list[ii]["entity"] = Entity(
                             job_record.record[cur_hash]['context']['ip'],
                             job_record.record[cur_hash]['context']
                             ['instance_id'], job_record)
                         self.make_dispatcher(ii)
                         self.dispatcher_list[ii][
                             "dispatcher_status"] = "unsubmitted"
                     else:
                         os.remove(
                             os.path.join(os.path.abspath(self.work_path),
                                          fn))
                 else:
                     self.dispatcher_list[ii][
                         "dispatcher_status"] = "finished"
         self.server_pool = self.get_server_pool()
         self.ip_pool = self.get_ip(self.server_pool)
         restart = True
     img_id = self.get_image_id(self.cloud_resources["img_name"])
     sg_id, vpc_id = self.get_sg_vpc_id()
     self.cloud_resources["template_id"] = self.create_template(
         img_id, sg_id, vpc_id)
     self.cloud_resources["vsw_id"] = self.get_vsw_id(vpc_id)
     if not restart:
         dlog.info("begin to create apg")
         self.cloud_resources["apg_id"] = self.create_apg()
         time.sleep(120)
         self.server_pool = self.get_server_pool()
         self.ip_pool = self.get_ip(self.server_pool)
     else:
         dlog.info("restart dispatcher")
コード例 #14
0
ファイル: shell.py プロジェクト: njzjz/dpdispatcher
    def check_status(self, job):
        job_id = job.job_id
        # print('shell.check_status.job_id', job_id)
        # job_state = JobStatus.unknown
        if job_id == "":
            return JobStatus.unsubmitted

        if_job_exists = psutil.pid_exists(pid=job_id)
        if self.check_finish_tag(job=job):
            dlog.info(f"job: {job.job_hash} {job.job_id} finished")
            return JobStatus.finished

        if if_job_exists:
            return JobStatus.running
        else:
            return JobStatus.terminated
コード例 #15
0
ファイル: ALI.py プロジェクト: haidi-ustc/dpdispatcher
 def delete_apg(self):
     request = DeleteAutoProvisioningGroupRequest()
     request.set_accept_format('json')
     request.set_AutoProvisioningGroupId(self.cloud_resources["apg_id"])
     request.set_TerminateInstances(True)
     count = 0
     flag = 0
     while count < 10:
         try:
             response = self.client.do_action_with_exception(request)
             flag = 1
             break
         except ServerException as e:
             time.sleep(10)
             count += 1
     if not flag:
         dlog.info("delete apg failed, exit")
         sys.exit()
コード例 #16
0
ファイル: ALI.py プロジェクト: haidi-ustc/dpdispatcher
 def catch_dispatcher_exception(self, ii):
     '''everything is okay: return 0
        ssh not active    : return 1
        machine callback  : return 2'''
     if self.check_spot_callback(
             self.dispatcher_list[ii]["entity"].instance_id):
         dlog.info("machine %s callback, ip: %s" %
                   (self.dispatcher_list[ii]["entity"].instance_id,
                    self.dispatcher_list[ii]["entity"].ip))
         return 2
     elif not self.dispatcher_list[ii]["dispatcher"].session._check_alive():
         try:
             self.dispatcher_list[ii]["dispatcher"].session.ensure_alive()
             return 0
         except RuntimeError:
             return 1
     else:
         return 0
コード例 #17
0
 def ensure_alive(self,
                  max_check = 10,
                  sleep_time = 10):
     count = 1
     while not self._check_alive():
         if count == max_check:
             raise RuntimeError('cannot connect ssh after %d failures at interval %d s' %
                                (max_check, sleep_time))
         dlog.info('connection check failed, try to reconnect to ' + self.remote_host)
         self._setup_ssh(hostname=self.remote_host,
                         port=self.remote_port,
                         username=self.remote_uname,
                         password=self.remote_password,
                         key_filename=self.local_key_filename,
                         timeout=self.remote_timeout,
                         passphrase=self.local_key_passphrase)
         count += 1
         time.sleep(sleep_time)
コード例 #18
0
ファイル: ALI.py プロジェクト: haidi-ustc/dpdispatcher
 def change_apg_capasity(self, capasity):
     request = ModifyAutoProvisioningGroupRequest()
     request.set_accept_format('json')
     request.set_AutoProvisioningGroupId(self.cloud_resources["apg_id"])
     request.set_TotalTargetCapacity(str(capasity))
     request.set_SpotTargetCapacity(str(capasity))
     request.set_PayAsYouGoTargetCapacity("0")
     count = 0
     flag = 0
     while count < 10:
         try:
             response = self.client.do_action_with_exception(request)
             flag = 1
             break
         except:
             count += 1
             time.sleep(10)
     if not flag:
         dlog.info("change_apg_capasity failed, exit")
         sys.exit()
コード例 #19
0
ファイル: submission.py プロジェクト: njzjz/dpdispatcher
 def try_recover_from_json(self):
     submission_file_name = "{submission_hash}.json".format(submission_hash=self.submission_hash)
     if_recover = self.machine.context.check_file_exists(submission_file_name)
     submission = None
     submission_dict = {}
     if if_recover :
         submission_dict_str = self.machine.context.read_file(fname=submission_file_name)
         submission_dict = json.loads(submission_dict_str)
         submission = Submission.deserialize(submission_dict=submission_dict)
         if self == submission:
             self.belonging_jobs = submission.belonging_jobs
             self.bind_machine(machine=self.machine)
             dlog.info(f"Find old submission; recover from json; "
                 f"submission.submission_hash:{submission.submission_hash}; "
                 f"machine.context.remote_root:{self.machine.context.remote_root}; "
                 f"submission.work_base:{submission.work_base};")
             # self = submission.bind_machine(machine=self.machine)
         else:
             print(self.serialize())
             print(submission.serialize())
             raise RuntimeError("Recover failed.")
コード例 #20
0
    def handle_unexpected_job_state(self):
        job_state = self.job_state

        if job_state == JobStatus.unknown:
            raise RuntimeError(
                "job_state for job {job} is unknown".format(job=self))

        if job_state == JobStatus.terminated:
            dlog.info(
                f"job: {self.job_hash} {self.job_id} terminated; restarting job"
            )
            if self.fail_count > 3:
                raise RuntimeError(
                    "job:job {job} failed 3 times".format(job=self))
            self.fail_count += 1
            self.submit_job()
            self.get_job_state()

        if job_state == JobStatus.unsubmitted:
            dlog.info(f"job: {self.job_hash} unsubmitted; submit it")
            if self.fail_count > 3:
                raise RuntimeError(
                    "job:job {job} failed 3 times".format(job=self))
            # self.fail_count += 1
            self.submit_job()
            dlog.info("job: {job_hash} submit; job_id is {job_id}".format(
                job_hash=self.job_hash, job_id=self.job_id))
コード例 #21
0
ファイル: lsf.py プロジェクト: njzjz/dpdispatcher
    def check_status(self, job):
        try:
            job_id = job.job_id
        except AttributeError:
            return JobStatus.terminated
        if job_id == "":
            return JobStatus.unsubmitted
        ret, stdin, stdout, stderr \
            = self.context.block_call("bjobs " + job_id)
        err_str = stderr.read().decode('utf-8')
        if ("Job <%s> is not found" % job_id) in err_str:
            if self.check_finish_tag(job):
                return JobStatus.finished
            else:
                return JobStatus.terminated
        elif ret != 0:
            raise RuntimeError(
                "status command bjobs fails to execute.\n error info: %s \nreturn code %d\n"
                % (err_str, ret))
        status_out = stdout.read().decode('utf-8').split('\n')
        if len(status_out) < 2:
            return JobStatus.unknown
        else:
            status_line = status_out[1]
            status_word = status_line.split()[2]

        # ref: https://www.ibm.com/support/knowledgecenter/en/SSETD4_9.1.2/lsf_command_ref/bjobs.1.html
        if status_word in ["PEND", "WAIT", "PSUSP"]:
            return JobStatus.waiting
        elif status_word in ["RUN", "USUSP"]:
            return JobStatus.running
        elif status_word in ["DONE", "EXIT"]:
            if self.check_finish_tag(job):
                dlog.info(f"job: {job.job_hash} {job.job_id} finished")
                return JobStatus.finished
            else:
                return JobStatus.terminated
        else:
            return JobStatus.unknown
コード例 #22
0
def make_dispatcher(mdata,
                    mdata_resource=None,
                    work_path=None,
                    run_tasks=None,
                    group_size=None):
    if 'cloud_resources' in mdata:
        if mdata['cloud_resources']['cloud_platform'] == 'ali':
            from dpdispatcher.ALI import ALI
            dispatcher = ALI(mdata, mdata_resource, work_path, run_tasks,
                             group_size, mdata['cloud_resources'])
            dispatcher.init()
            return dispatcher
        elif mdata['cloud_resources']['cloud_platform'] == 'ucloud':
            pass
    else:
        hostname = mdata.get('hostname', None)
        #use_uuid = mdata.get('use_uuid', False)
        if hostname:
            context_type = 'ssh'
        else:
            context_type = 'local'
        try:
            batch_type = mdata['batch']
        except:
            dlog.info(
                'cannot find key "batch" in machine file, try to use deprecated key "machine_type"'
            )
            batch_type = mdata['machine_type']
        lazy_local = (mdata.get('lazy-local', False)) or (mdata.get(
            'lazy_local', False))
        if lazy_local and context_type == 'local':
            dlog.info('Dispatcher switches to the lazy local mode')
            context_type = 'lazy-local'
        disp = Dispatcher(mdata,
                          context_type=context_type,
                          batch_type=batch_type)
        return disp
コード例 #23
0
    def submit_jobs(self,
                    resources,
                    command,
                    work_path,
                    tasks,
                    group_size,
                    forward_common_files,
                    forward_task_files,
                    backward_task_files,
                    forward_task_deference=True,
                    outlog='log',
                    errlog='err'):
        self.backward_task_files = backward_task_files
        # task_chunks = [
        #     [os.path.basename(j) for j in tasks[i:i + group_size]] \
        #     for i in range(0, len(tasks), group_size)
        # ]
        task_chunks = _split_tasks(tasks, group_size)
        task_chunks_str = ['+'.join(ii) for ii in task_chunks]
        task_hashes = [
            sha1(ii.encode('utf-8')).hexdigest() for ii in task_chunks_str
        ]
        job_record = JobRecord(work_path, task_chunks, fname=self.jrname)
        job_record.dump()
        nchunks = len(task_chunks)

        job_list = []
        for ii in range(nchunks):
            cur_chunk = task_chunks[ii]
            cur_hash = task_hashes[ii]
            if not job_record.check_finished(cur_hash):
                # chunk is not finished
                # check if chunk is submitted
                submitted = job_record.check_submitted(cur_hash)
                if not submitted:
                    job_uuid = None
                else:
                    job_uuid = job_record.get_uuid(cur_hash)
                    dlog.debug("load uuid %s for chunk %s" %
                               (job_uuid, cur_hash))
                # communication context, bach system
                context = self.context(work_path, self.session, job_uuid)
                batch = self.batch(context, uuid_names=self.uuid_names)
                rjob = {'context': context, 'batch': batch}
                # upload files
                if not rjob['context'].check_file_exists(
                        rjob['batch'].upload_tag_name):
                    rjob['context'].upload('.', forward_common_files)
                    rjob['context'].upload(cur_chunk,
                                           forward_task_files,
                                           dereference=forward_task_deference)

                    rjob['context'].write_file(rjob['batch'].upload_tag_name,
                                               '')
                    dlog.debug('uploaded files for %s' % task_chunks_str[ii])
                # submit new or recover old submission
                if not submitted:
                    rjob['batch'].submit(cur_chunk,
                                         command,
                                         res=resources,
                                         outlog=outlog,
                                         errlog=errlog)
                    job_uuid = rjob['context'].job_uuid
                    dlog.debug('assigned uuid %s for %s ' %
                               (job_uuid, task_chunks_str[ii]))
                    dlog.info('new submission of %s for chunk %s' %
                              (job_uuid, cur_hash))
                else:
                    rjob['batch'].submit(cur_chunk,
                                         command,
                                         res=resources,
                                         outlog=outlog,
                                         errlog=errlog,
                                         restart=True)
                    dlog.info('restart from old submission %s for chunk %s' %
                              (job_uuid, cur_hash))
                # record job and its remote context
                job_list.append(rjob)
                ip = None
                instance_id = None
                if 'cloud_resources' in self.remote_profile:
                    ip = self.remote_profile['hostname']
                    instance_id = self.remote_profile['instance_id']
                job_record.record_remote_context(cur_hash, context.local_root,
                                                 context.remote_root, job_uuid,
                                                 ip, instance_id)
                job_record.dump()
            else:
                # finished job, append a None to list
                job_list.append(None)
        assert (len(job_list) == nchunks)
        job_handler = {
            'task_chunks': task_chunks,
            'job_list': job_list,
            'job_record': job_record,
            'command': command,
            'resources': resources,
            'outlog': outlog,
            'errlog': errlog,
            'backward_task_files': backward_task_files
        }
        return job_handler
コード例 #24
0
 def run_submission(self, *, exit_on_submit=False, clean=True):
     """main method to execute the submission.
     First, check whether old Submission exists on the remote machine, and try to recover from it.
     Second, upload the local files to the remote machine where the tasks to be executed.
     Third, run the submission defined previously.
     Forth, wait until the tasks in the submission finished and download the result file to local directory.
     if exit_on_submit is True, submission will exit.
     """
     if not self.belonging_jobs:
         self.generate_jobs()
     self.try_recover_from_json()
     if self.check_all_finished():
         dlog.info('info:check_all_finished: True')
     else:
         dlog.info('info:check_all_finished: False')
         self.upload_jobs()
         self.handle_unexpected_submission_state()
         self.submission_to_json()
     time.sleep(1)
     while not self.check_all_finished():
         if exit_on_submit is True:
             print(
                 '<<<<<<dpdispatcher<<<<<<SuccessSubmit<<<<<<exit 0<<<<<<')
             print(f"submission succeeded: {self.submission_hash}")
             print(f"at {self.machine.context.remote_root}")
             print("exit_on_submit")
             print(
                 '>>>>>>dpdispatcher>>>>>>SuccessSubmit>>>>>>exit 0>>>>>>')
             return self.serialize()
         try:
             time.sleep(40)
         except KeyboardInterrupt as e:
             self.submission_to_json()
             print(
                 '<<<<<<dpdispatcher<<<<<<KeyboardInterrupt<<<<<<exit 1<<<<<<'
             )
             print('submission: ', self.submission_hash)
             print(self.serialize())
             print(
                 '>>>>>>dpdispatcher>>>>>>KeyboardInterrupt>>>>>>exit 1>>>>>>'
             )
             exit(1)
         except SystemExit as e:
             self.submission_to_json()
             print('<<<<<<dpdispatcher<<<<<<SystemExit<<<<<<exit 2<<<<<<')
             print('submission: ', self.submission_hash)
             print(self.serialize())
             print('>>>>>>dpdispatcher>>>>>>SystemExit>>>>>>exit 2>>>>>>')
             exit(2)
         except Exception as e:
             self.submission_to_json()
             print('<<<<<<dpdispatcher<<<<<<{e}<<<<<<exit 3<<<<<<'.format(
                 e=e))
             print('submission: ', self.submission_hash)
             print(self.serialize())
             print('>>>>>>dpdispatcher>>>>>>{e}>>>>>>exit 3>>>>>>'.format(
                 e=e))
             exit(3)
         else:
             self.handle_unexpected_submission_state()
         finally:
             pass
     self.handle_unexpected_submission_state()
     self.submission_to_json()
     self.download_jobs()
     if clean:
         self.clean_jobs()
     return self.serialize()
コード例 #25
0
 def run_jobs(self,
              resources,
              command,
              work_path,
              tasks,
              group_size,
              forward_common_files,
              forward_task_files,
              backward_task_files,
              forward_task_deference=True,
              mark_failure=False,
              outlog='log',
              errlog='err'):
     ratio_failure = self.mdata_resources.get("ratio_failue", 0)
     while True:
         if self.check_all_dispatchers_finished(ratio_failure):
             self.clean()
             break
         self.exception_handling(ratio_failure)
         jj = self.nchunks - 1
         for ii in range(self.nchunks):
             dispatcher_status = self.check_dispatcher_status(ii)
             if dispatcher_status == "unsubmitted":
                 dlog.info(self.dispatcher_list[ii]["entity"].ip)
                 self.dispatcher_list[ii][
                     "entity"].job_handler = self.dispatcher_list[
                         ii]["dispatcher"].submit_jobs(
                             resources, command, work_path,
                             self.task_chunks[ii], group_size,
                             forward_common_files, forward_task_files,
                             backward_task_files, forward_task_deference,
                             outlog, errlog)
                 self.dispatcher_list[ii][
                     "entity"].job_record = self.dispatcher_list[ii][
                         "entity"].job_handler["job_record"]
                 self.dispatcher_list[ii]["dispatcher_status"] = "running"
             elif dispatcher_status == "finished" and self.dispatcher_list[
                     ii]["entity"]:
                 # no jobs in queue, delete current machine
                 # else add current machine to server_pool
                 entity = self.dispatcher_list[ii]["entity"]
                 status_list = [
                     item["dispatcher_status"]
                     for item in self.dispatcher_list
                 ]
                 flag = "unallocated" in status_list
                 if not flag:
                     self.delete(ii)
                     self.dispatcher_list[ii]["entity"] = None
                 else:
                     self.dispatcher_list[ii]["entity"] = None
                     self.server_pool.append(entity.instance_id)
                     self.ip_pool.append(entity.ip)
                     while (jj >= ii):
                         if (self.dispatcher_list[jj]["dispatcher_status"]
                                 == "unallocated"):
                             self.create(jj)
                             if (self.dispatcher_list[jj]
                                 ["dispatcher_status"] == "unsubmitted"):
                                 dlog.info(
                                     self.dispatcher_list[jj]["entity"].ip)
                                 self.dispatcher_list[jj][
                                     "entity"].job_handler = self.dispatcher_list[
                                         jj]["dispatcher"].submit_jobs(
                                             resources, command, work_path,
                                             self.task_chunks[jj],
                                             group_size,
                                             forward_common_files,
                                             forward_task_files,
                                             backward_task_files,
                                             forward_task_deference, outlog,
                                             errlog)
                                 self.dispatcher_list[jj][
                                     "entity"].job_record = self.dispatcher_list[
                                         jj]["entity"].job_handler[
                                             "job_record"]
                                 self.dispatcher_list[jj][
                                     "dispatcher_status"] = "running"
                             break
                         jj -= 1
             elif dispatcher_status == "running":
                 pass
             elif dispatcher_status == "unallocated":
                 # if len(server_pool) > 0: make_dispatcher
                 # else: pass
                 self.create(ii)
                 if self.dispatcher_list[ii][
                         "dispatcher_status"] == "unsubmitted":
                     dlog.info(self.dispatcher_list[ii]["entity"].ip)
                     self.dispatcher_list[ii][
                         "entity"].job_handler = self.dispatcher_list[ii][
                             "dispatcher"].submit_jobs(
                                 resources, command, work_path,
                                 self.task_chunks[ii], group_size,
                                 forward_common_files, forward_task_files,
                                 backward_task_files,
                                 forward_task_deference, outlog, errlog)
                     self.dispatcher_list[ii][
                         "entity"].job_record = self.dispatcher_list[ii][
                             "entity"].job_handler["job_record"]
                     self.dispatcher_list[ii][
                         "dispatcher_status"] = "running"
             elif dispatcher_status == "terminated":
                 pass
         self.update()
         time.sleep(10)
コード例 #26
0
 def check_finish_tag(self, job):
     job_tag_finished = job.job_hash + '_job_tag_finished'
     dlog.info('check if job finished: ', job.job_id, job_tag_finished)
     return self.context.check_file_exists(job_tag_finished)
コード例 #27
0
ファイル: local_context.py プロジェクト: njzjz/dpdispatcher
    def download(self,
                 submission,
                 check_exists=False,
                 mark_failure=True,
                 back_error=False):
        cwd = os.getcwd()

        for ii in submission.belonging_tasks:
            # for ii in job_dirs :
            local_job = os.path.join(self.local_root, ii.task_work_path)
            remote_job = os.path.join(self.remote_root, ii.task_work_path)
            # flist = remote_down_files
            flist = ii.backward_files
            if back_error:
                os.chdir(remote_job)
                flist += glob('error*')
                os.chdir(cwd)
            for jj in flist:
                rfile = os.path.join(remote_job, jj)
                lfile = os.path.join(local_job, jj)
                if not os.path.realpath(rfile) == os.path.realpath(lfile):
                    if (not os.path.exists(rfile)) and (
                            not os.path.exists(lfile)):
                        if check_exists:
                            if mark_failure:
                                tag_file_path = os.path.join(
                                    self.local_root, ii.task_work_path,
                                    'tag_failure_download_%s' % jj)
                                with open(tag_file_path, 'w') as fp:
                                    pass
                            else:
                                pass
                        else:
                            raise RuntimeError('do not find download file ' +
                                               rfile)
                    elif (not os.path.exists(rfile)) and (
                            os.path.exists(lfile)):
                        # already downloaded
                        pass
                    elif (os.path.exists(rfile)) and (
                            not os.path.exists(lfile)):
                        # trivial case, download happily
                        shutil.move(rfile, lfile)
                    elif (os.path.exists(rfile)) and (os.path.exists(lfile)):
                        # both exists, replace!
                        dlog.info('find existing %s, replacing by %s' %
                                  (lfile, rfile))
                        if os.path.isdir(lfile):
                            shutil.rmtree(lfile, ignore_errors=True)
                        elif os.path.isfile(lfile) or os.path.islink(lfile):
                            os.remove(lfile)
                        shutil.copyfile(rfile, lfile)
                        # shutil.move(rfile, lfile)
                    else:
                        raise RuntimeError('should not reach here!')
                else:
                    # no nothing in the case of linked files
                    pass
        os.chdir(cwd)
        # for ii in submission.belonging_tasks:
        # for ii in job_dirs :
        # local_job = os.path.join(self.local_root, ii.task_work_path)
        # remote_job = os.path.join(self.remote_root, ii.task_work_path)
        # flist = remote_down_files
        # flist = ii.backward_files
        local_job = self.local_root
        remote_job = self.remote_root
        flist = submission.backward_common_files
        if back_error:
            os.chdir(remote_job)
            flist += glob('error*')
            os.chdir(cwd)
        for jj in flist:
            rfile = os.path.join(remote_job, jj)
            lfile = os.path.join(local_job, jj)
            if not os.path.realpath(rfile) == os.path.realpath(lfile):
                if (not os.path.exists(rfile)) and (not os.path.exists(lfile)):
                    if check_exists:
                        if mark_failure:
                            with open(
                                    os.path.join(
                                        self.local_root,
                                        'tag_failure_download_%s' % jj),
                                    'w') as fp:
                                pass
                        else:
                            pass
                    else:
                        raise RuntimeError('do not find download file ' +
                                           rfile)
                elif (not os.path.exists(rfile)) and (os.path.exists(lfile)):
                    # already downloaded
                    pass
                elif (os.path.exists(rfile)) and (not os.path.exists(lfile)):
                    # trivial case, download happily
                    shutil.move(rfile, lfile)
                elif (os.path.exists(rfile)) and (os.path.exists(lfile)):
                    dlog.info(f"both exist rfile:{rfile}; lfile:{lfile}")
                    # both exists, replace!
                    dlog.info('find existing %s, replacing by %s' %
                              (lfile, rfile))
                    if os.path.isdir(lfile):
                        shutil.rmtree(lfile, ignore_errors=True)
                    elif os.path.isfile(lfile) or os.path.islink(lfile):
                        os.remove(lfile)
                    shutil.copyfile(rfile, lfile)
                    # shutil.move(rfile, lfile)
                else:
                    raise RuntimeError('should not reach here!')
            else:
                # no nothing in the case of linked files
                pass
        os.chdir(cwd)