def start_glidin_jobs(RE_info): """start glidin jobs (advert_job.py) at every unique machine specified in RE_info""" unique_hosts = set(RE_info.remote_hosts) for i in unique_hosts: print "Number hosts: " + str( RE_info.remote_hosts.count(i) ) + " Number processes per job: " + str( RE_info.numberofprocesses) + " Number GlideIns per Host: " + str( RE_info.number_glideins_per_host) + " Index: " + str( RE_info.remote_hosts.index(i)) nodes = int(RE_info.remote_hosts.count(i)) * int( RE_info.numberofprocesses) lrms = RE_info.remote_host_local_schedulers[RE_info.remote_hosts.index( i)] project = RE_info.projects[RE_info.remote_hosts.index(i)] queue = RE_info.queues[RE_info.remote_hosts.index(i)] workingdirectory = RE_info.workingdirectories[ RE_info.remote_hosts.index(i)] userproxy = None try: userproxy = RE_info.userproxy[RE_info.remote_hosts.index(i)] except: pass if (CPR == True): lrms_url = "migol://" else: lrms_url = "gram://" lrms_url = lrms_url + i + "/" + "jobmanager-" + lrms nodes_per_glidein = nodes num_glidein = RE_info.number_glideins_per_host if (num_glidein != None and num_glidein != 0): nodes_per_glidein = nodes / num_glidein # must be divisible print "Glidin URL: " + lrms_url print "hosts: " + str( i) + " number of replica_processes (total): " + str(nodes) print "number glide-ins: " + str( num_glidein) + " node per glidein: " + str(nodes_per_glidein) print "Project: " + project + " Queue: " + queue + " Working Dir: " + workingdirectory # start job for ng in range(0, RE_info.number_glideins_per_host): advert_glidin_job = advert_job.advert_glidin_job( RE_info.advert_host) advert_glidin_job.start_glidin_job(lrms_url, None, nodes_per_glidein, queue, project, workingdirectory, userproxy, None) if RE_info.advert_glidin_jobs.has_key(i) == False: RE_info.advert_glidin_jobs[i] = [] RE_info.advert_glidin_jobs[i].append(advert_glidin_job) print "Started: " + str( advert_glidin_job) + " Glide-In Job Number: " + str(ng)
def init_bigjobs(self): """ start on specified resources a bigjob """ self.bigjob_list = self.schedule_bigjobs() for i in self.bigjob_list: gram_url = i["gram_url"] logging.debug("start bigjob at: " + gram_url) bigjob = advert_job.advert_glidin_job(self.advert_host) bigjob.start_glidin_job(gram_url, i["re_agent"], i["number_cores"], i["queue"], i["allocation"], "$(HOME)", None, None) i["bigjob"] = bigjob # store bigjob for later reference in dict i["free_cores"] = int(i["number_cores"]) # lock for modifying the number of free nodes i["lock"] = threading.Lock()
def start_glidin_jobs(self): """start glidin jobs (advert_job.py) at every unique machine specified in RE_info""" for resource in self.resourceMap.keys(): i = self.resourceMap[resource] host = i["host"] num_glidein = int(i["number_glide_in"]) nodes = int(i["number_nodes"]) lrms = i["scheduler"] project = i["allocation"] queue = i["queue"] workingdirectory = i["working_dir_root"] userproxy = None try: userproxy = i["userproxy"] except: pass if (self.cpr == True): lrms_url = "migol://" else: lrms_url = "gram://" lrms_url = lrms_url + host + "/" + "jobmanager-" + lrms nodes_per_glidein = nodes if (num_glidein != None and num_glidein != 0): nodes_per_glidein = nodes / num_glidein # must be divisible print "Glidin URL: " + lrms_url print "hosts: " + str( i) + " number of replica_processes (total): " + str(nodes) print "number glide-ins: " + str( num_glidein) + " node per glidein: " + str( nodes_per_glidein) print "Project: " + project + " Queue: " + queue + " Working Dir: " + workingdirectory # start job for ng in range(0, num_glidein): advert_glidin_job = advert_job.advert_glidin_job( self.advert_host) advert_glidin_job.start_glidin_job(lrms_url, self.re_agent, nodes_per_glidein, queue, project, workingdirectory, userproxy, None) if i.has_key("glide_in_jobs") == False: i["glide_in_jobs"] = [] i["glide_in_jobs"].append(advert_glidin_job) print "Started: " + str( advert_glidin_job) + " Glide-In Job Number: " + str(ng)
def init_bigjobs(self): """ start on specified resources a bigjob """ self.bigjob_list = self.schedule_bigjobs() for i in self.bigjob_list: gram_url = i["gram_url"] logging.debug("start bigjob at: " + gram_url) bigjob = advert_job.advert_glidin_job(self.advert_host) bigjob.start_glidin_job(gram_url, i["re_agent"], i["number_cores"], i["queue"], i["allocation"], "$(HOME)", None, None) i["bigjob"]=bigjob # store bigjob for later reference in dict i["free_cores"]=int(i["number_cores"]) # lock for modifying the number of free nodes i["lock"] = threading.Lock()
def start_glidin_jobs(RE_info): """start glidin jobs (advert_job.py) at every unique machine specified in RE_info""" unique_hosts = set(RE_info.remote_hosts) for i in unique_hosts: print "Number hosts: " + str(RE_info.remote_hosts.count(i)) + " Number processes per job: " + str(RE_info.numberofprocesses) + " Number GlideIns per Host: " + str(RE_info.number_glideins_per_host) + " Index: " + str(RE_info.remote_hosts.index(i)) nodes = int(RE_info.remote_hosts.count(i)) * int(RE_info.numberofprocesses) lrms = RE_info.remote_host_local_schedulers[RE_info.remote_hosts.index(i)] project = RE_info.projects[RE_info.remote_hosts.index(i)] queue = RE_info.queues[RE_info.remote_hosts.index(i)] workingdirectory = RE_info.workingdirectories[RE_info.remote_hosts.index(i)] userproxy=None try: userproxy = RE_info.userproxy[RE_info.remote_hosts.index(i)] except: pass if(CPR==True): lrms_url = "migol://" else: lrms_url = "gram://" lrms_url = lrms_url + i + "/" + "jobmanager-" + lrms nodes_per_glidein = nodes num_glidein = RE_info.number_glideins_per_host if (num_glidein != None and num_glidein != 0): nodes_per_glidein = nodes/num_glidein # must be divisible print "Glidin URL: " + lrms_url print "hosts: " + str(i) + " number of replica_processes (total): " + str(nodes) print "number glide-ins: " + str(num_glidein) + " node per glidein: " + str(nodes_per_glidein) print "Project: " + project + " Queue: " + queue + " Working Dir: " +workingdirectory # start job for ng in range(0, RE_info.number_glideins_per_host): advert_glidin_job = advert_job.advert_glidin_job(RE_info.advert_host) advert_glidin_job.start_glidin_job(lrms_url, None, nodes_per_glidein, queue, project, workingdirectory, userproxy, None) if RE_info.advert_glidin_jobs.has_key(i) == False: RE_info.advert_glidin_jobs[i] = [] RE_info.advert_glidin_jobs[i].append(advert_glidin_job); print "Started: " + str(advert_glidin_job) + " Glide-In Job Number: " + str(ng)+ time.asctime(time.localtime(time.time()))
def start_glidin_jobs(self): """start glidin jobs (advert_job.py) at every unique machine specified in RE_info""" for resource in self.resourceMap.keys(): i = self.resourceMap[resource] host = i["host"] num_glidein = int(i["number_glide_in"]) nodes = int(i["number_nodes"]) lrms = i["scheduler"] project = i["allocation"] queue = i["queue"] workingdirectory = i["working_dir_root"] userproxy = None try: userproxy = i["userproxy"] except: pass if self.cpr == True: lrms_url = "migol://" else: lrms_url = "gram://" lrms_url = lrms_url + host + "/" + "jobmanager-" + lrms nodes_per_glidein = nodes if num_glidein != None and num_glidein != 0: nodes_per_glidein = nodes / num_glidein # must be divisible print "Glidin URL: " + lrms_url print "hosts: " + str(i) + " number of replica_processes (total): " + str(nodes) print "number glide-ins: " + str(num_glidein) + " node per glidein: " + str(nodes_per_glidein) print "Project: " + project + " Queue: " + queue + " Working Dir: " + workingdirectory # start job for ng in range(0, num_glidein): advert_glidin_job = advert_job.advert_glidin_job(self.advert_host) advert_glidin_job.start_glidin_job( lrms_url, self.re_agent, nodes_per_glidein, queue, project, workingdirectory, userproxy, None ) if i.has_key("glide_in_jobs") == False: i["glide_in_jobs"] = [] i["glide_in_jobs"].append(advert_glidin_job) print "Started: " + str(advert_glidin_job) + " Glide-In Job Number: " + str(ng)
""" Test Job Submission via Advert """ if __name__ == "__main__": # Parameter for BigJob re_agent = os.getcwd() + "/advert_launcher.sh" # path to agent nodes = 64 # number nodes for agent lrms_url = "gram://qb1.loni.org/jobmanager-pbs" # resource url project = "loni_jha_big" #allocation queue = "workq" # queue (PBS) workingdirectory="/tmp" # working directory userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) # start Glide-In job (Replica-Agent) print "Start Glide-In at: " + lrms_url advert_glidin_job = advert_job.advert_glidin_job(advert_host) advert_glidin_job.start_glidin_job(lrms_url, re_agent, nodes, queue, project, workingdirectory, userproxy) print "BigJob URL: " + advert_glidin_job.glidin_url # submit sub-job through big-job jd = saga.job.description() jd.executable = "/home/luckow/src/REMDgManager/bigjob/main" jd.number_of_processes = "2" jd.spmd_variation = "mpi" jd.arguments = [""]
""" Test Job Submission via Advert """ if __name__ == "__main__": # Parameter for BigJob re_agent = os.getcwd() + "/advert_launcher.sh" # path to agent nodes = 64 # number nodes for agent lrms_url = "gram://qb1.loni.org/jobmanager-pbs" # resource url project = "loni_jha_big" #allocation queue = "workq" # queue (PBS) workingdirectory="/tmp" # working directory userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) # start Glide-In job (Replica-Agent) print "Start Glide-In at: " + lrms_url advert_glidin_job = advert_job.advert_glidin_job(advert_host) advert_glidin_job.start_glidin_job(lrms_url, re_agent, nodes, queue, project, workingdirectory, userproxy, None) print "BigJob URL: " + advert_glidin_job.glidin_url # submit sub-job through big-job jd = saga.job.description() jd.executable = "/home/luckow/src/REMDgManager/bigjob/main" jd.number_of_processes = "2" jd.spmd_variation = "mpi"