def runner_sqlschedules(options, dbdescr, experiment, *strings): """ Schedule multiple jobs from the command line to run using the sql command. Usage: sqlschedules <tablepath> <experiment> <parameters> See the sqlschedule command for <tablepath> <experiment> We accept the dbidispatch syntax: where <parameters> is interpreted as follows: The parameters may contain one or many segments of the form {{a,b,c,d}}, which generate multiple jobs to execute. Each segement will be replaced by one value in the segment separated by comma. The first will have the a value, the second the b value, etc. If their is many segment, it will generate the cross-product of possible value between the segment. """ parser = getattr(parse, options.parser, None) or resolve(options.parser) db = open_db(dbdescr, serial=True) ### resolve(experiment) # we try to load the function associated to the experiment verbose = not options.quiet (commands, choise_args) = generate_commands(strings) if verbose: print commands, choise_args if options.force: for cmd in commands: state = parser(*cmd) state['jobman.experiment'] = experiment sql.add_experiments_to_db([state] * (options.repeat), db, verbose=verbose, force_dup=True) if options.quiet: print "Added %d jobs to the db" % len(commands) else: #if the first insert fail, we won't force the other as the #force option was not gived. failed = 0 for cmd in commands: state = parser(*cmd) state['jobman.experiment'] = experiment ret = sql.add_experiments_to_db([state], db, verbose=verbose, force_dup=options.force) if ret[0][0]: sql.add_experiments_to_db([state] * (options.repeat - 1), db, verbose=verbose, force_dup=True) else: failed += 1 if verbose: print "The last cmd failed to insert, we won't repeat it. use --force to force the duplicate of job in the db." print "Added", len(commands) - failed, "on", len(commands), "jobs"
def db(dbstr): """ DEPRECATED: call api0.open_db(dbstr), which has the same api """ import warnings warnings.warn( "sql.db is deprecated, call api0.open_db", DeprecationWarning) import api0 return api0.open_db(dbstr)
def runner_sqlschedules(options, dbdescr, experiment, *strings): """ Schedule multiple jobs from the command line to run using the sql command. Usage: sqlschedules <tablepath> <experiment> <parameters> See the sqlschedule command for <tablepath> <experiment> We accept the dbidispatch syntax: where <parameters> is interpreted as follows: The parameters may contain one or many segments of the form {{a,b,c,d}}, which generate multiple jobs to execute. Each segement will be replaced by one value in the segment separated by comma. The first will have the a value, the second the b value, etc. If their is many segment, it will generate the cross-product of possible value between the segment. """ parser = getattr(parse, options.parser, None) or resolve(options.parser) db = open_db(dbdescr, serial=True) # resolve(experiment) # we try to load the function associated to the # experiment verbose = not options.quiet (commands, choise_args) = generate_commands(strings) if verbose: print commands, choise_args if options.force: for cmd in commands: state = parser(*cmd) state['jobman.experiment'] = experiment sql.add_experiments_to_db([state] * (options.repeat), db, verbose=verbose, force_dup=True) if options.quiet: print "Added %d jobs to the db" % len(commands) else: # if the first insert fail, we won't force the other as the # force option was not gived. failed = 0 for cmd in commands: state = parser(*cmd) state['jobman.experiment'] = experiment ret = sql.add_experiments_to_db([state], db, verbose=verbose, force_dup=options.force) if ret[0][0]: sql.add_experiments_to_db([state] * (options.repeat - 1), db, verbose=verbose, force_dup=True) else: failed += 1 if verbose: print "The last cmd failed to insert, we won't repeat it. use --force to force the duplicate of job in the db." print "Added", len(commands) - failed, "on", len(commands), "jobs"
def runner_sqlreload(options, dbdescr, table_dir, *ids): """ Put data in the experiment directory back in the the sql db. Usefull in case you delete the db or part of it. Example use: jobman sqlreload [--all] postgres://user:pass@host[:port]/dbname?table=tablename ~/expdir/dbname/tablename 10 11 """ if table_dir[-1] == os.path.sep: table_dir = table_dir[:-1] db = open_db(dbdescr, serial=True) assert os.path.split(table_dir)[-1] == db.tablename assert os.path.split(os.path.split(table_dir)[0])[-1] == db.dbname expdir = os.path.split(os.path.split(table_dir)[0])[0] if options.all: assert len(ids) == 0 ids = [] for p in os.listdir(table_dir): try: ids += [int(p)] except ValueError: print 'Skipping entry %s, as it is not a jobman id.' % p else: # Ensure that ids are all integers. ids = [int(d) for d in ids] try: session = db.session() for id in ids: # Get state dict from the file file_name = '%s/%i/current.conf' % (table_dir, id) file_state = parse.filemerge(file_name) # Get state dict from the DB db_state = db.get(id) if db_state is None: # No such dict exist, we have to insert it, with the right id file_state['jobman.id'] = id db.insert(file_state, session=session) else: db_state.update_in_session(file_state, session=session) pass finally: session.close()
def runner_sqlschedule(options, dbdescr, experiment, *strings): """ Schedule a job to run using the sql command. Usage: sqlschedule <tablepath> <experiment> <parameters> See the experiment and parameters topics for more information about these parameters. Assuming that a postgres database is running on port `port` of `host`, contains a database called `dbname` and that `user` has the permissions to create, read and modify tables on that database, tablepath should be of the following form: postgres://user:pass@host[:port]/dbname?table=tablename If no table is named `tablename`, one will be created automatically. The state corresponding to the experiment and parameters specified in the command will be saved in the database, but no experiment will be run. To run an experiment scheduled using sqlschedule, see the sql command. Example use: jobman sqlschedule postgres://user:pass@host[:port]/dbname?table=tablename \\ mymodule.my_experiment \\ stopper::pylearn.stopper.nsteps \\ # use pylearn.stopper.nsteps stopper.n=10000 \\ # the argument "n" of nsteps is 10000 lr=0.03 you can use the jobman.experiments.example1 as a working mymodule.my_experiment """ db = open_db(dbdescr, serial=True) parser = getattr(parse, options.parser, None) or resolve(options.parser) state = parser(*strings) resolve( experiment) # we try to load the function associated to the experiment state['jobman.experiment'] = experiment sql.add_experiments_to_db([state], db, verbose=1, force_dup=options.force)
def runner_sqlschedule(options, dbdescr, experiment, *strings): """ Schedule a job to run using the sql command. Usage: sqlschedule <tablepath> <experiment> <parameters> See the experiment and parameters topics for more information about these parameters. Assuming that a postgres database is running on port `port` of `host`, contains a database called `dbname` and that `user` has the permissions to create, read and modify tables on that database, tablepath should be of the following form: postgres://user:pass@host[:port]/dbname?table=tablename If no table is named `tablename`, one will be created automatically. The state corresponding to the experiment and parameters specified in the command will be saved in the database, but no experiment will be run. To run an experiment scheduled using sqlschedule, see the sql command. Example use: jobman sqlschedule postgres://user:pass@host[:port]/dbname?table=tablename \\ mymodule.my_experiment \\ stopper::pylearn.stopper.nsteps \\ # use pylearn.stopper.nsteps stopper.n=10000 \\ # the argument "n" of nsteps is 10000 lr=0.03 you can use the jobman.experiments.example1 as a working mymodule.my_experiment """ db = open_db(dbdescr, serial=True) parser = getattr(parse, options.parser, None) or resolve(options.parser) state = parser(*strings) # we try to load the function associated to the experiment resolve(experiment) state['jobman.experiment'] = experiment sql.add_experiments_to_db([state], db, verbose=1, force_dup=options.force)
def runner_sqlview(options, dbdescr, viewname): """ Create/drop a view of the scheduled experiments. Usage: jobman sqlview <tablepath> <viewname> The jobs should be scheduled first with the sqlschedule command. Also, it is more interesting to execute it after some experiment have finished. Assuming that a postgres database is running on port `port` of `host`, contains a database called `dbname` and that `user` has the permissions to create, read and modify tables on that database, tablepath should be of the following form: postgres://user:pass@host[:port]/dbname?table=tablename Example use: That was executed and at least one exeperiment was finished. jobman sqlschedule postgres://user:pass@host[:port]/dbname?table=tablename \\ mymodule.my_experiment \\ stopper::pylearn.stopper.nsteps \\ # use pylearn.stopper.nsteps stopper.n=10000 \\ # the argument "n" of nsteps is 10000 lr=0.03 Now this will create a view with a columns for each parameter and key=value set in the state by the jobs. jobman sqlview postgres://user:pass@host[:port]/dbname?table=tablename viewname you can use the jobman.experiments.example1 as a working mymodule.my_experiment """ db = open_db(dbdescr, serial=True) if options.drop: db.dropView(viewname, not options.quiet) else: db.createView(viewname, not options.quiet)
def cachesync_runner(options, dir): """ Syncs the working directory of jobs with remote cache. Usage: cachesync [options] <path_to_job(s)_workingdir(s)> (For this to work, though, you need to do a channel.save() at least once in your job before calling cachesync, otherwise the host_name and host_workdir won't be set in current.conf) For the purpose of this command, see below. It can either sync a single directory, which must contain "current.conf" file which specifies the remote host and directory. Example for a single directory: # this syncs the current directory jobman cachesync . # this syncs another directory jobman cachesync myexperiment/mydbname/mytablename/5 It can also sync all subdirectories of the directory you specify. You must use the -m (or --multiple) option for this. Each subdirectory (numbered 1, 2 ... etc based on job number) must contain a "current.conf" file specifying the remote host and directory. Examples: # syncs all subdirectories 1, 2 ... jobman cachesync -m myexperiment/mydbname/mytablename Normally completed jobs (status = DONE) won't be synced based on the "status" set in current.conf. Yet you can force sync by using the -f or --force option. --sql=dbdesc is an option that allow to get from the db missing info from the current.conf file. Same syntax as the sql command. Purpose of this command ----------------------- To clarify the purpose of the cachesync command: when launching jobs, working directories are created for each job. For example, when launching: dbidispatch jobman sql 'postgres://user@gershwin/mydatabase?table=mytable' . A directory ``mydatabase`` with subdirectory ``mytable``. will be created, containing further subdirectories numbered 1, 2 and 3 (based on job id's in the DB). These directories are the working directories of each job. They contain a copy of the stdout and stderr of the job, along with copies of the jobman state (dictionaries in .conf files) and further files created by the job. Yet the content of those directories is not updated live during the job. The job runs on a cluster node, and those files are first written to a temporary directory on the node itself. Then, when calling channel.save() or when the job finishes, they're rsync'ed over to the working directory where they should be. This is annoying since one can't see how the jobs are doing unless he SSH'es into the cluster node and finds the temporary directory. To alleviate this problem, the cachesync commands copies over the files to the working directory whenever asked to, so it's easier to probe the running jobs state. """ force = options.force multiple = options.multiple dbdesc = options.sql all_jobs = None if dbdesc: import api0 db = api0.open_db(dbdesc, serial=True) try: session = db.session() q = db.query(session) all_jobs = q.all() finally: try: session.close() except: pass if multiple: sync_all_directories(dir, all_jobs, force) else: sync_single_directory(dir, all_jobs, force)
def check_serve(options, dbdescr): """Check that all jobs marked as running in the db are marked as running in some cluster jobs scheduler. print jobs that could have crashed/been killed ... Example usage: jobman check <tablepath> """ db = open_db(dbdescr, serial=True) try: session = db.session() q = db.query(session) idle = q.filter_eq('jobman.status', 0).all() running = q.filter_eq('jobman.status', 1).all() finished = q.filter_eq('jobman.status', 2).all() err_start = q.filter_eq('jobman.status', 3).all() err_sync = q.filter_eq('jobman.status', 4).all() err_run = q.filter_eq('jobman.status', 5).all() canceled = q.filter_eq('jobman.status', -1).all() info = [] print ("I: number of job by status (%d:START, %d:RUNNING, %d:DONE," " %d:ERR_START, %d:ERR_SYNC, %d:ERR_RUN, %d:CANCELED)" " in the db (%d:TOTAL)" % (len(idle), len(running), len(finished), len(err_start), len(err_sync), len(err_run), len(canceled), len(q.all()))) print #warn about job in error status if len(err_start): print "E: The following jobs had an error when starting them", print [j.id for j in err_start] if len(err_sync): print "E: The following jobs had an error while doing the rsync", print [j.id for j in err_sync] if len(err_run): print "E: The following jobs had an error while running", print [j.id for j in err_run] print #check not 2 jobs in same slot+host host_slot = {} now = time.time() #check job still running for idx, r in enumerate(running): condor_job = False sge_job = False pbs_job = False #find the backend used for the job. if ("jobman.sql.condor_slot" in r.keys() and r["jobman.sql.condor_slot"] != "no_condor_slot"): condor_job = True if "jobman.sql.sge_task_id" in r.keys(): sge_job = True if "jobman.sql.pbs_task_id" in r.keys(): pbs_job = True if (sge_job + condor_job + pbs_job) > 1: print "W: Job %d have info such that it run on condor, sge and/or pbs. We can't determine the good one." continue if not (sge_job or condor_job or pbs_job): print "W: Job %d don't have condor, sge or pbs info attached to it. We can't determine if it is still running on the cluster. Old jobman to started the job?" % r.id continue #check that the job is still running. if sge_job: check_running_sge_jobs(r, now) continue if pbs_job: check_running_pbs_jobs(r, now) continue if not condor_job: print "W: Job %d is running but don't have the information needed to check if they still run on the jobs scheduler condor/pbs/torque/sge. Possible reasons: the job started with an old version of jobman or on another jobs scheduler."%r.id continue # We suppose the jobs started on condor. try: h = r["jobman.sql.host_name"] s = r["jobman.sql.condor_slot"] except KeyError, e: print "W: Job %d is running but don't have needed info to check them again condor. Possible reaons: the job started with an old version of jobman or without condor."%r.id continue st = s + '@' + h if host_slot.has_key(st): try: t0 = str_time(now - running[host_slot[st]]["jobman.sql.start_time"]) except KeyError: t0 = 'NO_START_TIME' try: t1 = str_time(now - r["jobman.sql.start_time"]) except KeyError: t1 = 'NO_START_TIME' print 'E: Job %d and Job %d are running on the same condor slot/host combination. running time: %s and %s'%(running[host_slot[st]].id,r.id,t0,t1) else: host_slot[st]=idx gjid = None if "jobman.sql.condor_global_job_id" in r.keys(): gjid = r["jobman.sql.condor_global_job_id"] elif "jobman.sql.condor_GlobalJobId" in r.keys(): gjid = r["jobman.sql.condor_GlobalJobId"] if gjid is not None: submit_host = gjid.split('#')[0] #import pdb;pdb.set_trace() #take care of the quotation, condor resquest that "" be used #around string. cmd = "condor_q -name %s -const 'GlobalJobId==\"%s\"' -format '%%s' 'JobStatus'"%(submit_host,gjid) p = Popen(cmd, shell=True, stdout=PIPE) ret = p.wait(); lines = p.stdout.readlines() if ret == 127 and len(lines) == 0: print "W: Job %d. condor_q failed. Is condor installed on this computer?"%r.id continue if len(lines) == 0: print "E: Job %d is marked as running in the bd on this condor jobs %s, but condor tell that this jobs is finished"%(r.id,gjid) continue elif len(lines) == 1: if lines[0] == '0': # condor unexpanded??? What should we do? print "E: Job %d is marked as running in the db, but its condor submited job is marked as unexpanded. We don't know what that mean, so we use an euristic to know if the jobs is still running."%r.id elif lines[0] == '1': # condor idle print "E: Job %d is marked as running in the db, but its condor submited job is marked as idle. This can mean that the computer that was running this job crashed."%r.id continue elif lines[0] == '2': # condor running continue elif lines[0] == '3': # condor removed print "E: Job %d is marked as running in the db, but its condor submited job is marked as removed."%r.id elif lines[0] == '4': # condor completed print "E: Job %d is marked as running in the db, but its condor submited job is marked as completed."%r.id elif lines[0] == '5': # condor held print "E: Job %d is marked as running in the db, but its condor submited job is marked as held."%r.id elif lines[0] == '6': # condor submission error print "E: Job %d is marked as running in the db, but its condor submited job is marked as submission error(SHOULD not happen as if condor can't start the job, it don't select one in the db)."%r.id else: print "W: condor return a not understood answer to a query. We will try some euristic to determine if it is running. test command `%s`. stdout returned `%s`"%(cmd,lines) #except KeyError: # pass info = (r.id, r["jobman.experiment"], r["jobman.sql.condor_slot"], r["jobman.sql.host_name"], r["jobman.sql.start_time"]) run_time = str_time(now - info[4]) if info[2] == "no_condor_slot": print "W: Job %d is not running on condor(Should not happed...)"%info[0] else: p = Popen('''condor_status -constraint 'Name == "slot%s@%s"' -format "%%s" Name -format " %%s" State -format " %%s" Activity -format " %%s" RemoteUser -format " %%s\n" RemoteOwner''' % (info[2], info[3]), shell=True, stdout=PIPE) p.wait() lines = p.stdout.readlines() #return when running: [email protected] Claimed Busy bastienf bastienf #return when don't exist: empty if len(lines) == 0: print "W: Job %d is running on a host(%s) that condor lost connection with. The job run for: %s"%(r.id, info[3], run_time) continue elif len(lines) != 1 and not (len(lines) == 2 and lines[-1] == '\n'): print "W: Job %d condor_status return not understood: ",lines continue sp = lines[0].split() if len(sp) >= 3 and sp[1] in ["Unclaimed", "Owner"] and sp[2] == "Idle": print "E: Job %d db tell that this job is running on %s. condor tell that this host don't run a job. running time %s"%(r.id,info[3],run_time) elif len(sp) == 5: assert sp[0] == "slot%s@%s" % (info[2], info[3]) if sp[3] != sp[4]: print "W: Job %d condor_status return not understood: ",lines if sp[1] == "Claimed" and sp[2] in ["Busy", "Retiring"]: if sp[4].split('@')[0] == os.getenv("USER"): print "W: Job %d is running on a condor host that is running a job of the same user. running time: %s"%(r.id,run_time) else: print "E: Job %d is running on a condor host that is running a job for user %s. running time: %s"%(r.id,sp[4].split('@')[0],run_time) else: print "W: Job %d condor state of host not understood"%r.id,sp else: print "W: Job %d condor_status return not understood: ",lines finally: session.close()
def check_serve(options, dbdescr): """Check that all jobs marked as running in the db are marked as running in some cluster jobs scheduler. print jobs that could have crashed/been killed ... Example usage: jobman check <tablepath> """ db = open_db(dbdescr, serial=True) try: session = db.session() q = db.query(session) idle = q.filter_eq('jobman.status', 0).all() running = q.filter_eq('jobman.status', 1).all() finished = q.filter_eq('jobman.status', 2).all() err_start = q.filter_eq('jobman.status', 3).all() err_sync = q.filter_eq('jobman.status', 4).all() err_run = q.filter_eq('jobman.status', 5).all() canceled = q.filter_eq('jobman.status', -1).all() info = [] print ("I: number of job by status (%d:START, %d:RUNNING, %d:DONE," " %d:ERR_START, %d:ERR_SYNC, %d:ERR_RUN, %d:CANCELED)" " in the db (%d:TOTAL)" % (len(idle), len(running), len(finished), len(err_start), len(err_sync), len(err_run), len(canceled), len(q.all()))) print # warn about job in error status if len(err_start): print "E: The following jobs had an error when starting them", print [j.id for j in err_start] if len(err_sync): print "E: The following jobs had an error while doing the rsync", print [j.id for j in err_sync] if len(err_run): print "E: The following jobs had an error while running", print [j.id for j in err_run] print # check not 2 jobs in same slot+host host_slot = {} now = time.time() # check job still running for idx, r in enumerate(running): condor_job = False sge_job = False pbs_job = False # find the backend used for the job. if ("jobman.sql.condor_slot" in r.keys() and r["jobman.sql.condor_slot"] != "no_condor_slot"): condor_job = True if "jobman.sql.sge_task_id" in r.keys(): sge_job = True if "jobman.sql.pbs_task_id" in r.keys(): pbs_job = True if (sge_job + condor_job + pbs_job) > 1: print "W: Job %d have info such that it run on condor, sge and/or pbs. We can't determine the good one." continue if not (sge_job or condor_job or pbs_job): print "W: Job %d don't have condor, sge or pbs info attached to it. We can't determine if it is still running on the cluster. Old jobman to started the job?" % r.id continue # check that the job is still running. if sge_job: check_running_sge_jobs(r, now) continue if pbs_job: check_running_pbs_jobs(r, now) continue if not condor_job: print "W: Job %d is running but don't have the information needed to check if they still run on the jobs scheduler condor/pbs/torque/sge. Possible reasons: the job started with an old version of jobman or on another jobs scheduler." % r.id continue # We suppose the jobs started on condor. try: h = r["jobman.sql.host_name"] s = r["jobman.sql.condor_slot"] except KeyError, e: print "W: Job %d is running but don't have needed info to check them again condor. Possible reaons: the job started with an old version of jobman or without condor." % r.id continue st = s + '@' + h if host_slot.has_key(st): try: t0 = str_time( now - running[host_slot[st]]["jobman.sql.start_time"]) except KeyError: t0 = 'NO_START_TIME' try: t1 = str_time(now - r["jobman.sql.start_time"]) except KeyError: t1 = 'NO_START_TIME' print 'E: Job %d and Job %d are running on the same condor slot/host combination. running time: %s and %s' % (running[host_slot[st]].id, r.id, t0, t1) else: host_slot[st] = idx gjid = None if "jobman.sql.condor_global_job_id" in r.keys(): gjid = r["jobman.sql.condor_global_job_id"] elif "jobman.sql.condor_GlobalJobId" in r.keys(): gjid = r["jobman.sql.condor_GlobalJobId"] if gjid is not None: submit_host = gjid.split('#')[0] #import pdb;pdb.set_trace() # take care of the quotation, condor resquest that "" be used # around string. cmd = "condor_q -name %s -const 'GlobalJobId==\"%s\"' -format '%%s' 'JobStatus'" % ( submit_host, gjid) p = Popen(cmd, shell=True, stdout=PIPE) ret = p.wait() lines = p.stdout.readlines() if ret == 127 and len(lines) == 0: print "W: Job %d. condor_q failed. Is condor installed on this computer?" % r.id continue if len(lines) == 0: print "E: Job %d is marked as running in the bd on this condor jobs %s, but condor tell that this jobs is finished" % (r.id, gjid) continue elif len(lines) == 1: # condor unexpanded??? What should we do? if lines[0] == '0': print "E: Job %d is marked as running in the db, but its condor submited job is marked as unexpanded. We don't know what that mean, so we use an euristic to know if the jobs is still running." % r.id elif lines[0] == '1': # condor idle print "E: Job %d is marked as running in the db, but its condor submited job is marked as idle. This can mean that the computer that was running this job crashed." % r.id continue elif lines[0] == '2': # condor running continue elif lines[0] == '3': # condor removed print "E: Job %d is marked as running in the db, but its condor submited job is marked as removed." % r.id elif lines[0] == '4': # condor completed print "E: Job %d is marked as running in the db, but its condor submited job is marked as completed." % r.id elif lines[0] == '5': # condor held print "E: Job %d is marked as running in the db, but its condor submited job is marked as held." % r.id elif lines[0] == '6': # condor submission error print "E: Job %d is marked as running in the db, but its condor submited job is marked as submission error(SHOULD not happen as if condor can't start the job, it don't select one in the db)." % r.id else: print "W: condor return a not understood answer to a query. We will try some euristic to determine if it is running. test command `%s`. stdout returned `%s`" % (cmd, lines) # except KeyError: # pass info = (r.id, r["jobman.experiment"], r["jobman.sql.condor_slot"], r["jobman.sql.host_name"], r["jobman.sql.start_time"]) run_time = str_time(now - info[4]) if info[2] == "no_condor_slot": print "W: Job %d is not running on condor(Should not happed...)" % info[0] else: p = Popen('''condor_status -constraint 'Name == "slot%s@%s"' -format "%%s" Name -format " %%s" State -format " %%s" Activity -format " %%s" RemoteUser -format " %%s\n" RemoteOwner''' % (info[2], info[3]), shell=True, stdout=PIPE) p.wait() lines = p.stdout.readlines() # return when running: [email protected] Claimed Busy bastienf bastienf # return when don't exist: empty if len(lines) == 0: print "W: Job %d is running on a host(%s) that condor lost connection with. The job run for: %s" % (r.id, info[3], run_time) continue elif len(lines) != 1 and not (len(lines) == 2 and lines[-1] == '\n'): print "W: Job %d condor_status return not understood: ", lines continue sp = lines[0].split() if len(sp) >= 3 and sp[1] in ["Unclaimed", "Owner"] and sp[2] == "Idle": print "E: Job %d db tell that this job is running on %s. condor tell that this host don't run a job. running time %s" % (r.id, info[3], run_time) elif len(sp) == 5: assert sp[0] == "slot%s@%s" % (info[2], info[3]) if sp[3] != sp[4]: print "W: Job %d condor_status return not understood: ", lines if sp[1] == "Claimed" and sp[2] in ["Busy", "Retiring"]: if sp[4].split('@')[0] == os.getenv("USER"): print "W: Job %d is running on a condor host that is running a job of the same user. running time: %s" % (r.id, run_time) else: print "E: Job %d is running on a condor host that is running a job for user %s. running time: %s" % (r.id, sp[4].split('@')[0], run_time) else: print "W: Job %d condor state of host not understood" % r.id, sp else: print "W: Job %d condor_status return not understood: ", lines finally: session.close()
def runner_sqlstatus(options, dbdescr, *ids): """Show the status of jobs. Option allow to change it. The --resert_prio option set the priority of the jobs back to the default value. Example use: jobman sqlstatus postgres://user:pass@host[:port]/dbname?table=tablename 10 11 """ #we don't want to remove all output when we change the db. if options.set_status and options.ret_nb_jobs: raise UsageError( "The option --set_status and --ret_nb_jobs are mutually exclusive." ) db = open_db(dbdescr, serial=True) if options.set_status: try: new_status = to_status_number(options.set_status) except ValueError: raise UsageError( "The option --set_status accept only the value START, RUNNING, DONE, ERR_START, ERR_SYNC, ERR_RUN, CANCELED or their equivalent int number" ) else: new_status = None have_running_jobs = False verbose = not options.quiet if options.ret_nb_jobs: verbose = 0 else: verbose += 1 ids = list(ids) try: session = db.session() if options.print_keys: q = db.query(session) job = q.first() print "Keys in the state of the first jobs", for k in job.keys(): print k, print del q, job, k if options.status: q = db.query(session) jobs = [] for stat in options.status: jobs += q.filter_eq('jobman.status', to_status_number(stat)).all() ids.extend([j.id for j in jobs]) del jobs, q if options.select: q = db.query(session) j = q.first() for param in options.select: k, v = param.split('=') if k == 'jobman.status': q = q.filter_eq(k, to_status_number(v)) elif isinstance(j[k], (str, unicode)): q = q.filter_eq(k, v) elif isinstance(j[k], float): q = q.filter_eq(k, float(v)) elif isinstance(j[k], int): q = q.filter_eq(k, int(v)) else: q = q.filter_eq(k, repr(v)) jobs = q.all() ids.extend([j.id for j in jobs]) del j, jobs, q if options.fselect: q = db.query(session) jobs = q.all() for param in options.fselect: k, v = param.split('=', 1) f = eval(v) for job in jobs: if k in job: if f(job[k]): ids.append(job.id) else: print "job", job.id, "don't have the attribute", k del job, jobs, q if options.all: q = db.query(session) jobs = q.all() ids.extend([j.id for j in jobs]) del q, jobs # Remove all dictionaries from the session session.expunge_all() ids = [int(id) for id in ids] ids = list(set(ids)) ids.sort() nb_jobs = len(ids) for id in ids: job = db.get(id) if job is None: if verbose > 0: print "Job id %s don't exit in the db" % (id) nb_jobs -= 1 continue try: prio = job['jobman.sql.priority'] except Exception: prio = 'BrokenDB_priority_DontExist' try: status = job['jobman.status'] except KeyError: status = 'BrokenDB_Status_DontExist' if verbose > 1: print "Job id %s, status=%d jobman.sql.priority=%s" % ( id, status, str(prio)), for p in options.prints: try: print '%s=%s' % (p, job[p]), except KeyError: print '%s=KeyDontExist' % (p), print if status == RUNNING: have_running_jobs = True if options.set_status: job.__setitem__('jobman.status', new_status, session) job.update_in_session({}, session) if options.reset_prio: job.__setitem__('jobman.sql.priority', 1.0, session) job.update_in_session({}, session) if options.set_status: session.commit() print "Changed the status to %d for %d jobs" % (new_status, len(ids)) if options.reset_prio: print "Reseted the priority to the default value" if new_status == CANCELED and have_running_jobs: print "WARNING: Canceled jobs only change the status in the db. Jobs that are already running, will continue to run. If the job finish with status COMPLETE, it will change the status to DONE. Otherwise the status won't be changed" finally: session.close() if options.ret_nb_jobs: print nb_jobs
def runner_sql(options, dbdescr, exproot): """ Run jobs from a sql table. Usage: sql [options] <tablepath> <exproot> The jobs should be scheduled first with the sqlschedule command. Assuming that a postgres database is running on port `port` of `host`, contains a database called `dbname` and that `user` has the permissions to create, read and modify tables on that database, tablepath should be of the following form: postgres://user:pass@host[:port]/dbname?table=tablename exproot can be a local path or a remote path. Examples of exproots: /some/local/path ssh://some_host:/some/remote/path # relative to the filesystem root ssh://some_host:other/remote/path # relative to the HOME on some_host The exproot will contain a subdirectory hierarchy corresponding to the dbname, tablename and job id which is a unique integer. The sql runner will pick any job in the table which is not running and is not done and will terminate when that job ends. You may call the same command multiple times, sequentially or in parallel, to run as many unfinished jobs as have been scheduled in that table with sqlschedule. Example use: jobman sql \\ postgres://user:pass@host[:port]/dbname?table=tablename \\ ssh://central_host:myexperiments """ if options.modules: modules = options.modules.split(',') else: modules = [] for module in modules: __import__(module, fromlist=[]) db = open_db(dbdescr, serial=True) n = options.n if options.n else -1 nrun = 0 try: while n != 0: if options.workdir: workdir = options.workdir else: if options.workdir_dir and not os.path.exists( options.workdir_dir): os.mkdir(options.workdir_dir) workdir = tempfile.mkdtemp(dir=options.workdir_dir) print "The working directory is:", os.path.join( os.getcwd(), workdir) channel = DBRSyncChannel(db, workdir, exproot, redirect_stdout=True, redirect_stderr=True, finish_up_after=options.finish_up_after or None, save_interval=options.save_every or None) channel.run() # Useful for manual tests; leave this there, just commented. #cachesync_runner.manualtest_before_delete() with cachesync_lock(None, workdir): # Useful for manual tests; leave this there, just #commented. cachesync_runner.manualtest_will_delete() shutil.rmtree(workdir, ignore_errors=True) n -= 1 nrun += 1 except JobError, e: if e.args[0] == JobError.NOJOB: print 'No more jobs to run (run %i jobs)' % nrun
def runner_sql(options, dbdescr, exproot): """ Run jobs from a sql table. Usage: sql [options] <tablepath> <exproot> The jobs should be scheduled first with the sqlschedule command. Assuming that a postgres database is running on port `port` of `host`, contains a database called `dbname` and that `user` has the permissions to create, read and modify tables on that database, tablepath should be of the following form: postgres://user:pass@host[:port]/dbname?table=tablename exproot can be a local path or a remote path. Examples of exproots: /some/local/path ssh://some_host:/some/remote/path # relative to the filesystem root ssh://some_host:other/remote/path # relative to the HOME on some_host The exproot will contain a subdirectory hierarchy corresponding to the dbname, tablename and job id which is a unique integer. The sql runner will pick any job in the table which is not running and is not done and will terminate when that job ends. You may call the same command multiple times, sequentially or in parallel, to run as many unfinished jobs as have been scheduled in that table with sqlschedule. Example use: jobman sql \\ postgres://user:pass@host[:port]/dbname?table=tablename \\ ssh://central_host:myexperiments """ if options.modules: modules = options.modules.split(',') else: modules = [] for module in modules: __import__(module, fromlist=[]) db = open_db(dbdescr, serial=True) n = options.n if options.n else -1 nrun = 0 try: while n != 0: if options.workdir: workdir = options.workdir else: if options.workdir_dir and not os.path.exists(options.workdir_dir): os.mkdir(options.workdir_dir) workdir = tempfile.mkdtemp(dir=options.workdir_dir) print "The working directory is:", os.path.join(os.getcwd(), workdir) channel = DBRSyncChannel(db, workdir, exproot, redirect_stdout=True, redirect_stderr=True, finish_up_after=options.finish_up_after or None, save_interval=options.save_every or None ) channel.run() # Useful for manual tests; leave this there, just commented. # cachesync_runner.manualtest_before_delete() with cachesync_lock(None, workdir): # Useful for manual tests; leave this there, just #commented. cachesync_runner.manualtest_will_delete() shutil.rmtree(workdir, ignore_errors=True) n -= 1 nrun += 1 except JobError, e: if e.args[0] == JobError.NOJOB: print 'No more jobs to run (run %i jobs)' % nrun
def runner_sqlstatus(options, dbdescr, *ids): """Show the status of jobs. Option allow to change it. The --resert_prio option set the priority of the jobs back to the default value. Example use: jobman sqlstatus postgres://user:pass@host[:port]/dbname?table=tablename 10 11 """ # we don't want to remove all output when we change the db. if options.set_status and options.ret_nb_jobs: raise UsageError( "The option --set_status and --ret_nb_jobs are mutually exclusive.") db = open_db(dbdescr, serial=True) if options.set_status: try: new_status = to_status_number(options.set_status) except ValueError: raise UsageError( "The option --set_status accept only the value START, RUNNING, DONE, ERR_START, ERR_SYNC, ERR_RUN, CANCELED or their equivalent int number") else: new_status = None have_running_jobs = False verbose = not options.quiet if options.ret_nb_jobs: verbose = 0 else: verbose += 1 ids = list(ids) try: session = db.session() if options.print_keys: q = db.query(session) job = q.first() print "Keys in the state of the first jobs", for k in job.keys(): print k, print del q, job, k if options.status: q = db.query(session) jobs = [] for stat in options.status: jobs += q.filter_eq('jobman.status', to_status_number(stat)).all() ids.extend([j.id for j in jobs]) del jobs, q if options.select: q = db.query(session) j = q.first() for param in options.select: k, v = param.split('=') if k == 'jobman.status': q = q.filter_eq(k, to_status_number(v)) elif isinstance(j[k], (str, unicode)): q = q.filter_eq(k, v) elif isinstance(j[k], float): q = q.filter_eq(k, float(v)) elif isinstance(j[k], int): q = q.filter_eq(k, int(v)) else: q = q.filter_eq(k, repr(v)) jobs = q.all() ids.extend([j.id for j in jobs]) del j, jobs, q if options.fselect: q = db.query(session) jobs = q.all() for param in options.fselect: k, v = param.split('=', 1) f = eval(v) for job in jobs: if k in job: if f(job[k]): ids.append(job.id) else: print "job", job.id, "don't have the attribute", k del job, jobs, q if options.all: q = db.query(session) jobs = q.all() ids.extend([j.id for j in jobs]) del q, jobs # Remove all dictionaries from the session session.expunge_all() ids = [int(id) for id in ids] ids = list(set(ids)) ids.sort() nb_jobs = len(ids) for id in ids: job = db.get(id) if job is None: if verbose > 0: print "Job id %s don't exit in the db" % (id) nb_jobs -= 1 continue try: prio = job['jobman.sql.priority'] except Exception: prio = 'BrokenDB_priority_DontExist' try: status = job['jobman.status'] except KeyError: status = 'BrokenDB_Status_DontExist' if verbose > 1: print "Job id %s, status=%d jobman.sql.priority=%s" % (id, status, str(prio)), for p in options.prints: try: print '%s=%s' % (p, job[p]), except KeyError: print '%s=KeyDontExist' % (p), print if status == RUNNING: have_running_jobs = True if options.set_status: job.__setitem__('jobman.status', new_status, session) job.update_in_session({}, session) if options.reset_prio: job.__setitem__('jobman.sql.priority', 1.0, session) job.update_in_session({}, session) if options.set_status: session.commit() print "Changed the status to %d for %d jobs" % (new_status, len(ids)) if options.reset_prio: print "Reseted the priority to the default value" if new_status == CANCELED and have_running_jobs: print "WARNING: Canceled jobs only change the status in the db. Jobs that are already running, will continue to run. If the job finish with status COMPLETE, it will change the status to DONE. Otherwise the status won't be changed" finally: session.close() if options.ret_nb_jobs: print nb_jobs