def train_and_parse_fold(fold_dir, dev_loc, i, label_set, no_extra_features, allow_reattach, allow_unshift, allow_move_top, allow_invert): name = fold_dir.join('name').open().read().strip() train_args = [ 'BASE_DIR', 'DEV_LOC', 'LABEL_SET', 'FEAT_STR', 'THRESH', 'REPAIR_STR' ] if no_extra_features: feat_str = '-x' else: feat_str = '' repair_str = [] if allow_reattach: repair_str.append('-r') if allow_move_top: repair_str.append('-m') if allow_unshift: repair_str.append('-u') if allow_invert: repair_str.append('-v') repair_str = ' '.join(repair_str) thresh = 5 * i if i >= 1 else 5 arg_vals = [fold_dir, dev_loc, label_set, feat_str, thresh, repair_str] env_str = ','.join('%s=%s' % (k, v) for k, v in zip(train_args, arg_vals)) sh.qsub('pbs/train.sh', o=fold_dir.join('out'), e=fold_dir.join('err'), v=env_str, N=name)
def qsub(resource_file, exec_path): """ :param resource_file: path to json resource file """ res = _read_resource_file(resource_file) path = os.path.expandvars(os.path.expanduser(exec_path)) walltime = res["resource"]["walltime"] max_cpus = 0 nodes = len(res["node"]) for node, systems in res["node"].iteritems(): cpus = 0 for system in systems.itervalues(): cpus += system['cpus'] max_cpus = max(cpus, max_cpus) sh.qsub(os.path.join(path, "submit_new.sh"), N="lambdathesis", l="nodes=%s:ppn=%s:walltime=%s" % (nodes, max_cpus, walltime), j="oe", o="lambda-out", e="lambda-err", m="n", V="True")
def submitJob(self, j): runHash = j.runHash prevJob = self.jobs.find_one(runHash=runHash) dbJob = prevJob if prevJob != None else dict( runHash=runHash, name=j.name, params=j.params) dbJob.update(retVal='', pbsId='', time=time.time(), qsubParams=j.qsubParams, status=JobStatus.NotSubmitted, runFunc=j.runFunc) if prevJob == None: self.jobs.insert(dbJob) else: self.jobs.update(dbJob, ['id']) jobStr = """#!/bin/bash #PBS -N {0} #PBS -o {1} #PBS -e {2} python -m pypalmetto run '{3}' """.format(j.name, self.getJobOutFile(j), self.getJobErrFile(j), j.runHash) #print("About to run qsub with:") #print(jobStr) qsubParams = j.qsubParamsRaw.copy() qsubParams.update({'_in': jobStr}) pbsId = str(sh.qsub(**qsubParams)).strip() dbJob = self.jobs.find_one(runHash=runHash) dbJob.update(status=JobStatus.Queued, pbsId=pbsId) self.jobs.update(dbJob, ['id'])
def run_nas(yagi, num_train, gpu_num, file_to_run, id, batch_size, archs_per_task=5, num_min_depth=20, num_max_depth=70): """Runs the influence function calculation on a specified yagi Arguments: start: int, per class test sample index at which to start per_class: int, how many images to process per class gpu_num: str, gpu id to run the influence function on. can be a single number or a comma seperated string of multiple ids file_to_run: str, filename of the script to run on yagi batch_size: int, reduce for small GPU mem machines recursion_depth: int, pass r_avg: int, pass""" print(f'running random NAS: {yagi}, num_train: {num_train}') time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S.%f") env_vars = os.environ.copy() env_vars["CUDA_VISIBLE_DEVICES"] = gpu_num sh.qsub('-q', f'main.q@{yagi}.vision.is.tohoku', '-v', f'time={time}', '-v', f'num_train={num_train}', '-v', f'id={id}', '-v', f'batch_size={batch_size}', '-v', f'archs_per_task={archs_per_task}', '-v', f'num_min_depth={num_min_depth}', '-v', f'num_max_depth={num_max_depth}', file_to_run, _env=env_vars)
def train_and_parse_fold(fold_dir, dev_loc, i, label_set, no_extra_features, allow_reattach, allow_unshift, allow_move_top, allow_invert): name = fold_dir.join('name').open().read().strip() train_args = ['BASE_DIR', 'DEV_LOC', 'LABEL_SET', 'FEAT_STR', 'THRESH', 'REPAIR_STR'] if no_extra_features: feat_str = '-x' else: feat_str = '' repair_str = [] if allow_reattach: repair_str.append('-r') if allow_move_top: repair_str.append('-m') if allow_unshift: repair_str.append('-u') if allow_invert: repair_str.append('-v') repair_str = ' '.join(repair_str) thresh = 5 * i if i >= 1 else 5 arg_vals = [fold_dir, dev_loc, label_set, feat_str, thresh, repair_str] env_str = ','.join('%s=%s' % (k, v) for k, v in zip(train_args, arg_vals)) sh.qsub('pbs/train.sh', o=fold_dir.join('out'), e=fold_dir.join('err'), v=env_str, N=name)
def main(): args = docopt(__doc__, version='Version 1.0') # copy-pasted from pipeline.py :( cfg_f = args['--config'] cfg_y = yaml.load(open(cfg_f)) cfg = pipeline.Config(cfg_y) # it's probably better to have separate log files. if args['--log']: _log = Path(args['--log']) if _log.exists(): print "Removing old log file %s" % _log _log.remove() log = open(args['--log'], 'a') else: log = sys.stdout sheet = open(args['<samplesheet>']) rows = csv.DictReader(sheet, fieldnames=['read_dirs', 'control_dirs'], delimiter='\t') rows = list(rows) base_out = args['--outdir'] or "." # os.getcwd()? sampledir = args['--sampledir'] def p_run(rows): weave = partial(weave_files, sampledir) fqs_and_controls = list(map(weave, rows)) run2_func = partial(pipeline.run2, cfg, log, base_out) pool = multiprocessing.Pool(len(rows)) print "Launching %d processes.\n==========================\n\n" % len( rows) pool.map(run2_func, fqs_and_controls) pool.close() pool.join() if False: p_run(rows) if args['--qsub']: for i, row in enumerate(rows): #outdir = os.path.join(base_out, "sheet-sample-%d" % i) fastqs, controls = weave_files(sampledir, row) import tempfile import sh temp = tempfile.NamedTemporaryFile(prefix='pathos_sheet', suffix='qsub', delete=False) template = "{script} --fastq {fastqs} -c {cfg} -o {odir} --control {controls}" cmd = template.format( script='python /u/michael.panciera/CURRENT/pathos/pipeline.py', fastqs=' '.join(fastqs), controls=' '.join(controls), cfg=args['--config'], odir=base_out) temp.write( "module load mpi\nmodule load bowtie\nmodule load blast\n") temp.write(cmd) temp.close() script = temp.name #print "qsub {script} -q batch -l nodes={node}:ppn={cores}".format(script=temp.name, node=amedpbswrair007.amed.ds.army.mil, cores=4) #print " -q batch -l nodes={node}:ppn={cores}".format(script=temp.name, node=amedpbswrair007.amed.ds.army.mil, cores=4) sample_num = row['read_dirs'].split(SEP)[0] sh.qsub( script, '-N', "sheet-sample-%s" % sample_num, # "-M", "EMAIL HERE", # '-l', "nodes=1:ppn=8:mem=80514472881") '-l', "nodes=1:ppn=12", '-l', "mem=80514472881") print "Running %s" % script else: print "No --qsub flag, didn't run anything."
def submit_jobs(jobs, args, log, timeout): log_message(1, ["Entering submit_jobs to schedule node tests"]) main_dir = os.getcwd() for job in jobs: # Create job directory job.set_rootdir(args.path) cp(args.case, job.path) os.chdir(job.path) try: if args.batch == "LSF": from sh import bsub with open(os.path.join(job.path, "run_case.lsf"), 'r') as jf: temp = bsub(_in=jf, m=' '.join(job.nodes), P=args.account, q=args.queue, _timeout=timeout) job.jobid = temp.split('<')[1].split('>')[0] log_message( 1, ["Job {} submitted with bsub".format(job.name)]) elif args.batch == "PBS": from sh import qsub sel_hosts = "select=" + '+'.join([ "ncpus=36:mpiprocs=36:host={}".format(nid) for nid in job.nodes ]) if args.force: temp = qsub("-l", sel_hosts, "-A", args.account, "-q", args.queue, "-h", os.path.join(job.path, "run_case.pbs"), _timeout=timeout) else: temp = qsub("-l", sel_hosts, "-A", args.account, "-q", args.queue, os.path.join(job.path, "run_case.pbs"), _timeout=timeout) job.jobid = temp.split('.')[0] log_message(1, [ "Job {} submitted with qsub (hold = {})".format( job.name, args.force) ]) log["num_active"] += 1 except TimeoutException: log_message(1, ["Could not submit job {}, skipping".format(job.name)]) log["errors"].append(" submit failed - " + job.name) log["num_errors"] += 1 log["num_jobs"] += 1 os.chdir(main_dir) # If it's been a while, check status if int(time.time() - log["last_time"]) >= 10: print_status(jobs, log) log_message(1, ["Finished submitting {} jobs".format(log["num_jobs"])])