def read_clust_conf(): import os conf_file = os.environ['HOME'] + "/picopili.conf" configs = read_conf(conf_file) cluster = configs['cluster'] pico_bin = os.path.dirname(os.path.realpath(__file__)) clust_dir = os.path.dirname(pico_bin) + '/cluster_templates' assert os.path.isdir( clust_dir ), "Unable to find cluster job submission template directory %s" % str( clust_dir) # load queue configuration info # - submission syntax, queue names, job holds clust_conf = read_conf(str(clust_dir) + '/' + str(cluster) + '.conf') return clust_conf
shape_call = [ shapeit_ex, '--input-bed', chrstem + '.bed', chrstem + '.bim', chrstem + '.fam', '--input-map', map_arg, '--input-ref', hap_arg, leg_arg, samp_arg, '--window', str(args.window), str(duo_txt), '--thread', str(args.threads), '--seed', str(args.shape_seed), '--output-max', outstem + '.phased.haps', outstem + '.phased.sample', '--output-log', outstem + '.shape.log' ] print ' '.join(shape_call) + '\n' # setup naming from task index configs = read_conf(os.environ['HOME'] + '/picopili.conf') clust_confdir = os.path.dirname(str(rp_bin)) + '/cluster_templates/' clust_conf = read_conf(clust_confdir + str(configs['cluster'] + '.conf')) task_id = str(clust_conf['log_task_id']) # submit jobres = send_job(jobname='shape.' + str(outdot), cmd=' '.join(shape_call), logname='shape.' + str(outdot) + '.chr' + task_id + '.sub.log', mem=int(args.mem_req) * 1000, walltime=30, njobs=22, threads=int(args.threads), sleep=str(args.sleep))
print '--chr_info_file '+str(args.chr_info_file) print '\nCluster settings:' print '--sleep '+str(args.sleep) if args.full_pipe: print '--full-pipe' ############# print '\n...Checking dependencies...' ############# # get cluster configuration # needed for specifying logfile names with clust_conf['log_task_id'] conf_file = os.environ['HOME']+"/picopili.conf" configs = read_conf(conf_file) cluster = configs['cluster'] clust_conf = read_clust_conf() # from config impute_ex = find_exec('impute2',key='i2loc') shapeit_ex = find_exec('shapeit',key='shloc') # get directory containing current script # (to get absolute path for scripts) rp_bin = os.path.dirname(os.path.realpath(__file__)) chunker_ex = rp_bin+'/chunk_snps.py' test_exec(chunker_ex,'picopili chunking script') if args.ref_dir is not None: # verify exists
def send_job( jobname, arrayfile=None, cmd=None, logname=None, logloc=None, mem=None, walltime=None, # week=None, njobs=None, maxpar=10000, threads=None, wait_file=None, wait_name=None, wait_num=None, cluster=None, sleep=30, testonly=False, forcearray=False): # validate args if arrayfile is None and cmd is None: raise ValueError("Require either array file or command.") elif arrayfile is not None and cmd is not None: raise ValueError("Require either array file or command, not both.") if logloc is None: logloc = os.getcwd() if not os.path.isdir(logloc): os.mkdir(logloc) if maxpar < 1: maxpar = 10000 # get cluster queue name if cluster is None: conf_file = os.environ['HOME'] + "/picopili.conf" configs = read_conf(conf_file) cluster = configs['cluster'] # get queue template pico_bin = os.path.dirname(os.path.realpath(__file__)) clust_dir = os.path.dirname(pico_bin) + '/cluster_templates' assert os.path.isdir( clust_dir ), "Unable to find cluster job submission template directory %s" % str( clust_dir) # load queue configuration info # - submission syntax, queue names, job holds clust_conf = read_conf(str(clust_dir) + '/' + str(cluster) + '.conf') # basic template with open(str(clust_dir) + '/' + str(cluster) + '.sub.sh', 'r') as single_templ: templ = single_templ.read() # setup memory args if mem is None: mem = 2000 mem_mb = str(int(mem)) if int(mem) > 1000: mem_gb = str(int(mem) / 1000) else: mem_gb = str(1) if mem > 30000: mem_txt = str(clust_conf['big_mem_txt']) else: mem_txt = "" # multithreading arguments if threads is None: threads = 1 # queue picking from job length if walltime is None: walltime = 1 queue_name = clust_conf['hour_q'] elif walltime <= 1.0: queue_name = clust_conf['hour_q'] elif walltime <= 2.0: queue_name = clust_conf['hour2_q'] elif walltime <= 4.0: queue_name = clust_conf['hour4_q'] elif walltime <= 24.0: queue_name = clust_conf['day_q'] else: queue_name = clust_conf['long_q'] # job dependencies if wait_name is not None: hold_str = clust_conf['hold_flag'].format(hold_name=str(wait_name), hold_num=str(wait_num)) elif wait_file is not None: with open(wait_file, 'r') as wait_fi: wait_name = wait_fi.readline() hold_str = clust_conf['hold_flag'].format(hold_name=str(wait_name), hold_num=str(wait_num)) else: hold_str = "" # load base template # for single jobs if cmd is not None and (njobs is None or njobs <= 1) and not forcearray: njobs = 1 tot_threads = int(threads) # log name if logname is None: logname = str(jobname) + '.sub.log' # command line cmd_str = cmd # dummy task array args for dict array_jobs = njobs j_per_core = 1 # for array jobs else: # setup indexing tasks j_per_core = int(clust_conf['j_per_node']) if j_per_core == 1: task_index = str(clust_conf['task_id']) else: task_index = "$1" # cmd or array file spec if cmd is not None: cmd_line = cmd.format(task=task_index) tot_threads = int(njobs) * int(threads) else: assert os.path.isfile( arrayfile), "Job array file %s not found." % str(arrayfile) njobs = file_len(arrayfile) tot_threads = int(njobs) * int(threads) cmd_tmp = dedent("""\ cline=`head -n {task} {fi} | tail -n 1` echo $cline $cline """) cmd_line = cmd_tmp.format(task=task_index, fi=arrayfile) # parallelization of array jobs on a node if j_per_core > 1: from math import floor, ceil # max simul tasks with memory limit node_mem = float(clust_conf['array_mem_mb']) task_mem_lim = int(floor((node_mem - 1.0) / float(mem))) # max simul tasks with threading if task_mem_lim > floor(int(j_per_core) / int(threads)): task_mem_lim = floor(int(j_per_core) / int(threads)) if task_mem_lim < 1: task_mem_lim = 1 # number of jobs to cover all tasks array_jobs = int(ceil(float(njobs) / float(task_mem_lim))) # convert multi-line command to script if len(cmd_line.splitlines()) > 1: tmp_script = open('temp_cmd.' + str(jobname) + '.sh', 'w') tmp_script.write(cmd_line) tmp_script.close() os.chmod(tmp_script.name, stat.S_IEXEC | stat.S_IREAD | stat.S_IWRITE) cmd_line = './' + tmp_script.name # setup to do task_mem_lim jobs on each node # note: specified above that cmd_line uses $1 (first arg) as task index # we manage that here with ${tid} par_tmp = dedent("""\ # array index for this job jj={job_index} # number of jobs to run on node nodej={nodej} # total number of jobs to run in task array maxj={njobs} # task index of first task on this node tid=$(($nodej * ($jj - 1) + 1)) # find index of last task for this node # - from either node task limit (nodej) # or total number of tasks (maxj) if [ "$tid" -le $(($maxj - $nodej + 1)) ]; then last_task=$(($tid + $nodej - 1)) else last_task=$(($maxj)) fi # start the tasks while [ "$tid" -le "$last_task" ]; do {cmd_line} $tid & tid=$(($tid+1)) done # let all tasks finish wait """) cmd_str = par_tmp.format(njobs=str(njobs), nodej=str(task_mem_lim), job_index=str(clust_conf['task_id']), cmd_line=cmd_line) else: array_jobs = njobs cmd_str = cmd_line # log name if logname is None: logname = str(jobname) + '.sub.' + str( clust_conf['log_task_id']) + '.log' # fill in template jobdict = { "job_name": str(jobname), "cmd_string": cmd_str, # formatted elsewhere "log_name": str(logloc) + '/' + str(logname), "mem_in_mb": str(mem_mb), "mem_in_gb": str(mem_gb), "big_mem_txt": str(mem_txt), "threads": str(threads), "total_threads": str(tot_threads), "wall_hours": str(walltime), "njobs": str(njobs), "array_jobs": str(array_jobs), "array_max": str(maxpar), "core_par": str(j_per_core), "task_id": str(clust_conf['task_id']), "log_task_id": str(clust_conf['log_task_id']), "queue_name": str(queue_name), "sleep_time": str(sleep), "project": str(clust_conf['project']), "workdir": os.getcwd() } # write job script sub_file = open(str(jobname) + '.sub.sh', 'w') sub_file.write(templ.format(**jobdict)) sub_file.close() # finalize or remove optional lines if njobs <= 1 and not forcearray: subprocess.check_call( ['sed', '-i', '/^::PICO_ARRAY_ONLY::/d', str(sub_file.name)]) else: subprocess.check_call( ['sed', '-i', 's/^::PICO_ARRAY_ONLY:://', str(sub_file.name)]) if threads <= 1: subprocess.check_call( ['sed', '-i', '/^::PICO_THREAD_ONLY::/d', str(sub_file.name)]) else: subprocess.check_call( ['sed', '-i', 's/^::PICO_THREAD_ONLY:://', str(sub_file.name)]) if njobs <= 1 and not forcearray and threads <= 1: subprocess.check_call( ['sed', '-i', '/^::PICO_THREADARRAY_ONLY::/d', str(sub_file.name)]) else: subprocess.check_call([ 'sed', '-i', 's/^::PICO_THREADARRAY_ONLY:://', str(sub_file.name) ]) # command to run if hold_str != "": launch_str = clust_conf['sub_cmd'] + ' ' + hold_str + ' ' + str( sub_file.name) else: launch_str = clust_conf['sub_cmd'] + ' ' + str(sub_file.name) # record print launch_str # run if not testonly: p = subprocess.Popen(launch_str.split(), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) out, err = p.communicate() if p.returncode is None or p.returncode == 0: return out else: raise EnvironmentError((p.returncode, err, out)) else: return 0
if str(args.addout) != '' and args.addout is not None: outdot = str(args.out)+'.'+str(args.addout) else: outdot = str(args.out) ############# print '\n...Reading ricopili config file...' ############# ### read plink, shapeit loc from config conf_file = os.environ['HOME']+"/ricopili.conf" configs = read_conf(conf_file) plinkx = configs['p2loc']+"plink" shapeit_ex = configs['shloc'] + '/bin/shapeit' ############# print '\n...Checking dependencies...' ############# # TODO: here