bim = open(args.bfile + '.bim', 'r') for line in bim: (chrom, snp_id, cm, bp, a1, a2) = line.split() snps[str(snp_id)] = [str(chrom), int(bp)] if str(chrom) not in chroms: chroms.append(str(chrom)) if int(chrom) in xrange(1, 23): nbimsnps_valid += 1 # prevent later errors if int(bp) > chrend[str(chrom)]: warnings.warn( 'SNP %s (chr %s, bp %d) is outside expected chromosome bounds (bp <= %d).' % (str(snp_id), str(chrom), int(bp), int(chrend_orig[str(chrom)]))) chrend[str(chrom)] = int(bp) bim.close() nbimsnps = file_len(args.bfile + '.bim') print 'Loaded %d autosomal SNPs (of %d total in %s).' % (nbimsnps_valid, nbimsnps, bim.name) ############# print '\n...Generating genomic chunks...' ############# chunks = open(outname, 'w') chunks.write(' '.join(['CHR', 'START', 'END', 'NAME']) + '\n') idx = 1 nsnps = 0 for ch in xrange(1, 23): if str(ch) not in chroms: continue
outpop = '-' return outpop # get list of selected pop for each individual in admixture results ind_pops = [] admix_pops_file = str(args.unrel_bfile + '.' + str(args.npops) + '.Q') with open(admix_pops_file, 'r') as f: # map() required to read probs as float instead of string ind_pops = [ maxpop(props=map(float, line.split()), names=popnames, th=args.prop_th) for line in f ] # sanity check parsing nfam = file_len(str(args.unrel_bfile + '.fam')) if len(ind_pops) != nfam: raise ValueError('Number of individuals parsed from admixture results (%d in %s) ' + \ 'and fam file of unrelateds (%d in %s) do not match.' % (len(ind_pops), admix_pops_file, int(nfam), str(args.unrel_bfile+'.fam'))) # check have sufficient exemplars popcounts = [ind_pops.count(popnames[i]) for i in range(args.npops)] lackingpops = [popcounts[i] < args.min_exemplar for i in range(args.npops)] print 'Exemplars per population:' for i in range(args.npops): print str(popnames[i] + ': ' + str(popcounts[i])) print 'Unassigned: ' + str(ind_pops.count('-')) if any(lackingpops): print '\n###########\n'
print '\n...Beginning LD pruning...' ############# # init i = 1 subprocess.check_call([str(plinkx), "--bfile", filtered_out, "--indep-pairwise", str(args.ld_wind), str(ld_move), str(args.ld_th), "--silent", "--memory", str(2000), "--allow-no-sex", "--out", args.out + '.prune' + str(i) + '.tmp' ]) # tracking number of SNPs before, after altest round of pruning nprune_old = file_len(filtered_out + '.bim') nprune_new = file_len(args.out + '.prune' + str(i) + '.tmp.prune.in') # loop til no additional exclusions while nprune_old > nprune_new: i += 1 ############# print 'Pruning pass ' + str(i) ############# subprocess.check_call([str(plinkx), "--bfile", filtered_out, "--extract", args.out + '.prune' + str(i-1) + '.tmp.prune.in', "--indep-pairwise", str(args.ld_wind), str(ld_move), str(args.ld_th), "--silent", "--memory", str(2000),
out_len = 11 elif args.model == 'gmmat-fam': ch_out = 'gmmatfam_score.'+str(outdot)+'.'+str(chname)+'.R.txt' out_len = 11 elif args.model == 'logistic': ch_out = 'logis.'+str(outdot)+'.'+str(chname)+'.assoc.logistic' out_len = 12 elif args.model == 'linear': ch_out = 'linear.'+str(outdot)+'.'+str(chname)+'.assoc.linear' out_len = 12 # record chunks with no/partial/broken output if not os.path.isfile(ch_out): print 'Output not found for %s' % str(ch_out) mis_chunks[str(chname)] = [str(chrom), int(start), int(end)] elif file_len(ch_out) < file_len(str(outdot)+'.snps.'+str(chname)+'.txt'): print 'Output file %s is incomplete' % str(ch_out) mis_chunks[str(chname)] = [str(chrom), int(start), int(end)] else: ft = file_tail(ch_out) if len(ft.split()) != out_len: print 'Last line of output file %s is incomplete' % str(ch_out) mis_chunks[str(chname)] = [str(chrom), int(start), int(end)] chunks_in.close() ############### # if there are missing chunks, restart their gwas and resub agg script ############### if len(mis_chunks) > 0:
if args.remove is not None: assert os.path.isfile( args.remove), "ID exclusion file does not exist (%r)" % args.remove if args.extract is not None: assert os.path.isfile( args.extract), "SNP inclusion file does not exist (%r)" % args.extract if args.exclude is not None: assert os.path.isfile( args.exclude), "SNP exclusion file does not exist (%r)" % args.exclude if args.pheno is not None: assert os.path.isfile( args.pheno), "Phenotype file does not exist (%r)" % args.pheno # warn if data is large if args.extract is not None: nsnp = file_len(str(args.extract)) elif args.exclude is not None: nsnp = file_len(str(args.bfile) + '.bim') - file_len(str(args.exclude)) else: nsnp = file_len(str(args.bfile) + '.bim') if nsnp > 1000000: warn( 'Large number of SNPs present for analysis (%d). Consider splitting for efficiency.' % int(nsnp)) print '\n' print '############' print 'Begin!' print '############'
print '\nUnrelated Set (IMUS) Criteria:' print '--rel-th ' + str(args.rel_th) print '\nPrincipal Components (PCA):' print '--npcs ' + str(args.npcs) print '--plot-all ' + str(args.plot_all) ##### # check imus memory requirements # = 6 GB + 400MB*(n/1000)^2, rounded up to nearest 4GB # based on previous runs of PRIMUS ##### warn_mem = False nsamp = float(file_len(str(args.bfile) + '.fam')) imus_mem = int(ceil((6000.0 + 400.0 * ((nsamp / 1000.0)**2)) / 4000.0) * 4) if imus_mem > 16 and not args.large_mem_ok: warn_mem = True args.test_sub = True ##### # submit strict qc print '\n...Submitting Strict QC job...' ##### strictqc_call = ' '.join([ 'strict_qc.py', '--bfile', args.bfile, '--out', args.out, clean_txt, '--mind-th', str(args.mind_th), '--maf-th',
if args.covar is not None: assert os.path.isfile(args.covar), "Covariate file does not exist (%r)" % args.covar if args.keep is not None: assert os.path.isfile(args.keep), "ID inclusion file does not exist (%r)" % args.keep if args.remove is not None: assert os.path.isfile(args.remove), "ID exclusion file does not exist (%r)" % args.remove if args.extract is not None: assert os.path.isfile(args.extract), "SNP inclusion file does not exist (%r)" % args.extract if args.exclude is not None: assert os.path.isfile(args.exclude), "SNP exclusion file does not exist (%r)" % args.exclude if args.pheno is not None: assert os.path.isfile(args.pheno), "Phenotype file does not exist (%r)" % args.pheno # warn if data is large if args.extract is not None: nsnp = file_len(str(args.extract)) elif args.exclude is not None: nsnp = file_len(str(args.bfile)+'.bim') - file_len(str(args.exclude)) else: nsnp = file_len(str(args.bfile)+'.bim') if nsnp > 50000: warn('Large number of SNPs present for analysis (%d). Consider splitting for efficiency.' % int(nsnp)) print '\n' print '############' print 'Begin!' print '############' #############
def send_job( jobname, arrayfile=None, cmd=None, logname=None, logloc=None, mem=None, walltime=None, # week=None, njobs=None, maxpar=10000, threads=None, wait_file=None, wait_name=None, wait_num=None, cluster=None, sleep=30, testonly=False, forcearray=False): # validate args if arrayfile is None and cmd is None: raise ValueError("Require either array file or command.") elif arrayfile is not None and cmd is not None: raise ValueError("Require either array file or command, not both.") if logloc is None: logloc = os.getcwd() if not os.path.isdir(logloc): os.mkdir(logloc) if maxpar < 1: maxpar = 10000 # get cluster queue name if cluster is None: conf_file = os.environ['HOME'] + "/picopili.conf" configs = read_conf(conf_file) cluster = configs['cluster'] # get queue template pico_bin = os.path.dirname(os.path.realpath(__file__)) clust_dir = os.path.dirname(pico_bin) + '/cluster_templates' assert os.path.isdir( clust_dir ), "Unable to find cluster job submission template directory %s" % str( clust_dir) # load queue configuration info # - submission syntax, queue names, job holds clust_conf = read_conf(str(clust_dir) + '/' + str(cluster) + '.conf') # basic template with open(str(clust_dir) + '/' + str(cluster) + '.sub.sh', 'r') as single_templ: templ = single_templ.read() # setup memory args if mem is None: mem = 2000 mem_mb = str(int(mem)) if int(mem) > 1000: mem_gb = str(int(mem) / 1000) else: mem_gb = str(1) if mem > 30000: mem_txt = str(clust_conf['big_mem_txt']) else: mem_txt = "" # multithreading arguments if threads is None: threads = 1 # queue picking from job length if walltime is None: walltime = 1 queue_name = clust_conf['hour_q'] elif walltime <= 1.0: queue_name = clust_conf['hour_q'] elif walltime <= 2.0: queue_name = clust_conf['hour2_q'] elif walltime <= 4.0: queue_name = clust_conf['hour4_q'] elif walltime <= 24.0: queue_name = clust_conf['day_q'] else: queue_name = clust_conf['long_q'] # job dependencies if wait_name is not None: hold_str = clust_conf['hold_flag'].format(hold_name=str(wait_name), hold_num=str(wait_num)) elif wait_file is not None: with open(wait_file, 'r') as wait_fi: wait_name = wait_fi.readline() hold_str = clust_conf['hold_flag'].format(hold_name=str(wait_name), hold_num=str(wait_num)) else: hold_str = "" # load base template # for single jobs if cmd is not None and (njobs is None or njobs <= 1) and not forcearray: njobs = 1 tot_threads = int(threads) # log name if logname is None: logname = str(jobname) + '.sub.log' # command line cmd_str = cmd # dummy task array args for dict array_jobs = njobs j_per_core = 1 # for array jobs else: # setup indexing tasks j_per_core = int(clust_conf['j_per_node']) if j_per_core == 1: task_index = str(clust_conf['task_id']) else: task_index = "$1" # cmd or array file spec if cmd is not None: cmd_line = cmd.format(task=task_index) tot_threads = int(njobs) * int(threads) else: assert os.path.isfile( arrayfile), "Job array file %s not found." % str(arrayfile) njobs = file_len(arrayfile) tot_threads = int(njobs) * int(threads) cmd_tmp = dedent("""\ cline=`head -n {task} {fi} | tail -n 1` echo $cline $cline """) cmd_line = cmd_tmp.format(task=task_index, fi=arrayfile) # parallelization of array jobs on a node if j_per_core > 1: from math import floor, ceil # max simul tasks with memory limit node_mem = float(clust_conf['array_mem_mb']) task_mem_lim = int(floor((node_mem - 1.0) / float(mem))) # max simul tasks with threading if task_mem_lim > floor(int(j_per_core) / int(threads)): task_mem_lim = floor(int(j_per_core) / int(threads)) if task_mem_lim < 1: task_mem_lim = 1 # number of jobs to cover all tasks array_jobs = int(ceil(float(njobs) / float(task_mem_lim))) # convert multi-line command to script if len(cmd_line.splitlines()) > 1: tmp_script = open('temp_cmd.' + str(jobname) + '.sh', 'w') tmp_script.write(cmd_line) tmp_script.close() os.chmod(tmp_script.name, stat.S_IEXEC | stat.S_IREAD | stat.S_IWRITE) cmd_line = './' + tmp_script.name # setup to do task_mem_lim jobs on each node # note: specified above that cmd_line uses $1 (first arg) as task index # we manage that here with ${tid} par_tmp = dedent("""\ # array index for this job jj={job_index} # number of jobs to run on node nodej={nodej} # total number of jobs to run in task array maxj={njobs} # task index of first task on this node tid=$(($nodej * ($jj - 1) + 1)) # find index of last task for this node # - from either node task limit (nodej) # or total number of tasks (maxj) if [ "$tid" -le $(($maxj - $nodej + 1)) ]; then last_task=$(($tid + $nodej - 1)) else last_task=$(($maxj)) fi # start the tasks while [ "$tid" -le "$last_task" ]; do {cmd_line} $tid & tid=$(($tid+1)) done # let all tasks finish wait """) cmd_str = par_tmp.format(njobs=str(njobs), nodej=str(task_mem_lim), job_index=str(clust_conf['task_id']), cmd_line=cmd_line) else: array_jobs = njobs cmd_str = cmd_line # log name if logname is None: logname = str(jobname) + '.sub.' + str( clust_conf['log_task_id']) + '.log' # fill in template jobdict = { "job_name": str(jobname), "cmd_string": cmd_str, # formatted elsewhere "log_name": str(logloc) + '/' + str(logname), "mem_in_mb": str(mem_mb), "mem_in_gb": str(mem_gb), "big_mem_txt": str(mem_txt), "threads": str(threads), "total_threads": str(tot_threads), "wall_hours": str(walltime), "njobs": str(njobs), "array_jobs": str(array_jobs), "array_max": str(maxpar), "core_par": str(j_per_core), "task_id": str(clust_conf['task_id']), "log_task_id": str(clust_conf['log_task_id']), "queue_name": str(queue_name), "sleep_time": str(sleep), "project": str(clust_conf['project']), "workdir": os.getcwd() } # write job script sub_file = open(str(jobname) + '.sub.sh', 'w') sub_file.write(templ.format(**jobdict)) sub_file.close() # finalize or remove optional lines if njobs <= 1 and not forcearray: subprocess.check_call( ['sed', '-i', '/^::PICO_ARRAY_ONLY::/d', str(sub_file.name)]) else: subprocess.check_call( ['sed', '-i', 's/^::PICO_ARRAY_ONLY:://', str(sub_file.name)]) if threads <= 1: subprocess.check_call( ['sed', '-i', '/^::PICO_THREAD_ONLY::/d', str(sub_file.name)]) else: subprocess.check_call( ['sed', '-i', 's/^::PICO_THREAD_ONLY:://', str(sub_file.name)]) if njobs <= 1 and not forcearray and threads <= 1: subprocess.check_call( ['sed', '-i', '/^::PICO_THREADARRAY_ONLY::/d', str(sub_file.name)]) else: subprocess.check_call([ 'sed', '-i', 's/^::PICO_THREADARRAY_ONLY:://', str(sub_file.name) ]) # command to run if hold_str != "": launch_str = clust_conf['sub_cmd'] + ' ' + hold_str + ' ' + str( sub_file.name) else: launch_str = clust_conf['sub_cmd'] + ' ' + str(sub_file.name) # record print launch_str # run if not testonly: p = subprocess.Popen(launch_str.split(), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) out, err = p.communicate() if p.returncode is None or p.returncode == 0: return out else: raise EnvironmentError((p.returncode, err, out)) else: return 0
source /broad/software/scripts/useuse reuse -q Anaconda sleep {sleep} cchr=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $1}}' {cfile}` cstart=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $2}}' {cfile}` cend=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $3}}' {cfile}` cname=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $4}}' {cfile}` {impute_ex} -use_prephased_g -known_haps_g {in_haps} -h {ref_haps} -l {ref_leg} -m {map} -int ${{cstart}} ${{cend}} -buffer {buffer} -Ne {Ne} -allow_large_regions -o_gz -o {out} {seedtxt} # eof """ # get number of chunks (-1 is for header) nchunks = file_len(outdot+'.chunks.txt')-1 # fill in template jobdict = {"jname": 'imp.chunks.'+str(outdot), "nchunk": str(nchunks), "outlog": str('imp.chunks.'+str(outdot)+'.$TASK_ID.qsub.log'), "sleep": str(args.sleep), "cfile": str(outdot)+'.chunks.txt', "impute_ex": str(impute_ex), "in_haps": str(shape_dir)+'/'+str(outdot)+'.chr${cchr}.phased.haps', "ref_haps": str(args.ref_haps).replace('###','${cchr}'), "ref_leg": str(args.ref_legs).replace('###','${cchr}'), "map": str(args.ref_maps).replace('###','${cchr}'), "Ne": str(args.Ne), "buffer": str(args.buffer), "out": str(outdot)+'.imp.${cname}',
if float(fhet) > args.het_th: id_out.write(str(fid) + ' ' + str(iid) + ' high_homozygosity_Fhet\n') nex += 1 elif float(fhet) < (-1.0*args.het_th): id_out.write(str(fid) + ' ' + str(iid) + ' low_homozygosity_Fhet\n') nex += 1 het.close() print 'Found %d individuals to exclude for failing absolute Fhet homozygosity rate > %r' % (nex, args.het_th) # filter mendel errors if args.mendel != 'none': # get number of SNPs as denominator for mendel error rate nsnp = file_len(str(prefilter_out) + '.bim') imendel_nam = ind_stats + '.imendel' imendel = open(imendel_nam, 'r') nex = 0 dumphead = imendel.readline() for line in imendel: (fid, iid, nmendel) = line.split() if float(nmendel) / float(nsnp) > args.id_mendel_th: id_out.write(str(fid) + ' ' + str(iid) + ' excessive_mendel_errors\n') nex += 1 imendel.close() print 'Found %d individuals to exclude for excessive mendel errors > %r of SNPs' % (nex, args.id_mendel_th)
'--merge-list', str(merge_list.name), '--merge-mode',str(4), '--make-bed', '--out', str(outdot)+'.cobg.filtered'], stderr=subprocess.STDOUT, stdout=merge_log) merge_log.close() ###################### print '\n...Verifying output...' ###################### assert os.path.isfile(str(outdot)+'.cobg.filtered.bed') assert os.path.isfile(str(outdot)+'.cobg.filtered.bim') assert os.path.isfile(str(outdot)+'.cobg.filtered.fam') assert file_len(str(outdot)+'.cobg.filtered.bim')+1 == file_len(str(outdot)+'.cobg.filtered.info') # TODO: here # finish print '\n############' print '\n' print 'SUCCESS!' exit(0) # eof
print '\nPrincipal Components (PCA):' print '--npcs '+str(args.npcs) print '--plot-all '+str(args.plot_all) ##### # check imus memory requirements # = 6 GB + 400MB*(n/1000)^2, rounded up to nearest 4GB # based on previous runs of PRIMUS ##### warn_mem = False nsamp = float(file_len(str(args.bfile)+'.fam')) imus_mem = int(ceil( (6000.0+400.0*( (nsamp/1000.0)**2) )/4000.0 ) * 4) if imus_mem > 16 and not args.large_mem_ok: warn_mem = True args.test_sub = True ##### # submit strict qc print '\n...Submitting Strict QC job...' ##### strictqc_call = ' '.join(['strict_qc.py', '--bfile', args.bfile, '--out', args.out,
### # submit next imputation task ### if args.full_pipe: ###################### print '\n...Queuing chunk aggregation script...' ###################### os.chdir(wd) next_call = str(rp_bin) + '/agg_imp.py ' + ' '.join(sys.argv[1:]) agg_log = 'agg_imp.' + str(outdot) + '.sub.log' # some dynamic adjustment of mem based on sample size population fam_n = file_len( str(shape_dir) + '/' + str(args.bfile) + '.hg19.ch.fl.fam') if fam_n > 3000: agg_mem = 32000 elif fam_n > 1000: agg_mem = 16000 else: agg_mem = 8000 # (empirically, seem to get ~2x sites from afr vs eur) # (admittedly this method of catching/handling it is _very_ informal) if "afr" in sys.argv[1:]: agg_mem = 2 * agg_mem # TODO: consider queue/mem for agg send_job(jobname='agg.imp.' + str(outdot), cmd=next_call,
ch_out = 'gee.'+str(outdot)+'.'+str(chname)+'.auto.R' out_len = 10 elif args.model == 'dfam': ch_out = 'dfam.'+str(outdot)+'.'+str(chname)+'.dfam' out_len = 8 elif args.model == 'gmmat': ch_out = 'gmmat_score.'+str(outdot)+'.'+str(chname)+'.R.txt' out_len = 11 elif args.model == 'gmmat-fam': ch_out = 'gmmatfam_score.'+str(outdot)+'.'+str(chname)+'.R.txt' out_len = 11 # record chunks with no/partial/broken output if not os.path.isfile(ch_out): mis_chunks[str(chname)] = [str(chrom), int(start), int(end)] elif file_len(ch_out) != file_len(str(outdot)+'.snps.'+str(chname)+'.txt'): mis_chunks[str(chname)] = [str(chrom), int(start), int(end)] else: ft = file_tail(ch_out) if len(ft.split()) != out_len: mis_chunks[str(chname)] = [str(chrom), int(start), int(end)] chunks_in.close() ############### # if there are missing chunks, restart their gwas and resub agg script ############### if len(mis_chunks) > 0: nummiss = len(mis_chunks) print 'Missing results for %d GWAS jobs. Preparing to resubmit...' % nummiss
merge_log = open(str(outdot) + '.cobg.filtered.merge.log', 'w') subprocess.check_call([ plink_ex, '--merge-list', str(merge_list.name), '--merge-mode', str(4), '--make-bed', '--out', str(outdot) + '.cobg.filtered' ], stderr=subprocess.STDOUT, stdout=merge_log) merge_log.close() ###################### print '\n...Verifying output...' ###################### assert os.path.isfile(str(outdot) + '.cobg.filtered.bed') assert os.path.isfile(str(outdot) + '.cobg.filtered.bim') assert os.path.isfile(str(outdot) + '.cobg.filtered.fam') assert file_len(str(outdot) + '.cobg.filtered.bim') + 1 == file_len( str(outdot) + '.cobg.filtered.info') # TODO: here # finish print '\n############' print '\n' print 'SUCCESS!' exit(0) # eof
for line in bim: (chrom, snp_id, cm, bp, a1, a2) = line.split() snps[str(snp_id)] = [str(chrom), int(bp)] if str(chrom) not in chroms: chroms.append(str(chrom)) if int(chrom) in xrange(1, 23): nbimsnps_valid += 1 # prevent later errors if int(bp) > chrend[str(chrom)]: warnings.warn( "SNP %s (chr %s, bp %d) is outside expected chromosome bounds (bp <= %d)." % (str(snp_id), str(chrom), int(bp), int(chrend_orig[str(chrom)])) ) chrend[str(chrom)] = int(bp) bim.close() nbimsnps = file_len(args.bfile + ".bim") print "Loaded %d autosomal SNPs (of %d total in %s)." % (nbimsnps_valid, nbimsnps, bim.name) ############# print "\n...Generating genomic chunks..." ############# chunks = open(outname, "w") chunks.write(" ".join(["CHR", "START", "END", "NAME"]) + "\n") idx = 1 nsnps = 0 for ch in xrange(1, 23): if str(ch) not in chroms: continue
chunks_in = open(args.chunk_file, 'r') dumphead = chunks_in.readline() for line in chunks_in: (chrom, start, end, chname) = line.split() chunks[str(chname)] = [str(chrom), int(start), int(end)] # verify output file exists if args.model == 'gee': ch_out = 'gee.'+str(outdot)+'.'+str(chname)+'.auto.R' elif args.model == 'dfam': ch_out = 'dfam.'+str(outdot)+'.'+str(chname)+'.dfam' # record chunks with no output if not os.path.isfile(ch_out): mis_chunks[str(chname)] = [str(chrom), int(start), int(end)] elif not file_len(ch_out) > 10: mis_chunks[str(chname)] = [str(chrom), int(start), int(end)] chunks_in.close() ############### # if there are missing chunks, restart their gwas and resub agg script ############### if len(mis_chunks) > 0: nummiss = len(mis_chunks) print 'Missing results for %d GWAS jobs. Preparing to resubmit...' % nummiss # just missing chunks for task array # fail if already tried tmp_chunk_file_name = 'tmp_missing_'+str(nummiss)+'_chunks.'+str(outdot)+'.txt'
print '\n...Beginning LD pruning...' ############# # init i = 1 subprocess.check_call([ str(plinkx), "--bfile", filtered_out, "--indep-pairwise", str(args.ld_wind), str(ld_move), str(args.ld_th), "--silent", "--memory", str(2000), "--allow-no-sex", "--out", args.out + '.prune' + str(i) + '.tmp' ]) # tracking number of SNPs before, after altest round of pruning nprune_old = file_len(filtered_out + '.bim') nprune_new = file_len(args.out + '.prune' + str(i) + '.tmp.prune.in') # loop til no additional exclusions while nprune_old > nprune_new: i += 1 ############# print 'Pruning pass ' + str(i) ############# subprocess.check_call([ str(plinkx), "--bfile", filtered_out, "--extract", args.out + '.prune' + str(i - 1) + '.tmp.prune.in', "--indep-pairwise", str(args.ld_wind), str(ld_move), str(args.ld_th), "--silent", "--memory",
"impute_ex": str(impute_ex), "in_haps": str(shape_dir)+'/'+str(outdot)+'.chr${{cchr}}.phased.haps', "ref_haps": str(args.ref_haps).replace('###','${{cchr}}'), "ref_leg": str(args.ref_legs).replace('###','${{cchr}}'), "map": str(args.ref_maps).replace('###','${{cchr}}'), "Ne": str(args.Ne), "buffer": str(args.buffer), "out": str(outdot)+'.imp.${{cname}}', "seedtxt": str(seedtxt), "cbopen":'{{', "cbclose":'}}', } # get number of chunks (-1 is for header) nchunks = file_len(outdot+'.chunks.txt')-1 # store job information for possible resubs job_store_file = 'imp.chunks.'+str(outdot)+'.pkl' clust_dict = init_sendjob_dict() clust_dict['jobname'] = 'imp.chunks.'+str(outdot) clust_dict['logname'] = str('imp.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log') clust_dict['mem'] = 8000 clust_dict['walltime'] = 2 clust_dict['njobs'] = int(nchunks) clust_dict['sleep'] = args.sleep save_job(jfile=job_store_file, cmd_templ=imp_templ, job_dict=jobdict, sendjob_dict=clust_dict)
whichmax = props.index(max(props)) if props[whichmax] > th: outpop = names[whichmax] else: outpop = '-' return outpop # get list of selected pop for each individual in admixture results ind_pops = [] admix_pops_file = str(args.unrel_bfile+'.'+str(args.npops)+'.Q') with open(admix_pops_file, 'r') as f: # map() required to read probs as float instead of string ind_pops = [maxpop(props=map(float,line.split()), names=popnames, th=args.prop_th) for line in f] # sanity check parsing nfam = file_len(str(args.unrel_bfile+'.fam')) if len(ind_pops) != nfam: raise ValueError('Number of individuals parsed from admixture results (%d in %s) ' + \ 'and fam file of unrelateds (%d in %s) do not match.' % (len(ind_pops), admix_pops_file, int(nfam), str(args.unrel_bfile+'.fam'))) # check have sufficient exemplars popcounts = [ind_pops.count(popnames[i]) for i in range(args.npops)] lackingpops = [popcounts[i] < args.min_exemplar for i in range(args.npops)] print 'Exemplars per population:' for i in range(args.npops): print str(popnames[i] + ': ' + str(popcounts[i])) print 'Unassigned: '+str(ind_pops.count('-')) if any(lackingpops): print '\n###########\n'