Ejemplo n.º 1
0
bim = open(args.bfile + '.bim', 'r')
for line in bim:
    (chrom, snp_id, cm, bp, a1, a2) = line.split()
    snps[str(snp_id)] = [str(chrom), int(bp)]
    if str(chrom) not in chroms:
        chroms.append(str(chrom))
    if int(chrom) in xrange(1, 23):
        nbimsnps_valid += 1
    # prevent later errors
    if int(bp) > chrend[str(chrom)]:
        warnings.warn(
            'SNP %s (chr %s, bp %d) is outside expected chromosome bounds (bp <= %d).'
            % (str(snp_id), str(chrom), int(bp), int(chrend_orig[str(chrom)])))
        chrend[str(chrom)] = int(bp)
bim.close()
nbimsnps = file_len(args.bfile + '.bim')
print 'Loaded %d autosomal SNPs (of %d total in %s).' % (nbimsnps_valid,
                                                         nbimsnps, bim.name)

#############
print '\n...Generating genomic chunks...'
#############
chunks = open(outname, 'w')
chunks.write(' '.join(['CHR', 'START', 'END', 'NAME']) + '\n')
idx = 1
nsnps = 0
for ch in xrange(1, 23):

    if str(ch) not in chroms:
        continue
Ejemplo n.º 2
0
        outpop = '-'
    return outpop


# get list of selected pop for each individual in admixture results
ind_pops = []
admix_pops_file = str(args.unrel_bfile + '.' + str(args.npops) + '.Q')
with open(admix_pops_file, 'r') as f:
    # map() required to read probs as float instead of string
    ind_pops = [
        maxpop(props=map(float, line.split()), names=popnames, th=args.prop_th)
        for line in f
    ]

# sanity check parsing
nfam = file_len(str(args.unrel_bfile + '.fam'))
if len(ind_pops) != nfam:
    raise ValueError('Number of individuals parsed from admixture results (%d in %s) ' + \
                     'and fam file of unrelateds (%d in %s) do not match.' % (len(ind_pops), admix_pops_file, int(nfam), str(args.unrel_bfile+'.fam')))

# check have sufficient exemplars
popcounts = [ind_pops.count(popnames[i]) for i in range(args.npops)]
lackingpops = [popcounts[i] < args.min_exemplar for i in range(args.npops)]

print 'Exemplars per population:'
for i in range(args.npops):
    print str(popnames[i] + ': ' + str(popcounts[i]))
print 'Unassigned: ' + str(ind_pops.count('-'))

if any(lackingpops):
    print '\n###########\n'
Ejemplo n.º 3
0
print '\n...Beginning LD pruning...'
#############

# init
i = 1

subprocess.check_call([str(plinkx), 
               "--bfile", filtered_out,
               "--indep-pairwise", str(args.ld_wind), str(ld_move), str(args.ld_th),
               "--silent",
               "--memory", str(2000),
               "--allow-no-sex",
               "--out", args.out + '.prune' + str(i) + '.tmp' ])

# tracking number of SNPs before, after altest round of pruning
nprune_old = file_len(filtered_out + '.bim')
nprune_new = file_len(args.out + '.prune' + str(i) + '.tmp.prune.in')

# loop til no additional exclusions
while nprune_old > nprune_new:

    i += 1
    #############
    print 'Pruning pass ' + str(i)
    #############
    subprocess.check_call([str(plinkx), 
               "--bfile", filtered_out,
               "--extract", args.out + '.prune' + str(i-1) + '.tmp.prune.in',
               "--indep-pairwise", str(args.ld_wind), str(ld_move), str(args.ld_th),
               "--silent",
               "--memory", str(2000),
Ejemplo n.º 4
0
        out_len = 11
    elif args.model == 'gmmat-fam':
        ch_out = 'gmmatfam_score.'+str(outdot)+'.'+str(chname)+'.R.txt'
        out_len = 11
    elif args.model == 'logistic':
        ch_out = 'logis.'+str(outdot)+'.'+str(chname)+'.assoc.logistic'
        out_len = 12
    elif args.model == 'linear':
        ch_out = 'linear.'+str(outdot)+'.'+str(chname)+'.assoc.linear'
	out_len = 12
    
    # record chunks with no/partial/broken output
    if not os.path.isfile(ch_out):
    	print 'Output not found for %s' % str(ch_out)
        mis_chunks[str(chname)] = [str(chrom), int(start), int(end)]
    elif file_len(ch_out) < file_len(str(outdot)+'.snps.'+str(chname)+'.txt'):
    	print 'Output file %s is incomplete' % str(ch_out)
        mis_chunks[str(chname)] = [str(chrom), int(start), int(end)]
    else:
        ft = file_tail(ch_out)
        if len(ft.split()) != out_len:
	    print 'Last line of output file %s is incomplete' % str(ch_out)
            mis_chunks[str(chname)] = [str(chrom), int(start), int(end)]
            

chunks_in.close()

###############
# if there are missing chunks, restart their gwas and resub agg script
###############
if len(mis_chunks) > 0:
Ejemplo n.º 5
0
if args.remove is not None:
    assert os.path.isfile(
        args.remove), "ID exclusion file does not exist (%r)" % args.remove
if args.extract is not None:
    assert os.path.isfile(
        args.extract), "SNP inclusion file does not exist (%r)" % args.extract
if args.exclude is not None:
    assert os.path.isfile(
        args.exclude), "SNP exclusion file does not exist (%r)" % args.exclude
if args.pheno is not None:
    assert os.path.isfile(
        args.pheno), "Phenotype file does not exist (%r)" % args.pheno

# warn if data is large
if args.extract is not None:
    nsnp = file_len(str(args.extract))
elif args.exclude is not None:
    nsnp = file_len(str(args.bfile) + '.bim') - file_len(str(args.exclude))
else:
    nsnp = file_len(str(args.bfile) + '.bim')

if nsnp > 1000000:
    warn(
        'Large number of SNPs present for analysis (%d). Consider splitting for efficiency.'
        % int(nsnp))

print '\n'
print '############'
print 'Begin!'
print '############'
Ejemplo n.º 6
0
print '\nUnrelated Set (IMUS) Criteria:'
print '--rel-th ' + str(args.rel_th)

print '\nPrincipal Components (PCA):'
print '--npcs ' + str(args.npcs)
print '--plot-all ' + str(args.plot_all)

#####
# check imus memory requirements
# = 6 GB + 400MB*(n/1000)^2, rounded up to nearest 4GB
# based on previous runs of PRIMUS
#####

warn_mem = False

nsamp = float(file_len(str(args.bfile) + '.fam'))

imus_mem = int(ceil((6000.0 + 400.0 * ((nsamp / 1000.0)**2)) / 4000.0) * 4)

if imus_mem > 16 and not args.large_mem_ok:
    warn_mem = True
    args.test_sub = True

#####
# submit strict qc
print '\n...Submitting Strict QC job...'
#####
strictqc_call = ' '.join([
    'strict_qc.py', '--bfile', args.bfile, '--out', args.out, clean_txt,
    '--mind-th',
    str(args.mind_th), '--maf-th',
Ejemplo n.º 7
0
if args.covar is not None:
    assert os.path.isfile(args.covar), "Covariate file does not exist (%r)" % args.covar
if args.keep is not None:
    assert os.path.isfile(args.keep), "ID inclusion file does not exist (%r)" % args.keep
if args.remove is not None:
    assert os.path.isfile(args.remove), "ID exclusion file does not exist (%r)" % args.remove
if args.extract is not None:
    assert os.path.isfile(args.extract), "SNP inclusion file does not exist (%r)" % args.extract
if args.exclude is not None:
    assert os.path.isfile(args.exclude), "SNP exclusion file does not exist (%r)" % args.exclude
if args.pheno is not None:
    assert os.path.isfile(args.pheno), "Phenotype file does not exist (%r)" % args.pheno

# warn if data is large
if args.extract is not None:
    nsnp = file_len(str(args.extract))
elif args.exclude is not None:
    nsnp = file_len(str(args.bfile)+'.bim') - file_len(str(args.exclude))
else:
    nsnp = file_len(str(args.bfile)+'.bim')

if nsnp > 50000:
    warn('Large number of SNPs present for analysis (%d). Consider splitting for efficiency.' % int(nsnp))


print '\n'
print '############'
print 'Begin!'
print '############'

#############
Ejemplo n.º 8
0
def send_job(
        jobname,
        arrayfile=None,
        cmd=None,
        logname=None,
        logloc=None,
        mem=None,
        walltime=None,
        #             week=None,
        njobs=None,
        maxpar=10000,
        threads=None,
        wait_file=None,
        wait_name=None,
        wait_num=None,
        cluster=None,
        sleep=30,
        testonly=False,
        forcearray=False):

    # validate args
    if arrayfile is None and cmd is None:
        raise ValueError("Require either array file or command.")

    elif arrayfile is not None and cmd is not None:
        raise ValueError("Require either array file or command, not both.")

    if logloc is None:
        logloc = os.getcwd()

    if not os.path.isdir(logloc):
        os.mkdir(logloc)

    if maxpar < 1:
        maxpar = 10000

    # get cluster queue name
    if cluster is None:
        conf_file = os.environ['HOME'] + "/picopili.conf"
        configs = read_conf(conf_file)
        cluster = configs['cluster']

    # get queue template
    pico_bin = os.path.dirname(os.path.realpath(__file__))
    clust_dir = os.path.dirname(pico_bin) + '/cluster_templates'

    assert os.path.isdir(
        clust_dir
    ), "Unable to find cluster job submission template directory %s" % str(
        clust_dir)

    # load queue configuration info
    # - submission syntax, queue names, job holds
    clust_conf = read_conf(str(clust_dir) + '/' + str(cluster) + '.conf')

    # basic template
    with open(str(clust_dir) + '/' + str(cluster) + '.sub.sh',
              'r') as single_templ:
        templ = single_templ.read()

    # setup memory args
    if mem is None:
        mem = 2000
    mem_mb = str(int(mem))
    if int(mem) > 1000:
        mem_gb = str(int(mem) / 1000)
    else:
        mem_gb = str(1)

    if mem > 30000:
        mem_txt = str(clust_conf['big_mem_txt'])
    else:
        mem_txt = ""

    # multithreading arguments
    if threads is None:
        threads = 1

    # queue picking from job length
    if walltime is None:
        walltime = 1
        queue_name = clust_conf['hour_q']
    elif walltime <= 1.0:
        queue_name = clust_conf['hour_q']
    elif walltime <= 2.0:
        queue_name = clust_conf['hour2_q']
    elif walltime <= 4.0:
        queue_name = clust_conf['hour4_q']
    elif walltime <= 24.0:
        queue_name = clust_conf['day_q']
    else:
        queue_name = clust_conf['long_q']

    # job dependencies
    if wait_name is not None:
        hold_str = clust_conf['hold_flag'].format(hold_name=str(wait_name),
                                                  hold_num=str(wait_num))

    elif wait_file is not None:
        with open(wait_file, 'r') as wait_fi:
            wait_name = wait_fi.readline()
            hold_str = clust_conf['hold_flag'].format(hold_name=str(wait_name),
                                                      hold_num=str(wait_num))

    else:
        hold_str = ""

    # load base template

    # for single jobs
    if cmd is not None and (njobs is None or njobs <= 1) and not forcearray:

        njobs = 1
        tot_threads = int(threads)

        # log name
        if logname is None:
            logname = str(jobname) + '.sub.log'

        # command line
        cmd_str = cmd

        # dummy task array args for dict
        array_jobs = njobs
        j_per_core = 1

    # for array jobs
    else:

        # setup indexing tasks
        j_per_core = int(clust_conf['j_per_node'])
        if j_per_core == 1:
            task_index = str(clust_conf['task_id'])
        else:
            task_index = "$1"

        # cmd or array file spec
        if cmd is not None:
            cmd_line = cmd.format(task=task_index)
            tot_threads = int(njobs) * int(threads)

        else:
            assert os.path.isfile(
                arrayfile), "Job array file %s not found." % str(arrayfile)

            njobs = file_len(arrayfile)
            tot_threads = int(njobs) * int(threads)

            cmd_tmp = dedent("""\
                cline=`head -n {task} {fi} | tail -n 1`
                echo $cline
                $cline
            """)
            cmd_line = cmd_tmp.format(task=task_index, fi=arrayfile)

        # parallelization of array jobs on a node
        if j_per_core > 1:

            from math import floor, ceil

            # max simul tasks with memory limit
            node_mem = float(clust_conf['array_mem_mb'])
            task_mem_lim = int(floor((node_mem - 1.0) / float(mem)))

            # max simul tasks with threading
            if task_mem_lim > floor(int(j_per_core) / int(threads)):
                task_mem_lim = floor(int(j_per_core) / int(threads))

            if task_mem_lim < 1:
                task_mem_lim = 1

            # number of jobs to cover all tasks
            array_jobs = int(ceil(float(njobs) / float(task_mem_lim)))

            # convert multi-line command to script
            if len(cmd_line.splitlines()) > 1:
                tmp_script = open('temp_cmd.' + str(jobname) + '.sh', 'w')
                tmp_script.write(cmd_line)
                tmp_script.close()
                os.chmod(tmp_script.name,
                         stat.S_IEXEC | stat.S_IREAD | stat.S_IWRITE)
                cmd_line = './' + tmp_script.name

            # setup to do task_mem_lim jobs on each node
            # note: specified above that cmd_line uses $1 (first arg) as task index
            # we manage that here with ${tid}
            par_tmp = dedent("""\
                # array index for this job            
                jj={job_index}
                
                # number of jobs to run on node
                nodej={nodej}
                
                # total number of jobs to run in task array
                maxj={njobs}
                
                # task index of first task on this node
                tid=$(($nodej * ($jj - 1) + 1))
                
                # find index of last task for this node
                # - from either node task limit (nodej)
                #   or total number of tasks (maxj)
                if [ "$tid" -le $(($maxj - $nodej + 1)) ]; then
                    last_task=$(($tid + $nodej - 1))
                else
                    last_task=$(($maxj))
                fi
                
                # start the tasks
                while [ "$tid" -le "$last_task" ]; do
                    {cmd_line} $tid &
                    tid=$(($tid+1))
                done
                
                # let all tasks finish
                wait
            """)

            cmd_str = par_tmp.format(njobs=str(njobs),
                                     nodej=str(task_mem_lim),
                                     job_index=str(clust_conf['task_id']),
                                     cmd_line=cmd_line)

        else:
            array_jobs = njobs
            cmd_str = cmd_line

        # log name
        if logname is None:
            logname = str(jobname) + '.sub.' + str(
                clust_conf['log_task_id']) + '.log'

    # fill in template
    jobdict = {
        "job_name": str(jobname),
        "cmd_string": cmd_str,  # formatted elsewhere
        "log_name": str(logloc) + '/' + str(logname),
        "mem_in_mb": str(mem_mb),
        "mem_in_gb": str(mem_gb),
        "big_mem_txt": str(mem_txt),
        "threads": str(threads),
        "total_threads": str(tot_threads),
        "wall_hours": str(walltime),
        "njobs": str(njobs),
        "array_jobs": str(array_jobs),
        "array_max": str(maxpar),
        "core_par": str(j_per_core),
        "task_id": str(clust_conf['task_id']),
        "log_task_id": str(clust_conf['log_task_id']),
        "queue_name": str(queue_name),
        "sleep_time": str(sleep),
        "project": str(clust_conf['project']),
        "workdir": os.getcwd()
    }

    # write job script
    sub_file = open(str(jobname) + '.sub.sh', 'w')
    sub_file.write(templ.format(**jobdict))
    sub_file.close()

    # finalize or remove optional lines
    if njobs <= 1 and not forcearray:
        subprocess.check_call(
            ['sed', '-i', '/^::PICO_ARRAY_ONLY::/d',
             str(sub_file.name)])
    else:
        subprocess.check_call(
            ['sed', '-i', 's/^::PICO_ARRAY_ONLY:://',
             str(sub_file.name)])

    if threads <= 1:
        subprocess.check_call(
            ['sed', '-i', '/^::PICO_THREAD_ONLY::/d',
             str(sub_file.name)])
    else:
        subprocess.check_call(
            ['sed', '-i', 's/^::PICO_THREAD_ONLY:://',
             str(sub_file.name)])

    if njobs <= 1 and not forcearray and threads <= 1:
        subprocess.check_call(
            ['sed', '-i', '/^::PICO_THREADARRAY_ONLY::/d',
             str(sub_file.name)])
    else:
        subprocess.check_call([
            'sed', '-i', 's/^::PICO_THREADARRAY_ONLY:://',
            str(sub_file.name)
        ])

    # command to run
    if hold_str != "":
        launch_str = clust_conf['sub_cmd'] + ' ' + hold_str + ' ' + str(
            sub_file.name)
    else:
        launch_str = clust_conf['sub_cmd'] + ' ' + str(sub_file.name)

    # record
    print launch_str

    # run
    if not testonly:
        p = subprocess.Popen(launch_str.split(),
                             stderr=subprocess.STDOUT,
                             stdout=subprocess.PIPE)
        out, err = p.communicate()
        if p.returncode is None or p.returncode == 0:
            return out
        else:
            raise EnvironmentError((p.returncode, err, out))

    else:
        return 0
Ejemplo n.º 9
0
source /broad/software/scripts/useuse
reuse -q Anaconda
sleep {sleep}

cchr=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $1}}' {cfile}`
cstart=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $2}}' {cfile}`
cend=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $3}}' {cfile}`
cname=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $4}}' {cfile}`

{impute_ex} -use_prephased_g -known_haps_g {in_haps} -h {ref_haps} -l {ref_leg} -m {map} -int ${{cstart}} ${{cend}} -buffer {buffer} -Ne {Ne} -allow_large_regions -o_gz -o {out} {seedtxt}

# eof
"""
    
# get number of chunks (-1 is for header)
nchunks = file_len(outdot+'.chunks.txt')-1

# fill in template
jobdict = {"jname": 'imp.chunks.'+str(outdot),
           "nchunk": str(nchunks),
           "outlog": str('imp.chunks.'+str(outdot)+'.$TASK_ID.qsub.log'),
           "sleep": str(args.sleep),
           "cfile": str(outdot)+'.chunks.txt',
           "impute_ex": str(impute_ex),
           "in_haps": str(shape_dir)+'/'+str(outdot)+'.chr${cchr}.phased.haps',
           "ref_haps": str(args.ref_haps).replace('###','${cchr}'),
           "ref_leg": str(args.ref_legs).replace('###','${cchr}'),
           "map": str(args.ref_maps).replace('###','${cchr}'),
           "Ne": str(args.Ne),
           "buffer": str(args.buffer),
           "out": str(outdot)+'.imp.${cname}',
Ejemplo n.º 10
0
    if float(fhet) > args.het_th:
        id_out.write(str(fid) + ' ' + str(iid) + ' high_homozygosity_Fhet\n')
        nex += 1
    elif float(fhet) < (-1.0*args.het_th):
        id_out.write(str(fid) + ' ' + str(iid) + ' low_homozygosity_Fhet\n')
        nex += 1

het.close()
print 'Found %d individuals to exclude for failing absolute Fhet homozygosity rate > %r' % (nex, args.het_th)


# filter mendel errors
if args.mendel != 'none':
    
    # get number of SNPs as denominator for mendel error rate
    nsnp = file_len(str(prefilter_out) + '.bim')

    imendel_nam = ind_stats + '.imendel'
    imendel = open(imendel_nam, 'r')
    nex = 0
    
    dumphead = imendel.readline()
    for line in imendel:
        (fid, iid, nmendel) = line.split()
        
        if float(nmendel) / float(nsnp) > args.id_mendel_th:
            id_out.write(str(fid) + ' ' + str(iid) + ' excessive_mendel_errors\n')
            nex += 1

    imendel.close()
    print 'Found %d individuals to exclude for excessive mendel errors > %r of SNPs' % (nex, args.id_mendel_th)
Ejemplo n.º 11
0
                       '--merge-list', str(merge_list.name),
                       '--merge-mode',str(4),
                       '--make-bed',
                       '--out', str(outdot)+'.cobg.filtered'],
                       stderr=subprocess.STDOUT, 
                       stdout=merge_log) 
merge_log.close()



######################
print '\n...Verifying output...'
######################

assert os.path.isfile(str(outdot)+'.cobg.filtered.bed')
assert os.path.isfile(str(outdot)+'.cobg.filtered.bim')
assert os.path.isfile(str(outdot)+'.cobg.filtered.fam')
assert file_len(str(outdot)+'.cobg.filtered.bim')+1 == file_len(str(outdot)+'.cobg.filtered.info')
# TODO: here




# finish
print '\n############'
print '\n'
print 'SUCCESS!'
exit(0)

# eof
Ejemplo n.º 12
0
print '\nPrincipal Components (PCA):'
print '--npcs '+str(args.npcs) 
print '--plot-all '+str(args.plot_all)



#####
# check imus memory requirements
# = 6 GB + 400MB*(n/1000)^2, rounded up to nearest 4GB
# based on previous runs of PRIMUS
#####

warn_mem = False

nsamp = float(file_len(str(args.bfile)+'.fam'))

imus_mem = int(ceil( (6000.0+400.0*( (nsamp/1000.0)**2) )/4000.0 ) * 4)

if imus_mem > 16 and not args.large_mem_ok:
    warn_mem = True
    args.test_sub = True


#####
# submit strict qc
print '\n...Submitting Strict QC job...'
#####
strictqc_call = ' '.join(['strict_qc.py', 
                         '--bfile', args.bfile,
                         '--out', args.out,
Ejemplo n.º 13
0
###
# submit next imputation task
###
if args.full_pipe:
    ######################
    print '\n...Queuing chunk aggregation script...'
    ######################

    os.chdir(wd)
    next_call = str(rp_bin) + '/agg_imp.py ' + ' '.join(sys.argv[1:])

    agg_log = 'agg_imp.' + str(outdot) + '.sub.log'

    # some dynamic adjustment of mem based on sample size population
    fam_n = file_len(
        str(shape_dir) + '/' + str(args.bfile) + '.hg19.ch.fl.fam')
    if fam_n > 3000:
        agg_mem = 32000
    elif fam_n > 1000:
        agg_mem = 16000
    else:
        agg_mem = 8000

    # (empirically, seem to get ~2x sites from afr vs eur)
    # (admittedly this method of catching/handling it is _very_ informal)
    if "afr" in sys.argv[1:]:
        agg_mem = 2 * agg_mem

    # TODO: consider queue/mem for agg
    send_job(jobname='agg.imp.' + str(outdot),
             cmd=next_call,
Ejemplo n.º 14
0
        ch_out = 'gee.'+str(outdot)+'.'+str(chname)+'.auto.R'
        out_len = 10
    elif args.model == 'dfam':
        ch_out = 'dfam.'+str(outdot)+'.'+str(chname)+'.dfam'
        out_len = 8
    elif args.model == 'gmmat':
        ch_out = 'gmmat_score.'+str(outdot)+'.'+str(chname)+'.R.txt'
        out_len = 11
    elif args.model == 'gmmat-fam':
        ch_out = 'gmmatfam_score.'+str(outdot)+'.'+str(chname)+'.R.txt'
        out_len = 11
    
    # record chunks with no/partial/broken output
    if not os.path.isfile(ch_out):
        mis_chunks[str(chname)] = [str(chrom), int(start), int(end)]
    elif file_len(ch_out) != file_len(str(outdot)+'.snps.'+str(chname)+'.txt'):
        mis_chunks[str(chname)] = [str(chrom), int(start), int(end)]
    else:
        ft = file_tail(ch_out)
        if len(ft.split()) != out_len:
            mis_chunks[str(chname)] = [str(chrom), int(start), int(end)]
            

chunks_in.close()

###############
# if there are missing chunks, restart their gwas and resub agg script
###############
if len(mis_chunks) > 0:
    nummiss = len(mis_chunks)
    print 'Missing results for %d GWAS jobs. Preparing to resubmit...' % nummiss
Ejemplo n.º 15
0
merge_log = open(str(outdot) + '.cobg.filtered.merge.log', 'w')
subprocess.check_call([
    plink_ex, '--merge-list',
    str(merge_list.name), '--merge-mode',
    str(4), '--make-bed', '--out',
    str(outdot) + '.cobg.filtered'
],
                      stderr=subprocess.STDOUT,
                      stdout=merge_log)
merge_log.close()

######################
print '\n...Verifying output...'
######################

assert os.path.isfile(str(outdot) + '.cobg.filtered.bed')
assert os.path.isfile(str(outdot) + '.cobg.filtered.bim')
assert os.path.isfile(str(outdot) + '.cobg.filtered.fam')
assert file_len(str(outdot) + '.cobg.filtered.bim') + 1 == file_len(
    str(outdot) + '.cobg.filtered.info')
# TODO: here

# finish
print '\n############'
print '\n'
print 'SUCCESS!'
exit(0)

# eof
Ejemplo n.º 16
0
for line in bim:
    (chrom, snp_id, cm, bp, a1, a2) = line.split()
    snps[str(snp_id)] = [str(chrom), int(bp)]
    if str(chrom) not in chroms:
        chroms.append(str(chrom))
    if int(chrom) in xrange(1, 23):
        nbimsnps_valid += 1
    # prevent later errors
    if int(bp) > chrend[str(chrom)]:
        warnings.warn(
            "SNP %s (chr %s, bp %d) is outside expected chromosome bounds (bp <= %d)."
            % (str(snp_id), str(chrom), int(bp), int(chrend_orig[str(chrom)]))
        )
        chrend[str(chrom)] = int(bp)
bim.close()
nbimsnps = file_len(args.bfile + ".bim")
print "Loaded %d autosomal SNPs (of %d total in %s)." % (nbimsnps_valid, nbimsnps, bim.name)


#############
print "\n...Generating genomic chunks..."
#############
chunks = open(outname, "w")
chunks.write(" ".join(["CHR", "START", "END", "NAME"]) + "\n")
idx = 1
nsnps = 0
for ch in xrange(1, 23):

    if str(ch) not in chroms:
        continue
Ejemplo n.º 17
0
chunks_in = open(args.chunk_file, 'r')
dumphead = chunks_in.readline()
for line in chunks_in:
    (chrom, start, end, chname) = line.split()
    chunks[str(chname)] = [str(chrom), int(start), int(end)]

    # verify output file exists
    if args.model == 'gee':
        ch_out = 'gee.'+str(outdot)+'.'+str(chname)+'.auto.R'
    elif args.model == 'dfam':
        ch_out = 'dfam.'+str(outdot)+'.'+str(chname)+'.dfam'
    
    # record chunks with no output
    if not os.path.isfile(ch_out):
        mis_chunks[str(chname)] = [str(chrom), int(start), int(end)]
    elif not file_len(ch_out) > 10:
        mis_chunks[str(chname)] = [str(chrom), int(start), int(end)]

chunks_in.close()

###############
# if there are missing chunks, restart their gwas and resub agg script
###############
if len(mis_chunks) > 0:
    nummiss = len(mis_chunks)
    print 'Missing results for %d GWAS jobs. Preparing to resubmit...' % nummiss
    
    # just missing chunks for task array
    # fail if already tried
    tmp_chunk_file_name = 'tmp_missing_'+str(nummiss)+'_chunks.'+str(outdot)+'.txt'
Ejemplo n.º 18
0
    if float(fhet) > args.het_th:
        id_out.write(str(fid) + ' ' + str(iid) + ' high_homozygosity_Fhet\n')
        nex += 1
    elif float(fhet) < (-1.0*args.het_th):
        id_out.write(str(fid) + ' ' + str(iid) + ' low_homozygosity_Fhet\n')
        nex += 1

het.close()
print 'Found %d individuals to exclude for failing absolute Fhet homozygosity rate > %r' % (nex, args.het_th)


# filter mendel errors
if args.mendel != 'none':
    
    # get number of SNPs as denominator for mendel error rate
    nsnp = file_len(str(prefilter_out) + '.bim')

    imendel_nam = ind_stats + '.imendel'
    imendel = open(imendel_nam, 'r')
    nex = 0
    
    dumphead = imendel.readline()
    for line in imendel:
        (fid, iid, nmendel) = line.split()
        
        if float(nmendel) / float(nsnp) > args.id_mendel_th:
            id_out.write(str(fid) + ' ' + str(iid) + ' excessive_mendel_errors\n')
            nex += 1

    imendel.close()
    print 'Found %d individuals to exclude for excessive mendel errors > %r of SNPs' % (nex, args.id_mendel_th)
Ejemplo n.º 19
0
print '\n...Beginning LD pruning...'
#############

# init
i = 1

subprocess.check_call([
    str(plinkx), "--bfile", filtered_out, "--indep-pairwise",
    str(args.ld_wind),
    str(ld_move),
    str(args.ld_th), "--silent", "--memory",
    str(2000), "--allow-no-sex", "--out", args.out + '.prune' + str(i) + '.tmp'
])

# tracking number of SNPs before, after altest round of pruning
nprune_old = file_len(filtered_out + '.bim')
nprune_new = file_len(args.out + '.prune' + str(i) + '.tmp.prune.in')

# loop til no additional exclusions
while nprune_old > nprune_new:

    i += 1
    #############
    print 'Pruning pass ' + str(i)
    #############
    subprocess.check_call([
        str(plinkx), "--bfile", filtered_out, "--extract",
        args.out + '.prune' + str(i - 1) + '.tmp.prune.in', "--indep-pairwise",
        str(args.ld_wind),
        str(ld_move),
        str(args.ld_th), "--silent", "--memory",
Ejemplo n.º 20
0
           "impute_ex": str(impute_ex),
           "in_haps": str(shape_dir)+'/'+str(outdot)+'.chr${{cchr}}.phased.haps',
           "ref_haps": str(args.ref_haps).replace('###','${{cchr}}'),
           "ref_leg": str(args.ref_legs).replace('###','${{cchr}}'),
           "map": str(args.ref_maps).replace('###','${{cchr}}'),
           "Ne": str(args.Ne),
           "buffer": str(args.buffer),
           "out": str(outdot)+'.imp.${{cname}}',
           "seedtxt": str(seedtxt),
	   "cbopen":'{{',
	   "cbclose":'}}',
           }


# get number of chunks (-1 is for header)
nchunks = file_len(outdot+'.chunks.txt')-1


# store job information for possible resubs
job_store_file = 'imp.chunks.'+str(outdot)+'.pkl'

clust_dict = init_sendjob_dict()
clust_dict['jobname'] = 'imp.chunks.'+str(outdot)
clust_dict['logname'] = str('imp.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log')
clust_dict['mem'] = 8000
clust_dict['walltime'] = 2
clust_dict['njobs'] = int(nchunks)
clust_dict['sleep'] = args.sleep

save_job(jfile=job_store_file, cmd_templ=imp_templ, job_dict=jobdict, sendjob_dict=clust_dict)
Ejemplo n.º 21
0
    whichmax = props.index(max(props))
    if props[whichmax] > th:
        outpop = names[whichmax]
    else:
        outpop = '-'
    return outpop

# get list of selected pop for each individual in admixture results
ind_pops = []
admix_pops_file = str(args.unrel_bfile+'.'+str(args.npops)+'.Q')
with open(admix_pops_file, 'r') as f:
    # map() required to read probs as float instead of string
    ind_pops = [maxpop(props=map(float,line.split()), names=popnames, th=args.prop_th) for line in f]

# sanity check parsing
nfam = file_len(str(args.unrel_bfile+'.fam'))
if len(ind_pops) != nfam:
    raise ValueError('Number of individuals parsed from admixture results (%d in %s) ' + \
                     'and fam file of unrelateds (%d in %s) do not match.' % (len(ind_pops), admix_pops_file, int(nfam), str(args.unrel_bfile+'.fam')))

# check have sufficient exemplars
popcounts = [ind_pops.count(popnames[i]) for i in range(args.npops)]
lackingpops = [popcounts[i] < args.min_exemplar for i in range(args.npops)]

print 'Exemplars per population:'
for i in range(args.npops):
    print str(popnames[i] + ': ' + str(popcounts[i]))
print 'Unassigned: '+str(ind_pops.count('-'))

if any(lackingpops):
    print '\n###########\n'