def permer(perm,randind): # First we permute the genotypes and write them out to a file (bimbam format) random.seed(perm + (int(blocknum)*100)) shuffle(randind) snper = currfiles + '.snps' aer = currfiles + '.as' ger = currfiles + '.gs' shuffler = 'zcat ' + genodir + 'ByChr/*.all*.gz | grep -f ' + snper + ' - | cut -f' + ','.join(randind) + ' > ' + currfiles + '_perm_sub.bimbam; paste ' + snper + ' ' + aer + ' ' + ger + ' ' + currfiles + '_perm_sub.bimbam > ' + currfiles + '_perm.bimbam' ifier(shuffler) # This runs gemma on the permuted genotypes gemmer = (hmdir + 'Programs/gemma0.94 -g ' + currfiles + '_perm.bimbam -p ' + currfiles + '.pheno -k ' + currfiles + '.square.txt -c ' + currfiles + '.covariates.txt' + ' -lmm 4 -maf 0.05 -o ' + blocknum + '.' + pheno) ifier(gemmer)
def permer(perm): #for perm in xrange(0,100): # if actives.count(0) == 0: # continue # First we permute the genotypes and write them out to a file (bimbam format) random.seed(perm + (int(blocknum)*100)) shuffle(randind) updateind = [0,1,2] + randind permbimbam = open(genodir + 'perms/' + blocknum + '_perm_curr.bimbam','w') for snp in masterdic[gene]: yrand = y[snp][updateind] print >> permbimbam, ", ".join(yrand) permbimbam.close() # This runs gemma on the permuted genotypes gemmer = (hmdir + 'Programs/gemma0.94 -g ' + genodir + 'perms/' + blocknum + '_perm_curr.bimbam -p ' + currfiles + '.pheno -k ' + currfiles + '.square.txt -c ' + currfiles + '.covs.txt' + ' -lmm 4 -maf 0.05 -o perm_curr_' + blocknum) ifier(gemmer)
def permer(gene): winnerperms = 0 for perm in xrange(0,10000): shuffle(randind) updateind = [0,1,2] + randind permgenos = [", ".join([genodic[x][index] for index in updateind]) for x in masterdic[gene]] currbimbam = open(genodir + 'perm_curr_' + chrm + '_pc' + str(pcs) + '.bimbam','w') print >> currbimbam, "\n".join(permgenos) currbimbam.close() print 'Gene No. ' + str(len(winnerdic.keys()) + 1) + ' on chrm.' print str(winnerperms) + ' of ' + str(perm) + ' permutations lost.' gemmer = (hmdir + 'Programs/gemma0.94 -g ' + genodir + 'perm_curr_' + chrm + '_pc' + str(pcs) + '.bimbam -p ' + currfiles + '.pheno -k ' + currfiles + '.square.txt -c ' + currfiles + '.pcs.txt' + ' -lmm 4 -maf 0.05 -o perm_curr_' + chrm + '_pc' + str(pcs)) ifier(gemmer) permering = open(genodir + 'output/perm_curr_' + chrm + '_pc' + str(pcs) + '.assoc.txt','r') permers = [x.strip().split()[12] for x in permering.readlines()] permers = [float(x) for x in permers if x != 'nan' and x != 'p_lrt'] permering.close() permlow = min(permers) if permlow <= pmin: winnerperms += 1 if winnerperms == 10: return 11/uniform(perm+2,perm+3) return (winnerperms + 1)/float(10001)
#raws = numpy.loadtxt(genodir + 'ByChr/hutt.imputed.chr' + chrm + '.raw',dtype='str') print "Transposing genotypes..." sys.stdout.flush() genorfile = open(genor,'w') for line in range(6,raws.shape[1]): print >> genorfile, "\t".join(list(raws[1:raws.shape[0],line])) genorfile.close() #traws = numpy.transpose(raws) #trawsu = traws[6:traws.shape[0],1:traws.shape[1]] #numpy.savetxt(genor,trawsu,delimiter="\t",fmt='%s') print "Annotating SNPs..." sys.stdout.flush() traws1 = numpy.column_stack(([0]*len(snpids),['.']*len(snpids))) traws2 = numpy.column_stack((snpids,traws1)) traws3 = numpy.column_stack((snppos,traws2)) traws4 = numpy.column_stack(([str(int(x)-1) for x in snppos],traws3)) traws5 = numpy.column_stack((['chr' + str(chrm)]*len(snpids),traws4)) numpy.savetxt(anoter,traws5,delimiter="\t",fmt='%s') print "Finalizing files..." sys.stdout.flush() paster = '/bin/bash -c "paste <(cat ' + anoter + ') <(cat ' + genor + ') > ' + genodir + 'ByChr/' + outname + '.chr' + chrm + '.txt; rm ' + anoter + '; rm ' + genor + '"' ifier(paster) doner = open(genodir + 'ByChr/chr' + chrm + '.done','w') doner.close()
chrm = numpy.array(chrm) if x == 0: genecounting = numpy.column_stack((numpy.array(chrm)[:,numpy.newaxis],numpy.array(start)[:,numpy.newaxis], numpy.array(end)[:,numpy.newaxis],numpy.array(gene)[:,numpy.newaxis], numpy.array(length)[:,numpy.newaxis],numpy.array(counts)[:,numpy.newaxis],)) else: if z == 0: genecounting = numpy.array(counts)[:,numpy.newaxis] z = 1 else: genecounting = numpy.column_stack((genecounting,numpy.array(counts)[:,numpy.newaxis])) x = 1 #Code to remove corrupted exoncounts files and regenerate... except IOError: print 'Fixing ' + samp + '...' ifier('rm ' + sample + '.exoncounts.txt') cleanup = '/mnt/lustre/home/cusanovich/Programs/samtools/samtools merge ' + sample + '.quality.merged.bam ' + sample + '.quality.sort.bam ' + sample + '.saved.quality.sort.bam; \ /mnt/lustre/home/cusanovich/Programs/BEDTools/bin/bamToBed -i ' + sample + '.quality.merged.bam > ' + sample + '.bed; \ /mnt/lustre/home/cusanovich/Programs/samtools/samtools merge ' + sample + '.junction.quality.merged.bam ' + sample + '.junction.quality.sort.bam ' + sample + '.saved.junction.quality.sort.bam; \ /mnt/lustre/home/cusanovich/Programs/samtools/samtools view ' + sample + '.junction.quality.merged.bam > ' + sample + '.junction.quality.merged.sam; \ python /mnt/lustre/data/users/cusanovich/RNAseq_scripts/junctionreformatter.py ' + sample + '.junction.quality.merged.sam ' + sample + '.junction.bed; \ cat ' + sample + '.junction.bed >> ' + sample + '.bed; \ /mnt/lustre/home/cusanovich/Programs/BEDTools/bin/coverageBed -a ' + sample + '.bed -b /mnt/lustre/data/users/cusanovich/References/hg19ProteinCodingEnsemblExonsMergedNonoverlapping.bed > ' + sample + '.exoncounts.txt; \ python /mnt/lustre/data/users/cusanovich/RNAseq_scripts/exoncombiner.py ' + sample + '.exoncounts.txt ' + sample + '.genecounts.txt; \ rm ' + sample + '.quality.merged.bam; \ rm ' + sample + '.junction.quality.merged.bam; \ rm ' + sample + '.junction.quality.merged.sam; \ rm ' + sample + '.junction.bed; \ python 500HT/Scripts/RNAseq/exonmatrixmaker.py;' ifier(cleanup) print samp + ' fixed.'
#step 2 - chrom raws #step 3 - chrom bim (snpid + position) #step 4 - load raw #step 5 - transpose raw #step 6 - add chrom, start, end, snpid, "0", "." #step 7 - save table #step 8 - compress with bgzip #step 9 - index with tabix genodir = '/mnt/lustre/home/cusanovich/500HT/Imputed1415/' outname = 'hutt.all.imputed' print 'Creating raw files...' for j in range(1,23): #plinker = 'echo "plink --noweb --nonfounders --maf 0.05 --geno 0.05 --bfile ' + genodir + 'imputed_cgi --chr ' + str(j) + ' --make-bed --out ' + genodir + 'ByChr/hutt.imputed.chr' + str(j) + '; plink --bfile ' + genodir + 'ByChr/hutt.imputed.chr' + str(j) + ' --recodeA --out ' + genodir + 'ByChr/hutt.imputed.chr' + str(j) + '; touch ' + genodir + 'ByChr/chr' + str(j) + '.done" | qsub -l h_vmem=2g -o ~/dump/ -e ~/dump/' plinker = 'echo "plink --noweb --nonfounders --bfile ' + genodir + 'hutt.imputed.rename --chr ' + str(j) + ' --make-bed --out ' + genodir + 'ByChr/' + outname + '.chr' + str(j) + '; plink --bfile ' + genodir + 'ByChr/' + outname + '.chr' + str(j) + ' --recodeA --out ' + genodir + 'ByChr/' + outname + '.chr' + str(j) + '; touch ' + genodir + 'ByChr/chr' + str(j) + '.done" | qsub -l h_vmem=2g -o ~/dump/ -e ~/dump/' ifier(plinker) while len(glob.glob(genodir + 'ByChr/*.done')) < 22: time.sleep(5) cleanup = "rm " + genodir + "ByChr/*.done" ifier(cleanup) print 'Creating bed files...' for j in range(1,23): converter = 'echo "python /mnt/lustre/home/cusanovich/500HT/Scripts/raw2txt.py ' + str(j) + ' ' + genodir + ' ' + outname + '" | qsub -l h_vmem=8g -o ~/dump/ -e ~/dump/' ifier(converter) while len(glob.glob(genodir + 'ByChr/*.done')) < 22: time.sleep(5)
tabixer = Tabixfile('/mnt/lustre/home/cusanovich/500HT/Imputed1415/ByChr/hutt.all.imputed.' + chrm + '.txt.gz') tempgenos = [x.split('\t') for x in tabixer.fetch(chrm,int(masterdic[snp][1])-1,int(masterdic[snp][2]))][0] genos = [tempgenos[x] for x in range(0,6) + genoinds] tabixer.close() y[snp] = [genos[3], 'A', 'G'] + genos[6:] print >> currbimbam, ", ".join(y) #t1 = time.time() #print t1-t0 currbimbam.close() #genomat = matrix_reader(genodir + 'hutt.imputed.dhssnps.bimbam',sep=",") print "Running GEMMA..." gemmer = (hmdir + 'Programs/gemma0.94 -g ' + currfiles + '.bimbam -p ' + currfiles + '.pheno -k ' + currfiles + '.square.txt -c ' + currfiles + '.covariates -lmm 4 -maf 0.05 -o curr_' + pheno) t0 = time.time() ifier(gemmer) t1 = time.time() print t1-t0 #currresults = open(genodir + 'output/curr_' + pheno + '.assoc.txt','r') currresults = matrix_reader(genodir + 'output/curr_' + pheno + '.assoc.txt',dtype='f8') currsort = curresults[currresults[:,12].argsort()] currwins = currsort[0:100,] currscores = [0]*len(dhsdic[dhsdic.keys()[0]]) for snp in currwins[:,1]: currscores = currscores + dhsdic[snp] currperms = [0]*len(dhsdic[dhsdic.keys()[0]]) currpermwins = [0]*len(dhsdic[dhsdic.keys()[0]]) curractive = [0]*len(dhsdic[dhsdic.keys()[0]]) print "Running permutations..." for perm in xrange(0,100):
masterdic[mastercols[i,0]].append(mastercols[i,1]) except KeyError: masterdic[mastercols[i,0]] = [mastercols[i,1]] exprcoldic[mastercols[i,0]] = mastercols[i,2] chrmdic[mastercols[i,0]] = mastercols[i,4] ####Build a dictionary to reference the genomic coordinates of each SNP print "Loading SNP annotations..." snpdic = {} snpbed = open('/mnt/lustre/home/cusanovich/500HT/hutt.imputed.coord.bed','r') for line in snpbed: liner = line.strip().split() snpdic[liner[3]] = liner[0:3] cover = ('cp ' + hmdir + '500HT/addSNP.500ht.ordered.square.txt ' + currfiles + '.square.txt') ifier(cover) if regressPCs: expressed = matrix_reader(hmdir + '500HT/qqnorm.500ht' + gccor + covcor + '.ordered.bimbam',dtype='float') pcmat = matrix_reader(hmdir + '500HT/Exprs/qqnorm.500ht' + gccor + covcor + '.ordered.pc' + str(pcs),dtype='float') if not regressPCs: cover = ('cp ' + hmdir + '500HT/Exprs/qqnorm.500ht' + gccor + covcor + '.ordered.pc' + str(pcs) + ' ' + currfiles + '.pcs.txt') ifier(cover) if regressPCs and int(pcs) != 0: mod1 = pcmat mod2 = mod1.T Y = expressed.T W = pcmat[:,1:(int(pcs)+1)] mods = mod2.dot(mod1)
numpy.array(length)[:, numpy.newaxis], numpy.array(counts)[:, numpy.newaxis], )) else: if z == 0: genecounting = numpy.array(counts)[:, numpy.newaxis] z = 1 else: genecounting = numpy.column_stack( (genecounting, numpy.array(counts)[:, numpy.newaxis])) x = 1 #Code to remove corrupted exoncounts files and regenerate... except IOError: print 'Fixing ' + samp + '...' ifier('rm ' + sample + '.exoncounts.txt') cleanup = '/mnt/lustre/home/cusanovich/Programs/samtools/samtools merge ' + sample + '.quality.merged.bam ' + sample + '.quality.sort.bam ' + sample + '.saved.quality.sort.bam; \ /mnt/lustre/home/cusanovich/Programs/BEDTools/bin/bamToBed -i ' + sample + '.quality.merged.bam > ' + sample + '.bed; \ /mnt/lustre/home/cusanovich/Programs/samtools/samtools merge ' + sample + '.junction.quality.merged.bam ' + sample + '.junction.quality.sort.bam ' + sample + '.saved.junction.quality.sort.bam; \ /mnt/lustre/home/cusanovich/Programs/samtools/samtools view ' + sample + '.junction.quality.merged.bam > ' + sample + '.junction.quality.merged.sam; \ python /mnt/lustre/data/users/cusanovich/RNAseq_scripts/junctionreformatter.py ' + sample + '.junction.quality.merged.sam ' + sample + '.junction.bed; \ cat ' + sample + '.junction.bed >> ' + sample + '.bed; \ /mnt/lustre/home/cusanovich/Programs/BEDTools/bin/coverageBed -a ' + sample + '.bed -b /mnt/lustre/data/users/cusanovich/References/hg19ProteinCodingEnsemblExonsMergedNonoverlapping.bed > ' + sample + '.exoncounts.txt; \ python /mnt/lustre/data/users/cusanovich/RNAseq_scripts/exoncombiner.py ' + sample + '.exoncounts.txt ' + sample + '.genecounts.txt; \ rm ' + sample + '.quality.merged.bam; \ rm ' + sample + '.junction.quality.merged.bam; \ rm ' + sample + '.junction.quality.merged.sam; \ rm ' + sample + '.junction.bed; \ python 500HT/Scripts/RNAseq/exonmatrixmaker.py;' ifier(cleanup)
for snp in masterdic.keys(): #for snp in masterdic.keys()[0:1000]: print >> snplist, snp print >> alist, 'A' print >> glist, 'G' snplist.close() alist.close() glist.close() print "Running permutations..." blocker = open(genodir + 'Block_' + blocknum + 'permwins.txt','w') for perm in xrange(0,100): permer(perm,randind = genoinds) permresults = matrix_reader(genodir + 'output/perm_curr_' + blocknum + '.assoc.txt',dtype='f8') permsort = permresults[permresults[:,12].argsort()] permwins = permsort[0:100,] print >> blocker, '\t'.join(permwins) blocker.close() cleanup = 'rm ' + genodir + '*curr_' + pheno + '.*' ifier(cleanup) print "Writing results..." aller = open('/mnt/lustre/home/cusanovich/500HT/Tissues/' + pheno + '.enrichmentps.txt','w') for i in xrange(0,len(tissueps)): print >> aller, '{0}\t{2:.4g}'.format(dhsdic['rsID'][i],tissueps[i]) aller.close()
#raws = numpy.loadtxt(genodir + 'ByChr/hutt.imputed.chr' + chrm + '.raw',dtype='str') print "Transposing genotypes..." sys.stdout.flush() genorfile = open(genor, 'w') for line in range(6, raws.shape[1]): print >> genorfile, "\t".join(list(raws[1:raws.shape[0], line])) genorfile.close() #traws = numpy.transpose(raws) #trawsu = traws[6:traws.shape[0],1:traws.shape[1]] #numpy.savetxt(genor,trawsu,delimiter="\t",fmt='%s') print "Annotating SNPs..." sys.stdout.flush() traws1 = numpy.column_stack(([0] * len(snpids), ['.'] * len(snpids))) traws2 = numpy.column_stack((snpids, traws1)) traws3 = numpy.column_stack((snppos, traws2)) traws4 = numpy.column_stack(([str(int(x) - 1) for x in snppos], traws3)) traws5 = numpy.column_stack((['chr' + str(chrm)] * len(snpids), traws4)) numpy.savetxt(anoter, traws5, delimiter="\t", fmt='%s') print "Finalizing files..." sys.stdout.flush() paster = '/bin/bash -c "paste <(cat ' + anoter + ') <(cat ' + genor + ') > ' + genodir + 'ByChr/' + outname + '.chr' + chrm + '.txt; rm ' + anoter + '; rm ' + genor + '"' ifier(paster) doner = open(genodir + 'ByChr/chr' + chrm + '.done', 'w') doner.close()