def read_dir(dir,mark,loc,ext,*spl): try: direct=os.popen('ls '+dir +'/*'+mark+'*'+ext) except: direct=os.popen('ls '+dir +'/'+mark+'*'+ext) data=[] files=[] for fl in direct.readlines(): if loc in fl: #print fl if len(spl)>0: tmpdata=read.read_dat(fl[:-1],str(spl[0])) else: tmpdata=read.read_dat(fl[:-1]) data.append(tmpdata) #location=fl[::-1].index('/') libst=['A','HS','E'] ind=0 run=True while run and ind<len(libst): try: Ai=fl.index(libst[ind]) run=False Aend=Ai+fl[Ai:].index('.') except: ind+=1 fl=read.tabless(fl) files.append(fl[Ai:Aend]) return files, data
def readall_bedintersect(dir,mark,*flagstat): direct=os.popen('ls '+dir +'/*'+mark+'*.bed') data=[] files=[] for fl in direct.readlines(): fl=fl.strip() tmpdata=read.read_bed_intersect(fl) data.append(tmpdata) ind=0 run=True fl=fl[-fl[::-1].index('/'):fl.index('.')] print fl files.append(fl) if len(flagstat)>0: flagstat=str(flagstat[0]) normalization=read.read_dat(flagstat,'\t') for fl in range(len(files)): bool=False for i in normalization: if files[fl] in i[0]: data[fl]=norm_bed(data[fl],i[-1]) bool=True break if not bool: print files[fl], 'not found' return files, data
def CPG_RPKM(RPKM, CPG, lim): CGI=[] cpg=read.read_dat(CPG,'\t') for i in cpg: CGI.append(i[0]) genes=RPKM[0][1:] libs=[] mat=[] for i in RPKM[1:]: mat.append(i[1:]) libs.append(i[0]) mat=analyse.data2arr(mat) genes=np.array(genes) allave=[] cgiave=[] print len(mat) for i in range(len(mat)): gntmp=genes[mat[i,:]>lim] tmp=mat[i,:][mat[i,:]>lim] allave.append(np.mean(tmp)) cgirpkm=[] for j in xrange(len(gntmp)): if gntmp[j] in CGI: cgirpkm.append(tmp[j]) cgiave.append(np.mean(cgirpkm)) print len(gntmp), len(cgirpkm) allave=np.array(allave) cgiave=np.array(cgiave) return allave,cgiave, libs
def CPG_RPKM(RPKM, CPG, lim): CGI = [] cpg = read.read_dat(CPG, '\t') for i in cpg: CGI.append(i[0]) genes = RPKM[0][1:] libs = [] mat = [] for i in RPKM[1:]: mat.append(i[1:]) libs.append(i[0]) mat = analyse.data2arr(mat) genes = np.array(genes) allave = [] cgiave = [] print len(mat) for i in range(len(mat)): gntmp = genes[mat[i, :] > lim] tmp = mat[i, :][mat[i, :] > lim] allave.append(np.mean(tmp)) cgirpkm = [] for j in xrange(len(gntmp)): if gntmp[j] in CGI: cgirpkm.append(tmp[j]) cgiave.append(np.mean(cgirpkm)) print len(gntmp), len(cgirpkm) allave = np.array(allave) cgiave = np.array(cgiave) return allave, cgiave, libs
def gene_rpkm_compare(thersh, rpkmdata, rpkmfiles, libs, *enslist): genelist = [] files = [] for ind in xrange(len(rpkmdata)): files.append(rpkmfiles[ind][:rpkmfiles[ind].index('.')]) libs = read.read_dat(libs, '\t') for ind in range(len(files)): for lib in libs: if files[ind] in lib[0]: files[ind] = lib[1] + '-' + lib[-1] break pval = [] foldchange = [] for igene in range(len(rpkmdata[0])): rpkmRT = [] rpkmnormal = [] for ind in xrange(len(rpkmdata)): if 'RT' in files[ind]: rpkmRT.append(rpkmdata[ind][igene][2]) elif 'Cancer' not in files[ind]: rpkmnormal.append(rpkmdata[ind][igene][2]) rpkmRT = np.array(rpkmRT) #rpkmRT=np.log(rpkmRT) rpkmnormal = np.array(rpkmnormal) #rpkmnormal=np.log(rpkmnormal) a = ss.ks_2samp(rpkmRT, rpkmnormal) if a[1] < thersh / np.float(20000) and np.mean(rpkmnormal) + np.mean( rpkmRT) > 1.: genelist.append(rpkmdata[0][igene][0]) #print rpkmdata[0][igene][0], 'pvalue RT=',(a[1]) pval.append(-np.log(a[1] * np.float(20000))) #if np.mean(rpkmnormal)<0: #print rpkmnormal #rpkmRT=np.array(rpkmRT) #rpkmnormal=np.log(rpkmnormal) foldchange.append(np.log( np.median(rpkmnormal) / np.median(rpkmRT))) print 'RT sample:', len(rpkmRT), ', Normal sample:', len(rpkmnormal) if len(enslist) > 0: enslist = read.read_dat(enslist[0]) genelist2 = ens_genes(genelist, enslist) #print genelist2 return genelist, foldchange, pval
def gene_rpkm_compare(thersh,rpkmdata,rpkmfiles,libs,*enslist): genelist=[] files=[] for ind in xrange(len(rpkmdata)): files.append(rpkmfiles[ind][:rpkmfiles[ind].index('.')]) libs=read.read_dat(libs,'\t') for ind in range(len(files)): for lib in libs: if files[ind] in lib[0]: files[ind]=lib[1]+'-'+lib[-1] break pval=[] foldchange=[] for igene in range(len(rpkmdata[0])): rpkmRT=[] rpkmnormal=[] for ind in xrange(len(rpkmdata)): if 'RT' in files[ind]: rpkmRT.append(rpkmdata[ind][igene][2]) elif 'Cancer' not in files[ind]: rpkmnormal.append(rpkmdata[ind][igene][2]) rpkmRT=np.array(rpkmRT) #rpkmRT=np.log(rpkmRT) rpkmnormal=np.array(rpkmnormal) #rpkmnormal=np.log(rpkmnormal) a=ss.ks_2samp(rpkmRT,rpkmnormal) if a[1]<thersh/np.float(20000) and np.mean(rpkmnormal)+np.mean(rpkmRT)>1.: genelist.append(rpkmdata[0][igene][0]) #print rpkmdata[0][igene][0], 'pvalue RT=',(a[1]) pval.append(-np.log(a[1]*np.float(20000))) #if np.mean(rpkmnormal)<0: #print rpkmnormal #rpkmRT=np.array(rpkmRT) #rpkmnormal=np.log(rpkmnormal) foldchange.append(np.log(np.median(rpkmnormal)/np.median(rpkmRT))) print 'RT sample:',len(rpkmRT), ', Normal sample:',len(rpkmnormal) if len(enslist)>0: enslist=read.read_dat(enslist[0]) genelist2=ens_genes(genelist,enslist) #print genelist2 return genelist, foldchange, pval
def pl_PET(dr, table): def pl(tmpdata): ls = [] for i in tmpdata: ls.append(float(i[1])) return np.array(ls) lable = [ 'H3K4me1', 'H3K4me3', 'H3K9me3', 'H3K27me3', 'H3K36me3', 'H3K27ac' ] tab = read.read_dat(table) libs = [] for i in range(len(lable)): for j in range(len(tab[0])): if lable[i] in tab[0][j]: libs.append([]) for k in range(1, 17): libs[-1].append(tab[k][j]) num = len(libs[0]) data = [] ind = 0 dr = os.popen('ls ' + dr + '*' + 'A' + '*.dist') for fl in dr.readlines(): for i in range(len(lable)): fig = plt.figure(i) plt.title(lable[i]) for lib in libs[i]: #print lib, lable[i] if lib in fl[:-1]: #print lib,fl[:-1],lable[i] tmpdata = pl(read.read_dat(fl[:-1])) data.append(tmpdata) plt.plot(range(50, 550), tmpdata[50:550] / np.max(tmpdata[50:550]), 's-', label=lib) #libs.append(lib) plt.legend(prop={'size': 14}) print i for i in range(len(lable)): fig = plt.figure(i) fig.savefig(lable[i] + '.pdf', bbox_inches='tight') return data
def readall_coverage(dir,mark,libs): ext='.coverage' try: direct=os.popen('ls '+dir +'/*'+mark+'*'+ext) except: direct=os.popen('ls '+dir +'/'+mark+'*'+ext) data=[] files=[] for fl in direct.readlines(): #print fl tmpdata=read.read_dat(fl[:-1],'\t') data.append(tmpdata) files.append(fl.split('/')[-1][0:-len('.coverage')-1]) genes=[] for i in data[0]: genes.append(i[3]) mat=[] #\genes=np.array(genes) for i in data: mat.append(np.zeros((len(genes)),np.float)) for j in i: try: mat[-1][genes.index(j[3])]+=np.float(j[4]) #print j[2] except: pass mat=np.array(mat) print 'ending' libs=read.read_dat(libs,'\t') for i in libs: for j in range(len(files)): if i[-1] in files[j]: files[j]=i[0] if 'hg19v69_genes.TSS_2000.pc.' in files[j]: ind=files[j].index('hg19v69_genes.TSS_2000.pc.') files[j]=files[j][ind]+files[j][ind+len('hg19v69_genes.TSS_2000.pc.'):] #break return files,mat,genes
def pl_PET(dr,table): def pl(tmpdata): ls=[] for i in tmpdata: ls.append(float(i[1])) return np.array(ls) lable=['H3K4me1','H3K4me3','H3K9me3','H3K27me3','H3K36me3','H3K27ac'] tab=read.read_dat(table) libs=[] for i in range(len(lable)): for j in range(len(tab[0])): if lable[i] in tab[0][j]: libs.append([]) for k in range(1,17): libs[-1].append(tab[k][j]) num=len(libs[0]) data=[] ind=0 dr=os.popen('ls '+ dr +'*'+'A'+'*.dist') for fl in dr.readlines(): for i in range(len(lable)): fig=plt.figure(i) plt.title(lable[i]) for lib in libs[i]: #print lib, lable[i] if lib in fl[:-1]: #print lib,fl[:-1],lable[i] tmpdata=pl(read.read_dat(fl[:-1])) data.append(tmpdata) plt.plot(range(50,550),tmpdata[50:550]/np.max(tmpdata[50:550]),'s-',label=lib) #libs.append(lib) plt.legend(prop={'size':14}) print i for i in range(len(lable)): fig=plt.figure(i) fig.savefig(lable[i]+'.pdf', bbox_inches='tight') return data
def write_QC1(fl,ordered,targetid,ref): #fread=open(fl,'r') lable=['H3K4me1','H3K4me3','H3K9me3','H3K27me3','H3K36me3','H3K27ac','Input DNA'] data=read.read_dat(fl) num=len(ordered)/len(lable) ordered=list(ordered) ln=data[0] i=0 for tmpi in range(1,len(data)): tmpdata=data[tmpi] libid=str(tmpdata[2]) if libid in ordered: ind=ordered.index(libid) tmpdata.append(lable[int(ind/float(num))]) idi=0 for idtmp in range(1,len(targetid)): for libtmp in targetid[idtmp]: if str(libid[-4:]) in str(libtmp): idi=idtmp tmp=targetid[idi][-1] for itm in ref: if tmp in itm[0]: tmp=itm[-1] if idi==0: print 'aha' tmpdata.append(str(tmp)) if float(tmpdata[5])<20000000: tmpdata.append('failed') else: tmpdata.append('passed') #data[tmpi]=tmpdata fwrite=open('table.txt','w') for i in ln: print >> fwrite, str(i), print '\n' for tmpdata in data: for i in tmpdata: try: print >> fwrite, float(i),'\t', except: print >> fwrite, str(i),'\t', print >> fwrite, '\n', fwrite.close() return data
def readall_bed(dir,mark,libs,*flagstat): direct=os.popen('ls '+dir +'/*'+mark+'*.coverage') data=[] files=[] for fl in direct.readlines(): fl=fl.strip() for alib in libs: if alib in fl and '#' not in alib: tmpdata=read.read_bed(fl) data.append(tmpdata) #location=fl[::-1].index('/') libst=['A','HS','E'] ind=0 run=True while run and ind<len(libst): try: Ai=fl.index(libst[ind]) run=False Aend=Ai+fl[Ai:].index('.') except: ind+=1 Ai=fl.index('/') Aend=len(fl) #print libst[ind-1] , 'tryed'; fl = fl[Ai:Aend] #print fl if mark in fl: fl=fl[fl.index(mark):] fl=fl[:len(mark)+1+fl[len(mark)+1:].index('.')] #print fl files.append(fl.strip()) if len(flagstat)>0: flagstat=str(flagstat[0]) normalization=read.read_dat(flagstat,'\t') for fl in range(len(files)): bool=False for i in normalization: if files[fl] in i[0]: data[fl]=norm_bed(data[fl],i[-1]) #print i[-1] bool=True break if not bool: print files[fl], 'not found' return files, data
def write_QC1(fl, ordered, targetid, ref): # fread=open(fl,'r') lable = ["H3K4me1", "H3K4me3", "H3K9me3", "H3K27me3", "H3K36me3", "H3K27ac", "Input DNA"] data = read.read_dat(fl) num = len(ordered) / len(lable) ordered = list(ordered) ln = data[0] i = 0 for tmpi in range(1, len(data)): tmpdata = data[tmpi] libid = str(tmpdata[2]) if libid in ordered: ind = ordered.index(libid) tmpdata.append(lable[int(ind / float(num))]) idi = 0 for idtmp in range(1, len(targetid)): for libtmp in targetid[idtmp]: if str(libid[-4:]) in str(libtmp): idi = idtmp tmp = targetid[idi][-1] for itm in ref: if tmp in itm[0]: tmp = itm[-1] if idi == 0: print "aha" tmpdata.append(str(tmp)) if float(tmpdata[5]) < 20000000: tmpdata.append("failed") else: tmpdata.append("passed") # data[tmpi]=tmpdata fwrite = open("table.txt", "w") for i in ln: print >> fwrite, str(i), print "\n" for tmpdata in data: for i in tmpdata: try: print >> fwrite, float(i), "\t", except: print >> fwrite, str(i), "\t", print >> fwrite, "\n", fwrite.close() return data
def read_dist(dr): lib=[] dist=[] direct=os.popen('ls '+dr +'/*.dist') #print direct,direct[0], len(direct) for fl in direct.readlines(): #print fl data=read.read_dat(fl[:-1]) tmp=[] for line in data: tmp.append(float(line[-2])) dist.append(np.array(tmp)) lib.append(fl[1+len(dr):len(dr)+7]) return lib,dist
def read_alldir(dir,ext,*include): #h3k27files, allh3k27=read.read_alldir('rhabdoid/coverage/TSS_2000_all','coverage','H3K27me3') data=[] files=[] spl='\t' if len(include)>0: include='*'+str(include[0])+'*' else: include='*' direct=os.popen('ls '+dir +'/'+include+ext) for fl in direct.readlines(): tmpdata=read.read_dat(fl[:-1],str(spl)) data.append(tmpdata) files.append(fl[-fl[::-1].index('/'):][:-1]) return files, data
def readall_genescore(dr,mark,lib,genes): direct=os.popen('ls '+dr +'*'+mark+'*.coverage') data=[] mylib=[] vs=[] for i in xrange(len(lib)): if '#' not in lib[i]: if '-' in lib[i]: mylib.append(lib[i].split('-')[0]) else: mylib.append(lib[i]) data={} ind=-1 for fl in direct.readlines(): ind+=1 for alib in mylib: if alib in fl: found=True tmpdata=read.read_dat(fl[:-1],'\t') data[alib]=tmpdata break mydata=[] for alib in mylib: #if '#' not in lib: mydata.append(data[alib]) genenum=len(genes) libnum=len(mylib) mat=np.zeros((genenum,libnum),np.float) for genei in xrange(genenum): for libi in xrange(libnum): found=False for line in mydata[libi]: if genes[genei] in line[3]: mat[genei,libi]=np.float(line[-2]) found=True break #if found: # print 'found' #else: # print genes[genei],mylib[libi] return mat
def write_bed_loc(projlist, tresh, coverfile): data = read.read_dat(coverfile, "\t") f = open("out.bed", "w") ind = 0 print len(data), len(data[0]) genes = [] for i in xrange(len(projlist)): if abs(projlist[i]) > tresh: ind += 1 stout = "" genes.append(str(data[i][3])) for j in range(3): stout += str(data[i][j]) + "\t" print >> f, stout[:-1] print "num of peaks: ", ind f.close() return genes
def write_bed_loc(projlist,tresh,coverfile): data=read.read_dat(coverfile,'\t') f=open('out.bed','w') ind=0 print len(data), len(data[0]) genes=[] for i in xrange(len(projlist)): if abs(projlist[i])>tresh: ind+=1 stout='' genes.append(str(data[i][3])) for j in range(3): stout+=str(data[i][j])+'\t' print >> f, stout[:-1] print 'num of peaks: ',ind f.close() return genes
def mk_cluster(dr,mark,loc,*perc): #writes the tanle for clustering header=[] if len(perc)>0: f=open('cluster-'+mark+'-'+loc+'-'+str(perc[0])+'.txt','w') else: f=open('cluster-'+mark+'-'+loc+'.txt','w') direct=os.popen('ls '+dr +'/*'+mark+'*.coverage') for fl in direct.readlines(): #print fl libst=['A','HS','E'] ind=0 run=True while run and ind<=len(libst): try: Ai=fl.index(libst[ind]) run=False except: ind+=1 Aend=Ai+fl[Ai:].index('.') print >> f, '\t',fl[Ai:Aend], header.append(fl[Ai:Aend]) print >> f, '\n', direct=os.popen('ls '+dr +'/*'+mark+'*.coverage') data=[] for fl in direct.readlines(): if loc in fl: tmpdata=read.read_dat(fl[:-1],'\t') if len(tmpdata)<1: print len(tmpdata),fl[:-1] else: data.append(tmpdata) print len(data),len(data[0]) enrich=[] lable=[] table=[] for line in range(len(data[0])): table.append([]) #lable.append([]) for tmpdata in data: #print (tmpdata[0]) table[-1].append(float(tmpdata[line][-2])) table[-1]=np.array(table[-1]) tmp2='' for i in range(4): tmp2=tmp2+str(tmpdata[line][i])+'_' tmp2=tmp2[:-1]+'_1' lable.append(tmp2) table=np.array(table) print len(table[0,:]), len(table[:,0]) if len(perc)>0: for row in range(len(table[0,:])): tmp=np.percentile(table[row,:],float(perc[0])) print tmp, table[row,:][table[row,:]<tmp]=0. enrich=[] for i in range(len(table[:,0])): enrich.append('') for j in range(len(table[0,:])): enrich[-1]+=str(table[i,j])+'\t' for line in range(len(data[0])): print >> f, lable[line],enrich[line] #print lable[:3],enrich[:3] f.close() return lable,header,table
def read_wig(file,*flagstat): elts = {} f = open(file,'r') if '.gz' in file: f = gzip.open(file) else: f = open(file,'r') s = f.readline() print s s = f.readline() ind=0 cover1=int(s[:-1]) f.close() if '.gz' in file: f = gzip.open(file) else: f = open(file,'r') count=0 data=f.read() data=data.splitlines() print 'reading done' for s in data: #count+=1 #if count >10000000: # break try: cover=int(s) if cover==cover1: reg+=step else: begin=start+ind*step end=begin+reg start=end ind=0 elts[chrom].append([begin,end,cover1]) cover1=cover reg=step #ind+=step except: ind=0 s = s.split() s=map(str,s) tmp=s[1] chrom=tmp[6:] tmp=s[2] start=int(tmp[6:]) tmp=s[3] step=int(tmp[5:]) reg=step if not elts.has_key(chrom): elts[chrom] = [] #s = f.readline() f.close() for key in elts.keys(): elts[key]=np.array(elts[key]) try: if len(flagstat)>0: flagstat=str(flagstat[0]) normalization=read.read_dat(flagstat,'\t') fl=file[0:6] print fl bool=False for i in normalization: if fl in i[0]: elts=norm_wig(elts,i[-1]) #print i[-1] bool=True break if not bool: print fl, 'not found' except: pass return elts
import read, write, bedtools, analyse import matplotlib.pyplot as plt dirPD = str(sys.argv[1]) dirin = str(sys.argv[2]) fl = str(sys.argv[3]) fl = dirin + fl mark = str(sys.argv[4]) sample1 = str(sys.argv[5]) sample2 = str(sys.argv[6]) field = int(sys.argv[7]) libsfile = str(sys.argv[8]) hugo = str(sys.argv[9]) libsdata = read.read_dat(libsfile, '\t') genelist = [] genelist = read.read_dat(hugo) if os.path.isfile(fl) and os.path.getsize(fl) > 0: print fl, field, libsfile nm, libsRNA, gene, genex, libs = analyse.heat_rna(fl, genelist, field, libsfile) rpkmmat = nm * 1. plt.xlabel(sample1 + ' vs. ' + sample2 + ' rpkm') plt.savefig(dirin + sample1 + '-' + sample2 + '-rpkm.pdf', bbox_inches='tight') print dirin + sample1 + '-' + sample2 + '-rpkm.pdf' plt.close()
import os, sys import commands import read import write import numpy as np tmp=sys.argv fl=tmp[1] table=read.read_dat("peaks.txt",'\t') data=read.read_dat(fl,'\t') mark=map(str,tmp[2].split('_')) #mark=mark[mark.index('_')+1:] mark=mark[-1] vecs=['pcDNA','K4E','Y69H','D83V'] #print data[1]D83V for row in table: for tmp in row: if "bwa-0." in tmp: if tmp in data[1][5]: #print data[1][5] cellline = data[-1][-1] cellline = cellline [:cellline.index('_')] print cellline vs='WT' out='table'+'-'+vs+'_all_vectors.txt' f=open(out,'a') tmp=fl[fl.index('/')+1:] rep=int(tmp[-len('2_peaks.xls'):-len('2_peaks.xls')+1])
dirPD=str(sys.argv[1]) dirin=str(sys.argv[2]) fl=str(sys.argv[3]) fl=dirin+fl mark=str(sys.argv[4]) sample1=str(sys.argv[5]) sample2=str(sys.argv[6]) field=int(sys.argv[7]) libsfile=str(sys.argv[8]) hugo=str(sys.argv[9]) libsdata=read.read_dat(libsfile,'\t') genelist=[] genelist=read.read_dat(hugo) if os.path.isfile(fl) and os.path.getsize(fl) > 0: print fl,field,libsfile nm,libsRNA,gene,genex,libs=analyse.heat_rna(fl,genelist,field,libsfile) rpkmmat=nm*1. plt.xlabel(sample1+' vs. '+sample2+' rpkm') plt.savefig(dirin+sample1+'-'+sample2+'-rpkm.pdf', bbox_inches='tight') print dirin+sample1+'-'+sample2+'-rpkm.pdf' plt.close() # try:
import os, sys import commands import read import write import numpy as np import time import subprocess table = read.read_dat("peaks.txt") data = table cells = ["D83V", "K4E", "pcDNA", "WT", "Y69H"] marks = ["H3K27ac", "H3K4me3", "H3K9me3", "V5", "input"] ran = 0 for irow in xrange(1, len(data)): row = data[irow] vector = row[-3] solution = row[-1][:-2] mymark = "" if "input" in row[-2]: mymark = "input" else: mymark = row[-2] if '1' in row[-4]: rep = '1' else: rep = '2' bam = row[5] if 'WT' not in vector and 'input' not in mymark and 'H3K4' not in mymark: for ind in xrange(-1, 4): try: ii = data[irow + ind][-3]
import sys, os, numpy as np sys.path.append('/Users/ssaberim/epigenomics/code') import read,write,bedtools,analyse ID=str(sys.argv[1]) drrna=str(sys.argv[2]) sample=str(sys.argv[3]) outfile=str(sys.argv[4]) field=int(sys.argv[5]) libsfile=str(sys.argv[6]) mark='RNA' libsdata=read.read_dat(libsfile,'\t') indmark=libsdata[0].index(mark) list1=[] for i in range(1,len(libsdata)): if libsdata[i][field] in sample and len(libsdata[i][field])>0: list1.append(libsdata[i][indmark]) hugo="/Users/ssaberim/epigenomics/resources/list.genes2.txt" hugo=read.read_dat(hugo) #print hugo[0] ID=read.read_dat(ID,'\t') for i in range(len(ID)): for j in range(len(hugo)):
def mk_cluster(dr, mark, loc, *perc): #writes the tanle for clustering header = [] if len(perc) > 0: f = open('cluster-' + mark + '-' + loc + '-' + str(perc[0]) + '.txt', 'w') else: f = open('cluster-' + mark + '-' + loc + '.txt', 'w') direct = os.popen('ls ' + dr + '/*' + mark + '*.coverage') for fl in direct.readlines(): #print fl libst = ['A', 'HS', 'E'] ind = 0 run = True while run and ind <= len(libst): try: Ai = fl.index(libst[ind]) run = False except: ind += 1 Aend = Ai + fl[Ai:].index('.') print >> f, '\t', fl[Ai:Aend], header.append(fl[Ai:Aend]) print >> f, '\n', direct = os.popen('ls ' + dr + '/*' + mark + '*.coverage') data = [] for fl in direct.readlines(): if loc in fl: tmpdata = read.read_dat(fl[:-1], '\t') if len(tmpdata) < 1: print len(tmpdata), fl[:-1] else: data.append(tmpdata) print len(data), len(data[0]) enrich = [] lable = [] table = [] for line in range(len(data[0])): table.append([]) #lable.append([]) for tmpdata in data: #print (tmpdata[0]) table[-1].append(float(tmpdata[line][-2])) table[-1] = np.array(table[-1]) tmp2 = '' for i in range(4): tmp2 = tmp2 + str(tmpdata[line][i]) + '_' tmp2 = tmp2[:-1] + '_1' lable.append(tmp2) table = np.array(table) print len(table[0, :]), len(table[:, 0]) if len(perc) > 0: for row in range(len(table[0, :])): tmp = np.percentile(table[row, :], float(perc[0])) print tmp, table[row, :][table[row, :] < tmp] = 0. enrich = [] for i in range(len(table[:, 0])): enrich.append('') for j in range(len(table[0, :])): enrich[-1] += str(table[i, j]) + '\t' for line in range(len(data[0])): print >> f, lable[line], enrich[line] #print lable[:3],enrich[:3] f.close() return lable, header, table
def gene_rpkm_corr_compare(thegene,thersh,rpkmdata,rpkmfiles,libs,*enslist): #usage: #genescompared,genesfc,pvals=\ #analyse.gene_rpkm_corr_compare('ENSG00000108799',0.000001,rpkm,files,'rhabdoid/RNA-ALL-key.txt','resources/list.genes2.txt') genelist=[] rpkmgene=[] geneimp=[] pvals=[] files=[] for ind in xrange(len(rpkmdata)): for i in rpkmdata[ind]: if thegene in i[0]: rpkmgene.append(i[2]) files.append(rpkmfiles[ind][:rpkmfiles[ind].index('.')]) break libs=read.read_dat(libs,'\t') for ind in range(len(files)): for lib in libs: if files[ind] in lib[0] : files[ind]=lib[1]+'-'+lib[-1] break rpkmgeneRT=[] rpkmgenenormal=[] for ind in xrange(len(rpkmgene)): #if 'Blood' not in files[ind]: #print files[ind] if 'RT' in files[ind]: rpkmgeneRT.append(rpkmgene[ind]) else: rpkmgenenormal.append(rpkmgene[ind]) rpkmgeneRT=np.array(rpkmgeneRT) rpkmgenenormal=np.array(rpkmgenenormal) print len(rpkmgeneRT), len(rpkmgenenormal) for igene in range(len(rpkmdata[0])): rpkmRT=[] rpkmnormal=[] for ind in xrange(len(rpkmdata)): if 'Blood' not in files[ind]: if 'RT' in files[ind] : rpkmRT.append(rpkmdata[ind][igene][2]) else: rpkmnormal.append(rpkmdata[ind][igene][2]) #print len(rpkmRT) #print len(rpkmnormal), rpkmnormal[0] rpkmRT=np.array(rpkmRT) rpkmnormal=np.array(rpkmnormal) a=ss.pearsonr(rpkmRT,rpkmgeneRT) b=ss.pearsonr(rpkmnormal,rpkmgenenormal) mn=np.mean(rpkmRT)/np.mean(rpkmnormal) if b[1]<thersh and (a[1]/b[1])>thersh: genelist.append(rpkmdata[0][igene][0]) if mn>1. or mn <1.: #print rpkmdata[0][igene][0], 'correlation for normal=%.3f, pvalue normal=%.5f, pvalue RT=%.5f, foldchange=%.3f'%(b[0],b[1],a[1],mn) geneimp.append(mn) pvals.append(np.log(a[1]/b[1])) return genelist , geneimp, pvals
import os, sys import commands import read import numpy as np import time import subprocess import write table=read.read_dat("peaks.txt",'\t') data=table cells=["D83V","K4E", "pcDNA","WT","Y69H"] marks=["H3K27ac","H3K4me3","H3K9me3","V5","input"] ran=0 for irow in xrange(2,len(data)): row=data[irow] vector=row[-3] solution=row[-1][:-2] mymark="" #print row[-2] if "input" in row: mymark="input" else: mymark=row[-2] if '1' in row[-4]: rep='1' else: rep='2' bam=row[5] if 'WT' not in vector and 'input' not in mymark and mymark in marks: for ind in xrange(-1,4): try: ii=data[irow+ind][-3]
def gene_rpkm_corr_compare(thegene, thersh, rpkmdata, rpkmfiles, libs, *enslist): #usage: #genescompared,genesfc,pvals=\ #analyse.gene_rpkm_corr_compare('ENSG00000108799',0.000001,rpkm,files,'rhabdoid/RNA-ALL-key.txt','resources/list.genes2.txt') genelist = [] rpkmgene = [] geneimp = [] pvals = [] files = [] for ind in xrange(len(rpkmdata)): for i in rpkmdata[ind]: if thegene in i[0]: rpkmgene.append(i[2]) files.append(rpkmfiles[ind][:rpkmfiles[ind].index('.')]) break libs = read.read_dat(libs, '\t') for ind in range(len(files)): for lib in libs: if files[ind] in lib[0]: files[ind] = lib[1] + '-' + lib[-1] break rpkmgeneRT = [] rpkmgenenormal = [] for ind in xrange(len(rpkmgene)): #if 'Blood' not in files[ind]: #print files[ind] if 'RT' in files[ind]: rpkmgeneRT.append(rpkmgene[ind]) else: rpkmgenenormal.append(rpkmgene[ind]) rpkmgeneRT = np.array(rpkmgeneRT) rpkmgenenormal = np.array(rpkmgenenormal) print len(rpkmgeneRT), len(rpkmgenenormal) for igene in range(len(rpkmdata[0])): rpkmRT = [] rpkmnormal = [] for ind in xrange(len(rpkmdata)): if 'Blood' not in files[ind]: if 'RT' in files[ind]: rpkmRT.append(rpkmdata[ind][igene][2]) else: rpkmnormal.append(rpkmdata[ind][igene][2]) #print len(rpkmRT) #print len(rpkmnormal), rpkmnormal[0] rpkmRT = np.array(rpkmRT) rpkmnormal = np.array(rpkmnormal) a = ss.pearsonr(rpkmRT, rpkmgeneRT) b = ss.pearsonr(rpkmnormal, rpkmgenenormal) mn = np.mean(rpkmRT) / np.mean(rpkmnormal) if b[1] < thersh and (a[1] / b[1]) > thersh: genelist.append(rpkmdata[0][igene][0]) if mn > 1. or mn < 1.: #print rpkmdata[0][igene][0], 'correlation for normal=%.3f, pvalue normal=%.5f, pvalue RT=%.5f, foldchange=%.3f'%(b[0],b[1],a[1],mn) geneimp.append(mn) pvals.append(np.log(a[1] / b[1])) return genelist, geneimp, pvals
def gene_rpkm_lineage(hugo, rpkmdata, rpkmfiles, allgenes, libs, *plotbool): genelist = [] for i in range(len(hugo)): genelist.append(hugo[i]) genelist = ens_genes(genelist, allgenes) rpkmls = [] files = [] for gene in genelist: rpkmls.append([]) for ind in xrange(len(rpkmdata)): for i in rpkmdata[ind]: if gene in i[0]: rpkmls[-1].append(i[2]) if len(rpkmls) == 1: files.append( rpkmfiles[ind][:rpkmfiles[ind].index('.')]) break files = np.array(files) rpkmls = data2arr(rpkmls) sortedinds = rpkmls[0].argsort() files = files[sortedinds] files = files[::-1] for i in range(len(genelist)): rpkmls[i] = rpkmls[i][sortedinds] rpkmls[i] = rpkmls[i][::-1] normalfiles = [] normalrpkms = [] RTfiles = [] RTrpkms = [] libs = read.read_dat(libs, '\t') for ind in range(len(files)): for lib in libs: if files[ind] in lib[0]: files[ind] = lib[1] + '-' + lib[-1] break for i in range(len(genelist)): rpkmls[i] = np.log(rpkmls[i] + 0.001) / np.log(10) for i in range(len(genelist)): normalrpkms.append([]) RTrpkms.append([]) for ind in range(len(files)): if 'Cancer' not in files[ind] and 'ES' not in files[ind]: print files[ind] normalfiles.append(files[ind]) for i in range(len(genelist)): normalrpkms[i].append(rpkmls[i][ind]) elif 'RT' in files[ind]: RTfiles.append(files[ind]) for i in range(len(genelist)): RTrpkms[i].append(rpkmls[i][ind]) plt.figure() for i in range(len(genelist)): if i < 7: plt.plot(xrange(len(list(files))), rpkmls[i], 's-', label=hugo[i]) else: plt.plot(xrange(len(list(files))), rpkmls[i], '^-', label=hugo[i]) plt.legend(loc=1) plt.xticks(range(len(files)), files) plt.xticks(rotation=90) plt.xticks(fontsize=8) plt.ylabel('$log_{10}$ $RPKM$') plt.grid() plt.show() def heatmap(rpkmls, hugo, txt): rpkmls = data2arr(rpkmls) norm, matrix, dist = analyse.all_corr(rpkmls.T) plt.matshow((matrix), cmap='RdYlBu', vmax=1, vmin=-1) plt.yticks(range(len(hugo)), hugo) plt.xticks(range(len(hugo)), hugo) plt.xticks(rotation=90) plt.xlabel('Correlation-' + txt) plt.colorbar() #plt.matshow(np.log(pval),cmap='PuBu',vmax=-5,vmin=-25) #plt.yticks(range(len(hugo)),hugo) #plt.xticks(range(len(hugo)),hugo) #plt.xlabel('P-Value $\log$ -'+txt) #plt.xticks(rotation=90) #plt.colorbar() if len(plotbool) > 0: if plotbool[0]: heatmap(rpkmls, hugo, 'All Samples') heatmap(normalrpkms, hugo, 'Normal Samples') heatmap(RTrpkms, hugo, 'RT Samples') for i in range(len(hugo)): print hugo[i], ss.ks_2samp(normalrpkms[i], RTrpkms[i])[1] * np.float( len(allgenes)), 'KS' print hugo[i], ss.ttest_ind(normalrpkms[i], RTrpkms[i])[1] * np.float( len(allgenes)), 'T-test' #fig=plt.figure() #plt.plot() return rpkmls, files, data2arr(normalrpkms), np.array( normalfiles), genelist
import os, sys import commands import read import write import numpy as np tmp=sys.argv fl=tmp[1] #vs=str(tmp[3]) mark=map(str,tmp[2].split('-')) mark=mark[1] vs='input' data=read.read_dat(fl) out='table'+'-'+vs+'.txt' f=open(out,'a') tmp=fl[fl.index('/')+1:] solution=tmp.split('_')[1] rep=tmp.split('_')[2] tmp = str(data[-1][-1]) tmp=tmp.split('_') rank=tmp[2] coverage=tmp[-1] vec=tmp[0] try: print >> f, rep,mark,vec,solution,vs,coverage except: print vec f.close()
def heat_rna(fl,genelist,field,libfile): data=read.read_dat(fl) data=data[:-1] if [''] in data: data.delete([['']]) lib=[] lib2=[] gene=[] for i in data: try: if i[2] not in gene: gene.append(i[2]) except: print i libstable=read.read_dat(libfile,'\t') for i in data: try: if i[0]+'-'+i[1] not in lib: lib.append(i[0]+'-'+i[1]) lib2.append(i[0]+'-'+i[1]) if len(libfile)>0: for ID in libstable: if i[0] in ID: lib2[-1]=ID[0]+'-'+ID[field] break except: print i pheno=[] pnum=-1.5 hist='' for i in lib: j=i.split('-') if j[1] != hist: pnum+=1 pheno.append(pnum) hist=j[1] #print hist,pnum mat=np.zeros((len(gene),len(lib)),np.float) for i in data: mat[gene.index(i[2]),lib.index(i[0]+'-'+i[1])]=np.float(i[3]) proj=np.dot(mat,pheno) for i in xrange(len(proj)): norm=np.sqrt(np.dot(mat[i,:],mat[i,:])) if norm!=0.: proj[i]/=norm #proj=np.array(proj) inds=sortedinds=proj.argsort() mat=mat[inds] gene=np.array(gene) gene=gene[inds] inds=np.any(mat != 0, axis=1) mat=mat[inds] gene=np.array(gene) gene=gene[inds] #for i in mat: # if np.mean(abs(i))==0: # print i nm=norm_max(mat) #for i in nm: # if np.mean(abs(i))==0: # print i genex=gene if len(genelist)>0: lb=ens_genes(gene,genelist) genex=lb else: return nm,lib2,gene,genex,lib b=plt.matshow(nm,aspect='auto',cmap='RdYlBu') if len(libfile)>0: plt.xticks(range(len(lib2)),lib2) else: plt.xticks(range(len(lib)),lib) plt.colorbar() plt.xticks(rotation=90) plt.yticks(range(len(genex)),genex) plt.yticks(fontsize=8) mytemplate(nm) plt.xlabel(lb) lb=fl.split('/')[-1][:-5] return nm,lib2,gene,genex,lib
def heat_rna(fl, genelist, field, libfile): data = read.read_dat(fl) data = data[:-1] if [''] in data: data.delete([['']]) lib = [] lib2 = [] gene = [] for i in data: try: if i[2] not in gene: gene.append(i[2]) except: print i libstable = read.read_dat(libfile, '\t') for i in data: try: if i[0] + '-' + i[1] not in lib: lib.append(i[0] + '-' + i[1]) lib2.append(i[0] + '-' + i[1]) if len(libfile) > 0: for ID in libstable: if i[0] in ID: lib2[-1] = ID[0] + '-' + ID[field] break except: print i pheno = [] pnum = -1.5 hist = '' for i in lib: j = i.split('-') if j[1] != hist: pnum += 1 pheno.append(pnum) hist = j[1] #print hist,pnum mat = np.zeros((len(gene), len(lib)), np.float) for i in data: mat[gene.index(i[2]), lib.index(i[0] + '-' + i[1])] = np.float(i[3]) proj = np.dot(mat, pheno) for i in xrange(len(proj)): norm = np.sqrt(np.dot(mat[i, :], mat[i, :])) if norm != 0.: proj[i] /= norm #proj=np.array(proj) inds = sortedinds = proj.argsort() mat = mat[inds] gene = np.array(gene) gene = gene[inds] inds = np.any(mat != 0, axis=1) mat = mat[inds] gene = np.array(gene) gene = gene[inds] #for i in mat: # if np.mean(abs(i))==0: # print i nm = norm_max(mat) #for i in nm: # if np.mean(abs(i))==0: # print i genex = gene if len(genelist) > 0: lb = ens_genes(gene, genelist) genex = lb else: return nm, lib2, gene, genex, lib b = plt.matshow(nm, aspect='auto', cmap='RdYlBu') if len(libfile) > 0: plt.xticks(range(len(lib2)), lib2) else: plt.xticks(range(len(lib)), lib) plt.colorbar() plt.xticks(rotation=90) plt.yticks(range(len(genex)), genex) plt.yticks(fontsize=8) mytemplate(nm) plt.xlabel(lb) lb = fl.split('/')[-1][:-5] return nm, lib2, gene, genex, lib
def gene_rpkm_lineage(hugo,rpkmdata,rpkmfiles,allgenes,libs,*plotbool): genelist=[] for i in range(len(hugo)): genelist.append(hugo[i]) genelist=ens_genes(genelist,allgenes) rpkmls=[] files=[] for gene in genelist: rpkmls.append([]) for ind in xrange(len(rpkmdata)): for i in rpkmdata[ind]: if gene in i[0]: rpkmls[-1].append(i[2]) if len(rpkmls)==1: files.append(rpkmfiles[ind][:rpkmfiles[ind].index('.')]) break files=np.array(files) rpkmls=data2arr(rpkmls) sortedinds=rpkmls[0].argsort() files=files[sortedinds] files=files[::-1] for i in range(len(genelist)): rpkmls[i]=rpkmls[i][sortedinds] rpkmls[i]=rpkmls[i][::-1] normalfiles=[] normalrpkms=[] RTfiles=[] RTrpkms=[] libs=read.read_dat(libs,'\t') for ind in range(len(files)): for lib in libs: if files[ind] in lib[0]: files[ind]=lib[1]+'-'+lib[-1] break for i in range(len(genelist)): rpkmls[i]=np.log(rpkmls[i]+0.001)/np.log(10) for i in range(len(genelist)): normalrpkms.append([]) RTrpkms.append([]) for ind in range(len(files)): if 'Cancer' not in files[ind] and 'ES' not in files[ind]: print files[ind] normalfiles.append(files[ind]) for i in range(len(genelist)): normalrpkms[i].append(rpkmls[i][ind]) elif 'RT' in files[ind]: RTfiles.append(files[ind]) for i in range(len(genelist)): RTrpkms[i].append(rpkmls[i][ind]) plt.figure() for i in range(len(genelist)): if i<7: plt.plot(xrange(len(list(files))),rpkmls[i],'s-',label=hugo[i]) else: plt.plot(xrange(len(list(files))),rpkmls[i],'^-',label=hugo[i]) plt.legend(loc=1) plt.xticks(range(len(files)),files) plt.xticks(rotation=90) plt. xticks(fontsize=8) plt.ylabel('$log_{10}$ $RPKM$') plt.grid() plt.show() def heatmap(rpkmls,hugo,txt): rpkmls=data2arr(rpkmls) norm,matrix,dist=analyse.all_corr(rpkmls.T) plt.matshow((matrix),cmap='RdYlBu',vmax=1,vmin=-1) plt.yticks(range(len(hugo)),hugo) plt.xticks(range(len(hugo)),hugo) plt.xticks(rotation=90) plt.xlabel('Correlation-'+txt) plt.colorbar() #plt.matshow(np.log(pval),cmap='PuBu',vmax=-5,vmin=-25) #plt.yticks(range(len(hugo)),hugo) #plt.xticks(range(len(hugo)),hugo) #plt.xlabel('P-Value $\log$ -'+txt) #plt.xticks(rotation=90) #plt.colorbar() if len(plotbool)>0: if plotbool[0]: heatmap(rpkmls,hugo,'All Samples') heatmap(normalrpkms,hugo,'Normal Samples') heatmap(RTrpkms,hugo,'RT Samples') for i in range(len(hugo)): print hugo[i],ss.ks_2samp(normalrpkms[i],RTrpkms[i])[1]*np.float(len(allgenes)), 'KS' print hugo[i],ss.ttest_ind(normalrpkms[i],RTrpkms[i])[1]*np.float(len(allgenes)), 'T-test' #fig=plt.figure() #plt.plot() return rpkmls,files,data2arr(normalrpkms),np.array(normalfiles),genelist
import sys,read, write,bedtools, analyse mark=sys.argv[1] sample1=sys.argv[2] sample2=sys.argv[3] num=int(sys.argv[4]) T=float(sys.argv[5]) dirout=sys.argv[6] field=int(sys.argv[7]) libsfile=str(sys.argv[8]) TSSdir=str(sys.argv[9]) beds={} libsdata=read.read_dat(libsfile,'\t') indmark=libsdata[0].index(mark) list1=[] list2=[] for i in xrange(1,len(libsdata)): if sample1 in libsdata[i][field] and len(libsdata[i][indmark])>1: list1.append(libsdata[i][indmark]) elif sample2 in libsdata[i][field] and len(libsdata[i][indmark])>1: list2.append(libsdata[i][indmark]) beds={} files1,beds[sample1]=read.readall_bed(TSSdir,mark,list1) files2,beds[sample2]=read.readall_bed(TSSdir,mark,list2) genes=read.read_gene_pos('rhabdoid/coverage/TSS_2000_all/hg19v69_genes.TSS_2000.pc.A03480.H3K27me3.GE02.coverage')
#!/usr/local/bin/env python #coding: utf8 import sys, read, write, bedtools, analyse mark = sys.argv[1] sample1 = sys.argv[2] sample2 = sys.argv[3] num = int(sys.argv[4]) T = float(sys.argv[5]) dirout = sys.argv[6] field = int(sys.argv[7]) libsfile = str(sys.argv[8]) TSSdir = str(sys.argv[9]) beds = {} libsdata = read.read_dat(libsfile, '\t') indmark = libsdata[0].index(mark) list1 = [] list2 = [] for i in xrange(1, len(libsdata)): if sample1 in libsdata[i][field] and len(libsdata[i][indmark]) > 1: list1.append(libsdata[i][indmark]) elif sample2 in libsdata[i][field] and len(libsdata[i][indmark]) > 1: list2.append(libsdata[i][indmark]) beds = {} files1, beds[sample1] = read.readall_bed(TSSdir, mark, list1) files2, beds[sample2] = read.readall_bed(TSSdir, mark, list2) genes = read.read_gene_pos( 'rhabdoid/coverage/TSS_2000_all/hg19v69_genes.TSS_2000.pc.A03480.H3K27me3.GE02.coverage'