Exemple #1
0
def read_dir(dir,mark,loc,ext,*spl):
	
	try:
              direct=os.popen('ls '+dir +'/*'+mark+'*'+ext)
	except:
              direct=os.popen('ls '+dir +'/'+mark+'*'+ext)
	data=[]
	files=[]
	for fl in direct.readlines():
		if loc in fl:
			#print fl
			if len(spl)>0:
				tmpdata=read.read_dat(fl[:-1],str(spl[0]))
			else:
				tmpdata=read.read_dat(fl[:-1])
			data.append(tmpdata)
			#location=fl[::-1].index('/')
			libst=['A','HS','E']
			ind=0
			run=True
			while run and ind<len(libst):
				try:
					Ai=fl.index(libst[ind])
					run=False
					Aend=Ai+fl[Ai:].index('.')
				except:
					ind+=1
			fl=read.tabless(fl)
			files.append(fl[Ai:Aend])

	return  files, data
Exemple #2
0
def read_dir(dir,mark,loc,ext,*spl):
	
	try:
              direct=os.popen('ls '+dir +'/*'+mark+'*'+ext)
	except:
              direct=os.popen('ls '+dir +'/'+mark+'*'+ext)
	data=[]
	files=[]
	for fl in direct.readlines():
		if loc in fl:
			#print fl
			if len(spl)>0:
				tmpdata=read.read_dat(fl[:-1],str(spl[0]))
			else:
				tmpdata=read.read_dat(fl[:-1])
			data.append(tmpdata)
			#location=fl[::-1].index('/')
			libst=['A','HS','E']
			ind=0
			run=True
			while run and ind<len(libst):
				try:
					Ai=fl.index(libst[ind])
					run=False
					Aend=Ai+fl[Ai:].index('.')
				except:
					ind+=1
			fl=read.tabless(fl)
			files.append(fl[Ai:Aend])

	return  files, data
Exemple #3
0
def readall_bedintersect(dir,mark,*flagstat):
       
       direct=os.popen('ls '+dir +'/*'+mark+'*.bed')
       data=[]
       files=[]
       for fl in direct.readlines():
          fl=fl.strip()
          tmpdata=read.read_bed_intersect(fl)
          data.append(tmpdata)
          ind=0
          run=True
          fl=fl[-fl[::-1].index('/'):fl.index('.')]
          print fl
          files.append(fl)
       
       if len(flagstat)>0:
                flagstat=str(flagstat[0])
                normalization=read.read_dat(flagstat,'\t')
                for fl in range(len(files)):
                     bool=False
                     for i in normalization:
                            if files[fl] in i[0]:
                                   data[fl]=norm_bed(data[fl],i[-1])
                                   bool=True
                                   break
                     if not bool:
                            print files[fl], 'not found'
                            
       return  files, data
Exemple #4
0
def readall_bedintersect(dir,mark,*flagstat):
       
       direct=os.popen('ls '+dir +'/*'+mark+'*.bed')
       data=[]
       files=[]
       for fl in direct.readlines():
          fl=fl.strip()
          tmpdata=read.read_bed_intersect(fl)
          data.append(tmpdata)
          ind=0
          run=True
          fl=fl[-fl[::-1].index('/'):fl.index('.')]
          print fl
          files.append(fl)
       
       if len(flagstat)>0:
                flagstat=str(flagstat[0])
                normalization=read.read_dat(flagstat,'\t')
                for fl in range(len(files)):
                     bool=False
                     for i in normalization:
                            if files[fl] in i[0]:
                                   data[fl]=norm_bed(data[fl],i[-1])
                                   bool=True
                                   break
                     if not bool:
                            print files[fl], 'not found'
                            
       return  files, data
Exemple #5
0
def CPG_RPKM(RPKM, CPG, lim):


  CGI=[]
  cpg=read.read_dat(CPG,'\t')
  for i in cpg:
    CGI.append(i[0])
  genes=RPKM[0][1:]
  libs=[]
  mat=[]
  for i in RPKM[1:]:
      mat.append(i[1:])
      libs.append(i[0])
  mat=analyse.data2arr(mat)
  genes=np.array(genes)
  allave=[]
  cgiave=[]
  print len(mat)
  for i in range(len(mat)):
      gntmp=genes[mat[i,:]>lim]

      tmp=mat[i,:][mat[i,:]>lim]
      allave.append(np.mean(tmp))
      cgirpkm=[]
      for j in xrange(len(gntmp)):
          if gntmp[j] in CGI:
              cgirpkm.append(tmp[j])
      cgiave.append(np.mean(cgirpkm))
      print len(gntmp), len(cgirpkm)
  allave=np.array(allave)
  cgiave=np.array(cgiave)
  return allave,cgiave, libs
Exemple #6
0
def CPG_RPKM(RPKM, CPG, lim):

    CGI = []
    cpg = read.read_dat(CPG, '\t')
    for i in cpg:
        CGI.append(i[0])
    genes = RPKM[0][1:]
    libs = []
    mat = []
    for i in RPKM[1:]:
        mat.append(i[1:])
        libs.append(i[0])
    mat = analyse.data2arr(mat)
    genes = np.array(genes)
    allave = []
    cgiave = []
    print len(mat)
    for i in range(len(mat)):
        gntmp = genes[mat[i, :] > lim]

        tmp = mat[i, :][mat[i, :] > lim]
        allave.append(np.mean(tmp))
        cgirpkm = []
        for j in xrange(len(gntmp)):
            if gntmp[j] in CGI:
                cgirpkm.append(tmp[j])
        cgiave.append(np.mean(cgirpkm))
        print len(gntmp), len(cgirpkm)
    allave = np.array(allave)
    cgiave = np.array(cgiave)
    return allave, cgiave, libs
Exemple #7
0
def gene_rpkm_compare(thersh, rpkmdata, rpkmfiles, libs, *enslist):

    genelist = []
    files = []
    for ind in xrange(len(rpkmdata)):
        files.append(rpkmfiles[ind][:rpkmfiles[ind].index('.')])

    libs = read.read_dat(libs, '\t')
    for ind in range(len(files)):
        for lib in libs:
            if files[ind] in lib[0]:
                files[ind] = lib[1] + '-' + lib[-1]
                break
    pval = []
    foldchange = []
    for igene in range(len(rpkmdata[0])):
        rpkmRT = []
        rpkmnormal = []
        for ind in xrange(len(rpkmdata)):
            if 'RT' in files[ind]:
                rpkmRT.append(rpkmdata[ind][igene][2])
            elif 'Cancer' not in files[ind]:
                rpkmnormal.append(rpkmdata[ind][igene][2])
        rpkmRT = np.array(rpkmRT)
        #rpkmRT=np.log(rpkmRT)
        rpkmnormal = np.array(rpkmnormal)
        #rpkmnormal=np.log(rpkmnormal)
        a = ss.ks_2samp(rpkmRT, rpkmnormal)

        if a[1] < thersh / np.float(20000) and np.mean(rpkmnormal) + np.mean(
                rpkmRT) > 1.:
            genelist.append(rpkmdata[0][igene][0])
            #print rpkmdata[0][igene][0], 'pvalue RT=',(a[1])
            pval.append(-np.log(a[1] * np.float(20000)))
            #if np.mean(rpkmnormal)<0:
            #print rpkmnormal
            #rpkmRT=np.array(rpkmRT)
            #rpkmnormal=np.log(rpkmnormal)
            foldchange.append(np.log(
                np.median(rpkmnormal) / np.median(rpkmRT)))
    print 'RT sample:', len(rpkmRT), ', Normal sample:', len(rpkmnormal)
    if len(enslist) > 0:
        enslist = read.read_dat(enslist[0])
        genelist2 = ens_genes(genelist, enslist)
        #print genelist2

    return genelist, foldchange, pval
Exemple #8
0
def gene_rpkm_compare(thersh,rpkmdata,rpkmfiles,libs,*enslist):

   genelist=[]
   files=[]
   for ind in xrange(len(rpkmdata)):
            files.append(rpkmfiles[ind][:rpkmfiles[ind].index('.')])

   libs=read.read_dat(libs,'\t')
   for ind in range(len(files)):
     for lib in libs:
        if files[ind] in lib[0]:
            files[ind]=lib[1]+'-'+lib[-1]            
            break
   pval=[]
   foldchange=[]
   for igene in range(len(rpkmdata[0])):
         rpkmRT=[]
         rpkmnormal=[]
         for ind in xrange(len(rpkmdata)):
            if 'RT' in files[ind]:
               rpkmRT.append(rpkmdata[ind][igene][2])
            elif 'Cancer' not in files[ind]:
               rpkmnormal.append(rpkmdata[ind][igene][2])
         rpkmRT=np.array(rpkmRT)
         #rpkmRT=np.log(rpkmRT)
         rpkmnormal=np.array(rpkmnormal)
         #rpkmnormal=np.log(rpkmnormal)
         a=ss.ks_2samp(rpkmRT,rpkmnormal)

         if a[1]<thersh/np.float(20000) and np.mean(rpkmnormal)+np.mean(rpkmRT)>1.:
            genelist.append(rpkmdata[0][igene][0])
            #print rpkmdata[0][igene][0], 'pvalue RT=',(a[1])
            pval.append(-np.log(a[1]*np.float(20000)))
            #if np.mean(rpkmnormal)<0:
               #print rpkmnormal
            #rpkmRT=np.array(rpkmRT)
            #rpkmnormal=np.log(rpkmnormal)
            foldchange.append(np.log(np.median(rpkmnormal)/np.median(rpkmRT)))
   print 'RT sample:',len(rpkmRT), ', Normal sample:',len(rpkmnormal)
   if len(enslist)>0:
         enslist=read.read_dat(enslist[0])
         genelist2=ens_genes(genelist,enslist)
         #print genelist2
   
   return genelist, foldchange, pval
Exemple #9
0
def pl_PET(dr, table):
    def pl(tmpdata):
        ls = []
        for i in tmpdata:
            ls.append(float(i[1]))
        return np.array(ls)

    lable = [
        'H3K4me1', 'H3K4me3', 'H3K9me3', 'H3K27me3', 'H3K36me3', 'H3K27ac'
    ]
    tab = read.read_dat(table)
    libs = []
    for i in range(len(lable)):
        for j in range(len(tab[0])):
            if lable[i] in tab[0][j]:
                libs.append([])
                for k in range(1, 17):
                    libs[-1].append(tab[k][j])

    num = len(libs[0])
    data = []
    ind = 0
    dr = os.popen('ls ' + dr + '*' + 'A' + '*.dist')
    for fl in dr.readlines():
        for i in range(len(lable)):
            fig = plt.figure(i)
            plt.title(lable[i])
            for lib in libs[i]:
                #print lib, lable[i]
                if lib in fl[:-1]:
                    #print lib,fl[:-1],lable[i]
                    tmpdata = pl(read.read_dat(fl[:-1]))
                    data.append(tmpdata)
                    plt.plot(range(50, 550),
                             tmpdata[50:550] / np.max(tmpdata[50:550]),
                             's-',
                             label=lib)
                    #libs.append(lib)
            plt.legend(prop={'size': 14})
    print i
    for i in range(len(lable)):
        fig = plt.figure(i)
        fig.savefig(lable[i] + '.pdf', bbox_inches='tight')

    return data
Exemple #10
0
def readall_coverage(dir,mark,libs):

    ext='.coverage'
    try:
        direct=os.popen('ls '+dir +'/*'+mark+'*'+ext)
    except:
         direct=os.popen('ls '+dir +'/'+mark+'*'+ext)
    data=[]
    files=[]
    for fl in direct.readlines():
			#print fl
			tmpdata=read.read_dat(fl[:-1],'\t')
			data.append(tmpdata)
			files.append(fl.split('/')[-1][0:-len('.coverage')-1])
    genes=[]
    for i in data[0]:
        genes.append(i[3])

    mat=[]
    #\genes=np.array(genes)
    for i in data:
        mat.append(np.zeros((len(genes)),np.float))
        for j in i:
            try:
                mat[-1][genes.index(j[3])]+=np.float(j[4])
                #print j[2]
            except:
                pass
    mat=np.array(mat)
    print 'ending'
    libs=read.read_dat(libs,'\t')
    for i in libs:
        for j in range(len(files)):
          if i[-1] in files[j]:
              files[j]=i[0]
          if 'hg19v69_genes.TSS_2000.pc.' in files[j]:
              ind=files[j].index('hg19v69_genes.TSS_2000.pc.')
              files[j]=files[j][ind]+files[j][ind+len('hg19v69_genes.TSS_2000.pc.'):]


              #break

    return  files,mat,genes
Exemple #11
0
def readall_coverage(dir,mark,libs):

    ext='.coverage'
    try:
        direct=os.popen('ls '+dir +'/*'+mark+'*'+ext)
    except:
         direct=os.popen('ls '+dir +'/'+mark+'*'+ext)
    data=[]
    files=[]
    for fl in direct.readlines():
			#print fl
			tmpdata=read.read_dat(fl[:-1],'\t')
			data.append(tmpdata)
			files.append(fl.split('/')[-1][0:-len('.coverage')-1])
    genes=[]
    for i in data[0]:
        genes.append(i[3])

    mat=[]
    #\genes=np.array(genes)
    for i in data:
        mat.append(np.zeros((len(genes)),np.float))
        for j in i:
            try:
                mat[-1][genes.index(j[3])]+=np.float(j[4])
                #print j[2]
            except:
                pass
    mat=np.array(mat)
    print 'ending'
    libs=read.read_dat(libs,'\t')
    for i in libs:
        for j in range(len(files)):
          if i[-1] in files[j]:
              files[j]=i[0]
          if 'hg19v69_genes.TSS_2000.pc.' in files[j]:
              ind=files[j].index('hg19v69_genes.TSS_2000.pc.')
              files[j]=files[j][ind]+files[j][ind+len('hg19v69_genes.TSS_2000.pc.'):]


              #break

    return  files,mat,genes
Exemple #12
0
def pl_PET(dr,table):
	
	def pl(tmpdata):
		ls=[]
		for i in tmpdata:
			ls.append(float(i[1]))
		return np.array(ls)
	
	lable=['H3K4me1','H3K4me3','H3K9me3','H3K27me3','H3K36me3','H3K27ac']
	tab=read.read_dat(table)
	libs=[]
	for i in range(len(lable)):
		for j in range(len(tab[0])):
			if lable[i] in tab[0][j]:
				libs.append([])
				for k in range(1,17):
					libs[-1].append(tab[k][j])
    
	num=len(libs[0])
	data=[]
	ind=0
	dr=os.popen('ls '+ dr +'*'+'A'+'*.dist')
	for fl in dr.readlines():
		for i in range(len(lable)):
			fig=plt.figure(i)
			plt.title(lable[i])
			for lib in libs[i]:
					#print lib, lable[i]
					if lib in fl[:-1]:
						#print lib,fl[:-1],lable[i]
						tmpdata=pl(read.read_dat(fl[:-1]))
						data.append(tmpdata)
						plt.plot(range(50,550),tmpdata[50:550]/np.max(tmpdata[50:550]),'s-',label=lib)
						#libs.append(lib)
			plt.legend(prop={'size':14})
	print i
	for i in range(len(lable)):
		fig=plt.figure(i)
		fig.savefig(lable[i]+'.pdf', bbox_inches='tight')
		

	return data
Exemple #13
0
def write_QC1(fl,ordered,targetid,ref):
	
	#fread=open(fl,'r')
	lable=['H3K4me1','H3K4me3','H3K9me3','H3K27me3','H3K36me3','H3K27ac','Input DNA']
	data=read.read_dat(fl)
	num=len(ordered)/len(lable)
	ordered=list(ordered)
	ln=data[0]
	i=0
	for tmpi in range(1,len(data)):
		tmpdata=data[tmpi]
		libid=str(tmpdata[2])
		if libid in ordered:
			ind=ordered.index(libid)
			tmpdata.append(lable[int(ind/float(num))])
			idi=0
			for idtmp in range(1,len(targetid)):
				for libtmp in targetid[idtmp]:
					if str(libid[-4:]) in str(libtmp):
						idi=idtmp
			tmp=targetid[idi][-1] 
			for itm in ref:
				if tmp in itm[0]:
					tmp=itm[-1]
			if idi==0:
				print 'aha'
			tmpdata.append(str(tmp))
		if float(tmpdata[5])<20000000:
			tmpdata.append('failed')
		else:
			tmpdata.append('passed')
				
				
		#data[tmpi]=tmpdata
	fwrite=open('table.txt','w')
	for i in ln:
		print >> fwrite, str(i),
	print '\n'
	for tmpdata in data:
		for i in tmpdata:
			
			try:
				print >> fwrite, float(i),'\t',
			except:
				print >> fwrite, str(i),'\t',
		print >> fwrite, '\n',

	fwrite.close()
	return data
Exemple #14
0
def readall_bed(dir,mark,libs,*flagstat):
       
       direct=os.popen('ls '+dir +'/*'+mark+'*.coverage')
       data=[]
       files=[]
       for fl in direct.readlines():
       	fl=fl.strip()
       	for alib in libs:
       		if alib in fl and '#' not in alib:
       			tmpdata=read.read_bed(fl)
       			data.append(tmpdata)
       		#location=fl[::-1].index('/')
       			libst=['A','HS','E']
       			ind=0
       			run=True
       			while run and ind<len(libst):
       				try:
       					Ai=fl.index(libst[ind])
       					run=False
       					Aend=Ai+fl[Ai:].index('.')
       				except:
       					ind+=1
       					Ai=fl.index('/')
       					Aend=len(fl)
       					#print libst[ind-1] , 'tryed';
       			fl = fl[Ai:Aend]
       			#print fl
       			if mark in fl:
       				fl=fl[fl.index(mark):]
       				fl=fl[:len(mark)+1+fl[len(mark)+1:].index('.')]
       				#print fl
       			files.append(fl.strip())
       
       if len(flagstat)>0:
                flagstat=str(flagstat[0])
                normalization=read.read_dat(flagstat,'\t')
                for fl in range(len(files)):
                     bool=False
                     for i in normalization:
                            if files[fl] in i[0]:
                                   data[fl]=norm_bed(data[fl],i[-1])
                                   #print i[-1]
                                   bool=True
                                   break
                     if not bool:
                            print files[fl], 'not found'
                            
       return  files, data
Exemple #15
0
def write_QC1(fl, ordered, targetid, ref):

    # fread=open(fl,'r')
    lable = ["H3K4me1", "H3K4me3", "H3K9me3", "H3K27me3", "H3K36me3", "H3K27ac", "Input DNA"]
    data = read.read_dat(fl)
    num = len(ordered) / len(lable)
    ordered = list(ordered)
    ln = data[0]
    i = 0
    for tmpi in range(1, len(data)):
        tmpdata = data[tmpi]
        libid = str(tmpdata[2])
        if libid in ordered:
            ind = ordered.index(libid)
            tmpdata.append(lable[int(ind / float(num))])
            idi = 0
            for idtmp in range(1, len(targetid)):
                for libtmp in targetid[idtmp]:
                    if str(libid[-4:]) in str(libtmp):
                        idi = idtmp
            tmp = targetid[idi][-1]
            for itm in ref:
                if tmp in itm[0]:
                    tmp = itm[-1]
            if idi == 0:
                print "aha"
            tmpdata.append(str(tmp))
        if float(tmpdata[5]) < 20000000:
            tmpdata.append("failed")
        else:
            tmpdata.append("passed")

            # data[tmpi]=tmpdata
    fwrite = open("table.txt", "w")
    for i in ln:
        print >> fwrite, str(i),
    print "\n"
    for tmpdata in data:
        for i in tmpdata:

            try:
                print >> fwrite, float(i), "\t",
            except:
                print >> fwrite, str(i), "\t",
        print >> fwrite, "\n",

    fwrite.close()
    return data
Exemple #16
0
def readall_bed(dir,mark,libs,*flagstat):
       
       direct=os.popen('ls '+dir +'/*'+mark+'*.coverage')
       data=[]
       files=[]
       for fl in direct.readlines():
       	fl=fl.strip()
       	for alib in libs:
       		if alib in fl and '#' not in alib:
       			tmpdata=read.read_bed(fl)
       			data.append(tmpdata)
       		#location=fl[::-1].index('/')
       			libst=['A','HS','E']
       			ind=0
       			run=True
       			while run and ind<len(libst):
       				try:
       					Ai=fl.index(libst[ind])
       					run=False
       					Aend=Ai+fl[Ai:].index('.')
       				except:
       					ind+=1
       					Ai=fl.index('/')
       					Aend=len(fl)
       					#print libst[ind-1] , 'tryed';
       			fl = fl[Ai:Aend]
       			#print fl
       			if mark in fl:
       				fl=fl[fl.index(mark):]
       				fl=fl[:len(mark)+1+fl[len(mark)+1:].index('.')]
       				#print fl
       			files.append(fl.strip())
       
       if len(flagstat)>0:
                flagstat=str(flagstat[0])
                normalization=read.read_dat(flagstat,'\t')
                for fl in range(len(files)):
                     bool=False
                     for i in normalization:
                            if files[fl] in i[0]:
                                   data[fl]=norm_bed(data[fl],i[-1])
                                   #print i[-1]
                                   bool=True
                                   break
                     if not bool:
                            print files[fl], 'not found'
                            
       return  files, data
Exemple #17
0
def read_dist(dr):

	lib=[]
	dist=[]
	direct=os.popen('ls '+dr +'/*.dist')
	#print direct,direct[0], len(direct)
	for fl in direct.readlines():
		#print fl
		data=read.read_dat(fl[:-1])
		tmp=[]
		for line in data:
			tmp.append(float(line[-2]))
		dist.append(np.array(tmp))
		lib.append(fl[1+len(dr):len(dr)+7])
		
	return lib,dist
Exemple #18
0
def read_dist(dr):

	lib=[]
	dist=[]
	direct=os.popen('ls '+dr +'/*.dist')
	#print direct,direct[0], len(direct)
	for fl in direct.readlines():
		#print fl
		data=read.read_dat(fl[:-1])
		tmp=[]
		for line in data:
			tmp.append(float(line[-2]))
		dist.append(np.array(tmp))
		lib.append(fl[1+len(dr):len(dr)+7])
		
	return lib,dist
Exemple #19
0
def read_alldir(dir,ext,*include):
#h3k27files, allh3k27=read.read_alldir('rhabdoid/coverage/TSS_2000_all','coverage','H3K27me3')
       data=[]
       files=[]
       spl='\t'
       if len(include)>0:
              include='*'+str(include[0])+'*'
       else:
              include='*'
       direct=os.popen('ls '+dir +'/'+include+ext)
       for fl in direct.readlines():
            tmpdata=read.read_dat(fl[:-1],str(spl))
            data.append(tmpdata)
            files.append(fl[-fl[::-1].index('/'):][:-1])

       
       return files, data
Exemple #20
0
def read_alldir(dir,ext,*include):
#h3k27files, allh3k27=read.read_alldir('rhabdoid/coverage/TSS_2000_all','coverage','H3K27me3')
       data=[]
       files=[]
       spl='\t'
       if len(include)>0:
              include='*'+str(include[0])+'*'
       else:
              include='*'
       direct=os.popen('ls '+dir +'/'+include+ext)
       for fl in direct.readlines():
            tmpdata=read.read_dat(fl[:-1],str(spl))
            data.append(tmpdata)
            files.append(fl[-fl[::-1].index('/'):][:-1])

       
       return files, data
Exemple #21
0
def readall_genescore(dr,mark,lib,genes):
	
	direct=os.popen('ls '+dr +'*'+mark+'*.coverage')
	data=[]
	mylib=[]
	vs=[]
	for i in xrange(len(lib)):
          if '#' not in lib[i]:
		if '-' in lib[i]:
			mylib.append(lib[i].split('-')[0])
		else:
			mylib.append(lib[i])
	data={}
	ind=-1
	for fl in direct.readlines():
		ind+=1
		for alib in mylib:
			if alib in fl:
				found=True
				tmpdata=read.read_dat(fl[:-1],'\t')
				data[alib]=tmpdata
				break

	mydata=[]
	for alib in mylib:
          #if '#' not in lib:
		mydata.append(data[alib])

	
	genenum=len(genes)
	libnum=len(mylib)
	mat=np.zeros((genenum,libnum),np.float)
	for genei in xrange(genenum):
		for libi in xrange(libnum):
			found=False
			for line in mydata[libi]:
				if genes[genei] in line[3]:
					mat[genei,libi]=np.float(line[-2])
					found=True
					break
			#if found:
			#	print 'found'
			#else:
			#	print genes[genei],mylib[libi] 
	return   mat
Exemple #22
0
def readall_genescore(dr,mark,lib,genes):
	
	direct=os.popen('ls '+dr +'*'+mark+'*.coverage')
	data=[]
	mylib=[]
	vs=[]
	for i in xrange(len(lib)):
          if '#' not in lib[i]:
		if '-' in lib[i]:
			mylib.append(lib[i].split('-')[0])
		else:
			mylib.append(lib[i])
	data={}
	ind=-1
	for fl in direct.readlines():
		ind+=1
		for alib in mylib:
			if alib in fl:
				found=True
				tmpdata=read.read_dat(fl[:-1],'\t')
				data[alib]=tmpdata
				break

	mydata=[]
	for alib in mylib:
          #if '#' not in lib:
		mydata.append(data[alib])

	
	genenum=len(genes)
	libnum=len(mylib)
	mat=np.zeros((genenum,libnum),np.float)
	for genei in xrange(genenum):
		for libi in xrange(libnum):
			found=False
			for line in mydata[libi]:
				if genes[genei] in line[3]:
					mat[genei,libi]=np.float(line[-2])
					found=True
					break
			#if found:
			#	print 'found'
			#else:
			#	print genes[genei],mylib[libi] 
	return   mat
Exemple #23
0
def write_bed_loc(projlist, tresh, coverfile):

    data = read.read_dat(coverfile, "\t")

    f = open("out.bed", "w")
    ind = 0
    print len(data), len(data[0])
    genes = []
    for i in xrange(len(projlist)):

        if abs(projlist[i]) > tresh:
            ind += 1
            stout = ""
            genes.append(str(data[i][3]))
            for j in range(3):
                stout += str(data[i][j]) + "\t"
            print >> f, stout[:-1]
    print "num of peaks: ", ind
    f.close()

    return genes
Exemple #24
0
def write_bed_loc(projlist,tresh,coverfile):
	
	
	data=read.read_dat(coverfile,'\t')
	
	f=open('out.bed','w')
	ind=0
	print len(data), len(data[0])
	genes=[]
	for i in xrange(len(projlist)):
		
		if abs(projlist[i])>tresh:
			ind+=1
			stout=''
			genes.append(str(data[i][3]))
			for j in range(3):
				stout+=str(data[i][j])+'\t'
			print >> f, stout[:-1]
	print 'num of peaks: ',ind
	f.close()
	
	return genes
Exemple #25
0
def mk_cluster(dr,mark,loc,*perc):
	#writes the tanle for clustering
	
	header=[]
	if len(perc)>0:
		f=open('cluster-'+mark+'-'+loc+'-'+str(perc[0])+'.txt','w')
	else:
		f=open('cluster-'+mark+'-'+loc+'.txt','w')
	direct=os.popen('ls '+dr +'/*'+mark+'*.coverage')
	for fl in direct.readlines():
		#print fl
		libst=['A','HS','E']
		ind=0
		run=True
		while run and ind<=len(libst):
			try:
				Ai=fl.index(libst[ind])
				run=False
			except:
				ind+=1
		Aend=Ai+fl[Ai:].index('.')
		print >> f, '\t',fl[Ai:Aend],
		header.append(fl[Ai:Aend])
		
	print >> f, '\n',
	direct=os.popen('ls '+dr +'/*'+mark+'*.coverage')
	data=[]
	for fl in direct.readlines():
		if loc in fl:
			tmpdata=read.read_dat(fl[:-1],'\t')
			if  len(tmpdata)<1:
				print len(tmpdata),fl[:-1]
			else:
				data.append(tmpdata)
	print len(data),len(data[0])
	enrich=[]
	lable=[]
	table=[]
	
	for line in range(len(data[0])):

		table.append([])
		#lable.append([])
		for tmpdata in data:
			#print  (tmpdata[0])
			table[-1].append(float(tmpdata[line][-2]))
		table[-1]=np.array(table[-1])
		tmp2=''
		for i in range(4):
				tmp2=tmp2+str(tmpdata[line][i])+'_'
		tmp2=tmp2[:-1]+'_1'
		lable.append(tmp2)
	table=np.array(table)
	print len(table[0,:]), len(table[:,0])
	if len(perc)>0:
		for row in range(len(table[0,:])):
			tmp=np.percentile(table[row,:],float(perc[0]))
			print tmp,
			table[row,:][table[row,:]<tmp]=0.
	enrich=[]
	for i in range(len(table[:,0])):
		enrich.append('')
		for j in range(len(table[0,:])):
			enrich[-1]+=str(table[i,j])+'\t'
	for line in range(len(data[0])):
			print >> f, lable[line],enrich[line]
	#print lable[:3],enrich[:3]
	f.close()
	
	return lable,header,table
Exemple #26
0
def read_wig(file,*flagstat):

    elts = {}
    
    f = open(file,'r')
    if '.gz' in file:
       f = gzip.open(file)
    else:
       f = open(file,'r')
    s = f.readline()
    print s
    s = f.readline()
    ind=0
    cover1=int(s[:-1])
    f.close()
    if '.gz' in file:
       f = gzip.open(file)
    else:
       f = open(file,'r')
    count=0
    data=f.read()
    data=data.splitlines()
    print 'reading done'
    for s in data: 
        #count+=1
        #if count >10000000: 
        #      break
        try:
           cover=int(s)
           if cover==cover1:
             reg+=step
           else:
             begin=start+ind*step
             end=begin+reg
             start=end
             ind=0
             elts[chrom].append([begin,end,cover1])
             cover1=cover
             reg=step
             #ind+=step
        except:
           ind=0
           s = s.split()
           s=map(str,s)
           tmp=s[1]
           chrom=tmp[6:]
           tmp=s[2]
           start=int(tmp[6:])
           tmp=s[3]
           step=int(tmp[5:])
           reg=step
           if not elts.has_key(chrom):
                  elts[chrom] = []
        #s = f.readline()
    f.close()
    for key in elts.keys():
       elts[key]=np.array(elts[key])
    try:
      if len(flagstat)>0:
              flagstat=str(flagstat[0])
              normalization=read.read_dat(flagstat,'\t')
              fl=file[0:6]
              print fl
              bool=False
              for i in normalization:
                            if fl in i[0]:
                                   elts=norm_wig(elts,i[-1])
                                   #print i[-1]
                                   bool=True
                                   break
              if not bool:
                            print fl, 'not found'
    except:
       pass
    return elts
Exemple #27
0
import read, write, bedtools, analyse
import matplotlib.pyplot as plt

dirPD = str(sys.argv[1])
dirin = str(sys.argv[2])
fl = str(sys.argv[3])
fl = dirin + fl
mark = str(sys.argv[4])

sample1 = str(sys.argv[5])
sample2 = str(sys.argv[6])
field = int(sys.argv[7])
libsfile = str(sys.argv[8])
hugo = str(sys.argv[9])

libsdata = read.read_dat(libsfile, '\t')

genelist = []
genelist = read.read_dat(hugo)

if os.path.isfile(fl) and os.path.getsize(fl) > 0:
    print fl, field, libsfile
    nm, libsRNA, gene, genex, libs = analyse.heat_rna(fl, genelist, field,
                                                      libsfile)
    rpkmmat = nm * 1.

    plt.xlabel(sample1 + ' vs. ' + sample2 + ' rpkm')
    plt.savefig(dirin + sample1 + '-' + sample2 + '-rpkm.pdf',
                bbox_inches='tight')
    print dirin + sample1 + '-' + sample2 + '-rpkm.pdf'
    plt.close()
Exemple #28
0
import os, sys 
import commands
import read
import write
import numpy as np

tmp=sys.argv
fl=tmp[1]
table=read.read_dat("peaks.txt",'\t')
data=read.read_dat(fl,'\t')
mark=map(str,tmp[2].split('_'))
#mark=mark[mark.index('_')+1:]
mark=mark[-1]
vecs=['pcDNA','K4E','Y69H','D83V']
#print data[1]D83V
for row in table:
	for tmp in row:
		if "bwa-0." in tmp:
			if tmp in data[1][5]:
				#print data[1][5]
				
				cellline = data[-1][-1]
				cellline = cellline [:cellline.index('_')]
				print cellline

vs='WT'

out='table'+'-'+vs+'_all_vectors.txt'
f=open(out,'a')
tmp=fl[fl.index('/')+1:]
rep=int(tmp[-len('2_peaks.xls'):-len('2_peaks.xls')+1])
Exemple #29
0
dirPD=str(sys.argv[1])
dirin=str(sys.argv[2])
fl=str(sys.argv[3])
fl=dirin+fl
mark=str(sys.argv[4])

sample1=str(sys.argv[5])
sample2=str(sys.argv[6])
field=int(sys.argv[7])
libsfile=str(sys.argv[8])
hugo=str(sys.argv[9])



libsdata=read.read_dat(libsfile,'\t')

genelist=[]
genelist=read.read_dat(hugo)

if os.path.isfile(fl) and os.path.getsize(fl) > 0:
	print fl,field,libsfile
	nm,libsRNA,gene,genex,libs=analyse.heat_rna(fl,genelist,field,libsfile)
	rpkmmat=nm*1.
	
	plt.xlabel(sample1+' vs. '+sample2+' rpkm')
	plt.savefig(dirin+sample1+'-'+sample2+'-rpkm.pdf', bbox_inches='tight')
	print dirin+sample1+'-'+sample2+'-rpkm.pdf'
	plt.close()
#	try:
	
Exemple #30
0
import os, sys
import commands
import read
import write
import numpy as np
import time
import subprocess

table = read.read_dat("peaks.txt")
data = table
cells = ["D83V", "K4E", "pcDNA", "WT", "Y69H"]
marks = ["H3K27ac", "H3K4me3", "H3K9me3", "V5", "input"]
ran = 0
for irow in xrange(1, len(data)):
    row = data[irow]
    vector = row[-3]
    solution = row[-1][:-2]
    mymark = ""
    if "input" in row[-2]:
        mymark = "input"
    else:
        mymark = row[-2]
    if '1' in row[-4]:
        rep = '1'
    else:
        rep = '2'
    bam = row[5]
    if 'WT' not in vector and 'input' not in mymark and 'H3K4' not in mymark:
        for ind in xrange(-1, 4):
            try:
                ii = data[irow + ind][-3]
Exemple #31
0
import sys, os, numpy as np
sys.path.append('/Users/ssaberim/epigenomics/code')
import read,write,bedtools,analyse



ID=str(sys.argv[1])
drrna=str(sys.argv[2])
sample=str(sys.argv[3])
outfile=str(sys.argv[4])
field=int(sys.argv[5])

libsfile=str(sys.argv[6])

mark='RNA'
libsdata=read.read_dat(libsfile,'\t')

indmark=libsdata[0].index(mark)

list1=[]
for i in range(1,len(libsdata)):

	if libsdata[i][field] in sample and len(libsdata[i][field])>0:
		list1.append(libsdata[i][indmark])

hugo="/Users/ssaberim/epigenomics/resources/list.genes2.txt"
hugo=read.read_dat(hugo)
#print hugo[0]
ID=read.read_dat(ID,'\t')
for i in range(len(ID)):
	for j  in range(len(hugo)):
Exemple #32
0
def mk_cluster(dr, mark, loc, *perc):
    #writes the tanle for clustering

    header = []
    if len(perc) > 0:
        f = open('cluster-' + mark + '-' + loc + '-' + str(perc[0]) + '.txt',
                 'w')
    else:
        f = open('cluster-' + mark + '-' + loc + '.txt', 'w')
    direct = os.popen('ls ' + dr + '/*' + mark + '*.coverage')
    for fl in direct.readlines():
        #print fl
        libst = ['A', 'HS', 'E']
        ind = 0
        run = True
        while run and ind <= len(libst):
            try:
                Ai = fl.index(libst[ind])
                run = False
            except:
                ind += 1
        Aend = Ai + fl[Ai:].index('.')
        print >> f, '\t', fl[Ai:Aend],
        header.append(fl[Ai:Aend])

    print >> f, '\n',
    direct = os.popen('ls ' + dr + '/*' + mark + '*.coverage')
    data = []
    for fl in direct.readlines():
        if loc in fl:
            tmpdata = read.read_dat(fl[:-1], '\t')
            if len(tmpdata) < 1:
                print len(tmpdata), fl[:-1]
            else:
                data.append(tmpdata)
    print len(data), len(data[0])
    enrich = []
    lable = []
    table = []

    for line in range(len(data[0])):

        table.append([])
        #lable.append([])
        for tmpdata in data:
            #print  (tmpdata[0])
            table[-1].append(float(tmpdata[line][-2]))
        table[-1] = np.array(table[-1])
        tmp2 = ''
        for i in range(4):
            tmp2 = tmp2 + str(tmpdata[line][i]) + '_'
        tmp2 = tmp2[:-1] + '_1'
        lable.append(tmp2)
    table = np.array(table)
    print len(table[0, :]), len(table[:, 0])
    if len(perc) > 0:
        for row in range(len(table[0, :])):
            tmp = np.percentile(table[row, :], float(perc[0]))
            print tmp,
            table[row, :][table[row, :] < tmp] = 0.
    enrich = []
    for i in range(len(table[:, 0])):
        enrich.append('')
        for j in range(len(table[0, :])):
            enrich[-1] += str(table[i, j]) + '\t'
    for line in range(len(data[0])):
        print >> f, lable[line], enrich[line]
    #print lable[:3],enrich[:3]
    f.close()

    return lable, header, table
Exemple #33
0
def gene_rpkm_corr_compare(thegene,thersh,rpkmdata,rpkmfiles,libs,*enslist):
   #usage:
   #genescompared,genesfc,pvals=\
   #analyse.gene_rpkm_corr_compare('ENSG00000108799',0.000001,rpkm,files,'rhabdoid/RNA-ALL-key.txt','resources/list.genes2.txt')
   genelist=[]
   rpkmgene=[]
   geneimp=[]
   pvals=[]
   files=[]
   for ind in xrange(len(rpkmdata)):
       for i in rpkmdata[ind]:
          if thegene in i[0]:
            rpkmgene.append(i[2])
            files.append(rpkmfiles[ind][:rpkmfiles[ind].index('.')])
            break



   libs=read.read_dat(libs,'\t')
   for ind in range(len(files)):
     for lib in libs:
        if files[ind] in lib[0] :
            files[ind]=lib[1]+'-'+lib[-1]
            break

   rpkmgeneRT=[]
   rpkmgenenormal=[]
   for ind in xrange(len(rpkmgene)):
          #if 'Blood' not in files[ind]:
            #print files[ind]
            if 'RT' in files[ind]:
               rpkmgeneRT.append(rpkmgene[ind])
            else:
               rpkmgenenormal.append(rpkmgene[ind]) 
   rpkmgeneRT=np.array(rpkmgeneRT)
   rpkmgenenormal=np.array(rpkmgenenormal)
   print len(rpkmgeneRT), len(rpkmgenenormal)

   for igene in range(len(rpkmdata[0])):
         rpkmRT=[]
         rpkmnormal=[]
         for ind in xrange(len(rpkmdata)):
            if 'Blood' not in files[ind]:
               if 'RT' in files[ind] :
                 rpkmRT.append(rpkmdata[ind][igene][2])
               else:
                 rpkmnormal.append(rpkmdata[ind][igene][2])
         
         #print len(rpkmRT)
         #print len(rpkmnormal), rpkmnormal[0]
         rpkmRT=np.array(rpkmRT)
         rpkmnormal=np.array(rpkmnormal)
         a=ss.pearsonr(rpkmRT,rpkmgeneRT)
         
         b=ss.pearsonr(rpkmnormal,rpkmgenenormal)
         mn=np.mean(rpkmRT)/np.mean(rpkmnormal)

         if b[1]<thersh  and (a[1]/b[1])>thersh:
            genelist.append(rpkmdata[0][igene][0])
            if mn>1. or mn <1.:
               #print rpkmdata[0][igene][0], 'correlation for normal=%.3f, pvalue normal=%.5f, pvalue RT=%.5f, foldchange=%.3f'%(b[0],b[1],a[1],mn)
               geneimp.append(mn)
               pvals.append(np.log(a[1]/b[1]))
   
        
   return genelist , geneimp, pvals
import os, sys 
import commands
import read
import numpy as np
import time
import subprocess
import write
table=read.read_dat("peaks.txt",'\t')
data=table
cells=["D83V","K4E", "pcDNA","WT","Y69H"]
marks=["H3K27ac","H3K4me3","H3K9me3","V5","input"]
ran=0 
for irow in xrange(2,len(data)):
	row=data[irow]
	vector=row[-3]
	solution=row[-1][:-2]
	mymark=""
	#print row[-2]
	if "input" in row:
				mymark="input"
	else:
		mymark=row[-2]
	if '1' in row[-4]:
		rep='1'
	else:
		rep='2'
	bam=row[5]
	if 'WT' not in vector and 'input' not in mymark and mymark in marks:
		for ind in xrange(-1,4):
			try:
				ii=data[irow+ind][-3]
Exemple #35
0
def gene_rpkm_corr_compare(thegene, thersh, rpkmdata, rpkmfiles, libs,
                           *enslist):
    #usage:
    #genescompared,genesfc,pvals=\
    #analyse.gene_rpkm_corr_compare('ENSG00000108799',0.000001,rpkm,files,'rhabdoid/RNA-ALL-key.txt','resources/list.genes2.txt')
    genelist = []
    rpkmgene = []
    geneimp = []
    pvals = []
    files = []
    for ind in xrange(len(rpkmdata)):
        for i in rpkmdata[ind]:
            if thegene in i[0]:
                rpkmgene.append(i[2])
                files.append(rpkmfiles[ind][:rpkmfiles[ind].index('.')])
                break

    libs = read.read_dat(libs, '\t')
    for ind in range(len(files)):
        for lib in libs:
            if files[ind] in lib[0]:
                files[ind] = lib[1] + '-' + lib[-1]
                break

    rpkmgeneRT = []
    rpkmgenenormal = []
    for ind in xrange(len(rpkmgene)):
        #if 'Blood' not in files[ind]:
        #print files[ind]
        if 'RT' in files[ind]:
            rpkmgeneRT.append(rpkmgene[ind])
        else:
            rpkmgenenormal.append(rpkmgene[ind])
    rpkmgeneRT = np.array(rpkmgeneRT)
    rpkmgenenormal = np.array(rpkmgenenormal)
    print len(rpkmgeneRT), len(rpkmgenenormal)

    for igene in range(len(rpkmdata[0])):
        rpkmRT = []
        rpkmnormal = []
        for ind in xrange(len(rpkmdata)):
            if 'Blood' not in files[ind]:
                if 'RT' in files[ind]:
                    rpkmRT.append(rpkmdata[ind][igene][2])
                else:
                    rpkmnormal.append(rpkmdata[ind][igene][2])

        #print len(rpkmRT)
        #print len(rpkmnormal), rpkmnormal[0]
        rpkmRT = np.array(rpkmRT)
        rpkmnormal = np.array(rpkmnormal)
        a = ss.pearsonr(rpkmRT, rpkmgeneRT)

        b = ss.pearsonr(rpkmnormal, rpkmgenenormal)
        mn = np.mean(rpkmRT) / np.mean(rpkmnormal)

        if b[1] < thersh and (a[1] / b[1]) > thersh:
            genelist.append(rpkmdata[0][igene][0])
            if mn > 1. or mn < 1.:
                #print rpkmdata[0][igene][0], 'correlation for normal=%.3f, pvalue normal=%.5f, pvalue RT=%.5f, foldchange=%.3f'%(b[0],b[1],a[1],mn)
                geneimp.append(mn)
                pvals.append(np.log(a[1] / b[1]))

    return genelist, geneimp, pvals
Exemple #36
0
def gene_rpkm_lineage(hugo, rpkmdata, rpkmfiles, allgenes, libs, *plotbool):

    genelist = []
    for i in range(len(hugo)):
        genelist.append(hugo[i])
    genelist = ens_genes(genelist, allgenes)
    rpkmls = []
    files = []
    for gene in genelist:
        rpkmls.append([])
        for ind in xrange(len(rpkmdata)):
            for i in rpkmdata[ind]:
                if gene in i[0]:
                    rpkmls[-1].append(i[2])
                    if len(rpkmls) == 1:
                        files.append(
                            rpkmfiles[ind][:rpkmfiles[ind].index('.')])
                    break

    files = np.array(files)
    rpkmls = data2arr(rpkmls)
    sortedinds = rpkmls[0].argsort()
    files = files[sortedinds]
    files = files[::-1]
    for i in range(len(genelist)):
        rpkmls[i] = rpkmls[i][sortedinds]
        rpkmls[i] = rpkmls[i][::-1]
    normalfiles = []
    normalrpkms = []
    RTfiles = []
    RTrpkms = []
    libs = read.read_dat(libs, '\t')
    for ind in range(len(files)):
        for lib in libs:
            if files[ind] in lib[0]:
                files[ind] = lib[1] + '-' + lib[-1]
                break
    for i in range(len(genelist)):
        rpkmls[i] = np.log(rpkmls[i] + 0.001) / np.log(10)

    for i in range(len(genelist)):
        normalrpkms.append([])
        RTrpkms.append([])
    for ind in range(len(files)):
        if 'Cancer' not in files[ind] and 'ES' not in files[ind]:
            print files[ind]
            normalfiles.append(files[ind])
            for i in range(len(genelist)):
                normalrpkms[i].append(rpkmls[i][ind])
        elif 'RT' in files[ind]:
            RTfiles.append(files[ind])
            for i in range(len(genelist)):
                RTrpkms[i].append(rpkmls[i][ind])

    plt.figure()
    for i in range(len(genelist)):
        if i < 7:
            plt.plot(xrange(len(list(files))), rpkmls[i], 's-', label=hugo[i])
        else:
            plt.plot(xrange(len(list(files))), rpkmls[i], '^-', label=hugo[i])
    plt.legend(loc=1)
    plt.xticks(range(len(files)), files)
    plt.xticks(rotation=90)
    plt.xticks(fontsize=8)
    plt.ylabel('$log_{10}$ $RPKM$')
    plt.grid()
    plt.show()

    def heatmap(rpkmls, hugo, txt):
        rpkmls = data2arr(rpkmls)
        norm, matrix, dist = analyse.all_corr(rpkmls.T)
        plt.matshow((matrix), cmap='RdYlBu', vmax=1, vmin=-1)
        plt.yticks(range(len(hugo)), hugo)
        plt.xticks(range(len(hugo)), hugo)
        plt.xticks(rotation=90)
        plt.xlabel('Correlation-' + txt)
        plt.colorbar()
        #plt.matshow(np.log(pval),cmap='PuBu',vmax=-5,vmin=-25)
        #plt.yticks(range(len(hugo)),hugo)
        #plt.xticks(range(len(hugo)),hugo)
        #plt.xlabel('P-Value $\log$ -'+txt)
        #plt.xticks(rotation=90)
        #plt.colorbar()

    if len(plotbool) > 0:
        if plotbool[0]:
            heatmap(rpkmls, hugo, 'All Samples')
            heatmap(normalrpkms, hugo, 'Normal Samples')
            heatmap(RTrpkms, hugo, 'RT Samples')
            for i in range(len(hugo)):
                print hugo[i], ss.ks_2samp(normalrpkms[i],
                                           RTrpkms[i])[1] * np.float(
                                               len(allgenes)), 'KS'
                print hugo[i], ss.ttest_ind(normalrpkms[i],
                                            RTrpkms[i])[1] * np.float(
                                                len(allgenes)), 'T-test'

    #fig=plt.figure()
    #plt.plot()
    return rpkmls, files, data2arr(normalrpkms), np.array(
        normalfiles), genelist
Exemple #37
0
import os, sys 
import commands
import read
import write
import numpy as np

tmp=sys.argv
fl=tmp[1]
#vs=str(tmp[3])
mark=map(str,tmp[2].split('-'))
mark=mark[1]
vs='input'
data=read.read_dat(fl)
out='table'+'-'+vs+'.txt'
f=open(out,'a')
tmp=fl[fl.index('/')+1:]
solution=tmp.split('_')[1]
rep=tmp.split('_')[2]

tmp = str(data[-1][-1])
tmp=tmp.split('_')
rank=tmp[2]
coverage=tmp[-1]
vec=tmp[0]
try:
	print >> f, rep,mark,vec,solution,vs,coverage
except:
	print vec
f.close()

Exemple #38
0
def read_wig(file,*flagstat):

    elts = {}
    
    f = open(file,'r')
    if '.gz' in file:
       f = gzip.open(file)
    else:
       f = open(file,'r')
    s = f.readline()
    print s
    s = f.readline()
    ind=0
    cover1=int(s[:-1])
    f.close()
    if '.gz' in file:
       f = gzip.open(file)
    else:
       f = open(file,'r')
    count=0
    data=f.read()
    data=data.splitlines()
    print 'reading done'
    for s in data: 
        #count+=1
        #if count >10000000: 
        #      break
        try:
           cover=int(s)
           if cover==cover1:
             reg+=step
           else:
             begin=start+ind*step
             end=begin+reg
             start=end
             ind=0
             elts[chrom].append([begin,end,cover1])
             cover1=cover
             reg=step
             #ind+=step
        except:
           ind=0
           s = s.split()
           s=map(str,s)
           tmp=s[1]
           chrom=tmp[6:]
           tmp=s[2]
           start=int(tmp[6:])
           tmp=s[3]
           step=int(tmp[5:])
           reg=step
           if not elts.has_key(chrom):
                  elts[chrom] = []
        #s = f.readline()
    f.close()
    for key in elts.keys():
       elts[key]=np.array(elts[key])
    try:
      if len(flagstat)>0:
              flagstat=str(flagstat[0])
              normalization=read.read_dat(flagstat,'\t')
              fl=file[0:6]
              print fl
              bool=False
              for i in normalization:
                            if fl in i[0]:
                                   elts=norm_wig(elts,i[-1])
                                   #print i[-1]
                                   bool=True
                                   break
              if not bool:
                            print fl, 'not found'
    except:
       pass
    return elts
Exemple #39
0
def heat_rna(fl,genelist,field,libfile):
    
    data=read.read_dat(fl)
    data=data[:-1]
    if [''] in data:
       data.delete([['']])
    lib=[]
    lib2=[]
    gene=[]
    for i in data:
      try:
        if i[2] not in gene:
          gene.append(i[2])
      except:
          print i

    libstable=read.read_dat(libfile,'\t')
    for i in data:
      try: 
        if i[0]+'-'+i[1] not in lib:
          lib.append(i[0]+'-'+i[1])
          lib2.append(i[0]+'-'+i[1])

        if len(libfile)>0:
         for ID in libstable:
              if i[0] in ID:
                lib2[-1]=ID[0]+'-'+ID[field]
                break 
      except:
          print i    

    pheno=[]
    pnum=-1.5
    hist=''
    for i in lib:
       j=i.split('-')
       if j[1] != hist:
               pnum+=1
       pheno.append(pnum)
       hist=j[1]
       #print hist,pnum

    mat=np.zeros((len(gene),len(lib)),np.float)
    for i in data: 
       mat[gene.index(i[2]),lib.index(i[0]+'-'+i[1])]=np.float(i[3])
    proj=np.dot(mat,pheno)
    for i in xrange(len(proj)):
       norm=np.sqrt(np.dot(mat[i,:],mat[i,:]))
       if norm!=0.:
         proj[i]/=norm
    #proj=np.array(proj)
    inds=sortedinds=proj.argsort()
    mat=mat[inds]
    gene=np.array(gene)
    gene=gene[inds]
    inds=np.any(mat != 0, axis=1)
    mat=mat[inds]
    gene=np.array(gene)
    gene=gene[inds]
    #for i in mat:
    #  if np.mean(abs(i))==0:
    #   print i
    nm=norm_max(mat)
    #for i in nm:
    #  if np.mean(abs(i))==0:
     #  print i
    genex=gene
    if len(genelist)>0:
       lb=ens_genes(gene,genelist)
       genex=lb
    else:
      return nm,lib2,gene,genex,lib
    
    b=plt.matshow(nm,aspect='auto',cmap='RdYlBu')
    if len(libfile)>0:
          plt.xticks(range(len(lib2)),lib2)
    else:
         plt.xticks(range(len(lib)),lib)
    plt.colorbar()
    plt.xticks(rotation=90)
    plt.yticks(range(len(genex)),genex)

    plt.yticks(fontsize=8)
    mytemplate(nm)
    plt.xlabel(lb)
    lb=fl.split('/')[-1][:-5]
 

    return nm,lib2,gene,genex,lib
Exemple #40
0
def heat_rna(fl, genelist, field, libfile):

    data = read.read_dat(fl)
    data = data[:-1]
    if [''] in data:
        data.delete([['']])
    lib = []
    lib2 = []
    gene = []
    for i in data:
        try:
            if i[2] not in gene:
                gene.append(i[2])
        except:
            print i

    libstable = read.read_dat(libfile, '\t')
    for i in data:
        try:
            if i[0] + '-' + i[1] not in lib:
                lib.append(i[0] + '-' + i[1])
                lib2.append(i[0] + '-' + i[1])

            if len(libfile) > 0:
                for ID in libstable:
                    if i[0] in ID:
                        lib2[-1] = ID[0] + '-' + ID[field]
                        break
        except:
            print i

    pheno = []
    pnum = -1.5
    hist = ''
    for i in lib:
        j = i.split('-')
        if j[1] != hist:
            pnum += 1
        pheno.append(pnum)
        hist = j[1]
        #print hist,pnum

    mat = np.zeros((len(gene), len(lib)), np.float)
    for i in data:
        mat[gene.index(i[2]), lib.index(i[0] + '-' + i[1])] = np.float(i[3])
    proj = np.dot(mat, pheno)
    for i in xrange(len(proj)):
        norm = np.sqrt(np.dot(mat[i, :], mat[i, :]))
        if norm != 0.:
            proj[i] /= norm
    #proj=np.array(proj)
    inds = sortedinds = proj.argsort()
    mat = mat[inds]
    gene = np.array(gene)
    gene = gene[inds]
    inds = np.any(mat != 0, axis=1)
    mat = mat[inds]
    gene = np.array(gene)
    gene = gene[inds]
    #for i in mat:
    #  if np.mean(abs(i))==0:
    #   print i
    nm = norm_max(mat)
    #for i in nm:
    #  if np.mean(abs(i))==0:
    #  print i
    genex = gene
    if len(genelist) > 0:
        lb = ens_genes(gene, genelist)
        genex = lb
    else:
        return nm, lib2, gene, genex, lib

    b = plt.matshow(nm, aspect='auto', cmap='RdYlBu')
    if len(libfile) > 0:
        plt.xticks(range(len(lib2)), lib2)
    else:
        plt.xticks(range(len(lib)), lib)
    plt.colorbar()
    plt.xticks(rotation=90)
    plt.yticks(range(len(genex)), genex)

    plt.yticks(fontsize=8)
    mytemplate(nm)
    plt.xlabel(lb)
    lb = fl.split('/')[-1][:-5]

    return nm, lib2, gene, genex, lib
Exemple #41
0
def gene_rpkm_lineage(hugo,rpkmdata,rpkmfiles,allgenes,libs,*plotbool):
   
   genelist=[]
   for i in range(len(hugo)):
      genelist.append(hugo[i])
   genelist=ens_genes(genelist,allgenes)
   rpkmls=[]
   files=[]
   for gene in genelist:
     rpkmls.append([])
     for ind in xrange(len(rpkmdata)):
       for i in rpkmdata[ind]:
          if gene in i[0]:
            rpkmls[-1].append(i[2])
            if len(rpkmls)==1:
              files.append(rpkmfiles[ind][:rpkmfiles[ind].index('.')])
            break

   files=np.array(files)
   rpkmls=data2arr(rpkmls)
   sortedinds=rpkmls[0].argsort()
   files=files[sortedinds]
   files=files[::-1]
   for i in range(len(genelist)):
      rpkmls[i]=rpkmls[i][sortedinds]
      rpkmls[i]=rpkmls[i][::-1]
   normalfiles=[]
   normalrpkms=[]
   RTfiles=[]
   RTrpkms=[] 
   libs=read.read_dat(libs,'\t')
   for ind in range(len(files)): 
     for lib in libs:
        if files[ind] in lib[0]:
            files[ind]=lib[1]+'-'+lib[-1]
            break
   for i in range(len(genelist)):
     rpkmls[i]=np.log(rpkmls[i]+0.001)/np.log(10)
     
   for i in range(len(genelist)):
      normalrpkms.append([])
      RTrpkms.append([])
   for ind in range(len(files)):
      if 'Cancer' not in files[ind]  and 'ES' not in files[ind]:
         print files[ind]
         normalfiles.append(files[ind])
         for i in range(len(genelist)):
            normalrpkms[i].append(rpkmls[i][ind])
      elif 'RT'  in files[ind]:
         RTfiles.append(files[ind])
         for i in range(len(genelist)):
            RTrpkms[i].append(rpkmls[i][ind])      
   
   plt.figure()
   for i in range(len(genelist)):
     if i<7:
       plt.plot(xrange(len(list(files))),rpkmls[i],'s-',label=hugo[i])
     else:
       plt.plot(xrange(len(list(files))),rpkmls[i],'^-',label=hugo[i])
   plt.legend(loc=1)
   plt.xticks(range(len(files)),files)
   plt.xticks(rotation=90)
   plt. xticks(fontsize=8)
   plt.ylabel('$log_{10}$ $RPKM$')
   plt.grid()
   plt.show()

   def heatmap(rpkmls,hugo,txt):
     rpkmls=data2arr(rpkmls)
     norm,matrix,dist=analyse.all_corr(rpkmls.T)
     plt.matshow((matrix),cmap='RdYlBu',vmax=1,vmin=-1)
     plt.yticks(range(len(hugo)),hugo)
     plt.xticks(range(len(hugo)),hugo)
     plt.xticks(rotation=90)
     plt.xlabel('Correlation-'+txt)
     plt.colorbar()
     #plt.matshow(np.log(pval),cmap='PuBu',vmax=-5,vmin=-25)
     #plt.yticks(range(len(hugo)),hugo)
     #plt.xticks(range(len(hugo)),hugo)
     #plt.xlabel('P-Value $\log$ -'+txt)
     #plt.xticks(rotation=90)
     #plt.colorbar()
   if len(plotbool)>0:
     if plotbool[0]:
       heatmap(rpkmls,hugo,'All Samples')
       heatmap(normalrpkms,hugo,'Normal Samples')
       heatmap(RTrpkms,hugo,'RT Samples')
       for i in range(len(hugo)):
        print hugo[i],ss.ks_2samp(normalrpkms[i],RTrpkms[i])[1]*np.float(len(allgenes)), 'KS'
        print hugo[i],ss.ttest_ind(normalrpkms[i],RTrpkms[i])[1]*np.float(len(allgenes)), 'T-test'
   
   
   #fig=plt.figure()
   #plt.plot()
   return rpkmls,files,data2arr(normalrpkms),np.array(normalfiles),genelist
Exemple #42
0
import sys,read, write,bedtools, analyse


mark=sys.argv[1]
sample1=sys.argv[2]
sample2=sys.argv[3]
num=int(sys.argv[4])
T=float(sys.argv[5])
dirout=sys.argv[6]
field=int(sys.argv[7])
libsfile=str(sys.argv[8])
TSSdir=str(sys.argv[9])


beds={}
libsdata=read.read_dat(libsfile,'\t')
indmark=libsdata[0].index(mark)
list1=[]
list2=[]
for i in xrange(1,len(libsdata)):
	if sample1 in libsdata[i][field] and len(libsdata[i][indmark])>1:
		list1.append(libsdata[i][indmark])

	elif sample2 in libsdata[i][field] and len(libsdata[i][indmark])>1:
		list2.append(libsdata[i][indmark])


beds={}
files1,beds[sample1]=read.readall_bed(TSSdir,mark,list1)
files2,beds[sample2]=read.readall_bed(TSSdir,mark,list2)
genes=read.read_gene_pos('rhabdoid/coverage/TSS_2000_all/hg19v69_genes.TSS_2000.pc.A03480.H3K27me3.GE02.coverage')
Exemple #43
0
#!/usr/local/bin/env python
#coding: utf8
import sys, read, write, bedtools, analyse

mark = sys.argv[1]
sample1 = sys.argv[2]
sample2 = sys.argv[3]
num = int(sys.argv[4])
T = float(sys.argv[5])
dirout = sys.argv[6]
field = int(sys.argv[7])
libsfile = str(sys.argv[8])
TSSdir = str(sys.argv[9])

beds = {}
libsdata = read.read_dat(libsfile, '\t')
indmark = libsdata[0].index(mark)
list1 = []
list2 = []
for i in xrange(1, len(libsdata)):
    if sample1 in libsdata[i][field] and len(libsdata[i][indmark]) > 1:
        list1.append(libsdata[i][indmark])

    elif sample2 in libsdata[i][field] and len(libsdata[i][indmark]) > 1:
        list2.append(libsdata[i][indmark])

beds = {}
files1, beds[sample1] = read.readall_bed(TSSdir, mark, list1)
files2, beds[sample2] = read.readall_bed(TSSdir, mark, list2)
genes = read.read_gene_pos(
    'rhabdoid/coverage/TSS_2000_all/hg19v69_genes.TSS_2000.pc.A03480.H3K27me3.GE02.coverage'