Esempio n. 1
0
def refseq_table(FILE, TSS=True):
	G 	= {}
	with open(FILE) as FH:
		header 	= True
		for line in FH:
			if not header:
				N,chrom,strand, start, stop  	= line.strip("\n").split("\t")[1:6]
				if chrom not in G:
					G[chrom] 	= list()
				if TSS:
					if strand == "+":
						G[chrom].append((int(start)-500, int(start) + 500, N))
					else:
						G[chrom].append((int(stop)-500, int(stop) + 500, N))
				else:
					G[chrom].append((int(start) , int(stop)  , N))
			else:
				header=False
	FHW 	= open("/Users/joazofeifa/Lab/genome_files/TSS.bed","w")
	for chrom in G:
		for st, sp, N in G[chrom]:
			FHW.write(chrom + "\t" + str(st) + "\t" + str(sp) + "\t" + N + "\n" )
		G[chrom] 	= node.tree(G[chrom])
	FHW.close()
	return G
Esempio n. 2
0
def insert_clinVarSNP(G, FILE, data_type, sig_level=4): #vcf
	A 	= {}
	N 	= 0.
	NN 	= 0.
	NNN = 0.
	with open(FILE) as FH:
		for chrom in G:
			A[chrom] = node.tree([(I.start, I. stop, I) for I in G[chrom]])
		for line in FH:
			if "#" != line[0]:
				chrom, pos, ID, REF, ALT,QUAL,FILTER, INFO 	= line.strip("\n").split("\t")

				chrom="chr"+chrom
				info_array 		= dict([d.split("=") for d in INFO.split(";") if len(d.split("="))==2 ])
				sig 	 		= int(info_array["CLNSIG"].split("|")[-1].split(",")[-1])
				if sig  >= sig_level and sig!=255:
					NN+=1
				if chrom in A:
					pos 		= int(pos )
					finds 	 	= A[chrom].searchPoint(pos)
					if finds:
						for st, sp, I in finds:
							I.insert_data(pos,ID, data_type)	
						NNN+=1
								
				N+=1
Esempio n. 3
0
def load_peak_files(FILE, G, TYPE):
	if not FILE or not os.path.exists(FILE):
		for chrom in G:
			for I in G[chrom]:
				setattr(I, TYPE+"_peak", True)
	else:
		with open(FILE) as FH:
			A 		= {}
			for line in FH:
				chrom,start, stop 	= line.strip("\n").split("\t")[:3]
				start, stop 		= int(start), int(stop)
				if chrom not in A:
					A[chrom]=list()
				A[chrom].append((start, stop, ""))
		

		for chrom in G:
			if chrom in A:
				T  	= node.tree(A[chrom])
				for I in G[chrom]:
					F 	= T.searchInterval((I.start, I.stop))
					if F:
						setattr(I, TYPE+"_peak", True)
					else:
						setattr(I, TYPE+"_peak", False)	
			else:
				for I in G[chrom]:
					setattr(I, TYPE+"_peak", False)
	if not os.path.exists(FILE):
		print FILE, "peak file, doesn't exist"
Esempio n. 4
0
    def _get_overlaps_pairwaise_tree(self, A, node_B):
        overlaps = list()
        for a in A:
            FINDS = node_B.searchInterval((a.start, a.stop))
            if FINDS:
                overlaps += [(max(a.start, st), min(a.stop, sp), [f for f in F] + [a]) for st, sp, F in FINDS]
                overlaps.sort()

        return node.tree(overlaps)
Esempio n. 5
0
 def _build(self, LSTS, kwargs):
     for v in kwargs:
         setattr(self, v, kwargs[v])
     for i, L in enumerate(LSTS):
         L.sort()
         self.intervals.append([interval(l, LID=i, PID=j) for j, l in enumerate(L)])
     self._check_disjoint()
     if not self.distjoint:
         for L in self.intervals:
             self.trees.append(node.tree([(l.start, l.stop, [l]) for l in L]))
Esempio n. 6
0
    def find_overlaps(self, *args, **kwargs):
        isolate = False
        ignore = list()
        if "isolate" in kwargs:
            isolate = kwargs["isolate"]
        if "ignore" in kwargs:
            ignore = kwargs["ignore"]
        if not isolate:
            assert len(args) > 1 or len(args[0]) > 1, "interval_package: need at least two lists to compare"
            if len(args) == 1:
                args = args[0]
            searchable = [self.intervals[a] for a in args if a < len(self.intervals) and a not in ignore]
            assert len(searchable) == len(
                args
            ), "interval_package: one or more of your interval comparisons is not in comparison struct"

            if self.distjoint:
                for i in range(len(searchable) - 1):
                    if i == 0:
                        overlaps = self._get_overlaps_pairwase(searchable[i], searchable[i + 1])
                    else:
                        overlaps = self._get_overlaps_pairwase(searchable[i + 1], overlaps)
                O = overlaps
            else:
                trees = [self.trees[a] for a in args if a < len(self.intervals) and a not in ignore]
                for i in range(len(searchable) - 1):
                    if i == 0:
                        overlaps = self._get_overlaps_pairwaise_tree(searchable[i], trees[i + 1])
                    else:
                        overlaps = self._get_overlaps_pairwaise_tree(searchable[i + 1], overlaps)
                O = overlaps.get_all()
                O.sort()
                O = [overlap(o[0], o[1], overlaps=dict([(I, 1) for I in o[2]])) for o in O]
            return O
        else:
            a = args[0]
            if not self.trees:
                self.trees = [
                    node.tree([(l.start, l.stop, [l]) for l in L])
                    for i, L in enumerate(self.intervals)
                    if i not in ignore
                ]
            trees = [self.trees[i] for i, A in enumerate(self.intervals) if i != a]
            DISTINCT = list()
            for A in self.intervals[a]:
                FOUND = False

                for t in trees:
                    if t.searchInterval((A.start, A.stop)):
                        FOUND = True
                        break
                if not FOUND:
                    DISTINCT.append(A)
            return DISTINCT
Esempio n. 7
0
def load_intervals(FILE):
	G 	= {}
	with open(FILE) as FH:
		for line in FH:
			chrom,start, stop, name 	= line.strip("\n").split("\t")
			if chrom not in G:
				G[chrom]=list()
			G[chrom].append((int(start),int(stop), name ))
	for chrom in G:
		G[chrom].sort()
		G[chrom] 	= node.tree(G[chrom])
	return G
Esempio n. 8
0
def make_DNAse_searchable(FILE):
	G 	= {}
	with open(FILE) as FH:
		for line in FH:
			chrom,start,stop 	= line.split("\t")[:3]
			if chrom not in G:
				G[chrom]=list()
			G[chrom].append((int(start)-2000, int(stop)+2000))
	for chrom in G:
		G[chrom].sort()
		G[chrom] 	= node.tree(G[chrom])
	return G
Esempio n. 9
0
def load_gene_counts(FILE):
    G = {}
    with open(FILE) as FH:
        for line in FH:
            name, chrom, start, stop, pos_neg = line.strip("\n").split("\t")
            if chrom not in G:
                G[chrom] = list()
            G[chrom].append((int(start), int(stop), (name, pos_neg)))
    for chrom in G:
        G[chrom].sort()
        G[chrom] = node.tree(G[chrom])
    return G
def load_gene_counts(FILE ):
	G 	= {}
	with open(FILE) as FH:
		for line in FH:
			name,chrom,start, stop, cov 	= line.strip("\n").split("\t")
			cov 							= sum([abs(float(x)) for x in cov.split(",")])
			if chrom not in G:
				G[chrom]= 	list()
			G[chrom].append((int(start) , int(stop) , (name, cov)))
	for chrom in G:
		G[chrom].sort()
		G[chrom]= node.tree(G[chrom])
	return G
Esempio n. 11
0
def load_refseq(FILE):
	G 	= {}
	with open(FILE) as FH:
		header 	= True
		for line in FH:
			if not header:
				name, chrom, strand, start, stop 	= line.split("\t")[1:6]
				if chrom not in G:
					G[chrom] 	= list()
				G[chrom].append((int(start), int(stop), name+ "_" + strand))
			else:
				header=False
	for chrom in G:
		G[chrom] 	= node.tree(G[chrom])
	return G
Esempio n. 12
0
def read_bidireciotnal(FILE):
	G 	= {}
	i 	= 0
	with open(FILE) as FH:
		for line in FH:
			if "#" != line[0]:
				chrom,start, stop 	= line.split("\t")[:3]
				if chrom not in G:
					G[chrom]=list()
				G[chrom].append((int(start), int(stop), i))
				i+=1
	for chrom in G:
		G[chrom].sort()
		G[chrom] 	= node.tree(G[chrom])
	return G
	def make_searchable(self):
		GG 	= list()
		SS 	= (self.unique_to_cell_type_1,self.unique_to_cell_type_2)

		for S in SS:
			G  	= {}
			for i in S:
				s 	= S[i]
				if s.chrom not in G:
					G[s.chrom]=list()
				G[s.chrom].append((s.start, s.stop, s))
			for chrom in G:
				G[chrom].sort()
				G[chrom] 	= node.tree(G[chrom])
			GG.append(G)
		self.unique_to_cell_type_1_searchable 	= GG[0]
		self.unique_to_cell_type_2_searchable 	= GG[1]
Esempio n. 14
0
def refseq(FILE):
	header 	= True
	G 		= {}
	with open(FILE) as FH:
		for line in FH:
			if not header:
				name, chrom, strand, start, stop 	= line.split("\t")[1:6]
				if chrom not in G:
					G[chrom] 	= list()
				if strand == "+":
					G[chrom].append((int(start)-1000, int (start )+1000 ,  strand ))
				else:
					G[chrom].append((int(stop)-1000, int (stop )+1000 ,  strand ))
			else:
				header=False
	for chrom in G:
		G[chrom] = node.tree(G[chrom])
	return G		
def run(bidirfile, fimodir):
    
    distances = dict()
    directorylist = [fimodir + '/' + item for item in os.listdir(fimodir) if 'fimo_out' in item]
    bidirsites = Functions.create_site_bidir(bidirfile)
    for item in directorylist:
        print item
        TF = item.split('/')[5].split('_')[0]
        fimodict = Functions.create_tup_fimo(item + "/fimo.cut.txt", True)
        for key in bidirsites:
            start,stop,chrom = key
            fimotree = fimodict[chrom]
            fimotree = node.tree(fimotree)
            intervalsearch = []
            for item in fimotree.searchInterval(key):
                start2,stop2,pval = item
                i = (start+stop)/2
                x = (start2+stop2)/2
                intervalsearch.append((i-x,pval))
            bidirsites[key].append((TF,intervalsearch))

        x = Functions.get_distances_pad_v3(bidirfile, item + "/fimo.cut.txt", True, 1500)
        if len(x) != 0:
            start = min(x)
            stop = max(x)
            sigma = np.std(x)
            mu = np.mean(x)
            N = len(x)
            #y = np.random.uniform(start, stop, N)
            y = np.linspace(start,stop,N)
            z = mu/(sigma/math.sqrt(N))
            p = 1 - scipy.special.ndtr(z)
            k = scipy.stats.ks_2samp(x,y)
            m = scipy.stats.mode(x)[0][0]
            if -0.25 < m < 0.25:
                m = 0
            else:
                m = 1
            distances[TF] = [k[1],p,m]
        
    return distances,bidirsites
def run(bidirfile, fimodir):

    directorylist = [fimodir + '/' + item for item in os.listdir(fimodir) if 'fimo_out' in item]
    bidirsites = Functions.create_site_bidir(bidirfile)
    for item in directorylist:
        print item
        TF = item.split('/')[5].split('_')[0]
        fimodict = Functions.create_tup_fimo(item + "/fimo.cut.txt", True)
        for key in bidirsites:
            start,stop,chrom = key
            fimotree = fimodict[chrom]
            fimotree = node.tree(fimotree)
            intervalsearch = []
            for item in fimotree.searchInterval(key):
                start2,stop2,info = item
                i = (start+stop)/2
                x = (start2+stop2)/2
                intervalsearch.append((i-x,info))
            bidirsites[key].append((TF,intervalsearch))

        
    return bidirsites
Esempio n. 17
0
def insert_dbSNP(G, dbSNP_directory, data_type,test=None): #bed
	#need to make intervals into interval tree
	A  	= {}
	for chrom in G:
		A[chrom] = node.tree([(I.start, I. stop, I) for I in G[chrom]])
	for FILE in os.listdir(dbSNP_directory):
		if "bed" == FILE[-3:]:
			header 	= True
			with open(dbSNP_directory+FILE) as FH:
				for line in FH:
					if not header:
						chrom, start, stop, ID, zero, strand 	= line.strip("\n").split("\t")
						start, stop 	= int(start), int(stop)
						if chrom in A:
							finds 		= A[chrom].searchInterval((start, stop))
							if finds:
								for st, sp, I in finds:
									I.insert_data(start,ID, data_type)	
								if test is not None:
									break
					else:
						header=False
def run(bidirectionalfile, DNAseFile):
    bidirlist = Functions.parse_bidirfile(bidirectionalfile)
    bidirdict = dict()
    datapoints = []
    for item in bidirlist:
        chrom, start, stop, parameters = item
        if chrom not in bidirdict:
            bidirdict[chrom] = []
        else:
            bidirdict[chrom].append((start,stop,parameters))
    
    dnasedict = Functions.create_tup_dict(DNAseFile, False)
    for chrom in dnasedict:
        if chrom in bidirdict:
            bidirtree = node.tree(bidirdict[chrom])
            for item in dnasedict[chrom]:
                bidirsite = bidirtree.searchInterval(item)
                if len(bidirsite) != 0:
                    start = float(item[0])
                    stop = float(item[1])
                    size = stop - start
                    datapoints.append((bidirsite[2][6],size))
                    
    return datapoints
Esempio n. 19
0
def collect_all_ChIP_motif_hits(FILES, FHW,i):
	for CM,MODEL in FILES:
		G 	= {}
		T 	= 0
		header 	= True
		with open(CM) as FH :
			for line in FH:
				if not header:
					line_array 	= line.strip("\n").split("\t")
					chrom,start, stop 	= line_array[1],line_array[2],line_array[3]
					start, stop 		= int(start),int(stop)
					FHW.write(line_array[1]+"\t" +line_array[2]+"\t"+line_array[3]+"\tCM_"+ MODEL +","+ str(i)+"\n" )
					if chrom not in G:
						G[chrom]=list()
					G[chrom].append((start-1000, stop+1000))

					T+=1
					i+=1
				else:
					header=False
		for chrom in G:
			G[chrom].sort()
			G[chrom]=node.tree(G[chrom])
	return G,i