def refseq_table(FILE, TSS=True): G = {} with open(FILE) as FH: header = True for line in FH: if not header: N,chrom,strand, start, stop = line.strip("\n").split("\t")[1:6] if chrom not in G: G[chrom] = list() if TSS: if strand == "+": G[chrom].append((int(start)-500, int(start) + 500, N)) else: G[chrom].append((int(stop)-500, int(stop) + 500, N)) else: G[chrom].append((int(start) , int(stop) , N)) else: header=False FHW = open("/Users/joazofeifa/Lab/genome_files/TSS.bed","w") for chrom in G: for st, sp, N in G[chrom]: FHW.write(chrom + "\t" + str(st) + "\t" + str(sp) + "\t" + N + "\n" ) G[chrom] = node.tree(G[chrom]) FHW.close() return G
def insert_clinVarSNP(G, FILE, data_type, sig_level=4): #vcf A = {} N = 0. NN = 0. NNN = 0. with open(FILE) as FH: for chrom in G: A[chrom] = node.tree([(I.start, I. stop, I) for I in G[chrom]]) for line in FH: if "#" != line[0]: chrom, pos, ID, REF, ALT,QUAL,FILTER, INFO = line.strip("\n").split("\t") chrom="chr"+chrom info_array = dict([d.split("=") for d in INFO.split(";") if len(d.split("="))==2 ]) sig = int(info_array["CLNSIG"].split("|")[-1].split(",")[-1]) if sig >= sig_level and sig!=255: NN+=1 if chrom in A: pos = int(pos ) finds = A[chrom].searchPoint(pos) if finds: for st, sp, I in finds: I.insert_data(pos,ID, data_type) NNN+=1 N+=1
def load_peak_files(FILE, G, TYPE): if not FILE or not os.path.exists(FILE): for chrom in G: for I in G[chrom]: setattr(I, TYPE+"_peak", True) else: with open(FILE) as FH: A = {} for line in FH: chrom,start, stop = line.strip("\n").split("\t")[:3] start, stop = int(start), int(stop) if chrom not in A: A[chrom]=list() A[chrom].append((start, stop, "")) for chrom in G: if chrom in A: T = node.tree(A[chrom]) for I in G[chrom]: F = T.searchInterval((I.start, I.stop)) if F: setattr(I, TYPE+"_peak", True) else: setattr(I, TYPE+"_peak", False) else: for I in G[chrom]: setattr(I, TYPE+"_peak", False) if not os.path.exists(FILE): print FILE, "peak file, doesn't exist"
def _get_overlaps_pairwaise_tree(self, A, node_B): overlaps = list() for a in A: FINDS = node_B.searchInterval((a.start, a.stop)) if FINDS: overlaps += [(max(a.start, st), min(a.stop, sp), [f for f in F] + [a]) for st, sp, F in FINDS] overlaps.sort() return node.tree(overlaps)
def _build(self, LSTS, kwargs): for v in kwargs: setattr(self, v, kwargs[v]) for i, L in enumerate(LSTS): L.sort() self.intervals.append([interval(l, LID=i, PID=j) for j, l in enumerate(L)]) self._check_disjoint() if not self.distjoint: for L in self.intervals: self.trees.append(node.tree([(l.start, l.stop, [l]) for l in L]))
def find_overlaps(self, *args, **kwargs): isolate = False ignore = list() if "isolate" in kwargs: isolate = kwargs["isolate"] if "ignore" in kwargs: ignore = kwargs["ignore"] if not isolate: assert len(args) > 1 or len(args[0]) > 1, "interval_package: need at least two lists to compare" if len(args) == 1: args = args[0] searchable = [self.intervals[a] for a in args if a < len(self.intervals) and a not in ignore] assert len(searchable) == len( args ), "interval_package: one or more of your interval comparisons is not in comparison struct" if self.distjoint: for i in range(len(searchable) - 1): if i == 0: overlaps = self._get_overlaps_pairwase(searchable[i], searchable[i + 1]) else: overlaps = self._get_overlaps_pairwase(searchable[i + 1], overlaps) O = overlaps else: trees = [self.trees[a] for a in args if a < len(self.intervals) and a not in ignore] for i in range(len(searchable) - 1): if i == 0: overlaps = self._get_overlaps_pairwaise_tree(searchable[i], trees[i + 1]) else: overlaps = self._get_overlaps_pairwaise_tree(searchable[i + 1], overlaps) O = overlaps.get_all() O.sort() O = [overlap(o[0], o[1], overlaps=dict([(I, 1) for I in o[2]])) for o in O] return O else: a = args[0] if not self.trees: self.trees = [ node.tree([(l.start, l.stop, [l]) for l in L]) for i, L in enumerate(self.intervals) if i not in ignore ] trees = [self.trees[i] for i, A in enumerate(self.intervals) if i != a] DISTINCT = list() for A in self.intervals[a]: FOUND = False for t in trees: if t.searchInterval((A.start, A.stop)): FOUND = True break if not FOUND: DISTINCT.append(A) return DISTINCT
def load_intervals(FILE): G = {} with open(FILE) as FH: for line in FH: chrom,start, stop, name = line.strip("\n").split("\t") if chrom not in G: G[chrom]=list() G[chrom].append((int(start),int(stop), name )) for chrom in G: G[chrom].sort() G[chrom] = node.tree(G[chrom]) return G
def make_DNAse_searchable(FILE): G = {} with open(FILE) as FH: for line in FH: chrom,start,stop = line.split("\t")[:3] if chrom not in G: G[chrom]=list() G[chrom].append((int(start)-2000, int(stop)+2000)) for chrom in G: G[chrom].sort() G[chrom] = node.tree(G[chrom]) return G
def load_gene_counts(FILE): G = {} with open(FILE) as FH: for line in FH: name, chrom, start, stop, pos_neg = line.strip("\n").split("\t") if chrom not in G: G[chrom] = list() G[chrom].append((int(start), int(stop), (name, pos_neg))) for chrom in G: G[chrom].sort() G[chrom] = node.tree(G[chrom]) return G
def load_gene_counts(FILE ): G = {} with open(FILE) as FH: for line in FH: name,chrom,start, stop, cov = line.strip("\n").split("\t") cov = sum([abs(float(x)) for x in cov.split(",")]) if chrom not in G: G[chrom]= list() G[chrom].append((int(start) , int(stop) , (name, cov))) for chrom in G: G[chrom].sort() G[chrom]= node.tree(G[chrom]) return G
def load_refseq(FILE): G = {} with open(FILE) as FH: header = True for line in FH: if not header: name, chrom, strand, start, stop = line.split("\t")[1:6] if chrom not in G: G[chrom] = list() G[chrom].append((int(start), int(stop), name+ "_" + strand)) else: header=False for chrom in G: G[chrom] = node.tree(G[chrom]) return G
def read_bidireciotnal(FILE): G = {} i = 0 with open(FILE) as FH: for line in FH: if "#" != line[0]: chrom,start, stop = line.split("\t")[:3] if chrom not in G: G[chrom]=list() G[chrom].append((int(start), int(stop), i)) i+=1 for chrom in G: G[chrom].sort() G[chrom] = node.tree(G[chrom]) return G
def make_searchable(self): GG = list() SS = (self.unique_to_cell_type_1,self.unique_to_cell_type_2) for S in SS: G = {} for i in S: s = S[i] if s.chrom not in G: G[s.chrom]=list() G[s.chrom].append((s.start, s.stop, s)) for chrom in G: G[chrom].sort() G[chrom] = node.tree(G[chrom]) GG.append(G) self.unique_to_cell_type_1_searchable = GG[0] self.unique_to_cell_type_2_searchable = GG[1]
def refseq(FILE): header = True G = {} with open(FILE) as FH: for line in FH: if not header: name, chrom, strand, start, stop = line.split("\t")[1:6] if chrom not in G: G[chrom] = list() if strand == "+": G[chrom].append((int(start)-1000, int (start )+1000 , strand )) else: G[chrom].append((int(stop)-1000, int (stop )+1000 , strand )) else: header=False for chrom in G: G[chrom] = node.tree(G[chrom]) return G
def run(bidirfile, fimodir): distances = dict() directorylist = [fimodir + '/' + item for item in os.listdir(fimodir) if 'fimo_out' in item] bidirsites = Functions.create_site_bidir(bidirfile) for item in directorylist: print item TF = item.split('/')[5].split('_')[0] fimodict = Functions.create_tup_fimo(item + "/fimo.cut.txt", True) for key in bidirsites: start,stop,chrom = key fimotree = fimodict[chrom] fimotree = node.tree(fimotree) intervalsearch = [] for item in fimotree.searchInterval(key): start2,stop2,pval = item i = (start+stop)/2 x = (start2+stop2)/2 intervalsearch.append((i-x,pval)) bidirsites[key].append((TF,intervalsearch)) x = Functions.get_distances_pad_v3(bidirfile, item + "/fimo.cut.txt", True, 1500) if len(x) != 0: start = min(x) stop = max(x) sigma = np.std(x) mu = np.mean(x) N = len(x) #y = np.random.uniform(start, stop, N) y = np.linspace(start,stop,N) z = mu/(sigma/math.sqrt(N)) p = 1 - scipy.special.ndtr(z) k = scipy.stats.ks_2samp(x,y) m = scipy.stats.mode(x)[0][0] if -0.25 < m < 0.25: m = 0 else: m = 1 distances[TF] = [k[1],p,m] return distances,bidirsites
def run(bidirfile, fimodir): directorylist = [fimodir + '/' + item for item in os.listdir(fimodir) if 'fimo_out' in item] bidirsites = Functions.create_site_bidir(bidirfile) for item in directorylist: print item TF = item.split('/')[5].split('_')[0] fimodict = Functions.create_tup_fimo(item + "/fimo.cut.txt", True) for key in bidirsites: start,stop,chrom = key fimotree = fimodict[chrom] fimotree = node.tree(fimotree) intervalsearch = [] for item in fimotree.searchInterval(key): start2,stop2,info = item i = (start+stop)/2 x = (start2+stop2)/2 intervalsearch.append((i-x,info)) bidirsites[key].append((TF,intervalsearch)) return bidirsites
def insert_dbSNP(G, dbSNP_directory, data_type,test=None): #bed #need to make intervals into interval tree A = {} for chrom in G: A[chrom] = node.tree([(I.start, I. stop, I) for I in G[chrom]]) for FILE in os.listdir(dbSNP_directory): if "bed" == FILE[-3:]: header = True with open(dbSNP_directory+FILE) as FH: for line in FH: if not header: chrom, start, stop, ID, zero, strand = line.strip("\n").split("\t") start, stop = int(start), int(stop) if chrom in A: finds = A[chrom].searchInterval((start, stop)) if finds: for st, sp, I in finds: I.insert_data(start,ID, data_type) if test is not None: break else: header=False
def run(bidirectionalfile, DNAseFile): bidirlist = Functions.parse_bidirfile(bidirectionalfile) bidirdict = dict() datapoints = [] for item in bidirlist: chrom, start, stop, parameters = item if chrom not in bidirdict: bidirdict[chrom] = [] else: bidirdict[chrom].append((start,stop,parameters)) dnasedict = Functions.create_tup_dict(DNAseFile, False) for chrom in dnasedict: if chrom in bidirdict: bidirtree = node.tree(bidirdict[chrom]) for item in dnasedict[chrom]: bidirsite = bidirtree.searchInterval(item) if len(bidirsite) != 0: start = float(item[0]) stop = float(item[1]) size = stop - start datapoints.append((bidirsite[2][6],size)) return datapoints
def collect_all_ChIP_motif_hits(FILES, FHW,i): for CM,MODEL in FILES: G = {} T = 0 header = True with open(CM) as FH : for line in FH: if not header: line_array = line.strip("\n").split("\t") chrom,start, stop = line_array[1],line_array[2],line_array[3] start, stop = int(start),int(stop) FHW.write(line_array[1]+"\t" +line_array[2]+"\t"+line_array[3]+"\tCM_"+ MODEL +","+ str(i)+"\n" ) if chrom not in G: G[chrom]=list() G[chrom].append((start-1000, stop+1000)) T+=1 i+=1 else: header=False for chrom in G: G[chrom].sort() G[chrom]=node.tree(G[chrom]) return G,i