def run(file1,file2): d1 = dict() d2 = dict() A = list() B = list() with open(file1) as F1: for line in F1: if not '#' in line[0]: chrom, start, stop, param = line.strip().split() param = param.split('|')[1].split(',') d1[chrom + ':' + start + '-' + stop] = param A.append((int(start),int(stop),'A',chrom)) with open(file2) as F2: for line in F2: if not '#' in line[0]: chrom, start, stop, param = line.strip().split() param = param.split('|')[1].split(',') d2[chrom + ':' + start + '-' + stop] = param B.append((int(start),int(stop),'B',chrom)) ST = intervals.comparison((A,B)) OVERLAPS_0_1 = ST.find_overlaps(0,1) print "Overlap Instances: " + str(len(OVERLAPS_0_1)) for index in range(len(param)): x = list() y = list() for O in OVERLAPS_0_1: if not len(O.overlaps.keys()) > 2: comparison = list() for interval_original in O.overlaps: comparison.append(interval_original.INFO[1]) if comparison[0] == comparison[1]: for interval_original in O.overlaps: if 'A' in interval_original.INFO: x.append(float(d1[interval_original.INFO[1] + ':' + str(interval_original.start) + '-' + str(interval_original.stop)][index])) elif 'B' in interval_original.INFO: y.append(float(d2[interval_original.INFO[1] + ':' + str(interval_original.start) + '-' + str(interval_original.stop)][index])) F = plt.figure() xy = np.vstack([x,y]) z = gaussian_kde(xy)(xy) plt.scatter(x,y,c=z,edgecolor="",s=14) plt.savefig(savedir + 'figure' + str(index) + '.png') return x,y
def run(bidirlist,fimofile,chipfile,bidirdict): #==================================================================================================== #Create a dictionary for each file (bidirectional, fimo, chip), convert (chr,start,stop) to one list #calculating chromosome offsets from lists above fimodict = Functions.create_tup_uncut_fimo2(fimofile, True) fimolist = list() for chrom in fimodict: if chrom in chromosomes: i = chromosomes.index(chrom) for interval in fimodict[chrom]: fimolist.append((int(interval[0])+sum(sizes[0:i]),int(interval[1])+sum(sizes[0:i]))) chipdict = Functions.create_tup_dict(chipfile, False) chiplist = list() for chrom in chipdict: if chrom in chromosomes: i = chromosomes.index(chrom) for interval in chipdict[chrom]: chiplist.append((int(interval[0])+sum(sizes[0:i]),int(interval[1])+sum(sizes[0:i]))) #==================================================================================================== #Using intervals, compare bidirectionals with fimo sites with chip sites, populate bidirectional #dictionary with overlapping fimo and chip sites ST = intervals.comparison(bidirlist,fimolist,chiplist) AB_Overlaps = ST.find_overlaps(0,1) ABList = list() FandB = ST.find_overlaps(0,1) FandBList = list() for O in AB_Overlaps: for interval_original in O.overlaps: ABList.append((interval_original.start,interval_original.stop,interval_original.INFO)) BC_Overlaps = ST.find_overlaps(0,2) return bidirdict
import intervals,load #=================================================================== #TOY TEST DATA #needs to be a list of tuples where the first two elements are the #start and stop of the intervals. The rest of the tuple can be there #or not and can be any size A = [(1,5, 'hello','hi','+'), (4,10, "A2"), (13,15, "A3"), (32, 34, "A4"), (61,68, "A5")] B = [(1,6), (7,15, "B2"), (16,17, "B3" ), (62,69, "B4") ] C = [(2,6, "C1"), (18,20, "C2"), (21,23, "C3"), (25, 29), (31, 35)] D = [(2,7, "D1"), (12, 17, "D2"), (61,65, "D3")] #first thing is the initialize the intervals data structure #this will figure out if an interval tree is the best #algorithm or just a regular list comparison ST = intervals.comparison((A,B,C, D) ) #=================================================================== #there are three important methods that you can use #FIRST ask for overlaps between any combination of the lists OVERLAPS_0_1 = ST.find_overlaps(0,1) OVERLAPS_0_1_2 = ST.find_overlaps(0,1,2) #=================================================================== #now ST.find_overlaps will return a list of overlap classes #the overlap class will have a start and stop location of the #overlap event (acessed by overlap.start and overlap.stop) #the overlap class will also have a attribute called overlaps #with is a dictionary where the key is the original intervals #that overlap it. These intervals have the original start and stop
def intervalSearch(bed1,bed2,TSS,TSSgene,END,ENDgene): bed1list = list() bed2list = list() X1 = list() X2 = list() Y1 = list() Y2 = list() with open(bed1) as F1: for line in F1: if '#' not in line[0]: chrom, start, stop, coverage = line.strip().split() bed1list.append((int(start),int(stop),chrom,float(coverage))) with open(bed2) as F2: for line in F2: if '#' not in line[0]: chrom, start, stop, coverage = line.strip().split() bed2list.append((int(start),int(stop),chrom,float(coverage))) #print bed1list[0:10] #print bed2list[0:10] #print TSS[0:10] #print TSSgene[0:10] #print END[0:10] #print ENDgene[0:10] ############################################################################### print "Performing First Interval Search..." start = time.time() ST1 = intervals.comparison((TSS,bed1list)) OVERLAPS_TSS = ST1.find_overlaps(0,1) TSScov = dict() print "Finished First Interval Search in: ", time.time() - start start = time.time() for O in OVERLAPS_TSS: cov = 0 chromlist = list() for interval_original in O.overlaps: if len(interval_original.INFO) > 2: chrom,gene,strand = interval_original.INFO for interval_original in O.overlaps: if len(interval_original.INFO) < 2: if chrom in interval_original.INFO: chromlist.append(interval_original) for interval_original in chromlist: if '+' in strand: if interval_original.INFO[0] > 0: cov += interval_original.INFO[0] else: if interval_original.INFO[0] < 0: cov += -interval_original.INFO[0] TSScov[gene] = cov print "Finished first interval analysis in: ", time.time() - start ST2 = intervals.comparison((TSSgene,bed1list)) OVERLAPS_TSSgene = ST2.find_overlaps(0,1) TSSgenecov = dict() for O in OVERLAPS_TSSgene: cov = 0 chromlist = list() for interval_original in O.overlaps: if len(interval_original.INFO) > 2: chrom,gene,strand = interval_original.INFO for interval_original in O.overlaps: if len(interval_original.INFO) < 2: if chrom in interval_original.INFO: chromlist.append(interval_original) for interval_original in chromlist: if '+' in strand: if interval_original.INFO[0] > 0: cov += interval_original.INFO[0] else: if interval_original.INFO[0] < 0: cov += -interval_original.INFO[0] TSSgenecov[gene] = cov ST3 = intervals.comparison((END,bed1list)) OVERLAPS_END = ST3.find_overlaps(0,1) ENDcov = dict() for O in OVERLAPS_END: cov = 0 chromlist = list() for interval_original in O.overlaps: if len(interval_original.INFO) > 2: chrom,gene,strand = interval_original.INFO for interval_original in O.overlaps: if len(interval_original.INFO) < 2: if chrom in interval_original.INFO: chromlist.append(interval_original) for interval_original in chromlist: if '+' in strand: if interval_original.INFO[0] > 0: cov += interval_original.INFO[0] else: if interval_original.INFO[0] < 0: cov += -interval_original.INFO[0] ENDcov[gene] = cov ST4 = intervals.comparison((ENDgene,bed1list)) OVERLAPS_ENDgene = ST4.find_overlaps(0,1) ENDgenecov = dict() for O in OVERLAPS_ENDgene: cov = 0 chromlist = list() for interval_original in O.overlaps: if len(interval_original.INFO) > 2: chrom,gene,strand = interval_original.INFO for interval_original in O.overlaps: if len(interval_original.INFO) < 2: if chrom in interval_original.INFO: chromlist.append(interval_original) for interval_original in chromlist: if '+' in strand: if interval_original.INFO[0] > 0: cov += interval_original.INFO[0] else: if interval_original.INFO[0] < 0: cov += -interval_original.INFO[0] ENDgenecov[gene] = cov ############################################################################### for gene in TSScov: if gene in TSSgenecov: X1.append(TSScov[gene]/TSSgenecov[gene]) if gene in ENDcov and gene in ENDgenecov: Y1. append(ENDcov[gene]/ENDgenecov[gene]) ############################################################################### ST1 = intervals.comparison((TSS,bed2list)) OVERLAPS_TSS = ST1.find_overlaps(0,1) TSScov = dict() print "Finished First Interval Search" for O in OVERLAPS_TSS: cov = 0 chromlist = list() for interval_original in O.overlaps: if len(interval_original.INFO) > 2: chrom,gene,strand = interval_original.INFO for interval_original in O.overlaps: if len(interval_original.INFO) < 2: if chrom in interval_original.INFO: chromlist.append(interval_original) for interval_original in chromlist: if '+' in strand: if interval_original.INFO[0] > 0: cov += interval_original.INFO[0] else: if interval_original.INFO[0] < 0: cov += -interval_original.INFO[0] TSScov[gene] = cov ST2 = intervals.comparison((TSSgene,bed2list)) OVERLAPS_TSSgene = ST2.find_overlaps(0,1) TSSgenecov = dict() for O in OVERLAPS_TSSgene: cov = 0 chromlist = list() for interval_original in O.overlaps: if len(interval_original.INFO) > 2: chrom,gene,strand = interval_original.INFO for interval_original in O.overlaps: if len(interval_original.INFO) < 2: if chrom in interval_original.INFO: chromlist.append(interval_original) for interval_original in chromlist: if '+' in strand: if interval_original.INFO[0] > 0: cov += interval_original.INFO[0] else: if interval_original.INFO[0] < 0: cov += -interval_original.INFO[0] TSSgenecov[gene] = cov ST3 = intervals.comparison((END,bed2list)) OVERLAPS_END = ST3.find_overlaps(0,1) ENDcov = dict() for O in OVERLAPS_END: cov = 0 chromlist = list() for interval_original in O.overlaps: if len(interval_original.INFO) > 2: chrom,gene,strand = interval_original.INFO for interval_original in O.overlaps: if len(interval_original.INFO) < 2: if chrom in interval_original.INFO: chromlist.append(interval_original) for interval_original in chromlist: if '+' in strand: if interval_original.INFO[0] > 0: cov += interval_original.INFO[0] else: if interval_original.INFO[0] < 0: cov += -interval_original.INFO[0] ENDcov[gene] = cov ST4 = intervals.comparison((ENDgene,bed2list)) OVERLAPS_ENDgene = ST4.find_overlaps(0,1) ENDgenecov = dict() for O in OVERLAPS_ENDgene: cov = 0 chromlist = list() for interval_original in O.overlaps: if len(interval_original.INFO) > 2: chrom,gene,strand = interval_original.INFO for interval_original in O.overlaps: if len(interval_original.INFO) < 2: if chrom in interval_original.INFO: chromlist.append(interval_original) for interval_original in chromlist: if '+' in strand: if interval_original.INFO[0] > 0: cov += interval_original.INFO[0] else: if interval_original.INFO[0] < 0: cov += -interval_original.INFO[0] ENDgenecov[gene] = cov ############################################################################### for gene in TSScov: if gene in TSSgenecov: X2.append(TSScov[gene]/TSSgenecov[gene]) if gene in ENDcov and gene in ENDgenecov: Y2. append(ENDcov[gene]/ENDgenecov[gene]) ############################################################################### bins = 100 F = plt.figure() plt.hist(X1, bins, alpha=0.5, label='DMSO') plt.hist(X2, bins, alpha=0.5, label='CA') plt.savefig(savedir + 'Traveling Ratio') F2 = plt.figure() plt.hist(Y1, bins, alpha=0.5, label='DMSO') plt.hist(Y2, bins, alpha=0.5, label='CA') plt.savefig(savedir + "3' End Ratio")
def run(BidirFile, ChipFile, FimoFile): #Give cutoff to i for motif calling motiftoicutoff = 100 #Give size of window to look at ChIP reads windowsize = 100 BidirDict = Functions.create_tup_bidir(BidirFile) ChipDict = Functions.create_bedgraph_dict(ChipFile, False) FimoDict = Functions.create_tup_uncut_fimo2(FimoFile, True) RandomizedDict = Functions.create_randomized_sites(windowsize) #Calculate reads over background by generating equally spaced random sites that are #2xwindowsize in length #background = 0 #genomesize = 3234830000 BackgroundDict = dict() for chrom in RandomizedDict: if chrom in ChipDict: RandomList = RandomizedDict[chrom] ChipList = ChipDict[chrom] BackgroundDict[chrom] = list() STBackground = intervals.comparison((RandomList,ChipList)) for O in STBackground.find_overlaps(0,1): for interval_original in O.overlaps: if not interval_original.INFO == '': BackgroundDict[chrom].append(interval_original.INFO) #Bulk of code. Will populate dictionaries (key = chrom) with ChIP coverage #for motif sites not in bidirectionals (as defined by motifcutoff) and for #motif sites overlapping bidir calls (within motifcutoff distance) FnoBDict = dict() FandBDict = dict() for chrom in FimoDict: if chrom in BidirDict: if chrom in ChipDict: FnoBDict[chrom] = list() FandBDict[chrom] = list() BidirSites = BidirDict[chrom] #Apply motif distance to i cutoff to bidirectional sites (i.e. #collapse bidirectional sites into cutoff window) BidirPaddedSites = list() for site in BidirSites: mid = (site[0]+site[1])/2 start = mid-motiftoicutoff stop = mid+motiftoicutoff BidirPaddedSites.append((start,stop)) ChipSites = ChipDict[chrom] FimoSites = FimoDict[chrom] #Create interval tree between padded Bidir sites and FIMO motif #sites. Compares motif sites that are at cutoff distance to #bidir site. ST1 = intervals.comparison((BidirPaddedSites,FimoSites)) FnoB = ST1.get_isolated(1) #Find motifs that do not overlap a bidirectional, expand sites #to mid+-windowsize FnoBList = list() for I in FnoB: mid = (I.start+I.stop)/2 FnoBList.append((mid-windowsize,mid+windowsize)) #Find motifs that overlap a bidirectional, expand sites to #mid+-windowsize FandB = ST1.find_overlaps(0,1) FandBList = list() for O in FandB: for interval_original in O.overlaps: if not interval_original.INFO == '': mid = (interval_original.start+interval_original.stop)/2 FandBList.append((mid-windowsize,mid+windowsize)) #Find ChIP coverage over motif sites that do not overlap a bidir ST2 = intervals.comparison(FnoBList, ChipSites) FnoBChipOverlaps = ST2.find_overlaps(0,1) #FnoBtotalsize = 0 for O in FnoBChipOverlaps: for interval_original in O.overlaps: if not interval_original.INFO == '': FnoBDict[chrom].append(interval_original.INFO) #FnoBtotalsize += interval_original.stop - interval_original.start #Find ChIP coverage over motif sites that overlap a bidir ST3 = intervals.comparison(FandBList,ChipSites) FandBChipOverlaps = ST3.find_overlaps(0,1) #FandBtotalsize = 0 for O in FandBChipOverlaps: for interval_original in O.overlaps: if not interval_original.INFO == '': FandBDict[chrom].append(interval_original.INFO) #FandBtotalsize += interval_original.stop - interval_original.start return BackgroundDict, FnoBDict, FandBDict