Esempio n. 1
0
def run(file1,file2):

    d1 = dict()
    d2 = dict()
    A = list()
    B = list()
    
    with open(file1) as F1:
        for line in F1:
            if not '#' in line[0]:
                chrom, start, stop, param = line.strip().split()
                param = param.split('|')[1].split(',')
                d1[chrom + ':' + start + '-' + stop] = param
                A.append((int(start),int(stop),'A',chrom))
    with open(file2) as F2:
        for line in F2:
            if not '#' in line[0]:
                chrom, start, stop, param = line.strip().split()
                param = param.split('|')[1].split(',')
                d2[chrom + ':' + start + '-' + stop] = param
                B.append((int(start),int(stop),'B',chrom))
                
    ST = intervals.comparison((A,B))
    OVERLAPS_0_1 = ST.find_overlaps(0,1)
    print "Overlap Instances: " + str(len(OVERLAPS_0_1))
    for index in range(len(param)):
        x = list()
        y = list()
        for O in OVERLAPS_0_1:
            if not len(O.overlaps.keys()) > 2:
                comparison = list()
                for interval_original in O.overlaps:
                    comparison.append(interval_original.INFO[1])
                if comparison[0] == comparison[1]:
                    for interval_original in O.overlaps:
                        if 'A' in interval_original.INFO:
                            x.append(float(d1[interval_original.INFO[1] + ':' + str(interval_original.start) + '-' + str(interval_original.stop)][index]))
                        elif 'B' in interval_original.INFO:
                            y.append(float(d2[interval_original.INFO[1] + ':' + str(interval_original.start) + '-' + str(interval_original.stop)][index]))
                            
        F = plt.figure()
        xy = np.vstack([x,y])
        z = gaussian_kde(xy)(xy)
        plt.scatter(x,y,c=z,edgecolor="",s=14)
        plt.savefig(savedir + 'figure' + str(index) + '.png')
    
    return x,y
def run(bidirlist,fimofile,chipfile,bidirdict):

    
#====================================================================================================
#Create a dictionary for each file (bidirectional, fimo, chip), convert (chr,start,stop) to one list
#calculating chromosome offsets from lists above

                
    fimodict = Functions.create_tup_uncut_fimo2(fimofile, True)
    fimolist = list()
    for chrom in fimodict:
        if chrom in chromosomes:
            i = chromosomes.index(chrom)
            for interval in fimodict[chrom]:
                fimolist.append((int(interval[0])+sum(sizes[0:i]),int(interval[1])+sum(sizes[0:i])))
                
    chipdict = Functions.create_tup_dict(chipfile, False)
    chiplist = list()
    for chrom in chipdict:
        if chrom in chromosomes:
            i = chromosomes.index(chrom)
            for interval in chipdict[chrom]:
                chiplist.append((int(interval[0])+sum(sizes[0:i]),int(interval[1])+sum(sizes[0:i])))
#====================================================================================================
#Using intervals, compare bidirectionals with fimo sites with chip sites, populate bidirectional
#dictionary with overlapping fimo and chip sites
        
    ST = intervals.comparison(bidirlist,fimolist,chiplist)
    AB_Overlaps = ST.find_overlaps(0,1)
    ABList = list()
    FandB = ST.find_overlaps(0,1)
    FandBList = list()
    for O in AB_Overlaps:
        for interval_original in O.overlaps:
            ABList.append((interval_original.start,interval_original.stop,interval_original.INFO))
    BC_Overlaps = ST.find_overlaps(0,2)
    
    
    return bidirdict
Esempio n. 3
0
import intervals,load
#===================================================================
#TOY TEST DATA
#needs to be a list of tuples where the first two elements are the 
#start and stop of the intervals. The rest of the tuple can be there
#or not and can be any size

A 	= [(1,5, 'hello','hi','+'), (4,10, "A2"),  (13,15, "A3"), (32, 34, "A4"), (61,68, "A5")]
B 	= [(1,6), (7,15, "B2"),  (16,17, "B3" ), (62,69, "B4") ]
C 	= [(2,6, "C1"), (18,20, "C2"),  (21,23, "C3"), (25, 29), (31, 35)]
D 	= [(2,7, "D1"), (12, 17, "D2"), (61,65, "D3")]

#first thing is the initialize the intervals data structure
#this will figure out if an interval tree is the best
#algorithm or just a regular list comparison
ST 	= intervals.comparison((A,B,C, D) )

#===================================================================
#there are three important methods that you can use
#FIRST ask for overlaps between any combination of the lists

OVERLAPS_0_1 	= ST.find_overlaps(0,1)
OVERLAPS_0_1_2 	= ST.find_overlaps(0,1,2)

#===================================================================
#now ST.find_overlaps will return a list of overlap classes
#the overlap class will have a start and stop location of the 
#overlap event (acessed by overlap.start and overlap.stop)
#the overlap class will also have a attribute called overlaps
#with is a dictionary where the key is the original intervals
#that overlap it. These intervals have the original start and stop
def intervalSearch(bed1,bed2,TSS,TSSgene,END,ENDgene):
    bed1list = list()
    bed2list = list()
    X1 = list()
    X2 = list()
    Y1 = list()
    Y2 = list()
    with open(bed1) as F1:
        for line in F1:
            if '#' not in line[0]:
                chrom, start, stop, coverage = line.strip().split()
                bed1list.append((int(start),int(stop),chrom,float(coverage)))
                
    with open(bed2) as F2:
        for line in F2:
            if '#' not in line[0]:
                chrom, start, stop, coverage = line.strip().split()
                bed2list.append((int(start),int(stop),chrom,float(coverage)))
                
    #print bed1list[0:10]
    #print bed2list[0:10]
    #print TSS[0:10]
    #print TSSgene[0:10]
    #print END[0:10]
    #print ENDgene[0:10]
###############################################################################
    print "Performing First Interval Search..."
    start = time.time()
    ST1 = intervals.comparison((TSS,bed1list))
    OVERLAPS_TSS = ST1.find_overlaps(0,1)
    TSScov = dict()
    print "Finished First Interval Search in: ", time.time() - start
    start = time.time()
    for O in OVERLAPS_TSS:
        cov = 0
        chromlist = list()
        for interval_original in O.overlaps:
            if len(interval_original.INFO) > 2:
                chrom,gene,strand = interval_original.INFO
        for interval_original in O.overlaps:
            if len(interval_original.INFO) < 2:
                if chrom in interval_original.INFO:
                    chromlist.append(interval_original)
        for interval_original in chromlist:
            if '+' in strand:
                if interval_original.INFO[0] > 0:
                    cov += interval_original.INFO[0]
            else:
                if interval_original.INFO[0] < 0:
                    cov += -interval_original.INFO[0]
        TSScov[gene] = cov
    
    print "Finished first interval analysis in: ", time.time() - start
    ST2 = intervals.comparison((TSSgene,bed1list))
    OVERLAPS_TSSgene = ST2.find_overlaps(0,1)
    TSSgenecov = dict()
    for O in OVERLAPS_TSSgene:
        cov = 0
        chromlist = list()
        for interval_original in O.overlaps:
            if len(interval_original.INFO) > 2:
                chrom,gene,strand = interval_original.INFO
        for interval_original in O.overlaps:
            if len(interval_original.INFO) < 2:
                if chrom in interval_original.INFO:
                    chromlist.append(interval_original)
        for interval_original in chromlist:
            if '+' in strand:
                if interval_original.INFO[0] > 0:
                    cov += interval_original.INFO[0]
            else:
                if interval_original.INFO[0] < 0:
                    cov += -interval_original.INFO[0]
        TSSgenecov[gene] = cov
    
    ST3 = intervals.comparison((END,bed1list))
    OVERLAPS_END = ST3.find_overlaps(0,1)
    ENDcov = dict()
    for O in OVERLAPS_END:
        cov = 0
        chromlist = list()
        for interval_original in O.overlaps:
            if len(interval_original.INFO) > 2:
                chrom,gene,strand = interval_original.INFO
        for interval_original in O.overlaps:
            if len(interval_original.INFO) < 2:
                if chrom in interval_original.INFO:
                    chromlist.append(interval_original)
        for interval_original in chromlist:
            if '+' in strand:
                if interval_original.INFO[0] > 0:
                    cov += interval_original.INFO[0]
            else:
                if interval_original.INFO[0] < 0:
                    cov += -interval_original.INFO[0]
        ENDcov[gene] = cov
        
    ST4 = intervals.comparison((ENDgene,bed1list))
    OVERLAPS_ENDgene = ST4.find_overlaps(0,1)
    ENDgenecov = dict()
    for O in OVERLAPS_ENDgene:
        cov = 0
        chromlist = list()
        for interval_original in O.overlaps:
            if len(interval_original.INFO) > 2:
                chrom,gene,strand = interval_original.INFO
        for interval_original in O.overlaps:
            if len(interval_original.INFO) < 2:
                if chrom in interval_original.INFO:
                    chromlist.append(interval_original)
        for interval_original in chromlist:
            if '+' in strand:
                if interval_original.INFO[0] > 0:
                    cov += interval_original.INFO[0]
            else:
                if interval_original.INFO[0] < 0:
                    cov += -interval_original.INFO[0]
        ENDgenecov[gene] = cov
###############################################################################
    for gene in TSScov:
        if gene in TSSgenecov:
            X1.append(TSScov[gene]/TSSgenecov[gene])
        if gene in ENDcov and gene in ENDgenecov:
            Y1. append(ENDcov[gene]/ENDgenecov[gene])
###############################################################################
    ST1 = intervals.comparison((TSS,bed2list))
    OVERLAPS_TSS = ST1.find_overlaps(0,1)
    TSScov = dict()
    print "Finished First Interval Search"
    for O in OVERLAPS_TSS:
        cov = 0
        chromlist = list()
        for interval_original in O.overlaps:
            if len(interval_original.INFO) > 2:
                chrom,gene,strand = interval_original.INFO
        for interval_original in O.overlaps:
            if len(interval_original.INFO) < 2:
                if chrom in interval_original.INFO:
                    chromlist.append(interval_original)
        for interval_original in chromlist:
            if '+' in strand:
                if interval_original.INFO[0] > 0:
                    cov += interval_original.INFO[0]
            else:
                if interval_original.INFO[0] < 0:
                    cov += -interval_original.INFO[0]
        TSScov[gene] = cov
    

    ST2 = intervals.comparison((TSSgene,bed2list))
    OVERLAPS_TSSgene = ST2.find_overlaps(0,1)
    TSSgenecov = dict()
    for O in OVERLAPS_TSSgene:
        cov = 0
        chromlist = list()
        for interval_original in O.overlaps:
            if len(interval_original.INFO) > 2:
                chrom,gene,strand = interval_original.INFO
        for interval_original in O.overlaps:
            if len(interval_original.INFO) < 2:
                if chrom in interval_original.INFO:
                    chromlist.append(interval_original)
        for interval_original in chromlist:
            if '+' in strand:
                if interval_original.INFO[0] > 0:
                    cov += interval_original.INFO[0]
            else:
                if interval_original.INFO[0] < 0:
                    cov += -interval_original.INFO[0]
        TSSgenecov[gene] = cov
    
    ST3 = intervals.comparison((END,bed2list))
    OVERLAPS_END = ST3.find_overlaps(0,1)
    ENDcov = dict()
    for O in OVERLAPS_END:
        cov = 0
        chromlist = list()
        for interval_original in O.overlaps:
            if len(interval_original.INFO) > 2:
                chrom,gene,strand = interval_original.INFO
        for interval_original in O.overlaps:
            if len(interval_original.INFO) < 2:
                if chrom in interval_original.INFO:
                    chromlist.append(interval_original)
        for interval_original in chromlist:
            if '+' in strand:
                if interval_original.INFO[0] > 0:
                    cov += interval_original.INFO[0]
            else:
                if interval_original.INFO[0] < 0:
                    cov += -interval_original.INFO[0]
        ENDcov[gene] = cov
        
    ST4 = intervals.comparison((ENDgene,bed2list))
    OVERLAPS_ENDgene = ST4.find_overlaps(0,1)
    ENDgenecov = dict()
    for O in OVERLAPS_ENDgene:
        cov = 0
        chromlist = list()
        for interval_original in O.overlaps:
            if len(interval_original.INFO) > 2:
                chrom,gene,strand = interval_original.INFO
        for interval_original in O.overlaps:
            if len(interval_original.INFO) < 2:
                if chrom in interval_original.INFO:
                    chromlist.append(interval_original)
        for interval_original in chromlist:
            if '+' in strand:
                if interval_original.INFO[0] > 0:
                    cov += interval_original.INFO[0]
            else:
                if interval_original.INFO[0] < 0:
                    cov += -interval_original.INFO[0]
        ENDgenecov[gene] = cov
###############################################################################
    for gene in TSScov:
        if gene in TSSgenecov:
            X2.append(TSScov[gene]/TSSgenecov[gene])
        if gene in ENDcov and gene in ENDgenecov:
            Y2. append(ENDcov[gene]/ENDgenecov[gene])
###############################################################################
    

    bins = 100
    F = plt.figure()
    plt.hist(X1, bins, alpha=0.5, label='DMSO')
    plt.hist(X2, bins, alpha=0.5, label='CA')
    plt.savefig(savedir + 'Traveling Ratio')
    F2 = plt.figure()
    plt.hist(Y1, bins, alpha=0.5, label='DMSO')
    plt.hist(Y2, bins, alpha=0.5, label='CA')
    plt.savefig(savedir + "3' End Ratio")
def run(BidirFile, ChipFile, FimoFile):
    #Give cutoff to i for motif calling
    motiftoicutoff = 100
    #Give size of window to look at ChIP reads
    windowsize = 100
    
    
    BidirDict = Functions.create_tup_bidir(BidirFile)
    ChipDict = Functions.create_bedgraph_dict(ChipFile, False)
    FimoDict = Functions.create_tup_uncut_fimo2(FimoFile, True)
    RandomizedDict = Functions.create_randomized_sites(windowsize)
    
    #Calculate reads over background by generating equally spaced random sites that are 
    #2xwindowsize in length
    #background = 0
    #genomesize = 3234830000
    BackgroundDict = dict()
    for chrom in RandomizedDict:
        if chrom in ChipDict:
            RandomList = RandomizedDict[chrom]
            ChipList = ChipDict[chrom]
            BackgroundDict[chrom] = list()
            STBackground = intervals.comparison((RandomList,ChipList))
            for O in STBackground.find_overlaps(0,1):
                for interval_original in O.overlaps:
                        if not interval_original.INFO == '':
                            BackgroundDict[chrom].append(interval_original.INFO)
    
    #Bulk of code. Will populate dictionaries (key = chrom) with ChIP coverage
    #for motif sites not in bidirectionals (as defined by motifcutoff) and for 
    #motif sites overlapping bidir calls (within motifcutoff distance)
    FnoBDict = dict()
    FandBDict = dict()
    for chrom in FimoDict:
        if chrom in BidirDict:
            if chrom in ChipDict:
                FnoBDict[chrom] = list()
                FandBDict[chrom] = list()
                BidirSites = BidirDict[chrom]
                
                #Apply motif distance to i cutoff to bidirectional sites (i.e.
                #collapse bidirectional sites into cutoff window)
                BidirPaddedSites = list()
                for site in BidirSites:
                    mid = (site[0]+site[1])/2
                    start = mid-motiftoicutoff
                    stop = mid+motiftoicutoff
                    BidirPaddedSites.append((start,stop))
                    
                ChipSites = ChipDict[chrom]
                FimoSites = FimoDict[chrom]
                
                #Create interval tree between padded Bidir sites and FIMO motif
                #sites.  Compares motif sites that are at cutoff distance to 
                #bidir site.
                ST1 = intervals.comparison((BidirPaddedSites,FimoSites))
                FnoB = ST1.get_isolated(1)
                
                #Find motifs that do not overlap a bidirectional, expand sites
                #to mid+-windowsize
                FnoBList = list()
                for I in FnoB:
                    mid = (I.start+I.stop)/2
                    FnoBList.append((mid-windowsize,mid+windowsize))
                
                #Find motifs that overlap a bidirectional, expand sites to 
                #mid+-windowsize
                FandB = ST1.find_overlaps(0,1)
                FandBList = list()
                for O in FandB:
                    for interval_original in O.overlaps:
                        if not interval_original.INFO == '':
                            mid = (interval_original.start+interval_original.stop)/2
                            FandBList.append((mid-windowsize,mid+windowsize))
                            
                
                #Find ChIP coverage over motif sites that do not overlap a bidir
                ST2 = intervals.comparison(FnoBList, ChipSites)
                FnoBChipOverlaps = ST2.find_overlaps(0,1)
                #FnoBtotalsize = 0
                for O in FnoBChipOverlaps:
                    for interval_original in O.overlaps:
                        if not interval_original.INFO == '':
                            FnoBDict[chrom].append(interval_original.INFO)
                            #FnoBtotalsize += interval_original.stop - interval_original.start
                    
                
                #Find ChIP coverage over motif sites that overlap a bidir
                ST3 = intervals.comparison(FandBList,ChipSites)
                FandBChipOverlaps = ST3.find_overlaps(0,1)
                #FandBtotalsize = 0
                for O in FandBChipOverlaps:
                    for interval_original in O.overlaps:
                        if not interval_original.INFO == '':
                            FandBDict[chrom].append(interval_original.INFO)
                            #FandBtotalsize += interval_original.stop - interval_original.start
                
            
    
    return BackgroundDict, FnoBDict, FandBDict