Beispiel #1
0
def concatenateSimpleBed (fileList):
    
    a = []
    for i in fileList:
        a.extend(SimpleBed(i))
        
    return mergeIntervals(a)
Beispiel #2
0
def compareTwo(a, b, effectiveGenomeSize, validChrs):

    print "Compare 2"
    aBed = SimpleBed(a, validChrs=validChrs)
    bBed = SimpleBed(b, validChrs=validChrs)

    print "Determine overlap"
    overlappingbp = determineOverlap(aBed, bBed)
    bpA = numbBP(aBed)
    bpB = numbBP(bBed)

    print "A percent overlapped:" + str(
        100.0 * float(overlappingbp) / float(bpA)) + ", Expected:" + str(
            100.0 * float(bpB) / float(effectiveGenomeSize))
    print "B percent overlapped:" + str(
        100.0 * float(overlappingbp) / float(bpB)) + ", Expected:" + str(
            100.0 * float(bpA) / float(effectiveGenomeSize))

    print "Pvalue" + pValue(overlappingbp, bpA, bpB, effectiveGenomeSize)
Beispiel #3
0
    def __init__(self, build):
        super(ChromosomeGaps, self).__init__(
            os.path.expanduser("~/mount/publicdata/" + build + "/gaps." +
                               build))

        self.chrmgaps = collections.defaultdict(int)
        self.individualgaps = SimpleBed(
            os.path.expanduser("~/mount/publicdata/" + build + "/gaps." +
                               build))

        for (chr, start, stop) in self.individualgaps:
            self.chrmgaps[chr] += (stop - start)
Beispiel #4
0
def compareTwo(a, b, numbRandomRegions, cores):

    randomOverlap = []
    randomBPOverlap = []
    randomPercentOverlap = []

    aBed = SimpleBed(a)
    bBed = SimpleBed(b)

    params = (aBed, bBed)
    pool = Pool(processes=cores)
    result = pool.map(doOverlap, [params] * numbRandomRegions)

    for r, rbp, rpercent, _, _ in result:
        randomOverlap.append(r)
        randomBPOverlap.append(rbp)
        randomPercentOverlap.append(rpercent)

    bTree = BedIntervalTree(b)

    overlapping, overlappingbp, overlappingPercent, avgOverlappingLength, avgNonOverlappingLength = determineOverlap(
        aBed, bTree)

    #########

    random = sum(randomOverlap) / float(len(randomOverlap))

    print "Average Overlapping Length:" + str(avgOverlappingLength)
    print "Average NonOverlapping Length:" + str(avgNonOverlappingLength)

    print

    print "Overlap 1bp: " + str(overlapping) + " / " + str(
        len(aBed)) + " (" + str(100.0 * overlapping / float(len(aBed))) + "%)"
    print "Overlap 1bp Random: " + str(random) + " / " + str(
        len(aBed)) + " (" + str(100.0 * random / float(len(aBed))) + "%)"
    print "FC 1bp: " + str(overlapping) + " / " + str(random) + " = " + str(
        overlapping / float(random))

    #print "Max FC 1bp:"+ str(len(aBed)) + " / " + str(random) + " = " + str(len(aBed)/float(random))

    print "Pvalues (corr/anticorr):" + str(pValue(overlapping, randomOverlap))

    ##########

    #    print
    #
    #    ##########
    #
    #    randomPercent = sum(randomPercentOverlap)/float(len(randomPercentOverlap))
    #
    #    print "Overlap 50%: " + str(overlappingPercent) + " / " + str(len(aBed)) + " ("+str(100.0*overlappingPercent/float(len(aBed)))+"%)"
    #    print "Overlap 50% Random: " + str(randomPercent) + " / " + str(len(aBed)) + " ("+str(100.0*randomPercent/float(len(aBed)))+"%)"
    #    print "FC 50%: "+str(overlappingPercent) + " / " + str(randomPercent) + " = " + str(overlappingPercent/float(randomPercent))
    #
    #    #print "Max FC 1bp:"+ str(len(aBed)) + " / " + str(randomPercent) + " = " + str(len(aBed)/float(randomPercent))
    #
    #    print "Pvalues (corr/anticorr):" + str(pValue(overlappingPercent,randomPercentOverlap))
    #
    #    ##########

    print

    ##########

    randombp = sum(randomBPOverlap) / float(len(randomBPOverlap))

    print "BP: " + str(overlappingbp) + " / " + str(numbBP(aBed)) + " (" + str(
        100.0 * overlappingbp / float(numbBP(aBed))) + "%)"
    print "Overlap BP Random: " + str(randombp) + " / " + str(
        numbBP(aBed)) + " (" + str(
            100.0 * randombp / float(numbBP(aBed))) + "%)"
    print "FC BP: " + str(overlappingbp) + " / " + str(randombp) + " = " + str(
        overlappingbp / float(randombp))

    #print "Max FC BP:"+ str(numbBP(aBed)) + " / " + str(randombp) + " = " + str(numbBP(aBed)/float(randombp))

    print "Pvalues:" + str(pValue(overlappingbp, randomBPOverlap))
Beispiel #5
0
        for i in range(len(intervals)):
            for j in range(i, len(intervals)):
                if i == j:
                    continue
                else:
                    if (intervals[j].start < intervals[i].stop and intervals[i].start < intervals[j].stop):
                        intervals[i].start = min(intervals[i].start,  intervals[j].start)
                        intervals[i].stop = max(intervals[i].stop,  intervals[j].stop)
                        del intervals[j]
                        mergeIntervals(intervals)
                        return
  
    allintervals = {}
    
    for arg in args:
        treatment = SimpleBed(arg)
        for (chr, start, stop) in treatment:
            interval = Interval(start, stop)
            if chr not in allintervals:
                allintervals[chr] = []
            allintervals[chr].append(interval)
    
    for chr in allintervals:
        mergeIntervals(allintervals[chr])

    treatments = {}
    
    for arg in args:
        treatments[arg] = BedTreatment(arg)
    
    prettyheader = ""
    assert outputfile != None
    assert genomeBuild != None
    assert replicateAnnotation != None
    assert replicateTypes != None
    
    
    if len(replicateAnnotation)%len(replicateTypes) == 0:
        print "Unbalanced length of replicant annotators..."    
    if (len(replicateAnnotation)/2) != len(replicateTypes):
        print "Unbalanced counts of replicant annotators to replicant types..."    
    if genomeBuild not in ("hg19", "hg18", "mm9"): 
        genomeBuild = "hg19"
        print "Genome build type unacceptable. Defaulting to genome hg19..."
    
    print "Loading regions file into memory..."
    regions = SimpleBed(regionsfile)
    print "Loading methylation data into memory..."
    methdata = PooledAggregateTree(methdatafile)
    print "Loading genome into structure..."
    genome = Genome(genomeBuild)
    
    
    def methGetP(rep1, rep2):
        ''' 
        Calculate fishers exact test (p-value) from the methylated and unmethylated 
        scores between two sets of replicates
        '''
        oddsRatio, p = scipy.stats.fisher_exact([rep1, rep2]) #@UnusedVariable
        return p
        
def compareTwoFiles(a, b):

    aBed = SimpleBed(a)
    bTree = BedIntervalTree(b)

    return determineOverlap(aBed, bTree)
            a = arg
        elif opt == "-b":
            b = arg
        elif opt == "-c":
            c = arg
        elif opt == "-d":
            d = arg
        elif opt == "-e":
            e = arg
        elif opt == "-n":
            numberTrials = int(arg)
        elif opt == "--cores":
            cores = int(arg)

    #Gets the overlap values
    aa = numbBP(SimpleBed(a))
    bb = numbBP(SimpleBed(b))
    cc = numbBP(SimpleBed(c))
    dd = numbBP(SimpleBed(d))
    ee = numbBP(SimpleBed(e))

    ab, abBed = compareTwoFiles(a, b)
    ac, acBed = compareTwoFiles(a, c)
    ad, adBed = compareTwoFiles(a, d)
    ae, aeBed = compareTwoFiles(a, e)
    bc, bcBed = compareTwoFiles(b, c)
    bd, bdBed = compareTwoFiles(b, d)
    be, beBed = compareTwoFiles(b, e)
    cd, cdBed = compareTwoFiles(c, d)
    ce, ceBed = compareTwoFiles(c, e)
    de, deBed = compareTwoFiles(d, e)
Beispiel #9
0
# UCSC table browser - Mapping and Sequencing - Chromosome Bands
# Download all columns with exception of "gieStain"
gBanding = ExtendedBed(os.path.expanduser("/mnt/50tb/publicdata/" + assembly +
                                          "/G-Banding/cytogenetic.map.bed"),
                       defaultkeys=["chrm", "start", "stop", "band"],
                       forcekeys=True)

chromosomeEnds = ChromosomeEnds(assembly)

###
###
###

headerRow = ['id', 'chr', 'start', 'stop']

intervals = SimpleBed(infile)

# extra columns from the input file
headerRow.extend(intervals.header[3:])

headerRow.extend([
    'In genes', "In genes + promotor", "intergenic", "iswholelyinintron",
    "isinpromotor", "isindownstreampromotor", "isinupstreampromotor",
    'overlapsTSS', 'In exons', 'in.cpg'
])
# "Ebox Motif", "Ebox Motif - Canonical A", "Ebox Motif - Canonical G", "AP1 Motif","AP-2 Motif"])

for genelist in genelists:
    headerRow.append(genelist.getFullName())
    headerRow.append(genelist.getFullName() + "-promotoronly")
    genelists_bound[genelist.getFullName()] = set()
Beispiel #10
0
import getopt

if __name__ == "__main__":

    try:
        opts, args = getopt.getopt(sys.argv[1:], "b:a:o:", [])
    except getopt.GetoptError, err:
        # print help information and exit:
        print str(err)  # will print something like "option -a not recognized"
        sys.exit(2)

    bedfile = None
    outputfile = None
    for o, a in opts:
        if o == "-a":
            assembly = a
        if o == "-b":
            bedfile = SimpleBed(a)
        if o == "-o":
            outputfile = csv.writer(open(a, "w"), delimiter='\t')

    ends = ChromosomeEnds(assembly)

    assert bedfile != None
    assert outputfile != None

    for (chr, start, stop) in bedfile:
        if int(start) > ends[chr]:
            continue
        else:
            outputfile.writerow([chr, start, str(min(ends[chr], int(stop)))])
Beispiel #11
0

def overlappingBP(start, stop, intervals):
    overlapBP = 0
    for interval in intervals:
        # how much of interval is overlapped with start, stop
        overlapBP += max(
            min(interval.end, stop) - max(interval.start, start), 0)
    return overlapBP


for infile in args:

    print infile

    intervals = SimpleBed(infile)

    print "Starting..."

    for row in intervals:

        chr, start, stop = row

        # 0 length regions
        if start == stop:
            continue

        size = stop - start

        # genes
        ingenes = mergeIntervals(genes.getIntervalsInRange(chr, start, stop))
Beispiel #12
0
        excList = exclude[i]
        
        print intList
        
        #Test for excluded regions:
        excludeUnionBed = None
        excludeBp = 0
        
        if (len(excList)) >0:
            #Concatenates the excluded bed list and gets the union of its regions
            excludeUnionBed = concatenateSimpleBed(excList)
            #Outputs the union list to a file
            excludeUnionBed = intervalListToFile(sys.path[0],excludeUnionBed)
        
        if len(intList) == 1:
            overlap = numbBP(SimpleBed(intList[0]))
            raweights.append(overlap)
            if excludeUnionBed != None:
                excludeBp,_ = compareTwoFiles(intList[0],excludeUnionBed)
            weights.append(overlap - excludeBp)
            
        if len(intList) == 2:
            overlap,bed = compareTwoFiles(intList[0],intList[1])
            raweights.append(overlap)
            if excludeUnionBed != None:
                excludeBp,_ = compareListAndFile(bed,excludeUnionBed)
            weights.append(overlap - excludeBp)

        if len(intList) > 2:
            _,bed = compareTwoFiles(intList[0],intList[1])
            
Beispiel #13
0
    cores = 1

    for opt, arg in opts:
        if opt == "-a":
            a = arg
        elif opt == "-b":
            b = arg
        elif opt == "-c":
            c = arg
        elif opt == "-n":
            numberTrials = int(arg)
        elif opt == "--cores":
            cores = int(arg)

    #Gets the overlap values
    aa = numbBP(SimpleBed(a))

    bb = numbBP(SimpleBed(b))

    cc = numbBP(SimpleBed(c))

    ab, abBed = compareTwoFiles(a, b)

    ac, _ = compareTwoFiles(a, c)

    bc, _ = compareTwoFiles(b, c)

    abc, _ = compareListAndFile(abBed, c)

    print "Raw:"
    print "a\t" + str(aa)