def concatenateSimpleBed (fileList): a = [] for i in fileList: a.extend(SimpleBed(i)) return mergeIntervals(a)
def compareTwo(a, b, effectiveGenomeSize, validChrs): print "Compare 2" aBed = SimpleBed(a, validChrs=validChrs) bBed = SimpleBed(b, validChrs=validChrs) print "Determine overlap" overlappingbp = determineOverlap(aBed, bBed) bpA = numbBP(aBed) bpB = numbBP(bBed) print "A percent overlapped:" + str( 100.0 * float(overlappingbp) / float(bpA)) + ", Expected:" + str( 100.0 * float(bpB) / float(effectiveGenomeSize)) print "B percent overlapped:" + str( 100.0 * float(overlappingbp) / float(bpB)) + ", Expected:" + str( 100.0 * float(bpA) / float(effectiveGenomeSize)) print "Pvalue" + pValue(overlappingbp, bpA, bpB, effectiveGenomeSize)
def __init__(self, build): super(ChromosomeGaps, self).__init__( os.path.expanduser("~/mount/publicdata/" + build + "/gaps." + build)) self.chrmgaps = collections.defaultdict(int) self.individualgaps = SimpleBed( os.path.expanduser("~/mount/publicdata/" + build + "/gaps." + build)) for (chr, start, stop) in self.individualgaps: self.chrmgaps[chr] += (stop - start)
def compareTwo(a, b, numbRandomRegions, cores): randomOverlap = [] randomBPOverlap = [] randomPercentOverlap = [] aBed = SimpleBed(a) bBed = SimpleBed(b) params = (aBed, bBed) pool = Pool(processes=cores) result = pool.map(doOverlap, [params] * numbRandomRegions) for r, rbp, rpercent, _, _ in result: randomOverlap.append(r) randomBPOverlap.append(rbp) randomPercentOverlap.append(rpercent) bTree = BedIntervalTree(b) overlapping, overlappingbp, overlappingPercent, avgOverlappingLength, avgNonOverlappingLength = determineOverlap( aBed, bTree) ######### random = sum(randomOverlap) / float(len(randomOverlap)) print "Average Overlapping Length:" + str(avgOverlappingLength) print "Average NonOverlapping Length:" + str(avgNonOverlappingLength) print print "Overlap 1bp: " + str(overlapping) + " / " + str( len(aBed)) + " (" + str(100.0 * overlapping / float(len(aBed))) + "%)" print "Overlap 1bp Random: " + str(random) + " / " + str( len(aBed)) + " (" + str(100.0 * random / float(len(aBed))) + "%)" print "FC 1bp: " + str(overlapping) + " / " + str(random) + " = " + str( overlapping / float(random)) #print "Max FC 1bp:"+ str(len(aBed)) + " / " + str(random) + " = " + str(len(aBed)/float(random)) print "Pvalues (corr/anticorr):" + str(pValue(overlapping, randomOverlap)) ########## # print # # ########## # # randomPercent = sum(randomPercentOverlap)/float(len(randomPercentOverlap)) # # print "Overlap 50%: " + str(overlappingPercent) + " / " + str(len(aBed)) + " ("+str(100.0*overlappingPercent/float(len(aBed)))+"%)" # print "Overlap 50% Random: " + str(randomPercent) + " / " + str(len(aBed)) + " ("+str(100.0*randomPercent/float(len(aBed)))+"%)" # print "FC 50%: "+str(overlappingPercent) + " / " + str(randomPercent) + " = " + str(overlappingPercent/float(randomPercent)) # # #print "Max FC 1bp:"+ str(len(aBed)) + " / " + str(randomPercent) + " = " + str(len(aBed)/float(randomPercent)) # # print "Pvalues (corr/anticorr):" + str(pValue(overlappingPercent,randomPercentOverlap)) # # ########## print ########## randombp = sum(randomBPOverlap) / float(len(randomBPOverlap)) print "BP: " + str(overlappingbp) + " / " + str(numbBP(aBed)) + " (" + str( 100.0 * overlappingbp / float(numbBP(aBed))) + "%)" print "Overlap BP Random: " + str(randombp) + " / " + str( numbBP(aBed)) + " (" + str( 100.0 * randombp / float(numbBP(aBed))) + "%)" print "FC BP: " + str(overlappingbp) + " / " + str(randombp) + " = " + str( overlappingbp / float(randombp)) #print "Max FC BP:"+ str(numbBP(aBed)) + " / " + str(randombp) + " = " + str(numbBP(aBed)/float(randombp)) print "Pvalues:" + str(pValue(overlappingbp, randomBPOverlap))
for i in range(len(intervals)): for j in range(i, len(intervals)): if i == j: continue else: if (intervals[j].start < intervals[i].stop and intervals[i].start < intervals[j].stop): intervals[i].start = min(intervals[i].start, intervals[j].start) intervals[i].stop = max(intervals[i].stop, intervals[j].stop) del intervals[j] mergeIntervals(intervals) return allintervals = {} for arg in args: treatment = SimpleBed(arg) for (chr, start, stop) in treatment: interval = Interval(start, stop) if chr not in allintervals: allintervals[chr] = [] allintervals[chr].append(interval) for chr in allintervals: mergeIntervals(allintervals[chr]) treatments = {} for arg in args: treatments[arg] = BedTreatment(arg) prettyheader = ""
assert outputfile != None assert genomeBuild != None assert replicateAnnotation != None assert replicateTypes != None if len(replicateAnnotation)%len(replicateTypes) == 0: print "Unbalanced length of replicant annotators..." if (len(replicateAnnotation)/2) != len(replicateTypes): print "Unbalanced counts of replicant annotators to replicant types..." if genomeBuild not in ("hg19", "hg18", "mm9"): genomeBuild = "hg19" print "Genome build type unacceptable. Defaulting to genome hg19..." print "Loading regions file into memory..." regions = SimpleBed(regionsfile) print "Loading methylation data into memory..." methdata = PooledAggregateTree(methdatafile) print "Loading genome into structure..." genome = Genome(genomeBuild) def methGetP(rep1, rep2): ''' Calculate fishers exact test (p-value) from the methylated and unmethylated scores between two sets of replicates ''' oddsRatio, p = scipy.stats.fisher_exact([rep1, rep2]) #@UnusedVariable return p
def compareTwoFiles(a, b): aBed = SimpleBed(a) bTree = BedIntervalTree(b) return determineOverlap(aBed, bTree)
a = arg elif opt == "-b": b = arg elif opt == "-c": c = arg elif opt == "-d": d = arg elif opt == "-e": e = arg elif opt == "-n": numberTrials = int(arg) elif opt == "--cores": cores = int(arg) #Gets the overlap values aa = numbBP(SimpleBed(a)) bb = numbBP(SimpleBed(b)) cc = numbBP(SimpleBed(c)) dd = numbBP(SimpleBed(d)) ee = numbBP(SimpleBed(e)) ab, abBed = compareTwoFiles(a, b) ac, acBed = compareTwoFiles(a, c) ad, adBed = compareTwoFiles(a, d) ae, aeBed = compareTwoFiles(a, e) bc, bcBed = compareTwoFiles(b, c) bd, bdBed = compareTwoFiles(b, d) be, beBed = compareTwoFiles(b, e) cd, cdBed = compareTwoFiles(c, d) ce, ceBed = compareTwoFiles(c, e) de, deBed = compareTwoFiles(d, e)
# UCSC table browser - Mapping and Sequencing - Chromosome Bands # Download all columns with exception of "gieStain" gBanding = ExtendedBed(os.path.expanduser("/mnt/50tb/publicdata/" + assembly + "/G-Banding/cytogenetic.map.bed"), defaultkeys=["chrm", "start", "stop", "band"], forcekeys=True) chromosomeEnds = ChromosomeEnds(assembly) ### ### ### headerRow = ['id', 'chr', 'start', 'stop'] intervals = SimpleBed(infile) # extra columns from the input file headerRow.extend(intervals.header[3:]) headerRow.extend([ 'In genes', "In genes + promotor", "intergenic", "iswholelyinintron", "isinpromotor", "isindownstreampromotor", "isinupstreampromotor", 'overlapsTSS', 'In exons', 'in.cpg' ]) # "Ebox Motif", "Ebox Motif - Canonical A", "Ebox Motif - Canonical G", "AP1 Motif","AP-2 Motif"]) for genelist in genelists: headerRow.append(genelist.getFullName()) headerRow.append(genelist.getFullName() + "-promotoronly") genelists_bound[genelist.getFullName()] = set()
import getopt if __name__ == "__main__": try: opts, args = getopt.getopt(sys.argv[1:], "b:a:o:", []) except getopt.GetoptError, err: # print help information and exit: print str(err) # will print something like "option -a not recognized" sys.exit(2) bedfile = None outputfile = None for o, a in opts: if o == "-a": assembly = a if o == "-b": bedfile = SimpleBed(a) if o == "-o": outputfile = csv.writer(open(a, "w"), delimiter='\t') ends = ChromosomeEnds(assembly) assert bedfile != None assert outputfile != None for (chr, start, stop) in bedfile: if int(start) > ends[chr]: continue else: outputfile.writerow([chr, start, str(min(ends[chr], int(stop)))])
def overlappingBP(start, stop, intervals): overlapBP = 0 for interval in intervals: # how much of interval is overlapped with start, stop overlapBP += max( min(interval.end, stop) - max(interval.start, start), 0) return overlapBP for infile in args: print infile intervals = SimpleBed(infile) print "Starting..." for row in intervals: chr, start, stop = row # 0 length regions if start == stop: continue size = stop - start # genes ingenes = mergeIntervals(genes.getIntervalsInRange(chr, start, stop))
excList = exclude[i] print intList #Test for excluded regions: excludeUnionBed = None excludeBp = 0 if (len(excList)) >0: #Concatenates the excluded bed list and gets the union of its regions excludeUnionBed = concatenateSimpleBed(excList) #Outputs the union list to a file excludeUnionBed = intervalListToFile(sys.path[0],excludeUnionBed) if len(intList) == 1: overlap = numbBP(SimpleBed(intList[0])) raweights.append(overlap) if excludeUnionBed != None: excludeBp,_ = compareTwoFiles(intList[0],excludeUnionBed) weights.append(overlap - excludeBp) if len(intList) == 2: overlap,bed = compareTwoFiles(intList[0],intList[1]) raweights.append(overlap) if excludeUnionBed != None: excludeBp,_ = compareListAndFile(bed,excludeUnionBed) weights.append(overlap - excludeBp) if len(intList) > 2: _,bed = compareTwoFiles(intList[0],intList[1])
cores = 1 for opt, arg in opts: if opt == "-a": a = arg elif opt == "-b": b = arg elif opt == "-c": c = arg elif opt == "-n": numberTrials = int(arg) elif opt == "--cores": cores = int(arg) #Gets the overlap values aa = numbBP(SimpleBed(a)) bb = numbBP(SimpleBed(b)) cc = numbBP(SimpleBed(c)) ab, abBed = compareTwoFiles(a, b) ac, _ = compareTwoFiles(a, c) bc, _ = compareTwoFiles(b, c) abc, _ = compareListAndFile(abBed, c) print "Raw:" print "a\t" + str(aa)