def testOverextendingIsochoresFail(self): ''' annotations: covering full workspace isochores: covering full workspace workspace: covering isochores partially segments: uniformly distributed in isochores ''' annotations = {} ws_size = 1000000 nworkspaces = 20 for x in range(nworkspaces): annotations["chr%i" % x] = gat.SegmentList(iter=[ (x * ws_size, (x + 1) * ws_size) ], normalize=True) workspacelist = gat.SegmentList(iter=[(0, nworkspaces * ws_size)], normalize=True) segmentlist = getRegularSegments(nworkspaces * ws_size, 1, 0.001) ss, aa, ww = self.addSingleIsochore(segmentlist, annotations, workspacelist) self.check(ss, aa, ww, "testChromosomalBiasFail", fold_is_different=False)
def createSet(self): '''create a test set. Segments are constructed starting from the first residue in the workspace and then added with one residue gap in-between to prevent them from merging. Segments are NOT randomly distributed within the workspace. Segments can partially overlap the workspace, but gaps in the workspace are accounted for properly. Returns a list of segments and workspaces. ''' workspace = self.createWorkspace() assert len(workspace) > 0 workspace_idx = 0 start = max(0, workspace[workspace_idx][0] - (self.segment_length - 1)) segments = [] for x in range(self.nsegments): end = start + self.segment_length segments.append((start, end)) # add gap between segments end += 1 if end > workspace[workspace_idx][1]: # segment extending beyond current workspace segment. # advance workspace workspace_idx += 1 # continue advancing workspace until end of workspace is larger than end of segment # accounts for segments straddling gaps in workspace or # multiple workspace segments. while workspace_idx < len(workspace) and workspace[workspace_idx][1] < end: workspace_idx += 1 if workspace_idx == len(workspace): # can't find any more workspaces, done break start = max(workspace[workspace_idx - 1][1] + 1, workspace[workspace_idx][0] - (self.segment_length - 1)) else: start = end if len(segments) != self.nsegments: # print segments # print workspace # E.warn( "not enough space in workspace for %i segments" % (self.nsegments) ) pass _segments = gat.SegmentList(iter=segments, normalize=True) _workspace = gat.SegmentList(iter=workspace, normalize=True) return _segments, _workspace
def computeExpectedCoverage(self, samples): # expected coverage # number_of_samples * segment_length / workspace_length # modifier: # can be simplified # number of segments nsegments = float(len(self.segments)) # number of nucleotides in segments overlapping the workspace tmp = gat.SegmentList(clone=self.workspace) tmp.intersect(self.segments) expected_overlap = tmp.sum() # for computing expected overlap use complete segments # as there is no truncation segment_size = self.segments.sum() # density of segment nucleotides in workspace segment_density = float(expected_overlap) / self.workspace.sum() # average length of segments within workspace expected_segment_length = segment_size / nsegments # expected coverage of segments expected_coverage = len(samples) * \ expected_overlap / float(self.workspace.sum()) # float(workspace.sum()) / (workspace.sum() + segments.sum() * len(workspace) ) return (nsegments, segment_size, segment_density, expected_overlap, expected_segment_length, expected_coverage)
def testLengthBiasFail(self): annotations, start, end = getRegularAnnotations() workspacelist = gat.SegmentList(iter=[(start - 100, end + 100)], normalize=True) segmentlist = getRegularSegments(workspacelist.sum(), self.segment_size, self.segment_density)
def testIntervalsPartialOverlap(self): '''test with intervals with increasing amount of overlap. ''' return workspaces, segments, annotations = \ gat.IntervalCollection("workspace"), \ gat.IntervalCollection("segment"), \ gat.IntervalCollection("annotation") workspace_size = 1000 size = 100 # workspace of size 1000000 workspaces.add( "default", "chr1", gat.SegmentList(iter=[ (0, workspace_size), ], normalize=True)) workspace = workspaces["default"] # segment of size 10 segments.add("default", "chr1", gat.SegmentList(iter=[ (0, size), ], normalize=True)) # annotations: a collection of segments. # overlap increases annotations.add( "full", "chr1", gat.SegmentList(iter=[ (y, size + y), ], normalize=True)) self.check(workspace, annotations, segments)
def testDefault(self): annotations, start, end = getRegularAnnotations() workspacelist = gat.SegmentList(iter=[(start - 100, end + 100)], normalize=True) segmentlist = getRegularSegments(workspacelist.sum(), self.segment_size, self.segment_density) ss, aa, ww = self.addSingleIsochore(segmentlist, annotations, workspacelist) self.check(ss, aa, ww, "testDefaultPass", fold_is_different=False)
def getRegularSegments(workspace_size, size, density): '''return regular segments of size *size* and *density* in workspace. ''' nsegments = int(workspace_size * density / size) rest = workspace_size * (1.0 - density) distance = rest // (nsegments + 2) assert distance > 0 start = int(distance) s = [] for x in range(nsegments): s.append((start, start + size)) start += distance + size return gat.SegmentList(iter=s, normalize=True)
def testCompositionBiasFail(self): isochores = gat.SegmentList() for x in range(self.nisochores): start = x * self.isochore_size * self.nisochores i = start for x in range(self.nisochores): isochores.add(i, i + self.isochore_size) i += self.isochore_size # segments have increasing spacing within isochores s = getIncrementallySpacedSegments( self.nisochores * self.isochore_size, self.segment_size, self.segment_density) s.shift(start) segmentlist.extend(s)
def testAscertainmentBiasFail2(self): annotations, start, end = getRegularAnnotations() workspacelist = gat.SegmentList(iter=[(start - 100, end + 100)], normalize=True) segmentlist = getRegularSegmentsInAnnotations(annotations, self.segment_size, self.segment_density) ss, aa, ww = self.addSingleIsochore(segmentlist, annotations, workspacelist) self.check(ss, aa, ww, "testAscertainmentBiasFail2", fold_is_different=True)
def testChromosomalBiasPass(self): annotations1, start, end = getRegularAnnotations() annotations2, start, end = getRegularAnnotations() workspacelist = gat.SegmentList(iter=[(start - 100, end + 100)], normalize=True) # segments only in first part of workspace segmentlist = getRegularSegments(end - start + 200, self.segment_size, self.segment_density) ss, aa, ww = self.addIsochores((segmentlist, None), (annotations1, annotations2), (workspacelist, workspacelist)) self.check(ss, aa, ww, "testChromosomalBiasPass", fold_is_different=False)
def testAscertainmentBiasFail1(self): segment_density = 0.1 segment_size = 100 annotations, start, end = getRegularAnnotations() # workspace twice as large as needed workspacelist = gat.SegmentList(iter=[(start - 100, 2 * end + 100)], normalize=True) segmentlist = getRegularSegments(workspacelist.sum(), self.segment_size, self.segment_density) ss, aa, ww = self.addSingleIsochore(segmentlist, annotations, workspacelist) self.check(ss, aa, ww, "testAscertainmentBiasFail1", fold_is_different=True)
def computeExpectedCoverageOld(self, samples): # expected coverage # number_of_samples * segment_length / workspace_length # modifier: # can be simplified # number of segments nsegments = float(len(self.segments)) # number of nucleotides in segments overlapping the workspace tmp = gat.SegmentList(clone=self.workspace) tmp.intersect(self.segments) segment_overlap = tmp.sum() # density of segment nucleotides in workspace segment_density = float(segment_overlap) / self.workspace.sum() segment_length = segment_size / nsegments expected = len(samples) * segment_overlap / float(self.workspace.sum()) # float(workspace.sum()) / (workspace.sum() + segments.sum() * len(workspace) ) return nsegments, segment_overlap, segment_density, segment_length, expected
def getIncrementallySpacedSegments(workspace_size, size, density): '''return segments of size *size* and *density* in workspace. The inter-segment distance increases over the range. ''' nsegments = int(workspace_size * density / size) rest = workspace_size * (1.0 - density) ngaps = nsegments + 2 # sum of distances is total with increment. increment = rest // sum(range(ngaps)) assert increment > 0 distance = increment start = int(distance) s = [] for x in range(nsegments): s.append((start, start + size)) distance += increment start += distance + size return gat.SegmentList(iter=s, normalize=True)
def testChromosomalBiasFail(self): annotations1, start, end = getRegularAnnotations() annotations2, start, end = getRegularAnnotations() # concatenate annotations annotations2.shift(workspacelist.sum()) annotations1.extend(annotations2) workspacelist = gat.SegmentList(iter=[(start - 100, 2 * end + 100)], normalize=True) # segments only in first part of workspace segmentlist = getRegularSegments(end - start + 200, self.segment_size, self.segment_density) ss, aa, ww = self.addSingleIsochore(segmentlist, annotations, workspacelist) self.check(ss, aa, ww, "testChromosomalBiasFail", fold_is_different=True)
def getRegularAnnotations( sizes=(100, 200, 400, 800, 1600, 3200, 6400, 128000, 256000, 512000), nsegments=20, distance=100): '''return annotations of size *size*. Each annotation has *nsegments* and segments are *distance* bases apart. Returns a dictionary of segment lists and start/end of the annotations. ''' annos = [[] for x in range(len(sizes))] start = distance for x in range(nsegments): for s, size in enumerate(sizes): annos[s].append((start, start + size)) start += size + distance annotations = {} for x, size in enumerate(sizes): annotations["size%06i" % size] = gat.SegmentList(iter=annos[x], normalize=True) return annotations, distance, start - distance
def validate(self, samples): # filename = getPlotFilename() # compute expected coverage (nsegments, segment_size, segment_density, expected_overlap, expected_segment_length, expected_coverage) = self.computeExpectedCoverage(samples) # compute actual coverage counts counts_within_workspace, segment_sizes, starts, ends = computeSegmentDensityProfile(self.workspace, samples) # plotCounts( None, # counts_within_workspace, # segment_sizes, # starts, ends, # self.workspace, # expected_coverage = expected_coverage, # density = segment_density ) ################################## # check if each sample has the correct number of nucleotides nucleotide_ok = True sums = [x.sum() for x in samples] overlaps = [] for x, s in enumerate(samples): tmp = gat.SegmentList(clone=self.workspace, normalize=True) s.normalize() tmp.intersect(s) ovl = tmp.sum() overlaps.append(ovl) if ovl != expected_overlap: nucleotide_ok = False E.warn("incorrect number of nucleotides in sample %i, got %i, expected %i, %s" % (x, ovl, expected_overlap, samples[x])) ################################## # check if average overlap is ok overlap_ok = True average_overlap = numpy.mean(overlaps) average_overlap_d = abs( average_overlap - expected_overlap) / float(expected_overlap) if average_overlap_d >= self.stringency_level: overlap_ok = False E.warn("expected_overlap counts (%f) != sampled counts (%f)" % (expected_overlap, average_overlap)) ################################## # check average coverage average_ok = True average_coverage = counts_within_workspace.mean() average_d = abs(average_coverage - expected_coverage) / \ float(expected_coverage) if average_d >= self.stringency_level: average_ok = False E.warn("expected_coverage counts (%f) != sampled counts (%f)" % (expected_coverage, counts_within_workspace.mean())) # check for uniform coverage uniform_ok = True stddev = numpy.std(counts_within_workspace) uniform_d = stddev / float(expected_coverage) if uniform_d >= self.stringency_level: uniform_ok = False E.warn("coverage variation too large : stddev (%f) / %f = %f > %f" % (stddev, expected_coverage, uniform_d, self.stringency_level)) return "\t".join(("%i" % nucleotide_ok, "%i" % average_ok, "%i" % uniform_ok, "%i" % overlap_ok, "%6.4f" % average_d, "%6.4f" % uniform_d, "%6.4f" % segment_density, "%6.4f" % average_overlap, "%6.4f" % expected_overlap, "%6.4f" % average_coverage, "%6.4f" % expected_coverage))
def getRegularSegmentsInAnnotations(annotations, segment_size, segment_density): '''get a collection of segments overlapping with annotations. Segments are uniformly distributed over annotations. Longer annotations will have more segments. Segments straddling annotations are truncated. ''' merged = gat.SegmentList() for x, i in annotations.items(): merged.extend(i) merged.normalize() workspace_size = merged.sum() nsegments = int(workspace_size * segment_density / segment_size) rest = workspace_size * (1.0 - segment_density) distance = rest // (nsegments + 2) # get regularly placed segments in a virtual workspace segments = getRegularSegments(workspace_size, segment_size, segment_density) # size of annotations n = 0 m = 0 s = [] # negative for inter-segment segments overhang = distance annotation_start, annotation_end = merged[m] lannotation = annotation_end - annotation_start for start, end in segments: lsegment = end - start # place intergap segment, split if necessary while overhang > 0: if overhang < lannotation: annotation_start += overhang overhang = 0 break overhang -= lannotation m += 1 annotation_start, annotation_end = merged[m] lannotation = annotation_end - annotation_start lannotation = annotation_end - annotation_start # place segment - split if necessary while lsegment > 0: if lsegment < lannotation: s.append((annotation_start, annotation_start + lsegment)) annotation_start += lsegment break s.append((annotation_start, annotation_end)) lsegment -= lannotation m += 1 annotation_start, annotation_end = merged[m] lannotation = annotation_end - annotation_start lannotation = annotation_end - annotation_start segments = gat.SegmentList(iter=s, normalize=True) noverlap = segments.overlapWithSegments(merged) return segments