def update_observed_distribution(self, reads): # Get (flowcell, lane, surface, swath, tile, x, y) tuples for each read read_locs = [] for (key, read, idx) in reads: read_loc = tk_lane.extract_read_position(read) if read_loc is not None: read_locs.append((read_loc, read)) # Sort by flowcell def flowcell(read_loc): return "%s" % (read_loc[0].flowcell) read_locs.sort(key=flowcell) lane_groups = itertools.groupby(read_locs, flowcell) # Measure distances between all pairs for (lane, lane_reads) in lane_groups: lane_reads = list(lane_reads) cmp_reads = min(200, len(lane_reads)) lane_loc_coords = [self.lane_coordinate_system.convert_to_lane_coords(loc) for (loc, _) in lane_reads] for i in range(cmp_reads): loc1, read1 = lane_reads[i] lane_loc1 = lane_loc_coords[i] for j in range(i+1, len(lane_reads)): loc2, read2 = lane_reads[j] lane_loc2 = lane_loc_coords[j] dist = math.sqrt((lane_loc1[0]-lane_loc2[0])**2 + (lane_loc1[1] - lane_loc2[1])**2) self.observed_distribution.increment(dist)
def estimate_flowcell_geometry(bam_in, lane_coordinate_system): bam_in.reset() lane_extents = {} flowcells = set() result = XYrange() for read in itertools.islice(bam_in, 100000): read_loc = tk_lane.extract_read_position(read) loc_coords = lane_coordinate_system.convert_to_lane_coords(read_loc) flowcells.add(read_loc.flowcell) key = read_loc.flowcell + '_' + read_loc.lane if key not in lane_extents: lane_extents[key] = XYrange() lane_extents[key].update(loc_coords) result.update(loc_coords) bam_in.reset() min_overlap = 1.0 lanes = lane_extents.keys() for i in xrange(len(lanes)): for j in xrange(i+1, len(lanes)): min_overlap = min(lane_extents[lanes[i]].overlap(lane_extents[lanes[j]]), min_overlap) print "Flowcells : ", flowcells for k, v in lane_extents.items(): print k, v print "Flowcell overlap = ", min_overlap if len(flowcells) > 1: print "Multiple flowcells found" if min_overlap < 0.9: print "Flowcells do not have identical geometry." return None return {"x": (result.xrange.min, result.xrange.max), "y": (result.yrange.min, result.yrange.max)}
def count_dups_by_distance(self, namedreads): """Count number of nearby duplicates in a set of reads. A pair is counted as 1""" # Get (flowcell, lane, surface, swath, tile, x, y) tuples for each read read_locs = [] for (footprint, barcode, read) in namedreads: read_loc = tk_lane.extract_read_position(read) if read_loc is not None: read_locs.append((read_loc, read)) # Sort by flowcell_lane def flowcell_lane(read_loc): return "%s_%s" % (read_loc[0].flowcell, read_loc[0].lane) read_locs.sort(key=flowcell_lane) lane_groups = itertools.groupby(read_locs, flowcell_lane) opt_dups_found = 0 # really close dupes diff_dups_found = 0 # somewhat close dupes # Measure distances between all pairs in a lane for (lane, lane_reads) in lane_groups: lane_reads = list(lane_reads) layout = self.lane_coordinate_system.get_layout_for_read_loc( lane_reads[0][0]) test_dups = layout.has_diffusion_duplicates( MAX_DIFFUSION_DUP_DISTANCE) if len(lane_reads) > 100: martian.log_info("Got dup cluster of size: %d" % len(lane_reads)) first_read = lane_reads[0][1] martian.log_info( "tid: %d, pos: %d, mapq: %d, seq: %s" % (first_read.reference_id, first_read.reference_start, first_read.mapping_quality, first_read.query_sequence)) opt_dups = set() diff_dups = set() dump = [] cmp_reads = min(200, len(lane_reads)) lane_loc_coords = [ self.lane_coordinate_system.convert_to_lane_coords(loc) for (loc, _) in lane_reads ] for i in range(cmp_reads): loc1, read1 = lane_reads[i] lane_loc1 = lane_loc_coords[i] for j in range(i + 1, len(lane_reads)): loc2, read2 = lane_reads[j] lane_loc2 = lane_loc_coords[j] dist = math.sqrt((lane_loc1[0] - lane_loc2[0])**2 + (lane_loc1[1] - lane_loc2[1])**2) if test_dups and dist < MAX_DIFFUSION_DUP_DISTANCE: diff_dups.add(j) if self.write_to_stdout and j not in diff_dups: dump.append( ("%d\t" + ("%d\t" * 14)) % (dist, loc1.surface, loc1.swath, loc1.tile, loc1.x, loc1.y, lane_loc1[0], lane_loc1[1], loc2.surface, loc2.swath, loc2.tile, loc2.x, loc2.y, lane_loc2[0], lane_loc2[1])) if dist < OPTICAL_DUPLICATE_DISTANCE: opt_dups.add(j) if self.write_to_stdout and len(diff_dups) >= 2: for x in dump: print("%d\t%s" % (len(diff_dups), x)) diff_dups_found += len(diff_dups) opt_dups_found += len(opt_dups) return opt_dups_found, diff_dups_found