def test_get_all_indices(self): #Use the first two targets, as above lim_targets = load_targets(TEST_FILE, levels=3, limit=2) self.assertEqual( set(lim_targets.get_all_indices(0)), set((1998850, 3178500)) ) self.assertEqual( set(lim_targets.get_all_indices(1)), set(map(int,( "1997278,1997279,1998849,1998851,2000420,2000421," + "3176929,3176930,3178499,3178501,3180071,3180072" ).split(","))) ) self.assertEqual( set(lim_targets.get_all_indices(None)), set(map(int,( "1998850,1997278,1997279,1998849,1998851,2000420," + "2000421,1995707,1995708,1995709,1997277,1997280," + "1998848,1998852,2000419,2000422,2001991,2001992," + "2001993,3178500,3176929,3176930,3178499,3178501," + "3180071,3180072,3175357,3175358,3175359,3176928," + "3176931,3178498,3178502,3180070,3180073,3181641," + "3181642,3181643" ).split(","))) )
def test_load_limit(self): lim_targets = load_targets(TEST_FILE, limit=2) self.assertEqual(len(lim_targets), 2) #Test iteration over all targets count = 0 for targ in lim_targets: count +=1 self.assertEqual(count, 2)
def test_bad_add(self): all_targets = self.all_targets self.assertRaises(Exception, all_targets.add_target, [(1, 2), (3, 4)]) #This should complain about the number of levels self.assertRaises(AssertionError, all_targets.add_target, [(111,), (112, 113, 114, 115)]) #This should work, but only once sub_targets = load_targets(TEST_FILE, 2) sub_targets.add_target( [(111,), (112, 113, 114, 115)] ) self.assertRaises(Exception, sub_targets.add_target, [(111,), (112, 113, 114, 115)])
def main(): # Setup options args = parse_args() if args.quiet: global log log = lambda *args: None lanes = args.lane.split(',') if args.lane else range(1, 8+1) max_tile = 24 #Works for Highseq X max_swath = 22 #Works for X and 4000 if args.stype == HISEQ_4000: max_tile = 28 else: try: max_tile = int(args.stype) % 100 max_swath = int(args.stype) // 100 or 22 except ValueError: pass # Never mind. Stick with 24/22. # Build a list of tiles we expect to see. Swaths for the older machines are [11, 12, 21, 22] but # in general and to handle the Novoseq we can infer the list from the max_swath value. tiles = [] for swath in [ '{}{}'.format(s, n) for s in range(1,max_swath//10+1) for n in range(1,max_swath%10+1) ]: for tile in range(1,max_tile+1): tiles.append("%s%02d" % (swath, tile)) # If tiles are specified check that all are valid. if args.tile_id: filtered_tiles = [] for tpat in args.tile_id.split(','): t_match = [t for t in tiles if re.match('^'+tpat+'$', t)] assert t_match, "%s matches no tile identifiers for a %s" % (t, args.stype) filtered_tiles.extend(t_match) tiles = sorted(set(filtered_tiles)) # And set cycles based on either --start/--end or --cycles cycles = [(args.start, args.end)] if args.cycles: #Minimal validation - user will get cryptic messages on bad values cycles = [ (int(s), int(e)) for r in args.cycles.split(',') for s, e in (r.split('-'),) ] # Decide how we are calculating edit distances get_edit_distance = Levenshtein.hamming if args.hamming else Levenshtein.distance targets = load_targets( filename = args.coord_file, levels = args.level+1, limit = args.sample_size) bcl_reader = bcl_direct_reader.BCLReader(args.run) for lane in lanes: lane_dupl = {} for tile in tiles: log("Reading tile %s in lane %s" % (tile, lane)) tile_bcl = bcl_reader.get_tile(lane, tile) #This actually reads the sequence data from the BCL into RAM #Now we support ranges, we might have to do this two or more times. seq_objs = [] for r in cycles: seq_objs.append( tile_bcl.get_seqs(targets.get_all_indices(), *r) ) log("Got %i sequences from %i contiguous cycle ranges." % ( sum(len(s) for s in seq_objs), len(seq_objs) )) #Each entry in lane_dupl dict is a list of valid (ie. centre seq passed QC) #targets for this tile. lane_dupl[tile] = [] for target in targets: center = target.get_centre() #log("Center: %s"%center) # if the center sequence does not pass the pass filter we don't assess edit distance # as large number of Ns compared to other reads with large number of Ns results in # small edit distance if not seq_objs[0][center][QUAL_FLAG]: continue center_seq = ''.join(s[center][SEQUENCE] for s in seq_objs) #Add a placeholder for the new stats target_stats = [None] * args.level lane_dupl[tile].append(target_stats) for level in range(args.level): #The level variable now runs from 0, but the target levels run from #1 because 0 is the centre, so be careful! dups = 0 well_indices = list(target.get_indices(level+1)) assert len(well_indices) > 0 for well_index in well_indices: well_seq = ''.join(s[well_index][SEQUENCE] for s in seq_objs) dist = get_edit_distance(center_seq, well_seq) #Log all the duplicates. This might get fairly large! #Note that to locate the matching sequence header in a FASTQ file you need to #convert the well number into co-ords. Eg for location 123456: # $ dump_slocs.py datadir/Data/Intensities/s.locs | grep ^0123456 if dist <= args.edit_distance: dups += 1 log("center seq at {:>07}: {}".format(center, center_seq)) log("well seq at {:>07}: {}".format(well_index, well_seq)) log("edit distance: {}".format(dist)) #Save a tuple of (TALLY, LENGTH) target_stats[level] = (dups, len(well_indices)) #log(lane_dupl) #Write output per lane output_writer(lane, len(targets), lane_dupl, verbose = not args.summary_only)
def main(): # Setup options args = parse_args() if args.quiet: global log log = lambda *args: None lanes = args.lane.split(',') if args.lane else range(1, 8 + 1) max_tile = 24 #Works for Highseq X max_swath = 22 #Works for X and 4000 if args.stype == HISEQ_4000: max_tile = 28 else: try: max_tile = int(args.stype) % 100 max_swath = int(args.stype) // 100 or 22 except ValueError: pass # Never mind. Stick with 24/22. # Build a list of tiles we expect to see. Swaths for the older machines are [11, 12, 21, 22] but # in general and to handle the Novoseq we can infer the list from the max_swath value. tiles = [] for swath in [ '{}{}'.format(s, n) for s in range(1, max_swath // 10 + 1) for n in range(1, max_swath % 10 + 1) ]: for tile in range(1, max_tile + 1): tiles.append("%s%02d" % (swath, tile)) # If tiles are specified check that all are valid. if args.tile_id: filtered_tiles = [] for tpat in args.tile_id.split(','): t_match = [t for t in tiles if re.match('^' + tpat + '$', t)] assert t_match, "%s matches no tile identifiers for a %s" % ( t, args.stype) filtered_tiles.extend(t_match) tiles = sorted(set(filtered_tiles)) # And set cycles based on either --start/--end or --cycles cycles = [(args.start, args.end)] if args.cycles: #Minimal validation - user will get cryptic messages on bad values cycles = [(int(s), int(e)) for r in args.cycles.split(',') for s, e in (r.split('-'), )] # Decide how we are calculating edit distances get_edit_distance = Levenshtein.hamming if args.hamming else Levenshtein.distance targets = load_targets(filename=args.coord_file, levels=args.level + 1, limit=args.sample_size) bcl_reader = bcl_direct_reader.BCLReader(args.run) for lane in lanes: lane_dupl = {} for tile in tiles: log("Reading tile %s in lane %s" % (tile, lane)) tile_bcl = bcl_reader.get_tile(lane, tile) #This actually reads the sequence data from the BCL into RAM #Now we support ranges, we might have to do this two or more times. seq_objs = [] for r in cycles: seq_objs.append( tile_bcl.get_seqs(targets.get_all_indices(), *r)) log("Got %i sequences from %i contiguous cycle ranges." % (sum(len(s) for s in seq_objs), len(seq_objs))) #Each entry in lane_dupl dict is a list of valid (ie. centre seq passed QC) #targets for this tile. lane_dupl[tile] = [] for target in targets: center = target.get_centre() #log("Center: %s"%center) # if the center sequence does not pass the pass filter we don't assess edit distance # as large number of Ns compared to other reads with large number of Ns results in # small edit distance if not seq_objs[0][center][QUAL_FLAG]: continue center_seq = ''.join(s[center][SEQUENCE] for s in seq_objs) #Add a placeholder for the new stats target_stats = [None] * args.level lane_dupl[tile].append(target_stats) for level in range(args.level): #The level variable now runs from 0, but the target levels run from #1 because 0 is the centre, so be careful! dups = 0 well_indices = list(target.get_indices(level + 1)) assert len(well_indices) > 0 for well_index in well_indices: well_seq = ''.join(s[well_index][SEQUENCE] for s in seq_objs) dist = get_edit_distance(center_seq, well_seq) #Log all the duplicates. This might get fairly large! #Note that to locate the matching sequence header in a FASTQ file you need to #convert the well number into co-ords. Eg for location 123456: # $ dump_slocs.py datadir/Data/Intensities/s.locs | grep ^0123456 if dist <= args.edit_distance: dups += 1 log("center seq at {:>07}: {}".format( center, center_seq)) log("well seq at {:>07}: {}".format( well_index, well_seq)) log("edit distance: {}".format(dist)) #Save a tuple of (TALLY, LENGTH) target_stats[level] = (dups, len(well_indices)) #log(lane_dupl) #Write output per lane output_writer(lane, len(targets), lane_dupl, verbose=not args.summary_only)
def test_load_subset(self): sub_targets = load_targets(TEST_FILE, levels=2) self.assertEqual(sub_targets.levels, 2)
def setUp(self): self.all_targets = load_targets(TEST_FILE)