Esempio n. 1
0
    def test_get_all_indices(self):
        #Use the first two targets, as above
        lim_targets = load_targets(TEST_FILE, levels=3, limit=2)

        self.assertEqual(
            set(lim_targets.get_all_indices(0)),
            set((1998850, 3178500))
        )

        self.assertEqual(
            set(lim_targets.get_all_indices(1)),
            set(map(int,(
                "1997278,1997279,1998849,1998851,2000420,2000421," +
                "3176929,3176930,3178499,3178501,3180071,3180072"
                ).split(",")))
        )

        self.assertEqual(
            set(lim_targets.get_all_indices(None)),
            set(map(int,(
                "1998850,1997278,1997279,1998849,1998851,2000420," +
                "2000421,1995707,1995708,1995709,1997277,1997280," +
                "1998848,1998852,2000419,2000422,2001991,2001992," +
                "2001993,3178500,3176929,3176930,3178499,3178501," +
                "3180071,3180072,3175357,3175358,3175359,3176928," +
                "3176931,3178498,3178502,3180070,3180073,3181641," +
                "3181642,3181643"
                ).split(",")))
        )
Esempio n. 2
0
    def test_load_limit(self):
        lim_targets = load_targets(TEST_FILE, limit=2)

        self.assertEqual(len(lim_targets), 2)

        #Test iteration over all targets
        count = 0
        for targ in lim_targets:
            count +=1

        self.assertEqual(count, 2)
Esempio n. 3
0
    def test_bad_add(self):
        all_targets = self.all_targets

        self.assertRaises(Exception, all_targets.add_target, [(1, 2), (3, 4)])

        #This should complain about the number of levels
        self.assertRaises(AssertionError, all_targets.add_target, [(111,), (112, 113, 114, 115)])

        #This should work, but only once
        sub_targets = load_targets(TEST_FILE, 2)
        sub_targets.add_target( [(111,), (112, 113, 114, 115)] )
        self.assertRaises(Exception, sub_targets.add_target, [(111,), (112, 113, 114, 115)])
def main():
    # Setup options
    args = parse_args()

    if args.quiet:
        global log
        log = lambda *args: None

    lanes = args.lane.split(',') if args.lane else range(1, 8+1)

    max_tile = 24 #Works for Highseq X
    max_swath = 22 #Works for X and 4000
    if args.stype == HISEQ_4000:
        max_tile = 28
    else:
        try:
            max_tile = int(args.stype) % 100
            max_swath = int(args.stype) // 100 or 22
        except ValueError:
            pass # Never mind. Stick with 24/22.

    # Build a list of tiles we expect to see. Swaths for the older machines are [11, 12, 21, 22] but
    # in general and to handle the Novoseq we can infer the list from the max_swath value.
    tiles = []
    for swath in [ '{}{}'.format(s, n) for s in range(1,max_swath//10+1) for n in range(1,max_swath%10+1) ]:
        for tile in range(1,max_tile+1):
            tiles.append("%s%02d" % (swath, tile))

    # If tiles are specified check that all are valid.
    if args.tile_id:
        filtered_tiles = []
        for tpat in args.tile_id.split(','):
            t_match = [t for t in tiles if re.match('^'+tpat+'$', t)]
            assert t_match, "%s matches no tile identifiers for a %s" % (t, args.stype)
            filtered_tiles.extend(t_match)
        tiles = sorted(set(filtered_tiles))

    # And set cycles based on either --start/--end or --cycles
    cycles = [(args.start, args.end)]
    if args.cycles:
        #Minimal validation - user will get cryptic messages on bad values
        cycles = [ (int(s), int(e)) for r in args.cycles.split(',') for s, e in (r.split('-'),) ]

    # Decide how we are calculating edit distances
    get_edit_distance = Levenshtein.hamming if args.hamming else Levenshtein.distance

    targets = load_targets( filename = args.coord_file,
                            levels = args.level+1,
                            limit = args.sample_size)
    bcl_reader = bcl_direct_reader.BCLReader(args.run)

    for lane in lanes:

        lane_dupl = {}
        for tile in tiles:
            log("Reading tile %s in lane %s" % (tile, lane))
            tile_bcl = bcl_reader.get_tile(lane, tile)

            #This actually reads the sequence data from the BCL into RAM
            #Now we support ranges, we might have to do this two or more times.
            seq_objs = []
            for r in cycles:
                seq_objs.append( tile_bcl.get_seqs(targets.get_all_indices(), *r) )

            log("Got %i sequences from %i contiguous cycle ranges." % (
                     sum(len(s) for s in seq_objs),
                                       len(seq_objs) ))

            #Each entry in lane_dupl dict is a list of valid (ie. centre seq passed QC)
            #targets for this tile.
            lane_dupl[tile] = []

            for target in targets:

                center = target.get_centre()
                #log("Center: %s"%center)

                # if the center sequence does not pass the pass filter we don't assess edit distance
                # as large number of Ns compared to other reads with large number of Ns results in
                # small edit distance
                if not seq_objs[0][center][QUAL_FLAG]:
                    continue
                center_seq = ''.join(s[center][SEQUENCE] for s in seq_objs)

                #Add a placeholder for the new stats
                target_stats = [None] * args.level
                lane_dupl[tile].append(target_stats)

                for level in range(args.level):
                    #The level variable now runs from 0, but the target levels run from
                    #1 because 0 is the centre, so be careful!
                    dups = 0
                    well_indices = list(target.get_indices(level+1))
                    assert len(well_indices) > 0
                    for well_index in well_indices:
                        well_seq = ''.join(s[well_index][SEQUENCE] for s in seq_objs)
                        dist = get_edit_distance(center_seq, well_seq)

                        #Log all the duplicates. This might get fairly large!
                        #Note that to locate the matching sequence header in a FASTQ file you need to
                        #convert the well number into co-ords. Eg for location 123456:
                        # $ dump_slocs.py datadir/Data/Intensities/s.locs | grep ^0123456
                        if dist <= args.edit_distance:
                            dups += 1
                            log("center seq at {:>07}: {}".format(center, center_seq))
                            log("well seq at   {:>07}: {}".format(well_index, well_seq))
                            log("edit distance: {}".format(dist))

                    #Save a tuple of (TALLY, LENGTH)
                    target_stats[level] = (dups, len(well_indices))

            #log(lane_dupl)
        #Write output per lane
        output_writer(lane, len(targets), lane_dupl, verbose = not args.summary_only)
def main():
    # Setup options
    args = parse_args()

    if args.quiet:
        global log
        log = lambda *args: None

    lanes = args.lane.split(',') if args.lane else range(1, 8 + 1)

    max_tile = 24  #Works for Highseq X
    max_swath = 22  #Works for X and 4000
    if args.stype == HISEQ_4000:
        max_tile = 28
    else:
        try:
            max_tile = int(args.stype) % 100
            max_swath = int(args.stype) // 100 or 22
        except ValueError:
            pass  # Never mind. Stick with 24/22.

    # Build a list of tiles we expect to see. Swaths for the older machines are [11, 12, 21, 22] but
    # in general and to handle the Novoseq we can infer the list from the max_swath value.
    tiles = []
    for swath in [
            '{}{}'.format(s, n) for s in range(1, max_swath // 10 + 1)
            for n in range(1, max_swath % 10 + 1)
    ]:
        for tile in range(1, max_tile + 1):
            tiles.append("%s%02d" % (swath, tile))

    # If tiles are specified check that all are valid.
    if args.tile_id:
        filtered_tiles = []
        for tpat in args.tile_id.split(','):
            t_match = [t for t in tiles if re.match('^' + tpat + '$', t)]
            assert t_match, "%s matches no tile identifiers for a %s" % (
                t, args.stype)
            filtered_tiles.extend(t_match)
        tiles = sorted(set(filtered_tiles))

    # And set cycles based on either --start/--end or --cycles
    cycles = [(args.start, args.end)]
    if args.cycles:
        #Minimal validation - user will get cryptic messages on bad values
        cycles = [(int(s), int(e)) for r in args.cycles.split(',')
                  for s, e in (r.split('-'), )]

    # Decide how we are calculating edit distances
    get_edit_distance = Levenshtein.hamming if args.hamming else Levenshtein.distance

    targets = load_targets(filename=args.coord_file,
                           levels=args.level + 1,
                           limit=args.sample_size)
    bcl_reader = bcl_direct_reader.BCLReader(args.run)

    for lane in lanes:

        lane_dupl = {}
        for tile in tiles:
            log("Reading tile %s in lane %s" % (tile, lane))
            tile_bcl = bcl_reader.get_tile(lane, tile)

            #This actually reads the sequence data from the BCL into RAM
            #Now we support ranges, we might have to do this two or more times.
            seq_objs = []
            for r in cycles:
                seq_objs.append(
                    tile_bcl.get_seqs(targets.get_all_indices(), *r))

            log("Got %i sequences from %i contiguous cycle ranges." %
                (sum(len(s) for s in seq_objs), len(seq_objs)))

            #Each entry in lane_dupl dict is a list of valid (ie. centre seq passed QC)
            #targets for this tile.
            lane_dupl[tile] = []

            for target in targets:

                center = target.get_centre()
                #log("Center: %s"%center)

                # if the center sequence does not pass the pass filter we don't assess edit distance
                # as large number of Ns compared to other reads with large number of Ns results in
                # small edit distance
                if not seq_objs[0][center][QUAL_FLAG]:
                    continue
                center_seq = ''.join(s[center][SEQUENCE] for s in seq_objs)

                #Add a placeholder for the new stats
                target_stats = [None] * args.level
                lane_dupl[tile].append(target_stats)

                for level in range(args.level):
                    #The level variable now runs from 0, but the target levels run from
                    #1 because 0 is the centre, so be careful!
                    dups = 0
                    well_indices = list(target.get_indices(level + 1))
                    assert len(well_indices) > 0
                    for well_index in well_indices:
                        well_seq = ''.join(s[well_index][SEQUENCE]
                                           for s in seq_objs)
                        dist = get_edit_distance(center_seq, well_seq)

                        #Log all the duplicates. This might get fairly large!
                        #Note that to locate the matching sequence header in a FASTQ file you need to
                        #convert the well number into co-ords. Eg for location 123456:
                        # $ dump_slocs.py datadir/Data/Intensities/s.locs | grep ^0123456
                        if dist <= args.edit_distance:
                            dups += 1
                            log("center seq at {:>07}: {}".format(
                                center, center_seq))
                            log("well seq at   {:>07}: {}".format(
                                well_index, well_seq))
                            log("edit distance: {}".format(dist))

                    #Save a tuple of (TALLY, LENGTH)
                    target_stats[level] = (dups, len(well_indices))

            #log(lane_dupl)
        #Write output per lane
        output_writer(lane,
                      len(targets),
                      lane_dupl,
                      verbose=not args.summary_only)
Esempio n. 6
0
    def test_load_subset(self):
        sub_targets = load_targets(TEST_FILE, levels=2)

        self.assertEqual(sub_targets.levels, 2)
Esempio n. 7
0
 def setUp(self):
     self.all_targets = load_targets(TEST_FILE)