def test_pair_fastqs_empty_files(self): """pair_fastqs: handle set of FASTQs with 'empty' pairs """ fastqs = [self.empty_r1, self.empty_r2] self.assertEqual(pair_fastqs(fastqs), ([],[self.empty_r1,self.empty_r2]))
def test_pair_fastqs_no_pairs(self): """pair_fastqs: handle set of FASTQ with no pairs """ fastqs = [self.fastq2_r2, self.fastq1_r1,] self.assertEqual(pair_fastqs(fastqs), ([],[self.fastq1_r1,self.fastq2_r2]))
def test_pair_fastqs_unpaired(self): """pair_fastqs: handle paired and unpaired FASTQs """ fastqs = [self.fastq2_r2, self.fastq1_r1, self.fastq2_r1,] self.assertEqual(pair_fastqs(fastqs), ([(self.fastq2_r1,self.fastq2_r2),], [self.fastq1_r1,]))
def test_pair_fastqs(self): """pair_fastqs: pair up a set of FASTQs """ fastqs = [self.fastq2_r2, self.fastq1_r1, self.fastq2_r1, self.fastq1_r2,] self.assertEqual(pair_fastqs(fastqs), ([(self.fastq1_r1,self.fastq1_r2), (self.fastq2_r1,self.fastq2_r2)], []))
# Input Fastqs fastqs = args.fastqs # Well list file if args.well_list_file is not None: well_list = ICell8WellList(args.well_list_file) else: well_list = None # Number of cores nprocs = args.nprocessors print "%d processor%s will be used" % (nprocs, ('s' if nprocs != 1 else '')) # Pair up Fastq files fastqs, unpaired = pair_fastqs(fastqs) if unpaired: print "Unpaired Fastqs specified:" for fq in unpaired: print "- %s" % fq logging.fatal("Unpaired Fastqs specified") sys.exit(1) # Only need R1 Fastqs fastqs = [pair[0] for pair in fastqs] # Set up a working directory if args.temporary_directory is not None: tmpdir = os.path.abspath(args.temporary_directory) else: try:
def main(): # Handle the command line p = argparse.ArgumentParser() p.add_argument("fastqs", nargs='*', metavar="FASTQ_R1 FASTQ_R2", help="FASTQ file pairs") p.add_argument("-w", "--well-list", dest="well_list_file", default=None, help="iCell8 'well list' file") p.add_argument("-m", "--mode", dest="splitting_mode", default="barcodes", choices=["barcodes", "batch", "none"], help="how to split the input FASTQs: 'barcodes' " "(one FASTQ pair per barcode), 'batch' (one or " "more FASTQ pairs with fixed number of reads not " "exceeding BATCH_SIZE), or 'none' (output all " "reads to a single FASTQ pair) (default: " "'barcodes')") p.add_argument("-s", "--size", type=int, dest="batch_size", default=DEFAULT_BATCH_SIZE, help="number of reads per batch in 'batch' mode " "(default: %d)" % DEFAULT_BATCH_SIZE) p.add_argument("-b", "--basename", default="icell8", help="basename for output FASTQ files (default: " "'icell8')") p.add_argument("-o", "--outdir", dest="out_dir", default=None, help="directory to write output FASTQ files to " "(default: current directory)") p.add_argument("-d", "--discard-unknown-barcodes", dest='discard_unknown_barcodes', action='store_true', help="discard reads with barcodes which don't " "match any of those in the WELL_LIST_FILE " "(default: keep all reads)") p.add_argument("-q", "--quality-filter", dest='quality_filter', action='store_true', help="filter reads by barcode and UMI quality " "(default: don't filter reads on quality)") p.add_argument("-c", "--compress", action='store_true', help="output compressed .gz FASTQ files") args = p.parse_args() # Convert quality cutoffs to character encoding barcode_quality_cutoff = chr(INLINE_BARCODE_QUALITY_CUTOFF + 33) umi_quality_cutoff = chr(UMI_QUALITY_CUTOFF + 33) # Get well list and expected barcodes well_list_file = args.well_list_file if well_list_file is not None: well_list_file = os.path.abspath(args.well_list_file) well_list = ICell8WellList(well_list_file) expected_barcodes = set(well_list.barcodes()) print "%d expected barcodes" % len(expected_barcodes) # Filtering on barcode do_check_barcodes = args.discard_unknown_barcodes if do_check_barcodes and well_list_file is None: logging.fatal("-d/--discard-unknown-barcodes: need to supply a " "well list file") sys.exit(1) # Filter on barcode and UMI quality do_quality_filter = args.quality_filter # Splitting mode splitting_mode = args.splitting_mode batch_size = args.batch_size # Count barcodes and rejections assigned = 0 unassigned = 0 filtered = 0 barcode_list = set() filtered_counts = {} # Input Fastqs fastqs = pair_fastqs([fq for fq in args.fastqs])[0] # Output Fastqs output_fqs = BufferedOutputFiles(base_dir=args.out_dir) if args.out_dir is not None: out_dir = os.path.abspath(args.out_dir) mkdir(out_dir) else: out_dir = os.getcwd() basename = args.basename # Compress outputs? if args.compress: fastq_ext = "fastq.gz" else: fastq_ext = "fastq" # Iterate over pairs of Fastqs for fastq_pair in fastqs: # Iterate over read pairs from the Fastqs print "-- %s\n %s" % fastq_pair print " Starting at %s" % time.ctime() start_time = time.time() for i, read_pair in enumerate(ICell8FastqIterator(*fastq_pair), start=1): # Deal with read pair if (i % 100000) == 0: print " Examining read pair #%d (%s)" % \ (i,time.ctime()) inline_barcode = read_pair.barcode barcode_list.add(inline_barcode) # Initial assignment assign_to = inline_barcode # Apply quality filtering if do_quality_filter: if not pass_quality_filter(read_pair.barcode_quality, barcode_quality_cutoff): assign_to = "failed_barcode" elif not pass_quality_filter(read_pair.umi_quality, umi_quality_cutoff): assign_to = "failed_umi" else: filtered += 1 # Check barcode is valid if do_check_barcodes: if inline_barcode not in expected_barcodes: assign_to = "unassigned" unassigned += 1 else: assigned += 1 logging.debug("%s" % '\t'.join([ assign_to, inline_barcode, read_pair.umi, read_pair.min_barcode_quality, read_pair.min_umi_quality ])) # Post filtering counts if assign_to == inline_barcode: try: filtered_counts[inline_barcode] += 1 except KeyError: filtered_counts[inline_barcode] = 1 # Reassign read pair to appropriate output files if splitting_mode == "batch": # Output to a batch-specific file pair batch_number = filtered / batch_size assign_to = "B%03d" % batch_number elif splitting_mode == "none": # Output to a single file pair assign_to = "filtered" # Write read pair fq_r1 = "%s_R1" % assign_to fq_r2 = "%s_R2" % assign_to if fq_r1 not in output_fqs: try: # Try to reopen file and append output_fqs.open(fq_r1, append=True) except KeyError: # Open new file output_fqs.open( fq_r1, "%s.%s.r1.%s" % (basename, assign_to, fastq_ext)) output_fqs.write(fq_r1, "%s" % read_pair.r1) if fq_r2 not in output_fqs: try: # Try to reopen file and append output_fqs.open(fq_r2, append=True) except KeyError: # Open new file output_fqs.open( fq_r2, "%s.%s.r2.%s" % (basename, assign_to, fastq_ext)) output_fqs.write(fq_r2, "%s" % read_pair.r2) print " Finished at %s" % time.ctime() print " (Took %.0fs)" % (time.time() - start_time) # Close output files output_fqs.close() # Summary output to screen total_reads = assigned + unassigned print "Summary:" print "--------" print "Number of barcodes : %d" % len(barcode_list) if do_check_barcodes: print "Number of expected barcodes: %d/%d" % \ (len(filtered_counts.keys()), len(expected_barcodes)) print "Total reads : %d" % total_reads if do_quality_filter: print "Total reads (filtered) : %d" % filtered if do_check_barcodes: print "Total reads (assigned) : %d" % assigned print "Unassigned reads : %d" % unassigned