예제 #1
0
 def test_pair_fastqs_empty_files(self):
     """pair_fastqs: handle set of FASTQs with 'empty' pairs
     """
     fastqs = [self.empty_r1,
               self.empty_r2]
     self.assertEqual(pair_fastqs(fastqs),
                      ([],[self.empty_r1,self.empty_r2]))
예제 #2
0
 def test_pair_fastqs_no_pairs(self):
     """pair_fastqs: handle set of FASTQ with no pairs
     """
     fastqs = [self.fastq2_r2,
               self.fastq1_r1,]
     self.assertEqual(pair_fastqs(fastqs),
                      ([],[self.fastq1_r1,self.fastq2_r2]))
예제 #3
0
 def test_pair_fastqs_unpaired(self):
     """pair_fastqs: handle paired and unpaired FASTQs
     """
     fastqs = [self.fastq2_r2,
               self.fastq1_r1,
               self.fastq2_r1,]
     self.assertEqual(pair_fastqs(fastqs),
                      ([(self.fastq2_r1,self.fastq2_r2),],
                      [self.fastq1_r1,]))
예제 #4
0
 def test_pair_fastqs(self):
     """pair_fastqs: pair up a set of FASTQs
     """
     fastqs = [self.fastq2_r2,
               self.fastq1_r1,
               self.fastq2_r1,
               self.fastq1_r2,]
     self.assertEqual(pair_fastqs(fastqs),
                      ([(self.fastq1_r1,self.fastq1_r2),
                        (self.fastq2_r1,self.fastq2_r2)],
                      []))
예제 #5
0
    # Input Fastqs
    fastqs = args.fastqs

    # Well list file
    if args.well_list_file is not None:
        well_list = ICell8WellList(args.well_list_file)
    else:
        well_list = None

    # Number of cores
    nprocs = args.nprocessors
    print "%d processor%s will be used" % (nprocs,
                                           ('s' if nprocs != 1 else ''))

    # Pair up Fastq files
    fastqs, unpaired = pair_fastqs(fastqs)
    if unpaired:
        print "Unpaired Fastqs specified:"
        for fq in unpaired:
            print "- %s" % fq
        logging.fatal("Unpaired Fastqs specified")
        sys.exit(1)

    # Only need R1 Fastqs
    fastqs = [pair[0] for pair in fastqs]

    # Set up a working directory
    if args.temporary_directory is not None:
        tmpdir = os.path.abspath(args.temporary_directory)
    else:
        try:
예제 #6
0
def main():
    # Handle the command line
    p = argparse.ArgumentParser()
    p.add_argument("fastqs",
                   nargs='*',
                   metavar="FASTQ_R1 FASTQ_R2",
                   help="FASTQ file pairs")
    p.add_argument("-w",
                   "--well-list",
                   dest="well_list_file",
                   default=None,
                   help="iCell8 'well list' file")
    p.add_argument("-m",
                   "--mode",
                   dest="splitting_mode",
                   default="barcodes",
                   choices=["barcodes", "batch", "none"],
                   help="how to split the input FASTQs: 'barcodes' "
                   "(one FASTQ pair per barcode), 'batch' (one or "
                   "more FASTQ pairs with fixed number of reads not "
                   "exceeding BATCH_SIZE), or 'none' (output all "
                   "reads to a single FASTQ pair) (default: "
                   "'barcodes')")
    p.add_argument("-s",
                   "--size",
                   type=int,
                   dest="batch_size",
                   default=DEFAULT_BATCH_SIZE,
                   help="number of reads per batch in 'batch' mode "
                   "(default: %d)" % DEFAULT_BATCH_SIZE)
    p.add_argument("-b",
                   "--basename",
                   default="icell8",
                   help="basename for output FASTQ files (default: "
                   "'icell8')")
    p.add_argument("-o",
                   "--outdir",
                   dest="out_dir",
                   default=None,
                   help="directory to write output FASTQ files to "
                   "(default: current directory)")
    p.add_argument("-d",
                   "--discard-unknown-barcodes",
                   dest='discard_unknown_barcodes',
                   action='store_true',
                   help="discard reads with barcodes which don't "
                   "match any of those in the WELL_LIST_FILE "
                   "(default: keep all reads)")
    p.add_argument("-q",
                   "--quality-filter",
                   dest='quality_filter',
                   action='store_true',
                   help="filter reads by barcode and UMI quality "
                   "(default: don't filter reads on quality)")
    p.add_argument("-c",
                   "--compress",
                   action='store_true',
                   help="output compressed .gz FASTQ files")
    args = p.parse_args()

    # Convert quality cutoffs to character encoding
    barcode_quality_cutoff = chr(INLINE_BARCODE_QUALITY_CUTOFF + 33)
    umi_quality_cutoff = chr(UMI_QUALITY_CUTOFF + 33)

    # Get well list and expected barcodes
    well_list_file = args.well_list_file
    if well_list_file is not None:
        well_list_file = os.path.abspath(args.well_list_file)
    well_list = ICell8WellList(well_list_file)
    expected_barcodes = set(well_list.barcodes())
    print "%d expected barcodes" % len(expected_barcodes)

    # Filtering on barcode
    do_check_barcodes = args.discard_unknown_barcodes
    if do_check_barcodes and well_list_file is None:
        logging.fatal("-d/--discard-unknown-barcodes: need to supply a "
                      "well list file")
        sys.exit(1)

    # Filter on barcode and UMI quality
    do_quality_filter = args.quality_filter

    # Splitting mode
    splitting_mode = args.splitting_mode
    batch_size = args.batch_size

    # Count barcodes and rejections
    assigned = 0
    unassigned = 0
    filtered = 0
    barcode_list = set()
    filtered_counts = {}

    # Input Fastqs
    fastqs = pair_fastqs([fq for fq in args.fastqs])[0]

    # Output Fastqs
    output_fqs = BufferedOutputFiles(base_dir=args.out_dir)
    if args.out_dir is not None:
        out_dir = os.path.abspath(args.out_dir)
        mkdir(out_dir)
    else:
        out_dir = os.getcwd()
    basename = args.basename

    # Compress outputs?
    if args.compress:
        fastq_ext = "fastq.gz"
    else:
        fastq_ext = "fastq"

    # Iterate over pairs of Fastqs
    for fastq_pair in fastqs:
        # Iterate over read pairs from the Fastqs
        print "-- %s\n   %s" % fastq_pair
        print "   Starting at %s" % time.ctime()
        start_time = time.time()
        for i, read_pair in enumerate(ICell8FastqIterator(*fastq_pair),
                                      start=1):
            # Deal with read pair
            if (i % 100000) == 0:
                print "   Examining read pair #%d (%s)" % \
                    (i,time.ctime())
            inline_barcode = read_pair.barcode
            barcode_list.add(inline_barcode)
            # Initial assignment
            assign_to = inline_barcode
            # Apply quality filtering
            if do_quality_filter:
                if not pass_quality_filter(read_pair.barcode_quality,
                                           barcode_quality_cutoff):
                    assign_to = "failed_barcode"
                elif not pass_quality_filter(read_pair.umi_quality,
                                             umi_quality_cutoff):
                    assign_to = "failed_umi"
                else:
                    filtered += 1
            # Check barcode is valid
            if do_check_barcodes:
                if inline_barcode not in expected_barcodes:
                    assign_to = "unassigned"
                    unassigned += 1
                else:
                    assigned += 1
            logging.debug("%s" % '\t'.join([
                assign_to, inline_barcode, read_pair.umi,
                read_pair.min_barcode_quality, read_pair.min_umi_quality
            ]))
            # Post filtering counts
            if assign_to == inline_barcode:
                try:
                    filtered_counts[inline_barcode] += 1
                except KeyError:
                    filtered_counts[inline_barcode] = 1
                # Reassign read pair to appropriate output files
                if splitting_mode == "batch":
                    # Output to a batch-specific file pair
                    batch_number = filtered / batch_size
                    assign_to = "B%03d" % batch_number
                elif splitting_mode == "none":
                    # Output to a single file pair
                    assign_to = "filtered"
            # Write read pair
            fq_r1 = "%s_R1" % assign_to
            fq_r2 = "%s_R2" % assign_to
            if fq_r1 not in output_fqs:
                try:
                    # Try to reopen file and append
                    output_fqs.open(fq_r1, append=True)
                except KeyError:
                    # Open new file
                    output_fqs.open(
                        fq_r1,
                        "%s.%s.r1.%s" % (basename, assign_to, fastq_ext))
            output_fqs.write(fq_r1, "%s" % read_pair.r1)
            if fq_r2 not in output_fqs:
                try:
                    # Try to reopen file and append
                    output_fqs.open(fq_r2, append=True)
                except KeyError:
                    # Open new file
                    output_fqs.open(
                        fq_r2,
                        "%s.%s.r2.%s" % (basename, assign_to, fastq_ext))
            output_fqs.write(fq_r2, "%s" % read_pair.r2)
        print "   Finished at %s" % time.ctime()
        print "   (Took %.0fs)" % (time.time() - start_time)
    # Close output files
    output_fqs.close()

    # Summary output to screen
    total_reads = assigned + unassigned
    print "Summary:"
    print "--------"
    print "Number of barcodes         : %d" % len(barcode_list)
    if do_check_barcodes:
        print "Number of expected barcodes: %d/%d" % \
            (len(filtered_counts.keys()),
             len(expected_barcodes))
    print "Total reads                : %d" % total_reads
    if do_quality_filter:
        print "Total reads (filtered)     : %d" % filtered
    if do_check_barcodes:
        print "Total reads (assigned)     : %d" % assigned
        print "Unassigned reads           : %d" % unassigned