def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=usage, description=globals()["__doc__"]) group = U.OptionGroup(parser, "extract-specific options") # (Experimental option) Retain the UMI in the sequence read" group.add_option("--retain-umi", dest="retain_umi", action="store_true", help=optparse.SUPPRESS_HELP) group.add_option("--read2-out", dest="read2_out", type="string", help="file to output processed paired read to") group.add_option("--read2-stdout", dest="read2_stdout", action="store_true", help="Paired reads, send read2 to stdout, discarding read1") group.add_option("--quality-filter-threshold", dest="quality_filter_threshold", type="int", help=("Remove reads where any UMI base quality score " "falls below this threshold")) group.add_option("--quality-filter-mask", dest="quality_filter_mask", type="int", help=("If a UMI base has a quality below this threshold, " "replace the base with 'N'")) group.add_option("--quality-encoding", dest="quality_encoding", type="choice", choices=["phred33", "phred64", "solexa"], help=("Quality score encoding. Choose from 'phred33'" "[33-77] 'phred64' [64-106] or 'solexa' [59-106]")) group.add_option("--filter-cell-barcode", dest="filter_cell_barcode", action="store_true", help=optparse.SUPPRESS_HELP) group.add_option("--error-correct-cell", dest="error_correct_cell", action="store_true", help=("Correct errors in the cell barcode")) group.add_option("--whitelist", dest="whitelist", type="string", help=("A whitelist of accepted cell barcodes")) group.add_option("--blacklist", dest="blacklist", type="string", help=("A blacklist of rejected cell barcodes")) group.add_option("--filter-umi", dest="filter_umi", action="store_true", #help="Filter the UMIs" help=optparse.SUPPRESS_HELP) group.add_option("--umi-whitelist", dest="umi_whitelist", type="string", default=None, #help="A whitelist of accepted UMIs [default=%default]" help=optparse.SUPPRESS_HELP) group.add_option("--umi-whitelist-paired", dest="umi_whitelist_paired", type="string", default=None, #help="A whitelist of accepted UMIs for read2[default=%default]" help=optparse.SUPPRESS_HELP) group.add_option("--correct-umi-threshold", dest="correct_umi_threshold", type="int", default=0, #help="Correct errors in UMIs to the whitelist(s) provided" #"if within threshold [default=%default]" help=optparse.SUPPRESS_HELP) group.add_option("--umi-correct-log", dest="umi_correct_log", type="string", default=None, #help="File logging UMI error correction", help=optparse.SUPPRESS_HELP) group.add_option("--subset-reads", "--reads-subset", dest="reads_subset", type="int", help=("Only extract from the first N reads. If N is " "greater than the number of reads, all reads will " "be used")) group.add_option("--reconcile-pairs", dest="reconcile", action="store_true", help=("Allow the presences of reads in read2 input that " "are not present in read1 input. This allows cell " "barcode filtering of read1s without " "considering read2s")) parser.add_option_group(group) group = U.OptionGroup(parser, "[EXPERIMENTAl] barcode extraction options") group.add_option("--either-read", dest="either_read", action="store_true", help="UMI may be on either read (see " "--either-read-resolve) for options to resolve cases where" "UMI is on both reads") group.add_option("--either-read-resolve", dest="either_read_resolve", type="choice", choices=["discard", "quality"], help=("How to resolve instances where both reads " "contain a UMI but using --either-read." "Choose from 'discard' or 'quality'" "(use highest quality). default=dicard")) parser.add_option_group(group) parser.set_defaults(extract_method="string", filter_cell_barcodes=False, whitelist=None, blacklist=None, error_correct_cell=False, pattern=None, pattern2=None, read2_in=None, read2_out=False, read2_stdout=False, quality_filter_threshold=None, quality_encoding=None, reconcile=False, either_read=False, either_read_resolve="discard", ignore_suffix=False) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_extract_options=True, add_group_dedup_options=False, add_umi_grouping_options=False, add_sam_options=False) if options.filter_cell_barcode: U.info('Use of --whitelist ensures cell barcodes are filtered. ' '--filter-cell-barcode is no longer required and may be ' 'removed in future versions.') if options.whitelist is not None: options.filter_cell_barcode = True if options.retain_umi and not options.extract_method == "regex": U.error("option --retain-umi only works with --extract-method=regex") if (options.filtered_out and not options.extract_method == "regex" and whitelist is None): U.error("Reads will not be filtered unless extract method is" "set to regex (--extract-method=regex) or cell" "barcodes are filtered (--whitelist)") if options.quality_filter_threshold or options.quality_filter_mask: if not options.quality_encoding: U.error("must provide a quality encoding (--quality-" "encoding) to filter UMIs by quality (--quality" "-filter-threshold) or mask low quality bases " "with (--quality-filter-mask)") extract_cell, extract_umi = U.validateExtractOptions(options) if options.either_read: if extract_cell: U.error("Option to extract from either read (--either-read) " "is not currently compatible with cell barcode extraction") if not options.extract_method == "regex": U.error("Option to extract from either read (--either-read)" "requires --extract-method=regex") if not options.pattern or not options.pattern2: U.error("Option to extract from either read (--either-read)" "requires --bc-pattern=[PATTERN1] and" "--bc-pattern2=[PATTERN2]") if options.filter_umi: if not options.umi_whitelist: U.error("must provide a UMI whitelist (--umi-whitelist) if using " "--filter-umi option") if options.pattern2 and not options.umi_whitelist_paired: U.error("must provide a UMI whitelist for paired end " "(--umi-whitelist-paired) if using --filter-umi option" "with paired end data") if not extract_umi: if options.extract_method == "string": U.error("barcode pattern(s) do not include any umi bases " "(marked with 'Ns') %s, %s" % ( options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any umi groups " "(starting with 'umi_') %s, %s" ( options.pattern, options.pattern2)) if options.whitelist: if not extract_cell: if options.extract_method == "string": U.error("barcode pattern(s) do not include any cell bases " "(marked with 'Cs') %s, %s" % ( options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any cell groups " "(starting with 'cell_') %s, %s" ( options.pattern, options.pattern2)) read1s = umi_methods.fastqIterate(options.stdin) # set up read extractor ReadExtractor = extract_methods.ExtractFilterAndUpdate( options.extract_method, options.pattern, options.pattern2, options.prime3, extract_cell, options.quality_encoding, options.quality_filter_threshold, options.quality_filter_mask, options.filter_umi, options.filter_cell_barcode, options.retain_umi, options.either_read, options.either_read_resolve) if options.filter_umi: umi_whitelist, false_to_true_map = whitelist_methods.getUserDefinedBarcodes( options.umi_whitelist, options.umi_whitelist_paired, deriveErrorCorrection=True, threshold=options.correct_umi_threshold) U.info("Length of whitelist: %i" % len(umi_whitelist)) U.info("Length of 'correctable' whitelist: %i" % len(false_to_true_map)) ReadExtractor.umi_whitelist = umi_whitelist ReadExtractor.umi_false_to_true_map = false_to_true_map ReadExtractor.umi_whitelist_counts = collections.defaultdict( lambda: collections.Counter()) if options.whitelist: cell_whitelist, false_to_true_map = whitelist_methods.getUserDefinedBarcodes( options.whitelist, getErrorCorrection=options.error_correct_cell) ReadExtractor.cell_whitelist = cell_whitelist ReadExtractor.false_to_true_map = false_to_true_map if options.blacklist: blacklist = set() with U.openFile(options.blacklist, "r") as inf: for line in inf: blacklist.add(line.strip().split("\t")[0]) ReadExtractor.cell_blacklist = blacklist # variables for progress monitor progCount = 0 displayMax = 100000 U.info("Starting barcode extraction") if options.filtered_out: filtered_out = U.openFile(options.filtered_out, "w") if options.read2_in is None: for read in read1s: # incrementing count for monitoring progress progCount += 1 # Update display in every 100kth iteration if progCount % displayMax == 0: U.info("Parsed {} reads".format(progCount)) new_read = ReadExtractor(read) if options.reads_subset: if (ReadExtractor.read_counts['Input Reads'] > options.reads_subset): break if not new_read: if options.filtered_out: filtered_out.write(str(read) + "\n") continue options.stdout.write(str(new_read) + "\n") else: if options.filtered_out2: filtered_out2 = U.openFile(options.filtered_out2, "w") read2s = umi_methods.fastqIterate(U.openFile(options.read2_in)) if options.read2_out: read2_out = U.openFile(options.read2_out, "w") if options.reconcile: strict = False else: strict = True for read1, read2 in umi_methods.joinedFastqIterate( read1s, read2s, strict, options.ignore_suffix): # incrementing count for monitoring progress progCount += 1 # Update display in every 100kth iteration if progCount % displayMax == 0: U.info("Parsed {} reads".format(progCount)) sys.stdout.flush() reads = ReadExtractor(read1, read2) if options.reads_subset: if (ReadExtractor.read_counts['Input Reads'] > options.reads_subset): break if not reads: if options.filtered_out: filtered_out.write(str(read1) + "\n") if options.filtered_out2: filtered_out2.write(str(read2) + "\n") continue else: new_read1, new_read2 = reads if options.read2_stdout: options.stdout.write(str(new_read2) + "\n") else: options.stdout.write(str(new_read1) + "\n") if options.read2_out: read2_out.write(str(new_read2) + "\n") if options.read2_out: read2_out.close() if options.filtered_out: filtered_out.close() if options.filtered_out2: filtered_out2.close() for k, v in ReadExtractor.getReadCounts().most_common(): U.info("%s: %s" % (k, v)) if options.umi_correct_log: with U.openFile(options.umi_correct_log, "w") as outf: outf.write("umi\tcount_no_errors\tcount_errors\n") for umi, counts in ReadExtractor.umi_whitelist_counts.items(): outf.write("%s\t%i\t%i\n" % ( umi, counts["no_error"], counts["error"])) outf.close() U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-p", "--bc-pattern", dest="pattern", type="string", help="Barcode pattern") parser.add_option("--bc-pattern2", dest="pattern2", type="string", help="Barcode pattern for paired reads") parser.add_option("--3prime", dest="prime3", action="store_true", help="barcode is on 3' end of read.") parser.add_option("--read2-in", dest="read2_in", type="string", help="file name for read pairs") parser.add_option("--read2-out", dest="read2_out", type="string", help="file to output processed paired read to") parser.add_option("--read2-stdout", dest="read2_stdout", action="store_true", help="Paired reads, send read2 to stdout, discarding read1") parser.add_option("--quality-filter-threshold", dest="quality_filter_threshold", type="int", help=("Remove reads where any UMI base quality score " "falls below this threshold")) parser.add_option("--quality-filter-mask", dest="quality_filter_mask", type="int", help=("If a UMI base has a quality below this threshold, " "replace the base with 'N'")) parser.add_option("--quality-encoding", dest="quality_encoding", type="choice", choices=["phred33", "phred64", "solexa"], help=("Quality score encoding. Choose from 'phred33'" "[33-77] 'phred64' [64-106] or 'solexa' [59-106]")) parser.add_option("--extract-method", dest="extract_method", type="choice", choices=["string", "regex"], help=("How to extract the umi +/- cell barcodes, Choose " "from 'string' or 'regex'")) parser.add_option("--filter-cell-barcode", dest="filter_cell_barcode", action="store_true", help="Filter the cell barcodes") parser.add_option("--error-correct-cell", dest="error_correct_cell", action="store_true", help=("Correct errors in the cell barcode")) parser.add_option("--whitelist", dest="whitelist", type="string", help=("A whitelist of accepted cell barcodes")) parser.add_option("--blacklist", dest="blacklist", type="string", help=("A blacklist of accepted cell barcodes")) parser.add_option("--reads-subset", dest="reads_subset", type="int", help=("Only extract from the first N reads. If N is " "greater than the number of reads, all reads will " "be used")) parser.add_option("--reconcile-pairs", dest="reconcile", action="store_true", help=("Allow the presences of reads in read2 input that are" "not present in read1 input. This allows cell barcode" "filtering of read1s without considering read2s")) parser.set_defaults(extract_method="string", filter_cell_barcodes=False, whitelist=None, blacklist=None, error_correct_cell=False, pattern=None, pattern2=None, read2_in=None, read2_out=False, read2_stdout=False, quality_filter_threshold=None, quality_encoding=None, reconcile=False) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False, add_sam_options=False) if options.quality_filter_threshold or options.quality_filter_mask: if not options.quality_encoding: U.error("must provide a quality encoding (--quality-" "encoding) to filter UMIs by quality (--quality" "-filter-threshold) or mask low quality bases " "with (--quality-filter-mask)") if not options.pattern and not options.pattern2: if not options.read2_in: U.error("Must supply --bc-pattern for single-end") else: U.error("Must supply --bc-pattern and/or --bc-pattern " "if paired-end ") if options.pattern2: if not options.read2_in: U.error("must specify a paired fastq ``--read2-in``") if not options.pattern2: options.pattern2 = options.pattern extract_cell = False extract_umi = False # If the pattern is a regex we can compile the regex(es) prior to # ExtractFilterAndUpdate instantiation if options.extract_method == "regex": if options.pattern: try: options.pattern = regex.compile(options.pattern) except regex.error: U.error("barcode_regex '%s' is not a " "valid regex" % options.pattern) if options.pattern2: try: options.pattern2 = regex.compile(options.pattern2) except regex.Error: U.error("barcode_regex2 '%s' is not a " "valid regex" % options.pattern2) # check whether the regex contains a umi group(s) and cell groups(s) if options.extract_method == "regex": if options.pattern: for group in options.pattern.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True if options.pattern2: for group in options.pattern2.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True # check whether the pattern string contains umi/cell bases elif options.extract_method == "string": if options.pattern: if "C" in options.pattern: extract_cell = True if "N" in options.pattern: extract_umi = True if options.pattern2: if "C" in options.pattern2: extract_cell = True if "N" in options.pattern2: extract_umi = True if not extract_umi: if options.extract_method == "string": U.error("barcode pattern(s) do not include any umi bases " "(marked with 'Ns') %s, %s" % ( options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any umi groups " "(starting with 'umi_') %s, %s" ( options.pattern, options.pattern2)) if options.filter_cell_barcodes: if not options.whitelist: U.error("must provide a whitelist (--whitelist) if using " "--filter-cell-barcode option") if not extract_cell: if options.extract_method == "string": U.error("barcode pattern(s) do not include any cell bases " "(marked with 'Cs') %s, %s" % ( options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any cell groups " "(starting with 'cell_') %s, %s" ( options.pattern, options.pattern2)) read1s = umi_methods.fastqIterate(options.stdin) # set up read extractor ReadExtractor = umi_methods.ExtractFilterAndUpdate( options.extract_method, options.pattern, options.pattern2, options.prime3, extract_cell, options.quality_encoding, options.quality_filter_threshold, options.quality_filter_mask, options.filter_cell_barcode) if options.filter_cell_barcode: cell_whitelist, false_to_true_map = umi_methods.getUserDefinedBarcodes( options.whitelist, options.error_correct_cell) ReadExtractor.cell_whitelist = cell_whitelist ReadExtractor.false_to_true_map = false_to_true_map if options.blacklist: blacklist = set() with U.openFile(options.blacklist, "r") as inf: for line in inf: blacklist.add(line.strip().split("\t")[0]) ReadExtractor.cell_blacklist = blacklist # variables for progress monitor progCount = 0 displayMax = 100000 U.info("Starting barcode extraction") if options.read2_in is None: for read in read1s: # incrementing count for monitoring progress progCount += 1 # Update display in every 100kth iteration if progCount % displayMax == 0: U.info("Parsed {} reads".format(progCount)) new_read = ReadExtractor(read) if options.reads_subset: if (ReadExtractor.read_counts['Input Reads'] > options.reads_subset): break if not new_read: continue options.stdout.write(str(new_read) + "\n") else: read2s = umi_methods.fastqIterate(U.openFile(options.read2_in)) if options.read2_out: read2_out = U.openFile(options.read2_out, "w") if options.reconcile: strict = False else: strict = True for read1, read2 in umi_methods.joinedFastqIterate( read1s, read2s, strict): # incrementing count for monitoring progress progCount += 1 # Update display in every 100kth iteration if progCount % displayMax == 0: U.info("Parsed {} reads".format(progCount)) sys.stdout.flush() reads = ReadExtractor(read1, read2) if options.reads_subset: if (ReadExtractor.read_counts['Input Reads'] > options.reads_subset): break if not reads: continue else: new_read1, new_read2 = reads if options.read2_stdout: options.stdout.write(str(new_read2) + "\n") else: options.stdout.write(str(new_read1) + "\n") if options.read2_out: read2_out.write(str(new_read2) + "\n") if options.read2_out: read2_out.close() for k, v in ReadExtractor.getReadCounts().most_common(): U.info("%s: %s" % (k, v)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-p", "--bc-pattern", dest="pattern", type="string", help="Barcode pattern") parser.add_option("--bc-pattern2", dest="pattern2", type="string", help="Barcode pattern for paired reads") parser.add_option("--3prime", dest="prime3", action="store_true", help="barcode is on 3' end of read.") parser.add_option("--read2-in", dest="read2_in", type="string", help="file name for read pairs") parser.add_option("--read2-out", dest="read2_out", type="string", help="file to output processed paired read to") parser.add_option( "--read2-out-only", dest="read2_out_only", action="store_true", help="Paired reads, only output the second read in the pair") parser.add_option("--quality-filter-threshold", dest="quality_filter_threshold", type="int", help=("Remove reads where any UMI base quality score " "falls below this threshold")) parser.add_option( "--quality-filter-mask", dest="quality_filter_mask", type="int", help=("If a UMI base has a quality below this threshold, " "replace the base with 'N'")) parser.add_option("--quality-encoding", dest="quality_encoding", type="choice", choices=["phred33", "phred64", "solexa"], help=("Quality score encoding. Choose from 'phred33'" "[33-77] 'phred64' [64-106] or 'solexa' [59-106]")) parser.add_option("--extract-method", dest="extract_method", type="choice", choices=["string", "regex"], help=("How to extract the umi +/- cell barcodes, Choose " "from 'string' or 'regex'")) parser.add_option("--filter-cell-barcode", dest="filter_cell_barcode", action="store_true", help="Filter the cell barcodes") parser.add_option("--error-correct-cell", dest="error_correct_cell", action="store_true", help=("Correct errors in the cell barcode")) parser.add_option("--error-correct-threshold", dest="error_correct_threshold", type="int", help=("Hamming distance allowed for correction")) parser.add_option("--plot-prefix", dest="plot_prefix", type="string", help=("Prefix for plots to visualise the automated " "detection of the number of 'true' cell barcodes")) parser.add_option("--output-whitelist", dest="output_whitelist", type="string", help=("Write out the automatically generated whitelist")) parser.add_option("--whitelist-tsv", dest="whitelist_tsv", type="string", help=("A whitelist of accepted cell barcodes")) parser.add_option("--blacklist-tsv", dest="blacklist_tsv", type="string", help=("A blacklist of accepted cell barcodes")) parser.add_option( "--cell-barcode-subset", dest="cell_barcode_subset", type="int", help=("Use only the first N reads to automatically " "identify the true cell barcodes. If N is greater " "than the number of reads, all reads will be used")) parser.add_option("--reads-subset", dest="reads_subset", type="int", help=("Only extract from the first N reads. If N is " "greater than the number of reads, all reads will " "be used")) parser.add_option( "--reconcile-pairs", dest="reconcile", action="store_true", help=("Allow the presences of reads in read2 input that are" "not present in read1 input. This allows cell barcode" "filtering of read1s without considering read2s")) parser.set_defaults(extract_method="string", filter_cell_barcodes=False, whitelist_tsv=None, blacklist_tsv=None, error_correct_cell=False, error_correct_threshold=1, pattern=None, pattern2=None, read2_in=None, read2_out=False, read2_out_only=False, quality_filter_threshold=None, quality_encoding=None, plot_prefix=None, output_whitelist=None, cell_barcode_subset=50000000, reconcile=False) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) if options.quality_filter_threshold or options.quality_filter_mask: if not options.quality_encoding: U.error("must provide a quality encoding (--quality-" "encoding) to filter UMIs by quality (--quality" "-filter-threshold) or mask low quality bases " "with (--quality-filter-mask)") if not options.pattern and not options.pattern2: if not options.read2_in: U.error("Must supply --bc-pattern for single-end") else: U.error("Must supply --bc-pattern and/or --bc-pattern " "if paired-end ") if options.pattern2: if not options.read2_in: U.error("must specify a paired fastq ``--read2-in``") if not options.pattern2: options.pattern2 = options.pattern extract_cell = False extract_umi = False # If the pattern is a regex we can compile the regex(es) prior to # ExtractFilterAndUpdate instantiation if options.extract_method == "regex": if options.pattern: try: options.pattern = regex.compile(options.pattern) except regex.error: U.error("barcode_regex '%s' is not a " "valid regex" % options.pattern) if options.pattern2: try: options.pattern2 = regex.compile(options.barcode_regex2) except regex.Error: U.error("barcode_regex2 '%s' is not a " "valid regex" % options.barcode_regex2) # check whether the regex contains a umi group(s) and cell groups(s) if options.extract_method == "regex": if options.pattern: for group in options.pattern.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True if options.pattern2: for group in options.pattern2.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True # check whether the pattern string contains umi/cell bases elif options.extract_method == "string": if options.pattern: if "C" in options.pattern: extract_cell = True if "N" in options.pattern: extract_umi = True if options.pattern2: if "C" in options.pattern2: extract_cell = True if "N" in options.pattern2: extract_umi = True if options.whitelist_tsv: if options.blacklist_tsv: U.error("Do not supply a blacklist and a whitelist. Just " "remove the blacklist barcodes from the whitelist!") if not extract_umi: if options.extract_method == "string": U.error("barcode pattern(s) do not include any umi bases " "(marked with 'Ns') %s, %s" % (options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any umi groups " "(starting with 'umi_') %s, %s" (options.pattern, options.pattern2)) if options.stdin == sys.stdin: if not options.whitelist_tsv and options.filter_cell_barcode: U.error( "cannot support reading from stdin if correcting cell barcode") read1s = umi_methods.fastqIterate(U.openFile(options.stdin)) else: read1s = umi_methods.fastqIterate(U.openFile(options.stdin.name)) # set up read extractor ReadExtractor = umi_methods.ExtractFilterAndUpdate( options.extract_method, options.pattern, options.pattern2, options.prime3, extract_cell, options.quality_encoding, options.quality_filter_threshold, options.quality_filter_mask, options.filter_cell_barcode) if options.filter_cell_barcode: if (not options.whitelist_tsv) or options.error_correct_cell: cell_barcode_counts = collections.Counter() n_reads = 0 if not options.read2_in: for read1 in read1s: n_reads += 1 cell_barcode = ReadExtractor.getCellBarcode(read1) if cell_barcode: cell_barcode_counts[cell_barcode] += 1 if options.cell_barcode_subset: if (n_reads > options.cell_barcode_subset): break else: read2s = umi_methods.fastqIterate(U.openFile(options.read2_in)) for read1, read2 in izip(read1s, read2s): n_reads += 1 cell_barcode = ReadExtractor.getCellBarcode(read1, read2) if cell_barcode: cell_barcode_counts[cell_barcode] += 1 if options.cell_barcode_subset: if (n_reads > options.cell_barcode_subset): break if options.blacklist_tsv: cell_blacklist = umi_methods.getUserDefinedBarcodes( options.blacklist_tsv) for cell in cell_blacklist: del cell_barcode_counts[cell] if options.whitelist_tsv: cell_whitelist = umi_methods.getUserDefinedBarcodes( options.whitelist_tsv) error_correct_mappings = umi_methods.getErrorCorrectMappings( cell_barcode_counts.keys(), cell_whitelist, options.error_correct_threshold) else: # getCellWhitelist has not been properly defined yet! cell_whitelist, error_correct_mappings = umi_methods.getCellWhitelist( cell_barcode_counts, options.error_correct_threshold, options.plot_prefix) # re-make the reads1s iterator read1s = umi_methods.fastqIterate(U.openFile(options.stdin.name)) else: cell_whitelist = umi_methods.getUserDefinedBarcodes( options.whitelist_tsv) error_correct_mappings = None, None false_to_true_map, true_to_false_map = error_correct_mappings if options.output_whitelist: with U.openFile(options.output_whitelist, "w") as outf: columns = [ "barcode", "count", "corrected_barcodes", "corrected_barcode_counts" ] outf.write("\t".join(columns) + "\n") for barcode in sorted(list(cell_whitelist)): if true_to_false_map: corrected_barcodes = ",".join( sorted(true_to_false_map[barcode])) corrected_barcode_counts = ",".join( map(str, [ cell_barcode_counts[x] for x in sorted(true_to_false_map[barcode]) ])) else: corrected_barcodes, corrected_barcode_counts = "", "" outf.write("%s\t%s\t%s\t%s\n" % (barcode, cell_barcode_counts[barcode], corrected_barcodes, corrected_barcode_counts)) ReadExtractor.cell_whitelist = cell_whitelist ReadExtractor.false_to_true_map = false_to_true_map if options.read2_in is None: for read in read1s: new_read = ReadExtractor(read) if options.reads_subset: if (ReadExtractor.read_counts['Input Reads'] > options.reads_subset): break if not new_read: continue options.stdout.write(str(new_read) + "\n") else: read2s = umi_methods.fastqIterate(U.openFile(options.read2_in)) if options.read2_out: read2_out = U.openFile(options.read2_out, "w") if options.reconcile: strict = False else: strict = True for read1, read2 in umi_methods.joinedFastqIterate( read1s, read2s, strict): reads = ReadExtractor(read1, read2) if options.reads_subset: if (ReadExtractor.read_counts['Input Reads'] > options.reads_subset): break if not reads: continue else: new_read1, new_read2 = reads if not options.read2_out_only: options.stdout.write(str(new_read1) + "\n") if options.read2_out: read2_out.write(str(new_read2) + "\n") if options.read2_out: read2_out.close() for k, v in ReadExtractor.getReadCounts().most_common(): U.info("%s: %s" % (k, v)) U.Stop()