コード例 #1
0
ファイル: whitelist.py プロジェクト: yiyewuqingfeng/UMI-tools
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=usage,
                            description=globals()["__doc__"])

    group = U.OptionGroup(parser, "whitelist-specific options")

    group.add_option("--plot-prefix",
                     dest="plot_prefix", type="string",
                     help=("Prefix for plots to visualise the automated "
                           "detection of the number of 'true' cell barcodes"))
    group.add_option("--subset-reads",
                     dest="subset_reads", type="int",
                     help=("Use the first N reads to automatically identify "
                           "the true cell barcodes. If N is greater than the "
                           "number of reads, all reads will be used. "
                           "Default is 100,000,000"))
    group.add_option("--error-correct-threshold",
                     dest="error_correct_threshold",
                     type="int",
                     help=("Hamming distance for correction of barcodes to "
                           "whitelist barcodes. This value will also be used "
                           "for error detection above the knee if required "
                           "(--ed-above-threshold)"))
    group.add_option("--method",
                     dest="method",
                     choices=["reads", "umis"],
                     help=("Use reads or unique umi counts per cell"))
    group.add_option("--knee-method",
                     dest="knee_method",
                     choices=["distance", "density"],
                     help=("Use distance or density methods for detection of knee"))
    group.add_option("--expect-cells",
                     dest="expect_cells",
                     type="int",
                     help=("Prior expectation on the upper limit on the "
                           "number of cells sequenced"))
    group.add_option("--allow-threshold-error",
                     dest="allow_threshold_error", action="store_true",
                     help=("Don't select a threshold. Will still "
                           "output the plots if requested (--plot-prefix)"))
    group.add_option("--set-cell-number",
                     dest="cell_number",
                     type="int",
                     help=("Specify the number of cell barcodes to accept"))

    parser.add_option("--ed-above-threshold",
                      dest="ed_above_threshold", type="choice",
                      choices=["discard", "correct"],
                      help=("Detect CBs above the threshold which may be "
                            "sequence errors from another CB and either "
                            "'discard' or 'correct'. Default=discard"))
    parser.add_option_group(group)

    parser.set_defaults(method="reads",
                        knee_method="distance",
                        extract_method="string",
                        whitelist_tsv=None,
                        blacklist_tsv=None,
                        error_correct_threshold=1,
                        pattern=None,
                        pattern2=None,
                        read2_in=None,
                        plot_prefix=None,
                        subset_reads=100000000,
                        expect_cells=False,
                        allow_threshold_error=False,
                        cell_number=False,
                        ed_above_threshold=None,
                        ignore_suffix=False)

    # add common options (-h/--help, ...) and parse command line

    (options, args) = U.Start(parser, argv=argv,
                              add_extract_options=True,
                              add_group_dedup_options=False,
                              add_umi_grouping_options=False,
                              add_sam_options=False)

    if options.filtered_out and not options.extract_method == "regex":
        U.error("Reads will not be filtered unless extract method is"
                "set to regex (--extract-method=regex)")

    if options.expect_cells:
        if options.knee_method == "distance":
            U.error("Cannot use --expect-cells with 'distance' knee "
                    "method. Switch to --knee-method=density if you want to "
                    "provide an expectation for the number of "
                    "cells. Alternatively, if you know the number of cell "
                    "barcodes, use --cell-number")
        if options.cell_number:
            U.error("Cannot supply both --expect-cells and "
                    "--cell-number options")

    extract_cell, extract_umi = U.validateExtractOptions(options)

    if not extract_cell:
        if options.extract_method == "string":
            U.error("barcode pattern(s) do not include any cell bases "
                    "(marked with 'Cs') %s, %s" % (
                        options.pattern, options.pattern2))
        elif options.extract_method == "regex":
            U.error("barcode regex(es) do not include any cell groups "
                    "(starting with 'cell_') %s, %s" (
                        options.pattern, options.pattern2))

    read1s = umi_methods.fastqIterate(options.stdin)

    # set up read extractor
    ReadExtractor = extract_methods.ExtractFilterAndUpdate(
        method=options.extract_method,
        pattern=options.pattern,
        pattern2=options.pattern2,
        prime3=options.prime3,
        extract_cell=extract_cell)

    cell_barcode_counts = collections.Counter()

    n_reads = 0
    n_cell_barcodes = 0

    # if using the umis method, need to keep a set of umis observed
    if options.method == "umis":
        cell_barcode_umis = collections.defaultdict(set)

    # variables for progress monitor
    displayMax = 100000
    U.info("Starting barcode extraction")

    if options.filtered_out:
        filtered_out = U.openFile(options.filtered_out, "w")

    if not options.read2_in:
        for read1 in read1s:

            # Update display in every 100kth iteration
            if n_reads % displayMax == 0:
                U.info("Parsed {} reads".format(n_reads))

            n_reads += 1
            barcode_values = ReadExtractor.getBarcodes(read1)
            if barcode_values is None:
                if options.filtered_out:
                    filtered_out.write(str(read1) + "\n")
                continue
            else:
                cell, umi, _, _, _, _, _ = barcode_values
                if options.method == "umis":
                    cell_barcode_umis[cell].add(umi)
                else:
                    cell_barcode_counts[cell] += 1
                n_cell_barcodes += 1

            if options.subset_reads:
                if n_cell_barcodes > options.subset_reads:
                    break
    else:

        if options.filtered_out2:
            filtered_out2 = U.openFile(options.filtered_out2, "w")

        read2s = umi_methods.fastqIterate(U.openFile(options.read2_in))
        for read1, read2 in izip(read1s, read2s):

            # Update display in every 100kth iteration
            if n_reads % displayMax == 0:
                U.info("Parsed {} reads".format(n_reads))

            n_reads += 1

            barcode_values = ReadExtractor.getBarcodes(read1, read2)
            if barcode_values is None:
                if options.filtered_out:
                    filtered_out.write(str(read1) + "\n")
                if options.filtered_out2:
                    filtered_out2.write(str(read2) + "\n")
                continue
            else:
                cell, umi, _, _, _, _, _ = barcode_values
                if options.method == "umis":
                    cell_barcode_umis[cell].add(umi)
                else:
                    cell_barcode_counts[cell] += 1
                n_cell_barcodes += 1

            if options.subset_reads:
                if n_reads > options.subset_reads:
                    break

    U.info("Starting - whitelist determination")

    if options.method == "umis":
        for cell in cell_barcode_umis:
            cell_barcode_counts[cell] = len(cell_barcode_umis[cell])

    if options.cell_number and options.cell_number > len(cell_barcode_counts):
        raise ValueError(
            "--set-cell-barcode option specifies more cell barcodes than the "
            "number of observed cell barcodes. This may be because "
            "--subset-reads was set to a value too low to capture reads from "
            "all cells. %s cell barcodes observed from %s parsed reads. "
            "Expected>= %s cell barcodes" % (
                len(cell_barcode_counts),
                options.subset_reads,
                options.cell_number))

    cell_whitelist, true_to_false_map = whitelist_methods.getCellWhitelist(
        cell_barcode_counts,
        options.knee_method,
        options.expect_cells,
        options.cell_number,
        options.error_correct_threshold,
        options.plot_prefix)

    if cell_whitelist:
        U.info("Top %s cell barcodes passed the selected threshold" %
               len(cell_whitelist))

    if options.ed_above_threshold:
        cell_whitelist, true_to_false_map = whitelist_methods.errorDetectAboveThreshold(
            cell_barcode_counts,
            cell_whitelist,
            true_to_false_map,
            errors=options.error_correct_threshold,
            resolution_method=options.ed_above_threshold)

    if cell_whitelist:
        U.info("Writing out whitelist")
        total_correct_barcodes = 0
        total_corrected_barcodes = 0
        for barcode in sorted(list(cell_whitelist)):

            total_correct_barcodes += cell_barcode_counts[barcode]

            if true_to_false_map:
                corrected_barcodes = ",".join(
                    sorted(true_to_false_map[barcode]))

                correct_barcode_counts = [cell_barcode_counts[x] for x in
                                          sorted(true_to_false_map[barcode])]
                total_corrected_barcodes += sum(correct_barcode_counts)

                corrected_barcode_counts = ",".join(
                    map(str, correct_barcode_counts))
            else:
                corrected_barcodes, corrected_barcode_counts = "", ""

            options.stdout.write("%s\t%s\t%s\t%s\n" % (
                barcode, corrected_barcodes, cell_barcode_counts[barcode],
                corrected_barcode_counts))
    else:
        msg = ("No local minima was accepted. Recommend checking the plot "
               "output and counts per local minima (requires `--plot-prefix`"
               "option) and then re-running with manually selected threshold "
               "(`--set-cell-number` option)")

        if options.allow_threshold_error:
            U.info(msg)
        else:
            U.error(msg)

    U.info("Parsed %i reads" % n_reads)
    U.info("%i reads matched the barcode pattern" % n_cell_barcodes)
    U.info("Found %i unique cell barcodes" % len(cell_barcode_counts))

    if cell_whitelist:
        U.info("Found %i total reads matching the selected cell barcodes" %
               total_correct_barcodes)
        U.info("Found %i total reads which can be error corrected to the "
               "selected cell barcodes" % total_corrected_barcodes)

    if options.filtered_out:
        filtered_out.close()
    if options.filtered_out2:
        filtered_out2.close()

    U.Stop()
コード例 #2
0
ファイル: extract.py プロジェクト: xcit10/UMI-tools
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=usage,
                            description=globals()["__doc__"])

    group = U.OptionGroup(parser, "extract-specific options")

    # (Experimental option) Retain the UMI in the sequence read"
    group.add_option("--retain-umi", dest="retain_umi", action="store_true",
                     help=optparse.SUPPRESS_HELP)
    group.add_option("--read2-out", dest="read2_out", type="string",
                     help="file to output processed paired read to")
    group.add_option("--read2-stdout", dest="read2_stdout",
                     action="store_true",
                     help="Paired reads, send read2 to stdout, discarding read1")
    group.add_option("--quality-filter-threshold",
                     dest="quality_filter_threshold", type="int",
                     help=("Remove reads where any UMI base quality score "
                           "falls below this threshold"))
    group.add_option("--quality-filter-mask",
                     dest="quality_filter_mask", type="int",
                     help=("If a UMI base has a quality below this threshold, "
                           "replace the base with 'N'"))
    group.add_option("--quality-encoding",
                     dest="quality_encoding", type="choice",
                     choices=["phred33", "phred64", "solexa"],
                     help=("Quality score encoding. Choose from 'phred33'"
                           "[33-77] 'phred64' [64-106] or 'solexa' [59-106]"))
    group.add_option("--filter-cell-barcode",
                     dest="filter_cell_barcode",
                     action="store_true",
                     help=optparse.SUPPRESS_HELP)
    group.add_option("--error-correct-cell",
                     dest="error_correct_cell",
                     action="store_true",
                     help=("Correct errors in the cell barcode"))
    group.add_option("--whitelist",
                     dest="whitelist", type="string",
                     help=("A whitelist of accepted cell barcodes"))
    group.add_option("--blacklist",
                     dest="blacklist", type="string",
                     help=("A blacklist of rejected cell barcodes"))
    group.add_option("--filter-umi",
                     dest="filter_umi",
                     action="store_true",
                     #help="Filter the UMIs"
                     help=optparse.SUPPRESS_HELP)
    group.add_option("--umi-whitelist", dest="umi_whitelist",
                     type="string", default=None,
                     #help="A whitelist of accepted UMIs [default=%default]"
                     help=optparse.SUPPRESS_HELP)
    group.add_option("--umi-whitelist-paired", dest="umi_whitelist_paired",
                     type="string", default=None,
                     #help="A whitelist of accepted UMIs for read2[default=%default]"
                     help=optparse.SUPPRESS_HELP)
    group.add_option("--correct-umi-threshold", dest="correct_umi_threshold",
                     type="int", default=0,
                     #help="Correct errors in UMIs to the whitelist(s) provided"
                     #"if within threshold [default=%default]"
                     help=optparse.SUPPRESS_HELP)
    group.add_option("--umi-correct-log", dest="umi_correct_log",
                     type="string", default=None,
                     #help="File logging UMI error correction",
                     help=optparse.SUPPRESS_HELP)
    group.add_option("--subset-reads", "--reads-subset",
                     dest="reads_subset", type="int",
                     help=("Only extract from the first N reads. If N is "
                           "greater than the number of reads, all reads will "
                           "be used"))
    group.add_option("--reconcile-pairs",
                     dest="reconcile", action="store_true",
                     help=("Allow the presences of reads in read2 input that "
                           "are not present in read1 input. This allows cell "
                           "barcode filtering of read1s without "
                           "considering read2s"))
    parser.add_option_group(group)

    group = U.OptionGroup(parser, "[EXPERIMENTAl] barcode extraction options")

    group.add_option("--either-read", dest="either_read", action="store_true",
                     help="UMI may be on either read (see "
                     "--either-read-resolve) for options to resolve cases where"
                     "UMI is on both reads")
    group.add_option("--either-read-resolve",
                     dest="either_read_resolve", type="choice",
                     choices=["discard", "quality"],
                     help=("How to resolve instances where both reads "
                           "contain a UMI but using --either-read."
                           "Choose from 'discard' or 'quality'"
                           "(use highest quality). default=dicard"))

    parser.add_option_group(group)

    parser.set_defaults(extract_method="string",
                        filter_cell_barcodes=False,
                        whitelist=None,
                        blacklist=None,
                        error_correct_cell=False,
                        pattern=None,
                        pattern2=None,
                        read2_in=None,
                        read2_out=False,
                        read2_stdout=False,
                        quality_filter_threshold=None,
                        quality_encoding=None,
                        reconcile=False,
                        either_read=False,
                        either_read_resolve="discard",
                        ignore_suffix=False)

    # add common options (-h/--help, ...) and parse command line

    (options, args) = U.Start(parser, argv=argv,
                              add_extract_options=True,
                              add_group_dedup_options=False,
                              add_umi_grouping_options=False,
                              add_sam_options=False)

    if options.filter_cell_barcode:
        U.info('Use of --whitelist ensures cell barcodes are filtered. '
               '--filter-cell-barcode is no longer required and may be '
               'removed in future versions.')

    if options.whitelist is not None:
        options.filter_cell_barcode = True

    if options.retain_umi and not options.extract_method == "regex":
        U.error("option --retain-umi only works with --extract-method=regex")

    if (options.filtered_out and not options.extract_method == "regex" and
        whitelist is None):
        U.error("Reads will not be filtered unless extract method is"
                "set to regex (--extract-method=regex) or cell"
                "barcodes are filtered (--whitelist)")

    if options.quality_filter_threshold or options.quality_filter_mask:
        if not options.quality_encoding:
            U.error("must provide a quality encoding (--quality-"
                    "encoding) to filter UMIs by quality (--quality"
                    "-filter-threshold) or mask low quality bases "
                    "with (--quality-filter-mask)")

    extract_cell, extract_umi = U.validateExtractOptions(options)

    if options.either_read:
        if extract_cell:
            U.error("Option to extract from either read (--either-read) "
                    "is not currently compatible with cell barcode extraction")
        if not options.extract_method == "regex":
            U.error("Option to extract from either read (--either-read)"
                    "requires --extract-method=regex")
        if not options.pattern or not options.pattern2:
            U.error("Option to extract from either read (--either-read)"
                    "requires --bc-pattern=[PATTERN1] and"
                    "--bc-pattern2=[PATTERN2]")

    if options.filter_umi:

        if not options.umi_whitelist:
                U.error("must provide a UMI whitelist (--umi-whitelist) if using "
                        "--filter-umi option")
        if options.pattern2 and not options.umi_whitelist_paired:
                U.error("must provide a UMI whitelist for paired end "
                        "(--umi-whitelist-paired) if using --filter-umi option"
                        "with paired end data")
        if not extract_umi:
            if options.extract_method == "string":
                U.error("barcode pattern(s) do not include any umi bases "
                        "(marked with 'Ns') %s, %s" % (
                            options.pattern, options.pattern2))
            elif options.extract_method == "regex":
                U.error("barcode regex(es) do not include any umi groups "
                        "(starting with 'umi_') %s, %s" (
                            options.pattern, options.pattern2))

    if options.whitelist:

        if not extract_cell:
            if options.extract_method == "string":
                U.error("barcode pattern(s) do not include any cell bases "
                        "(marked with 'Cs') %s, %s" % (
                            options.pattern, options.pattern2))
            elif options.extract_method == "regex":
                U.error("barcode regex(es) do not include any cell groups "
                        "(starting with 'cell_') %s, %s" (
                            options.pattern, options.pattern2))

    read1s = umi_methods.fastqIterate(options.stdin)

    # set up read extractor
    ReadExtractor = extract_methods.ExtractFilterAndUpdate(
        options.extract_method,
        options.pattern,
        options.pattern2,
        options.prime3,
        extract_cell,
        options.quality_encoding,
        options.quality_filter_threshold,
        options.quality_filter_mask,
        options.filter_umi,
        options.filter_cell_barcode,
        options.retain_umi,
        options.either_read,
        options.either_read_resolve)

    if options.filter_umi:
        umi_whitelist, false_to_true_map = whitelist_methods.getUserDefinedBarcodes(
            options.umi_whitelist,
            options.umi_whitelist_paired,
            deriveErrorCorrection=True,
            threshold=options.correct_umi_threshold)

        U.info("Length of whitelist: %i" % len(umi_whitelist))
        U.info("Length of 'correctable' whitelist: %i" % len(false_to_true_map))

        ReadExtractor.umi_whitelist = umi_whitelist
        ReadExtractor.umi_false_to_true_map = false_to_true_map
        ReadExtractor.umi_whitelist_counts = collections.defaultdict(
            lambda: collections.Counter())

    if options.whitelist:
        cell_whitelist, false_to_true_map = whitelist_methods.getUserDefinedBarcodes(
            options.whitelist,
            getErrorCorrection=options.error_correct_cell)

        ReadExtractor.cell_whitelist = cell_whitelist
        ReadExtractor.false_to_true_map = false_to_true_map

    if options.blacklist:
        blacklist = set()
        with U.openFile(options.blacklist, "r") as inf:
            for line in inf:
                blacklist.add(line.strip().split("\t")[0])
        ReadExtractor.cell_blacklist = blacklist

    # variables for progress monitor
    progCount = 0
    displayMax = 100000
    U.info("Starting barcode extraction")

    if options.filtered_out:
        filtered_out = U.openFile(options.filtered_out, "w")

    if options.read2_in is None:
        for read in read1s:

            # incrementing count for monitoring progress
            progCount += 1

            # Update display in every 100kth iteration
            if progCount % displayMax == 0:
                U.info("Parsed {} reads".format(progCount))

            new_read = ReadExtractor(read)

            if options.reads_subset:
                if (ReadExtractor.read_counts['Input Reads'] >
                    options.reads_subset):
                    break

            if not new_read:
                if options.filtered_out:
                    filtered_out.write(str(read) + "\n")
                continue

            options.stdout.write(str(new_read) + "\n")

    else:

        if options.filtered_out2:
            filtered_out2 = U.openFile(options.filtered_out2, "w")

        read2s = umi_methods.fastqIterate(U.openFile(options.read2_in))

        if options.read2_out:
            read2_out = U.openFile(options.read2_out, "w")

        if options.reconcile:
            strict = False
        else:
            strict = True

        for read1, read2 in umi_methods.joinedFastqIterate(
                read1s, read2s, strict, options.ignore_suffix):

            # incrementing count for monitoring progress
            progCount += 1

            # Update display in every 100kth iteration
            if progCount % displayMax == 0:
                U.info("Parsed {} reads".format(progCount))
                sys.stdout.flush()

            reads = ReadExtractor(read1, read2)

            if options.reads_subset:
                if (ReadExtractor.read_counts['Input Reads'] >
                    options.reads_subset):
                    break

            if not reads:
                if options.filtered_out:
                    filtered_out.write(str(read1) + "\n")
                if options.filtered_out2:
                    filtered_out2.write(str(read2) + "\n")
                continue
            else:
                new_read1, new_read2 = reads

            if options.read2_stdout:
                options.stdout.write(str(new_read2) + "\n")
            else:
                options.stdout.write(str(new_read1) + "\n")

                if options.read2_out:
                    read2_out.write(str(new_read2) + "\n")

    if options.read2_out:
        read2_out.close()
    if options.filtered_out:
        filtered_out.close()
    if options.filtered_out2:
        filtered_out2.close()

    for k, v in ReadExtractor.getReadCounts().most_common():
        U.info("%s: %s" % (k, v))

    if options.umi_correct_log:
        with U.openFile(options.umi_correct_log, "w") as outf:
            outf.write("umi\tcount_no_errors\tcount_errors\n")
            for umi, counts in ReadExtractor.umi_whitelist_counts.items():
                outf.write("%s\t%i\t%i\n" % (
                    umi, counts["no_error"], counts["error"]))
        outf.close()

    U.Stop()