Python Utilities Examples

Programming Language: Python

Namespace/Package Name: umi_tools

Class/Type: Utilities

Examples at hotexamples.com: 38

Python Utilities - 38 examples found. These are the top rated real world Python examples of umi_tools.Utilities extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

info(26)

openFile(21)

OptionParser(17)

Stop(17)

Start(16)

OptionGroup(7)

error(7)

debug(6)

warn(5)

getTempFilename(3)

validateSamOptions(3)

validateExtractOptions(2)

Example #1

Show file

    def fill(self):
        ''' parse the BAM to obtain the frequency for each UMI'''
        self.frequency2umis = collections.defaultdict(list)

        for read in self.inbam:

            if read.is_unmapped:
                continue

            if read.is_read2:
                continue

            try:
                self.umis[self.barcode_getter(read)[0]] += 1
            except KeyError:
                continue

        self.umis_counter = collections.Counter(self.umis)
        total_umis = sum(self.umis_counter.values())
        U.info("total_umis %i" % total_umis)
        U.info("#umis %i" % len(self.umis_counter))

        self.prob = self.umis_counter.values()
        sum_prob = sum(self.prob)
        self.prob = [float(x) / sum_prob for x in self.prob]
        self.refill_random()

Example #2

Show file

    def write_mates(self):
        '''Scan the current chromosome for matches to any of the reads stored
        in the read1s buffer'''
        if self.chrom is not None:
            U.debug("Dumping %i mates for contig %s" %
                    (len(self.read1s), self.chrom))

        for read in self.infile.fetch(reference=self.chrom,
                                      multiple_iterators=True):
            if any((read.is_unmapped, read.mate_is_unmapped, read.is_read1)):
                continue

            key = read.query_name, read.reference_name, read.reference_start
            if key in self.read1s:
                if self.read2tags is not None:
                    unique_id, umi = self.read2tags[key]
                    self.read2tags.pop(key)

                    read.tags += [('UG', unique_id)]
                    read.tags += [('FU', umi)]

                self.outfile.write(read)
                self.read1s.remove(key)

        U.debug("%i mates remaining" % len(self.read1s))

Example #3

Show file

File: network.py Project: bdemaree/UMI-tools

def breadth_first_search_recursive(node, adj_list):
    try:
        recursive_search.component = set((node,))
        return recursive_search(node, adj_list)

    except RecursionError as error:
        U.info('Recursion Error: %s' % error)
        return breadth_first_search(node, adj_list)

Example #4

Show file

File: network.py Project: popucui/UMI-tools

def breadth_first_search_recursive(node, adj_list):
    try:
        recursive_search.component = set((node, ))
        return recursive_search(node, adj_list)

    except RecursionError as error:
        U.info('Recursion Error: %s' % error)
        return breadth_first_search(node, adj_list)

Example #5

Show file

File: extract_methods.py Project: xcit10/UMI-tools

def ExtractBarcodes(read,
                    match,
                    extract_umi=False,
                    extract_cell=False,
                    discard=False,
                    retain_umi=False):
    '''Extract the cell and umi barcodes using a regex.match object

    inputs:

    - read 1 and read2 = Record objects
    - match = regex.match object
    - extract_umi and extract_cell = switches to determine whether these
                                     barcodes should be extracted
    - discard = is there a region(s) of the sequence which should be
      discarded entirely?
    - retain_umi = Should UMI sequence be retained on the read sequence

    returns:

        - cell_barcode = Cell barcode string
        - cell_barcode_quals = Cell barcode quality scores
        - umi = UMI barcode string.
        - umi_quals = UMI barcode quality scores
        - new_seq = Read1 sequence after extraction
        - new_quals = Read1 qualities after extraction

    Barcodes and qualities default to empty strings where extract_cell
    or extract_umi are false.

    '''
    cell_barcode, umi, cell_barcode_quals, umi_quals, new_seq, new_quals = (
        "", ) * 6

    if not extract_cell and not extract_umi:
        U.error("must set either extract_cell and/or extract_umi to true")

    groupdict = match.groupdict()
    cell_bases = set()
    umi_bases = set()
    discard_bases = set()
    for k in sorted(list(groupdict)):
        span = match.span(k)
        if extract_cell and k.startswith("cell_"):
            cell_barcode += groupdict[k]
            cell_bases.update(range(span[0], span[1]))
        elif extract_umi and k.startswith("umi_"):
            umi += groupdict[k]
            umi_bases.update(range(span[0], span[1]))
        elif discard and k.startswith("discard_"):
            discard_bases.update(range(span[0], span[1]))

    new_seq, new_quals, umi_quals, cell_quals = extractSeqAndQuals(
        read.seq, read.quals, umi_bases, cell_bases, discard_bases, retain_umi)

    return (cell_barcode, cell_barcode_quals, umi, umi_quals, new_seq,
            new_quals)

Example #6

Show file

def getErrorCorrectMapping(cell_barcodes, whitelist, threshold=1):
    ''' Find the mappings between true and false cell barcodes based
    on an edit distance threshold.
    Any cell barcode within the threshold to more than one whitelist
    barcode will be excluded'''

    true_to_false = collections.defaultdict(set)

    # Unexpected results with cythonise hamming distance so redefine in python here
    def hamming_distance(first, second):
        ''' returns the edit distance/hamming distances between
        its two arguements '''

        # We only want to define hamming distance for barcodes with the same length
        if len(first) != len(second):
            return np.inf

        dist = sum([not a == b for a, b in zip(first, second)])
        return dist

    whitelist = set([str(x) for x in whitelist])

    U.info('building bktree')
    tree2 = pybktree.BKTree(hamming_distance, whitelist)
    U.info('done building bktree')

    for cell_barcode in cell_barcodes:

        if cell_barcode in whitelist:
            # if the barcode is already whitelisted, no need to add
            continue

        # get all members of whitelist that are at distance 1
        candidates = [
            white_cell for d, white_cell in tree2.find(cell_barcode, threshold)
            if d > 0
        ]

        if len(candidates) == 0:
            # the cell doesnt match to any whitelisted barcode,
            # hence we have to drop it
            # (as it cannot be asscociated with any frequent barcde)
            continue

        elif len(candidates) == 1:
            white_cell_str = candidates[0]
            true_to_false[white_cell_str].add(cell_barcode)

        else:
            # more than on whitelisted candidate:
            # we drop it as its not uniquely assignable
            continue
    return true_to_false

Example #7

Show file

File: count_tab.py Project: bdemaree/UMI-tools

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False)

    nInput, nOutput = 0, 0

    # set the method with which to extract umis from reads
    umi_getter = partial(
        umi_methods.get_umi_read_string, sep=options.umi_sep)

    options.stdout.write("%s\t%s\n" % ("gene", "count"))

    # set up UMIClusterer functor with methods specific to
    # specified options.method
    processor = network.UMIClusterer(options.method)

    for gene, counts in umi_methods.get_gene_count_tab(
            options.stdin,
            umi_getter=umi_getter):

        umis = counts.keys()

        nInput += sum(counts.values())

        # group the umis
        groups = processor(
            umis,
            counts,
            threshold=options.threshold)

        gene_count = len(groups)
        options.stdout.write("%s\t%i\n" % (gene, gene_count))
        nOutput += gene_count

    U.info("Number of reads counted: %i" % nOutput)

    U.Stop()

Example #8

Show file

    def __init__(self, options):

        self.UMIClusterer = UMIClusterer(cluster_method=options.method)

        if options.filter_umi:
            self.umi_whitelist = whitelist_methods.getUserDefinedBarcodes(
                options.umi_whitelist,
                options.umi_whitelist_paired,
                deriveErrorCorrection=False)[0]
            self.umi_whitelist_counts = collections.Counter()

            U.info("Length of UMI whitelist: %i" % len(self.umi_whitelist))

        else:
            self.umi_whitelist = None

Example #9

Show file

File: network.py Project: zorrodong/UMI-tools

    def __call__(self, umis, counts):
        '''Counts is a directionary that maps UMIs to their counts'''

        len_umis = [len(x) for x in umis]
        if not max(len_umis) == min(len_umis):
            U.warn("not all umis are the same length(!):  %d - %d" % (
                min(len_umis), max(len_umis)))

        adj_list = self.get_adj_list(umis, counts)

        clusters = self.get_connected_components(umis, adj_list, counts)

        final_umis = [list(x) for x in
                      self.get_groups(clusters, adj_list, counts)]

        return final_umis

Example #10

Show file

File: network.py Project: bdemaree/UMI-tools

    def __call__(self, umis, counts):
        '''Counts is a directionary that maps UMIs to their counts'''

        len_umis = [len(x) for x in umis]
        if not max(len_umis) == min(len_umis):
            U.warn("not all umis are the same length(!):  %d - %d" % (
                min(len_umis), max(len_umis)))

        adj_list = self.get_adj_list(umis, counts)

        clusters = self.get_connected_components(umis, adj_list, counts)

        final_umis = [list(x) for x in
                      self.get_groups(clusters, adj_list, counts)]

        return final_umis

Example #11

Show file

File: whitelist_methods.py Project: rstatistics/UMI-tools

 def singleBarcodeGenerator(whitelist_tsv):
     with U.openFile(whitelist_tsv, "r") as inf:
         for line in inf:
             if line.startswith('#'):
                 continue
             line = line.strip().split("\t")
             yield(line[0])

Example #12

Show file

File: whitelist_methods.py Project: AbateLab/DAb-seq

def errorDetectAboveThreshold(cell_barcode_counts,
                              cell_whitelist,
                              true_to_false_map,
                              errors=1,
                              resolution_method="discard"):

    assert resolution_method in [
        "discard", "correct"
    ], ("resolution method must be discard or correct")

    error_counter = collections.Counter()

    new_true_to_false_map = copy.deepcopy(true_to_false_map)

    discard_cbs = set()

    cell_whitelist = list(cell_whitelist)
    cell_whitelist.sort(key=lambda x: cell_barcode_counts[x])

    for ix, cb in enumerate(cell_whitelist):

        near_misses = checkError(cb, cell_whitelist[ix + 1:], errors=errors)

        if len(near_misses) > 0:
            error_counter["error_discarded_mt_1"]
            discard_cbs.add(cb)  # Will always discard CB from cell_whitelist

        if resolution_method == "correct" and len(near_misses) == 1:

            # Only correct substitutions as INDELs will also mess
            # up UMI so simple correction of CB is insufficient
            if regex.match("(%s){s<=%i}" % (cb, errors), near_misses[0]):
                # add corrected barcode to T:F map
                new_true_to_false_map[near_misses[0]].add(cb)
                error_counter["substitution_corrected"] += 1
            else:
                discard_cbs.add(cb)
                error_counter["indel_discarded"] += 1
        else:
            error_counter["error_discarded"] += 1

    if resolution_method == "correct":
        U.info(
            "CBs above the knee corrected due to possible substitutions: %i" %
            error_counter["substitution_corrected"])
        U.info("CBs above the knee discarded due to possible INDELs: %i" %
               error_counter["indel_discarded"])
        U.info("CBs above the knee discarded due to possible errors from "
               "multiple other CBs: %i" %
               error_counter["error_discarded_mt_1"])
    else:
        U.info("CBs above the knee discarded due to possible errors: %i" %
               len(discard_cbs))

    cell_whitelist = set(cell_whitelist).difference(discard_cbs)

    return (cell_whitelist, new_true_to_false_map)

Example #13

Show file

File: dedup.py Project: messersc/UMI-tools

    def write_mates(self):
        '''Scan the current chormosome for matches to any of the reads stored
        in the read1s buffer'''

        if self.chrom is not None:
            U.debug("Dumping %i mates for contig %s" % (
                len(self.read1s), self.infile.get_reference_name(self.chrom)))

        for read in self.infile.fetch(tid=self.chrom, multiple_iterators=True):
            if any((read.is_unmapped, read.mate_is_unmapped, read.is_read1)):
                continue

            key = read.query_name, read.reference_id, read.reference_start
            if key in self.read1s:
                self.outfile.write(read)
                self.read1s.remove(key)
        U.debug("%i mates remaining" % len(self.read1s))

Example #14

Show file

File: dedup.py Project: IMB-Computational-Genomics-Lab/UMI-tools

    def close(self):
        '''Write mates for remaining chromsome. Search for matches to any
        unmatched reads'''

        self.write_mates()
        U.info("Searching for mates for %i unmatched alignments" %
               len(self.read1s))

        found = 0
        for name, chrom, pos in self.read1s:
            for read in self.outfile.fetch(start=pos, end=pos + 1, tid=chrom):
                if (read.query_name, read.pos) == (name, pos):
                    self.outfile.write(read)
                    found += 1
                    break

        U.info("%i mates never found" % (len(self.read1s) - found))
        self.outfile.close()

Example #15

Show file

File: dedup.py Project: messersc/UMI-tools

    def close(self):
        '''Write mates for remaining chromsome. Search for matches to any
        unmatched reads'''

        self.write_mates()
        U.info("Searching for mates for %i unmatched alignments" %
               len(self.read1s))

        found = 0
        for name, chrom, pos in self.read1s:
            for read in self.outfile.fetch(start=pos, end=pos+1, tid=chrom):
                if (read.query_name, read.pos) == (name, pos):
                    self.outfile.write(read)
                    found += 1
                    break

        U.info("%i mates never found" % (len(self.read1s) - found))
        self.outfile.close()

Example #16

Show file

File: dedup.py Project: IMB-Computational-Genomics-Lab/UMI-tools

    def write_mates(self):
        '''Scan the current chormosome for matches to any of the reads stored
        in the read1s buffer'''

        if self.chrom is not None:
            U.debug(
                "Dumping %i mates for contig %s" %
                (len(self.read1s), self.infile.get_reference_name(self.chrom)))

        for read in self.infile.fetch(tid=self.chrom, multiple_iterators=True):
            if any((read.is_unmapped, read.mate_is_unmapped, read.is_read1)):
                continue

            key = read.query_name, read.reference_id, read.reference_start
            if key in self.read1s:
                self.outfile.write(read)
                self.read1s.remove(key)
        U.debug("%i mates remaining" % len(self.read1s))

Example #17

Show file

File: count_tab.py Project: zorrodong/UMI-tools

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False)

    nInput, nOutput = 0, 0

    # set the method with which to extract umis from reads
    umi_getter = partial(umi_methods.get_umi_read_string, sep=options.umi_sep)

    options.stdout.write("%s\t%s\n" % ("gene", "count"))

    # set up UMIClusterer functor with methods specific to
    # specified options.method
    processor = network.UMIClusterer(options.method)

    for gene, counts in umi_methods.get_gene_count_tab(options.stdin,
                                                       umi_getter=umi_getter):

        umis = counts.keys()

        nInput += sum(counts.values())

        # group the umis
        groups = processor(umis, counts, threshold=options.threshold)

        gene_count = len(groups)
        options.stdout.write("%s\t%i\n" % (gene, gene_count))
        nOutput += gene_count

    U.info("Number of reads counted: %i" % nOutput)

    U.Stop()

Example #18

Show file

File: whitelist_methods.py Project: AbateLab/DAb-seq

def getCellWhitelist(cell_barcode_counts,
                     knee_method="distance",
                     expect_cells=False,
                     cell_number=False,
                     error_correct_threshold=0,
                     plotfile_prefix=None):

    if knee_method == "distance":
        cell_whitelist = getKneeEstimateDistance(cell_barcode_counts,
                                                 cell_number, plotfile_prefix)

    elif knee_method == "density":
        cell_whitelist = getKneeEstimateDensity(cell_barcode_counts,
                                                expect_cells, cell_number,
                                                plotfile_prefix)

    else:
        raise ValueError("knee_method must be 'distance' or 'density'")

    U.info("Finished - whitelist determination")

    true_to_false_map = None

    if cell_whitelist and error_correct_threshold > 0:
        U.info("Starting - finding putative error cell barcodes")
        true_to_false_map = getErrorCorrectMapping(cell_barcode_counts.keys(),
                                                   cell_whitelist,
                                                   error_correct_threshold)
        U.info("Finished - finding putative error cell barcodes")

    return cell_whitelist, true_to_false_map

Example #19

Show file

File: umi_methods.py Project: AbateLab/DAb-seq

def fastqIterate(infile):
    '''iterate over contents of fastq file.'''

    def convert2string(b):
        if type(b) == str:
            return b
        else:
            return b.decode("utf-8")

    while 1:
        line1 = convert2string(infile.readline())
        if not line1:
            break
        if not line1.startswith('@'):
            U.error("parsing error: expected '@' in line %s" % line1)
        line2 = convert2string(infile.readline())
        line3 = convert2string(infile.readline())
        if not line3.startswith('+'):
            U.error("parsing error: expected '+' in line %s" % line3)
        line4 = convert2string(infile.readline())
        # incomplete entry
        if not line4:
            U.error("incomplete entry for %s" % line1)

        yield Record(line1[1:-1], line2[:-1], line4[:-1])

Example #20

Show file

File: whitelist_methods.py Project: rstatistics/UMI-tools

    def pairedBarcodeGenerator(whitelist_tsv, whitelist_tsv2):

        whitelist1 = []
        whitelist2 = []

        with U.openFile(whitelist_tsv, "r") as inf:
            for line in inf:
                if line.startswith('#'):
                    continue

                line = line.strip().split("\t")
                whitelist1.append(line[0])

        with U.openFile(whitelist_tsv2, "r") as inf2:
            for line in inf2:
                if line.startswith('#'):
                    continue

                line = line.strip().split("\t")
                whitelist2.append(line[0])

        for w1, w2 in itertools.product(whitelist1, whitelist2):
            yield(w1 + w2)

Example #21

Show file

File: sam_methods.py Project: yh154/UMI-tools

    def close(self):
        '''Write mates for remaining chromsome. Search for matches to any
        unmatched reads'''

        self.write_mates()
        U.info("Searching for mates for %i unmatched alignments" %
               len(self.read1s))

        found = 0
        for read in self.infile.fetch(until_eof=True, multiple_iterators=True):

            if any((read.is_unmapped, read.mate_is_unmapped, read.is_read1)):
                continue

            key = read.query_name, read.reference_name, read.reference_start
            if key in self.read1s:
                self.outfile.write(read)
                self.read1s.remove(key)
                found += 1
                continue

        U.info("%i mates never found" % len(self.read1s))
        self.outfile.close()

Example #22

Show file

def getMetaContig2contig(gene_transcript_map):
    ''' '''
    metacontig2contig = collections.defaultdict(set)
    for line in U.openFile(gene_transcript_map, "r"):

        if line.startswith("#"):
            continue

        if len(line.strip()) == 0:
            break

        gene, transcript = line.strip().split("\t")
        metacontig2contig[gene].add(transcript)

    return metacontig2contig

Example #23

Show file

File: whitelist_methods.py Project: AbateLab/DAb-seq

def getUserDefinedBarcodes(whitelist_tsv, getErrorCorrection=False):
    cell_whitelist = []

    if getErrorCorrection:
        false_to_true_map = {}
    else:
        false_to_true_map = None

    with U.openFile(whitelist_tsv, "r") as inf:

        for line in inf:
            if line.startswith('#'):
                continue

            line = line.strip().split("\t")
            whitelist_barcode = line[0]
            cell_whitelist.append(whitelist_barcode)

            if getErrorCorrection:
                for error_barcode in line[1].split(","):
                    false_to_true_map[error_barcode] = whitelist_barcode

    return set(cell_whitelist), false_to_true_map

Example #24

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-p", "--bc-pattern", dest="pattern", type="string",
                      help="Barcode pattern")
    parser.add_option("--bc-pattern2", dest="pattern2", type="string",
                      help="Barcode pattern for paired reads")
    parser.add_option("--3prime", dest="prime3", action="store_true",
                      help="barcode is on 3' end of read.")
    parser.add_option("--read2-in", dest="read2_in", type="string",
                      help="file name for read pairs")
    parser.add_option("--read2-out", dest="read2_out", type="string",
                      help="file to output processed paired read to")
    parser.add_option("--read2-stdout", dest="read2_stdout",
                      action="store_true",
                      help="Paired reads, send read2 to stdout, discarding read1")
    parser.add_option("--quality-filter-threshold",
                      dest="quality_filter_threshold", type="int",
                      help=("Remove reads where any UMI base quality score "
                            "falls below this threshold"))
    parser.add_option("--quality-filter-mask",
                      dest="quality_filter_mask", type="int",
                      help=("If a UMI base has a quality below this threshold, "
                            "replace the base with 'N'"))
    parser.add_option("--quality-encoding",
                      dest="quality_encoding", type="choice",
                      choices=["phred33", "phred64", "solexa"],
                      help=("Quality score encoding. Choose from 'phred33'"
                            "[33-77] 'phred64' [64-106] or 'solexa' [59-106]"))
    parser.add_option("--extract-method",
                      dest="extract_method", type="choice",
                      choices=["string", "regex"],
                      help=("How to extract the umi +/- cell barcodes, Choose "
                            "from 'string' or 'regex'"))
    parser.add_option("--filter-cell-barcode",
                      dest="filter_cell_barcode",
                      action="store_true",
                      help="Filter the cell barcodes")
    parser.add_option("--error-correct-cell",
                      dest="error_correct_cell",
                      action="store_true",
                      help=("Correct errors in the cell barcode"))
    parser.add_option("--whitelist",
                      dest="whitelist", type="string",
                      help=("A whitelist of accepted cell barcodes"))
    parser.add_option("--blacklist",
                      dest="blacklist", type="string",
                      help=("A blacklist of accepted cell barcodes"))
    parser.add_option("--reads-subset",
                      dest="reads_subset", type="int",
                      help=("Only extract from the first N reads. If N is "
                            "greater than the number of reads, all reads will "
                            "be used"))
    parser.add_option("--reconcile-pairs",
                      dest="reconcile", action="store_true",
                      help=("Allow the presences of reads in read2 input that are"
                            "not present in read1 input. This allows cell barcode"
                            "filtering of read1s without considering read2s"))
    parser.set_defaults(extract_method="string",
                        filter_cell_barcodes=False,
                        whitelist=None,
                        blacklist=None,
                        error_correct_cell=False,
                        pattern=None,
                        pattern2=None,
                        read2_in=None,
                        read2_out=False,
                        read2_stdout=False,
                        quality_filter_threshold=None,
                        quality_encoding=None,
                        reconcile=False)

    # add common options (-h/--help, ...) and parse command line

    (options, args) = U.Start(parser, argv=argv,
                              add_group_dedup_options=False,
                              add_sam_options=False)

    if options.quality_filter_threshold or options.quality_filter_mask:
        if not options.quality_encoding:
            U.error("must provide a quality encoding (--quality-"
                    "encoding) to filter UMIs by quality (--quality"
                    "-filter-threshold) or mask low quality bases "
                    "with (--quality-filter-mask)")

    if not options.pattern and not options.pattern2:
        if not options.read2_in:
            U.error("Must supply --bc-pattern for single-end")
        else:
            U.error("Must supply --bc-pattern and/or --bc-pattern "
                    "if paired-end ")

    if options.pattern2:
        if not options.read2_in:
            U.error("must specify a paired fastq ``--read2-in``")

        if not options.pattern2:
            options.pattern2 = options.pattern

    extract_cell = False
    extract_umi = False

    # If the pattern is a regex we can compile the regex(es) prior to
    # ExtractFilterAndUpdate instantiation
    if options.extract_method == "regex":
        if options.pattern:
            try:
                options.pattern = regex.compile(options.pattern)
            except regex.error:
                U.error("barcode_regex '%s' is not a "
                        "valid regex" % options.pattern)

        if options.pattern2:
            try:
                options.pattern2 = regex.compile(options.pattern2)
            except regex.Error:
                U.error("barcode_regex2 '%s' is not a "
                        "valid regex" % options.pattern2)

    # check whether the regex contains a umi group(s) and cell groups(s)
    if options.extract_method == "regex":
        if options.pattern:
            for group in options.pattern.groupindex:
                if group.startswith("cell_"):
                    extract_cell = True
                elif group.startswith("umi_"):
                    extract_umi = True
        if options.pattern2:
            for group in options.pattern2.groupindex:
                if group.startswith("cell_"):
                    extract_cell = True
                elif group.startswith("umi_"):
                    extract_umi = True

    # check whether the pattern string contains umi/cell bases
    elif options.extract_method == "string":
        if options.pattern:
            if "C" in options.pattern:
                extract_cell = True
            if "N" in options.pattern:
                extract_umi = True
        if options.pattern2:
            if "C" in options.pattern2:
                extract_cell = True
            if "N" in options.pattern2:
                extract_umi = True

    if not extract_umi:
        if options.extract_method == "string":
            U.error("barcode pattern(s) do not include any umi bases "
                    "(marked with 'Ns') %s, %s" % (
                        options.pattern, options.pattern2))
        elif options.extract_method == "regex":
            U.error("barcode regex(es) do not include any umi groups "
                    "(starting with 'umi_') %s, %s" (
                        options.pattern, options.pattern2))

    if options.filter_cell_barcodes:

        if not options.whitelist:
                U.error("must provide a whitelist (--whitelist) if using "
                        "--filter-cell-barcode option")

        if not extract_cell:
            if options.extract_method == "string":
                U.error("barcode pattern(s) do not include any cell bases "
                        "(marked with 'Cs') %s, %s" % (
                            options.pattern, options.pattern2))
            elif options.extract_method == "regex":
                U.error("barcode regex(es) do not include any cell groups "
                        "(starting with 'cell_') %s, %s" (
                            options.pattern, options.pattern2))

    read1s = umi_methods.fastqIterate(options.stdin)

    # set up read extractor
    ReadExtractor = umi_methods.ExtractFilterAndUpdate(
        options.extract_method,
        options.pattern,
        options.pattern2,
        options.prime3,
        extract_cell,
        options.quality_encoding,
        options.quality_filter_threshold,
        options.quality_filter_mask,
        options.filter_cell_barcode)

    if options.filter_cell_barcode:
        cell_whitelist, false_to_true_map = umi_methods.getUserDefinedBarcodes(
            options.whitelist, options.error_correct_cell)

        ReadExtractor.cell_whitelist = cell_whitelist
        ReadExtractor.false_to_true_map = false_to_true_map

    if options.blacklist:
        blacklist = set()
        with U.openFile(options.blacklist, "r") as inf:
            for line in inf:
                blacklist.add(line.strip().split("\t")[0])
        ReadExtractor.cell_blacklist = blacklist

    # variables for progress monitor
    progCount = 0
    displayMax = 100000
    U.info("Starting barcode extraction")

    if options.read2_in is None:
        for read in read1s:

            # incrementing count for monitoring progress
            progCount += 1

            # Update display in every 100kth iteration
            if progCount % displayMax == 0:
                U.info("Parsed {} reads".format(progCount))

            new_read = ReadExtractor(read)

            if options.reads_subset:
                if (ReadExtractor.read_counts['Input Reads'] >
                    options.reads_subset):
                    break

            if not new_read:
                continue

            options.stdout.write(str(new_read) + "\n")

    else:
        read2s = umi_methods.fastqIterate(U.openFile(options.read2_in))

        if options.read2_out:
            read2_out = U.openFile(options.read2_out, "w")

        if options.reconcile:
            strict = False
        else:
            strict = True

        for read1, read2 in umi_methods.joinedFastqIterate(
                read1s, read2s, strict):

            # incrementing count for monitoring progress
            progCount += 1

            # Update display in every 100kth iteration
            if progCount % displayMax == 0:
                U.info("Parsed {} reads".format(progCount))
                sys.stdout.flush()

            reads = ReadExtractor(read1, read2)

            if options.reads_subset:
                if (ReadExtractor.read_counts['Input Reads'] >
                    options.reads_subset):
                    break

            if not reads:
                continue
            else:
                new_read1, new_read2 = reads

            if options.read2_stdout:
                options.stdout.write(str(new_read2) + "\n")
            else:
                options.stdout.write(str(new_read1) + "\n")

                if options.read2_out:
                    read2_out.write(str(new_read2) + "\n")

    if options.read2_out:
        read2_out.close()

    for k, v in ReadExtractor.getReadCounts().most_common():
        U.info("%s: %s" % (k, v))

    U.Stop()

Example #25

Show file

File: whitelist.py Project: bdemaree/UMI-tools

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-p", "--bc-pattern", dest="pattern", type="string",
                      help="Barcode pattern")
    parser.add_option("--bc-pattern2", dest="pattern2", type="string",
                      help="Barcode pattern for paired reads")
    parser.add_option("--3prime", dest="prime3", action="store_true",
                      help="barcode is on 3' end of read.")
    parser.add_option("--read2-in", dest="read2_in", type="string",
                      help="file name for read pairs")
    parser.add_option("--extract-method",
                      dest="extract_method", type="choice",
                      choices=["string", "regex"],
                      help=("How to extract the umi +/- cell barcodes, Choose "
                            "from 'string' or 'regex'"))
    parser.add_option("--plot-prefix",
                      dest="plot_prefix", type="string",
                      help=("Prefix for plots to visualise the automated "
                            "detection of the number of 'true' cell barcodes"))
    parser.add_option("--subset-reads",
                      dest="subset_reads", type="int",
                      help=("Use the first N reads to automatically identify "
                            "the true cell barcodes. If N is greater than the "
                            "number of reads, all reads will be used"))
    parser.add_option("--error-correct-threshold",
                      dest="error_correct_threshold",
                      type="int",
                      help=("Hamming distance for correction of "
                            "barcodes to whitelist barcodes"))
    parser.add_option("--method",
                      dest="method",
                      choices=["reads", "umis"],
                      help=("Use reads or unique umi counts per cell"))
    parser.add_option("--expect-cells",
                      dest="expect_cells",
                      type="int",
                      help=("Prior expectation on the upper limit on the "
                            "number of cells sequenced"))
    parser.add_option("--set-cell-number",
                      dest="cell_number",
                      type="int",
                      help=("Specify the number of cell barcodes to accept"))
    parser.set_defaults(method="reads",
                        extract_method="string",
                        filter_cell_barcodes=False,
                        whitelist_tsv=None,
                        blacklist_tsv=None,
                        error_correct_threshold=1,
                        pattern=None,
                        pattern2=None,
                        read2_in=None,
                        plot_prefix=None,
                        subset_reads=100000000,
                        expect_cells=False,
                        cell_number=False)

    # add common options (-h/--help, ...) and parse command line

    (options, args) = U.Start(parser, argv=argv,
                              add_group_dedup_options=False,
                              add_sam_options=False)

    if options.expect_cells and options.cell_number:
        U.error("Cannot supply both --expect-cells and "
                "--cell-number options")

    if not options.pattern and not options.pattern2:
        if not options.read2_in:
            U.error("Must supply --bc-pattern for single-end")
        else:
            U.error("Must supply --bc-pattern and/or --bc-pattern2 "
                    "if paired-end ")

    if options.pattern2:
        if not options.read2_in:
            U.error("must specify a paired fastq ``--read2-in``")

        if not options.pattern2:
            options.pattern2 = options.pattern

    extract_cell = False
    extract_umi = False

    # If the pattern is a regex we can compile the regex(es) prior to
    # ExtractFilterAndUpdate instantiation
    if options.extract_method == "regex":
        if options.pattern:
            try:
                options.pattern = regex.compile(options.pattern)
            except regex.error:
                U.error("barcode_regex '%s' is not a "
                        "valid regex" % options.pattern)

        if options.pattern2:
            try:
                options.pattern2 = regex.compile(options.barcode_regex2)
            except regex.Error:
                U.error("barcode_regex2 '%s' is not a "
                        "valid regex" % options.barcode_regex2)

    # check whether the regex contains a umi group(s) and cell groups(s)
    if options.extract_method == "regex":
        if options.pattern:
            for group in options.pattern.groupindex:
                if group.startswith("cell_"):
                    extract_cell = True
                elif group.startswith("umi_"):
                    extract_umi = True
        if options.pattern2:
            for group in options.pattern2.groupindex:
                if group.startswith("cell_"):
                    extract_cell = True
                elif group.startswith("umi_"):
                    extract_umi = True

    # check whether the pattern string contains umi/cell bases
    elif options.extract_method == "string":
        if options.pattern:
            if "C" in options.pattern:
                extract_cell = True
            if "N" in options.pattern:
                extract_umi = True
        if options.pattern2:
            if "C" in options.pattern2:
                extract_cell = True
            if "N" in options.pattern2:
                extract_umi = True

    if not extract_umi:
        if options.extract_method == "string":
            U.error("barcode pattern(s) do not include any umi bases "
                    "(marked with 'Ns') %s, %s" % (
                        options.pattern, options.pattern2))
        elif options.extract_method == "regex":
            U.error("barcode regex(es) do not include any umi groups "
                    "(starting with 'umi_') %s, %s" (
                        options.pattern, options.pattern2))
    if not extract_cell:
        if options.extract_method == "string":
            U.error("barcode pattern(s) do not include any cell bases "
                    "(marked with 'Cs') %s, %s" % (
                        options.pattern, options.pattern2))
        elif options.extract_method == "regex":
            U.error("barcode regex(es) do not include any cell groups "
                    "(starting with 'cell_') %s, %s" (
                        options.pattern, options.pattern2))

    read1s = umi_methods.fastqIterate(options.stdin)

    # set up read extractor
    ReadExtractor = umi_methods.ExtractFilterAndUpdate(
        method=options.extract_method,
        pattern=options.pattern,
        pattern2=options.pattern2,
        prime3=options.prime3,
        extract_cell=extract_cell)

    cell_barcode_counts = collections.Counter()

    n_reads = 0
    n_cell_barcodes = 0

    # if using the umis method, need to keep a set of umis observed
    if options.method == "umis":
        cell_barcode_umis = collections.defaultdict(set)

    # variables for progress monitor
    displayMax = 100000
    U.info("Starting barcode extraction")

    if not options.read2_in:
        for read1 in read1s:

            # Update display in every 100kth iteration
            if n_reads % displayMax == 0:
                U.info("Parsed {} reads".format(n_reads))

            n_reads += 1
            barcode_values = ReadExtractor.getBarcodes(read1)
            if barcode_values is None:
                continue
            else:
                cell, umi, _, _, _, _, _ = barcode_values
                if options.method == "umis":
                    cell_barcode_umis[cell].add(umi)
                else:
                    cell_barcode_counts[cell] += 1
                n_cell_barcodes += 1

            if options.subset_reads:
                if n_cell_barcodes > options.subset_reads:
                    break
    else:
        read2s = umi_methods.fastqIterate(U.openFile(options.read2_in))
        for read1, read2 in izip(read1s, read2s):

            # Update display in every 100kth iteration
            if n_reads % displayMax == 0:
                U.info("Parsed {} reads".format(n_reads))

            n_reads += 1

            barcode_values = ReadExtractor.getBarcodes(read1, read2)
            if barcode_values is None:
                continue
            else:
                cell, umi, _, _, _, _, _ = barcode_values
                if options.method == "umis":
                    cell_barcode_umis[cell].add(umi)
                else:
                    cell_barcode_counts[cell] += 1
                n_cell_barcodes += 1

            if options.subset_reads:
                if n_reads > options.subset_reads:
                    break

    U.info("Starting - whitelist determination")

    if options.method == "umis":
        for cell in cell_barcode_umis:
            cell_barcode_counts[cell] = len(cell_barcode_umis[cell])

    if options.cell_number and options.cell_number > len(cell_barcode_counts):
        raise ValueError(
            "--set-cell-barcode option specifies more cell barcodes than the "
            "number of observed cell barcodes. This may be because "
            "--subset-reads was set to a value too low to capture reads from "
            "all cells. %s cell barcodes observed from %s parsed reads. "
            "Expected>= %s cell barcodes" % (
                len(cell_barcode_counts),
                options.subset_reads,
                options.cell_number))

    cell_whitelist, true_to_false_map = umi_methods.getCellWhitelist(
        cell_barcode_counts,
        options.expect_cells,
        options.cell_number,
        options.error_correct_threshold,
        options.plot_prefix)

    U.info("Writing out whitelist")
    for barcode in sorted(list(cell_whitelist)):

        if true_to_false_map:
            corrected_barcodes = ",".join(
                sorted(true_to_false_map[barcode]))
            corrected_barcode_counts = ",".join(
                map(str, [cell_barcode_counts[x] for x
                          in sorted(true_to_false_map[barcode])]))
        else:
            corrected_barcodes, corrected_barcode_counts = "", ""

        options.stdout.write("%s\t%s\t%s\t%s\n" % (
            barcode, corrected_barcodes, cell_barcode_counts[barcode],
            corrected_barcode_counts))

    U.info("Parsed %i reads" % n_reads)
    U.info("%i reads matched the barcode pattern" % n_cell_barcodes)
    U.info("Found %i unique cell barcodes" % len(cell_barcode_counts))

    U.Stop()

Example #26

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    group = U.OptionGroup(parser, "group-specific options")

    group.add_option(
        "--group-out",
        dest="tsv",
        type="string",
        help="Outfile name for file mapping read id to read group",
        default=None)

    group.add_option(
        "--output-bam",
        dest="output_bam",
        action="store_true",
        default=False,
        help=("output a bam file with read groups tagged using the UG tag"
              "[default=%default]"))

    group.add_option(
        "--output-unmapped",
        dest="output_unmapped",
        action="store_true",
        default=False,
        help=("Retain all unmapped reads in output[default=%default]"))

    parser.add_option("--umi-group-tag",
                      dest="umi_group_tag",
                      type="string",
                      help="tag for the outputted umi group",
                      default='BX')

    parser.add_option_group(group)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    U.validateSamOptions(options)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        if options.no_sort_output:
            out_name = options.stdout.name
        else:
            out_name = U.getTempFilename()
            sorted_out_name = options.stdout.name
        options.stdout.close()
        assert options.output_bam, (
            "To output a bam you must include --output-bam option")
    else:
        if options.no_sort_output:
            out_name = "-"
        else:
            out_name = U.getTempFilename()
            sorted_out_name = "-"

    if not options.no_sort_output:  # need to determine the output format for sort
        if options.out_sam:
            sort_format = "sam"
        else:
            sort_format = "bam"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "wh"
    else:
        out_mode = "wb"

    infile = pysam.Samfile(in_name, in_mode)

    if options.output_bam:
        outfile = pysam.Samfile(out_name, out_mode, template=infile)
    else:
        outfile = None

    if options.tsv:
        mapping_outfile = U.openFile(options.tsv, "w")
        mapping_outfile.write("%s\n" % "\t".join([
            "read_id", "contig", "position", "gene", "umi", "umi_count",
            "final_umi", "final_umi_count", "unique_id"
        ]))

    nInput, nOutput, unique_id, input_reads, output_reads = 0, 0, 0, 0, 0

    gene_tag = options.gene_tag
    metacontig2contig = None

    if options.chrom:
        inreads = infile.fetch(reference=options.chrom)
    else:
        if options.per_gene and options.gene_transcript_map:
            metacontig2contig = umi_methods.getMetaContig2contig(
                infile, options.gene_transcript_map)
            metatag = "MC"
            inreads = umi_methods.metafetcher(infile, metacontig2contig,
                                              metatag)
            gene_tag = metatag

        else:
            inreads = infile.fetch(until_eof=options.output_unmapped)

    bundle_iterator = umi_methods.get_bundles(
        options,
        all_reads=True,
        return_read2=True,
        return_unmapped=options.output_unmapped,
        metacontig_contig=metacontig2contig)

    for bundle, key, status in bundle_iterator(inreads):

        # write out read2s and unmapped (if these options are set)
        if status == 'single_read':
            # bundle is just a single read here
            nInput += 1

            if outfile:
                outfile.write(bundle)

            nOutput += 1
            continue

        umis = bundle.keys()
        counts = {umi: bundle[umi]["count"] for umi in umis}

        nInput += sum(counts.values())

        while nOutput >= output_reads + 10000:
            output_reads += 10000
            U.info("Written out %i reads" % output_reads)

        while nInput >= input_reads + 1000000:
            input_reads += 1000000
            U.info("Parsed %i input reads" % input_reads)

        # set up UMIClusterer functor with methods specific to
        # specified options.method
        processor = network.UMIClusterer(options.method)

        # group the umis
        groups = processor(umis, counts, threshold=options.threshold)

        for umi_group in groups:
            top_umi = umi_group[0]

            group_count = sum(counts[umi] for umi in umi_group)

            for umi in umi_group:
                reads = bundle[umi]['read']
                for read in reads:
                    if outfile:
                        # Add the 'UG' tag to the read
                        read.tags += [('UG', unique_id)]
                        read.tags += [(options.umi_group_tag, top_umi)]
                        outfile.write(read)

                    if options.tsv:
                        if options.per_gene:
                            gene = read.get_tag(gene_tag)
                        else:
                            gene = "NA"
                        mapping_outfile.write("%s\n" % "\t".join(
                            map(str,
                                (read.query_name, read.reference_name,
                                 umi_methods.get_read_position(
                                     read, options.soft_clip_threshold)[1],
                                 gene, umi.decode(), counts[umi],
                                 top_umi.decode(), group_count, unique_id))))

                    nOutput += 1

            unique_id += 1

    if outfile:
        outfile.close()
        if not options.no_sort_output:
            # sort the output
            pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name)
            os.unlink(out_name)  # delete the tempfile

    if options.tsv:
        mapping_outfile.close()

    # write footer and output benchmark information.
    U.info("Reads: %s" % ", ".join([
        "%s: %s" % (x[0], x[1])
        for x in bundle_iterator.read_events.most_common()
    ]))
    U.info("Number of reads out: %i, Number of groups: %i" %
           (nOutput, unique_id))
    U.Stop()

Example #27

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--in-sam",
                      dest="in_sam",
                      action="store_true",
                      help="Input file is in sam format [default=%default]",
                      default=False)
    parser.add_option(
        "-o",
        "--out-sam",
        dest="out_sam",
        action="store_true",
        help="Output alignments in sam format [default=%default]",
        default=False)
    parser.add_option("--ignore-umi",
                      dest="ignore_umi",
                      action="store_true",
                      help="Ignore UMI and dedup"
                      " only on position",
                      default=False)
    parser.add_option("--umi-separator",
                      dest="umi_sep",
                      type="string",
                      help="separator between read id and UMI",
                      default="_")
    parser.add_option("--umi-tag",
                      dest="umi_tag",
                      type="string",
                      help="tag containing umi",
                      default='RX')
    parser.add_option("--extract-umi-method",
                      dest="get_umi_method",
                      type="choice",
                      choices=("read_id", "tag"),
                      default="read_id",
                      help="where is the read UMI encoded? [default=%default]")
    parser.add_option("--subset",
                      dest="subset",
                      type="float",
                      help="Use only a fraction of reads, specified by subset",
                      default=None)
    parser.add_option("--spliced-is-unique",
                      dest="spliced",
                      action="store_true",
                      help="Treat a spliced read as different to an unspliced"
                      " one [default=%default]",
                      default=False)
    parser.add_option("--soft-clip-threshold",
                      dest="soft",
                      type="float",
                      help="number of bases clipped from 5' end before"
                      "read is counted as spliced [default=%default]",
                      default=4)
    parser.add_option("--edit-distance-threshold",
                      dest="threshold",
                      type="int",
                      default=1,
                      help="Edit distance theshold at which to join two UMIs"
                      "when clustering. [default=%default]")
    parser.add_option("--chrom",
                      dest="chrom",
                      type="string",
                      help="Restrict to one chromosome",
                      default=None)
    parser.add_option("--paired",
                      dest="paired",
                      action="store_true",
                      default=False,
                      help="paired BAM. [default=%default]")
    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("adjacency", "directional", "percentile",
                               "unique", "cluster"),
                      default="directional",
                      help="method to use for umi deduping [default=%default]")
    parser.add_option("--output-stats",
                      dest="stats",
                      type="string",
                      default=False,
                      help="Specify location to output stats")
    parser.add_option(
        "--whole-contig",
        dest="whole_contig",
        action="store_true",
        default=False,
        help=
        "Read whole contig before outputting bundles: guarantees that no reads"
        "are missed, but increases memory usage")
    parser.add_option("--multimapping-detection-method",
                      dest="detection_method",
                      type="choice",
                      choices=("NH", "X0", "XT"),
                      default=None,
                      help=("Some aligners identify multimapping using bam "
                            "tags. Setting this option to NH, X0 or XT will "
                            "use these tags when selecting the best read "
                            "amongst reads with the same position and umi "
                            "[default=%default]"))
    parser.add_option("--mapping-quality",
                      dest="mapping_quality",
                      type="int",
                      help="Minimum mapping quality for a read to be retained"
                      " [default=%default]",
                      default=0)
    parser.add_option(
        "--read-length",
        dest="read_length",
        action="store_true",
        default=False,
        help=("use read length in addition to position and UMI"
              "to identify possible duplicates [default=%default]"))
    parser.add_option("--per-contig",
                      dest="per_contig",
                      action="store_true",
                      default=False,
                      help=("dedup per contig (field 3 in BAM; RNAME),"
                            " e.g for transcriptome where contig = gene"))
    parser.add_option("--per-gene",
                      dest="per_gene",
                      action="store_true",
                      default=False,
                      help=("Deduplicate per gene,"
                            "e.g for transcriptome where contig = transcript"
                            "must also provide a transript to gene map with"
                            "--gene-transcript-map [default=%default]"))
    parser.add_option("--gene-transcript-map",
                      dest="gene_transcript_map",
                      type="string",
                      help="file mapping transcripts to genes (tab separated)",
                      default=None)
    parser.add_option("--gene-tag",
                      dest="gene_tag",
                      type="string",
                      help=("Deduplicate per gene where gene is"
                            "defined by this bam tag [default=%default]"),
                      default=None)
    parser.add_option(
        "--skip-tags-regex",
        dest="skip_regex",
        type="string",
        help=("Used with --gene-tag. "
              "Ignore reads where the gene-tag matches this regex"),
        default="^[__|Unassigned]")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    if options.random_seed:
        np.random.seed(options.random_seed)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        out_name = options.stdout.name
        options.stdout.close()
    else:
        out_name = "-"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "wh"
    else:
        out_mode = "wb"

    if options.stats:
        if options.ignore_umi:
            raise ValueError("'--output-stats' and '--ignore-umi' options"
                             " cannot be used together")

    if options.per_gene:
        if not options.gene_transcript_map and not options.gene_map:
            raise ValueError(
                "--per-gene option requires --gene-transcript-map "
                "or --gene-tag")
    try:
        re.compile(options.skip_regex)
    except re.error:
        raise ValueError("skip-regex '%s' is not a "
                         "valid regex" % options.skip_regex)

    infile = pysam.Samfile(in_name, in_mode)
    outfile = pysam.Samfile(out_name, out_mode, template=infile)

    if options.paired:
        outfile = umi_methods.TwoPassPairWriter(infile, outfile)

    nInput, nOutput = 0, 0

    if options.detection_method:
        bam_features = detect_bam_features(infile.filename)

        if not bam_features[options.detection_method]:
            if sum(bam_features.values()) == 0:
                raise ValueError(
                    "There are no bam tags available to detect multimapping. "
                    "Do not set --multimapping-detection-method")
            else:
                raise ValueError(
                    "The chosen method of detection for multimapping (%s) "
                    "will not work with this bam. Multimapping can be detected"
                    " for this bam using any of the following: %s" %
                    (options.detection_method, ",".join(
                        [x for x in bam_features if bam_features[x]])))

    # set the method with which to extract umis from reads
    if options.get_umi_method == "read_id":
        umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep)
    elif options.get_umi_method == "tag":
        umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag)
    else:
        raise ValueError("Unknown umi extraction method")

    if options.stats:
        # set up arrays to hold stats data
        stats_pre_df_dict = {"UMI": [], "counts": []}
        stats_post_df_dict = {"UMI": [], "counts": []}
        pre_cluster_stats = []
        post_cluster_stats = []
        pre_cluster_stats_null = []
        post_cluster_stats_null = []
        topology_counts = collections.Counter()
        node_counts = collections.Counter()
        read_gn = umi_methods.random_read_generator(infile.filename,
                                                    chrom=options.chrom,
                                                    umi_getter=umi_getter)

    if options.chrom:
        inreads = infile.fetch(reference=options.chrom)
    else:
        if options.per_gene and options.gene_transcript_map:
            metacontig2contig = umi_methods.getMetaContig2contig(
                infile, options.gene_transcript_map)
            metatag = "MC"
            inreads = umi_methods.metafetcher(infile, metacontig2contig,
                                              metatag)
            gene_tag = metatag

        else:
            inreads = infile.fetch()
            gene_tag = options.gene_tag

    for bundle, read_events, status in umi_methods.get_bundles(
            inreads,
            ignore_umi=options.ignore_umi,
            subset=options.subset,
            quality_threshold=options.mapping_quality,
            paired=options.paired,
            spliced=options.spliced,
            soft_clip_threshold=options.soft,
            per_contig=options.per_contig,
            gene_tag=options.gene_tag,
            skip_regex=options.skip_regex,
            whole_contig=options.whole_contig,
            read_length=options.read_length,
            detection_method=options.detection_method,
            umi_getter=umi_getter,
            all_reads=False,
            return_read2=False,
            return_unmapped=False):

        nInput += sum([bundle[umi]["count"] for umi in bundle])

        if nOutput % 10000 == 0:
            U.debug("Outputted %i" % nOutput)

        if nInput % 1000000 == 0:
            U.debug("Read %i input reads" % nInput)

        if options.stats:
            # generate pre-dudep stats
            average_distance = umi_methods.get_average_umi_distance(
                bundle.keys())
            pre_cluster_stats.append(average_distance)
            cluster_size = len(bundle)
            random_umis = read_gn.getUmis(cluster_size)
            average_distance_null = umi_methods.get_average_umi_distance(
                random_umis)
            pre_cluster_stats_null.append(average_distance_null)

        if options.ignore_umi:
            for umi in bundle:
                nOutput += 1
                outfile.write(bundle[umi]["read"])

        else:

            # set up ReadCluster functor with methods specific to
            # specified options.method
            processor = network.ReadDeduplicator(options.method)

            # dedup using umis and write out deduped bam
            reads, umis, umi_counts = processor(bundle=bundle,
                                                threshold=options.threshold)

            for read in reads:
                outfile.write(read)
                nOutput += 1

            if options.stats:

                # collect pre-dudupe stats
                stats_pre_df_dict['UMI'].extend(bundle)
                stats_pre_df_dict['counts'].extend(
                    [bundle[UMI]['count'] for UMI in bundle])

                # collect post-dudupe stats
                post_cluster_umis = [umi_getter(x) for x in reads]
                stats_post_df_dict['UMI'].extend(umis)
                stats_post_df_dict['counts'].extend(umi_counts)

                average_distance = umi_methods.get_average_umi_distance(
                    post_cluster_umis)
                post_cluster_stats.append(average_distance)

                cluster_size = len(post_cluster_umis)
                random_umis = read_gn.getUmis(cluster_size)
                average_distance_null = umi_methods.get_average_umi_distance(
                    random_umis)
                post_cluster_stats_null.append(average_distance_null)

    outfile.close()

    if options.stats:

        # generate the stats dataframe
        stats_pre_df = pd.DataFrame(stats_pre_df_dict)
        stats_post_df = pd.DataFrame(stats_post_df_dict)

        # tally the counts per umi per position
        pre_counts = collections.Counter(stats_pre_df["counts"])
        post_counts = collections.Counter(stats_post_df["counts"])
        counts_index = list(
            set(pre_counts.keys()).union(set(post_counts.keys())))
        counts_index.sort()
        with U.openFile(options.stats + "_per_umi_per_position.tsv",
                        "w") as outf:
            outf.write("counts\tinstances_pre\tinstances_post\n")
            for count in counts_index:
                values = (count, pre_counts[count], post_counts[count])
                outf.write("\t".join(map(str, values)) + "\n")

        # aggregate stats pre/post per UMI
        agg_pre_df = aggregateStatsDF(stats_pre_df)
        agg_post_df = aggregateStatsDF(stats_post_df)

        agg_df = pd.merge(agg_pre_df,
                          agg_post_df,
                          how='left',
                          left_index=True,
                          right_index=True,
                          sort=True,
                          suffixes=["_pre", "_post"])

        # TS - if count value not observed either pre/post-dedup,
        # merge will leave an empty cell and the column will be cast as a float
        # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html
        # --> Missing data casting rules and indexing
        # so, back fill with zeros and convert back to int
        agg_df = agg_df.fillna(0).astype(int)

        agg_df.index = [x.decode() for x in agg_df.index]
        agg_df.index.name = 'UMI'
        agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t")

        # bin distances into integer bins
        max_ed = int(
            max(
                map(max, [
                    pre_cluster_stats, post_cluster_stats,
                    pre_cluster_stats_null, post_cluster_stats_null
                ])))

        cluster_bins = range(-1, int(max_ed) + 2)

        def bin_clusters(cluster_list, bins=cluster_bins):
            ''' take list of floats and return bins'''
            return np.digitize(cluster_list, bins, right=True)

        def tallyCounts(binned_cluster, max_edit_distance):
            ''' tally counts per bin '''
            return np.bincount(binned_cluster, minlength=max_edit_distance + 3)

        pre_cluster_binned = bin_clusters(pre_cluster_stats)
        post_cluster_binned = bin_clusters(post_cluster_stats)
        pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null)
        post_cluster_null_binned = bin_clusters(post_cluster_stats_null)

        edit_distance_df = pd.DataFrame({
            "unique":
            tallyCounts(pre_cluster_binned, max_ed),
            "unique_null":
            tallyCounts(pre_cluster_null_binned, max_ed),
            options.method:
            tallyCounts(post_cluster_binned, max_ed),
            "%s_null" % options.method:
            tallyCounts(post_cluster_null_binned, max_ed),
            "edit_distance":
            cluster_bins
        })

        # TS - set lowest bin (-1) to "Single_UMI"
        edit_distance_df['edit_distance'][0] = "Single_UMI"

        edit_distance_df.to_csv(options.stats + "_edit_distance.tsv",
                                index=False,
                                sep="\t")

    # write footer and output benchmark information.

    U.info(
        "%s" %
        ", ".join(["%s: %s" % (x[0], x[1])
                   for x in read_events.most_common()]))
    U.info("Number of reads out: %i" % nOutput)

    U.Stop()

Example #28

Show file

File: whitelist.py Project: zhaoning2016/UMI-tools

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-p",
                      "--bc-pattern",
                      dest="pattern",
                      type="string",
                      help="Barcode pattern")
    parser.add_option("--bc-pattern2",
                      dest="pattern2",
                      type="string",
                      help="Barcode pattern for paired reads")
    parser.add_option("--3prime",
                      dest="prime3",
                      action="store_true",
                      help="barcode is on 3' end of read.")
    parser.add_option("--read2-in",
                      dest="read2_in",
                      type="string",
                      help="file name for read pairs")
    parser.add_option("--extract-method",
                      dest="extract_method",
                      type="choice",
                      choices=["string", "regex"],
                      help=("How to extract the umi +/- cell barcodes, Choose "
                            "from 'string' or 'regex'"))
    parser.add_option("--plot-prefix",
                      dest="plot_prefix",
                      type="string",
                      help=("Prefix for plots to visualise the automated "
                            "detection of the number of 'true' cell barcodes"))
    parser.add_option("--subset-reads",
                      dest="subset_reads",
                      type="int",
                      help=("Use the first N reads to automatically identify "
                            "the true cell barcodes. If N is greater than the "
                            "number of reads, all reads will be used"))
    parser.add_option("--error-correct-threshold",
                      dest="error_correct_threshold",
                      type="int",
                      help=("Hamming distance for correction of "
                            "barcodes to whitelist barcodes"))
    parser.add_option("--method",
                      dest="method",
                      choices=["reads", "umis"],
                      help=("Use reads or unique umi counts per cell"))
    parser.add_option("--expect-cells",
                      dest="expect_cells",
                      type="int",
                      help=("Prior expectation on the upper limit on the "
                            "number of cells sequenced"))
    parser.add_option("--set-cell-number",
                      dest="cell_number",
                      type="int",
                      help=("Specify the number of cell barcodes to accept"))
    parser.set_defaults(method="reads",
                        extract_method="string",
                        filter_cell_barcodes=False,
                        whitelist_tsv=None,
                        blacklist_tsv=None,
                        error_correct_threshold=1,
                        pattern=None,
                        pattern2=None,
                        read2_in=None,
                        plot_prefix=None,
                        subset_reads=100000000,
                        expect_cells=False,
                        cell_number=False)

    # add common options (-h/--help, ...) and parse command line

    (options, args) = U.Start(parser,
                              argv=argv,
                              add_group_dedup_options=False,
                              add_sam_options=False)

    if options.expect_cells and options.cell_number:
        U.error("Cannot supply both --expect-cells and "
                "--cell-number options")

    if not options.pattern and not options.pattern2:
        if not options.read2_in:
            U.error("Must supply --bc-pattern for single-end")
        else:
            U.error("Must supply --bc-pattern and/or --bc-pattern2 "
                    "if paired-end ")

    if options.pattern2:
        if not options.read2_in:
            U.error("must specify a paired fastq ``--read2-in``")

        if not options.pattern2:
            options.pattern2 = options.pattern

    extract_cell = False
    extract_umi = False

    # If the pattern is a regex we can compile the regex(es) prior to
    # ExtractFilterAndUpdate instantiation
    if options.extract_method == "regex":
        if options.pattern:
            try:
                options.pattern = regex.compile(options.pattern)
            except regex.error:
                U.error("barcode_regex '%s' is not a "
                        "valid regex" % options.pattern)

        if options.pattern2:
            try:
                options.pattern2 = regex.compile(options.barcode_regex2)
            except regex.Error:
                U.error("barcode_regex2 '%s' is not a "
                        "valid regex" % options.barcode_regex2)

    # check whether the regex contains a umi group(s) and cell groups(s)
    if options.extract_method == "regex":
        if options.pattern:
            for group in options.pattern.groupindex:
                if group.startswith("cell_"):
                    extract_cell = True
                elif group.startswith("umi_"):
                    extract_umi = True
        if options.pattern2:
            for group in options.pattern2.groupindex:
                if group.startswith("cell_"):
                    extract_cell = True
                elif group.startswith("umi_"):
                    extract_umi = True

    # check whether the pattern string contains umi/cell bases
    elif options.extract_method == "string":
        if options.pattern:
            if "C" in options.pattern:
                extract_cell = True
            if "N" in options.pattern:
                extract_umi = True
        if options.pattern2:
            if "C" in options.pattern2:
                extract_cell = True
            if "N" in options.pattern2:
                extract_umi = True

    if not extract_umi:
        if options.extract_method == "string":
            U.error("barcode pattern(s) do not include any umi bases "
                    "(marked with 'Ns') %s, %s" %
                    (options.pattern, options.pattern2))
        elif options.extract_method == "regex":
            U.error("barcode regex(es) do not include any umi groups "
                    "(starting with 'umi_') %s, %s" (options.pattern,
                                                     options.pattern2))
    if not extract_cell:
        if options.extract_method == "string":
            U.error("barcode pattern(s) do not include any cell bases "
                    "(marked with 'Cs') %s, %s" %
                    (options.pattern, options.pattern2))
        elif options.extract_method == "regex":
            U.error("barcode regex(es) do not include any cell groups "
                    "(starting with 'cell_') %s, %s" (options.pattern,
                                                      options.pattern2))

    read1s = umi_methods.fastqIterate(options.stdin)

    # set up read extractor
    ReadExtractor = umi_methods.ExtractFilterAndUpdate(
        method=options.extract_method,
        pattern=options.pattern,
        pattern2=options.pattern2,
        prime3=options.prime3,
        extract_cell=extract_cell)

    cell_barcode_counts = collections.Counter()

    n_reads = 0
    n_cell_barcodes = 0

    # if using the umis method, need to keep a set of umis observed
    if options.method == "umis":
        cell_barcode_umis = collections.defaultdict(set)

    # variables for progress monitor
    displayMax = 100000
    U.info("Starting barcode extraction")

    if not options.read2_in:
        for read1 in read1s:

            # Update display in every 100kth iteration
            if n_reads % displayMax == 0:
                U.info("Parsed {} reads".format(n_reads))

            n_reads += 1
            barcode_values = ReadExtractor.getBarcodes(read1)
            if barcode_values is None:
                continue
            else:
                cell, umi, _, _, _, _, _ = barcode_values
                if options.method == "umis":
                    cell_barcode_umis[cell].add(umi)
                else:
                    cell_barcode_counts[cell] += 1
                n_cell_barcodes += 1

            if options.subset_reads:
                if n_cell_barcodes > options.subset_reads:
                    break
    else:
        read2s = umi_methods.fastqIterate(U.openFile(options.read2_in))
        for read1, read2 in izip(read1s, read2s):

            # Update display in every 100kth iteration
            if n_reads % displayMax == 0:
                U.info("Parsed {} reads".format(n_reads))

            n_reads += 1

            barcode_values = ReadExtractor.getBarcodes(read1, read2)
            if barcode_values is None:
                continue
            else:
                cell, umi, _, _, _, _, _ = barcode_values
                if options.method == "umis":
                    cell_barcode_umis[cell].add(umi)
                else:
                    cell_barcode_counts[cell] += 1
                n_cell_barcodes += 1

            if options.subset_reads:
                if n_reads > options.subset_reads:
                    break

    U.info("Starting - whitelist determination")

    if options.method == "umis":
        for cell in cell_barcode_umis:
            cell_barcode_counts[cell] = len(cell_barcode_umis[cell])

    if options.cell_number and options.cell_number > len(cell_barcode_counts):
        raise ValueError(
            "--set-cell-barcode option specifies more cell barcodes than the "
            "number of observed cell barcodes. This may be because "
            "--subset-reads was set to a value too low to capture reads from "
            "all cells. %s cell barcodes observed from %s parsed reads. "
            "Expected>= %s cell barcodes" %
            (len(cell_barcode_counts), options.subset_reads,
             options.cell_number))

    cell_whitelist, true_to_false_map = umi_methods.getCellWhitelist(
        cell_barcode_counts, options.expect_cells, options.cell_number,
        options.error_correct_threshold, options.plot_prefix)

    U.info("Writing out whitelist")
    for barcode in sorted(list(cell_whitelist)):

        if true_to_false_map:
            corrected_barcodes = ",".join(sorted(true_to_false_map[barcode]))
            corrected_barcode_counts = ",".join(
                map(str, [
                    cell_barcode_counts[x]
                    for x in sorted(true_to_false_map[barcode])
                ]))
        else:
            corrected_barcodes, corrected_barcode_counts = "", ""

        options.stdout.write(
            "%s\t%s\t%s\t%s\n" %
            (barcode, corrected_barcodes, cell_barcode_counts[barcode],
             corrected_barcode_counts))

    U.info("Parsed %i reads" % n_reads)
    U.info("%i reads matched the barcode pattern" % n_cell_barcodes)
    U.info("Found %i unique cell barcodes" % len(cell_barcode_counts))

    U.Stop()

Example #29

Show file

File: group.py Project: bdemaree/UMI-tools

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    group = U.OptionGroup(parser, "group-specific options")

    group.add_option("--group-out", dest="tsv", type="string",
                     help="Outfile name for file mapping read id to read group",
                     default=None)

    group.add_option("--output-bam", dest="output_bam", action="store_true",
                     default=False,
                     help=("output a bam file with read groups tagged using the UG tag"
                           "[default=%default]"))

    group.add_option("--output-unmapped", dest="output_unmapped", action="store_true",
                     default=False,
                     help=("Retain all unmapped reads in output[default=%default]"))

    parser.add_option("--umi-group-tag", dest="umi_group_tag",
                      type="string", help="tag for the outputted umi group",
                      default='BX')

    parser.add_option_group(group)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    U.validateSamOptions(options)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        if options.no_sort_output:
            out_name = options.stdout.name
        else:
            out_name = U.getTempFilename()
            sorted_out_name = options.stdout.name
        options.stdout.close()
        assert options.output_bam, (
            "To output a bam you must include --output-bam option")
    else:
        if options.no_sort_output:
            out_name = "-"
        else:
            out_name = U.getTempFilename()
            sorted_out_name = "-"

    if not options.no_sort_output:  # need to determine the output format for sort
        if options.out_sam:
            sort_format = "sam"
        else:
            sort_format = "bam"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "wh"
    else:
        out_mode = "wb"

    infile = pysam.Samfile(in_name, in_mode)

    if options.output_bam:
        outfile = pysam.Samfile(out_name, out_mode, template=infile)
    else:
        outfile = None

    if options.tsv:
        mapping_outfile = U.openFile(options.tsv, "w")
        mapping_outfile.write("%s\n" % "\t".join(
            ["read_id", "contig", "position", "gene", "umi", "umi_count",
             "final_umi", "final_umi_count", "unique_id"]))

    nInput, nOutput, unique_id, input_reads, output_reads = 0, 0, 0, 0, 0

    gene_tag = options.gene_tag
    metacontig2contig = None

    if options.chrom:
        inreads = infile.fetch(reference=options.chrom)
    else:
        if options.per_gene and options.gene_transcript_map:
            metacontig2contig = umi_methods.getMetaContig2contig(
                infile, options.gene_transcript_map)
            metatag = "MC"
            inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag)
            gene_tag = metatag

        else:
            inreads = infile.fetch(until_eof=options.output_unmapped)

    bundle_iterator = umi_methods.get_bundles(
        options,
        all_reads=True,
        return_read2=True,
        return_unmapped=options.output_unmapped,
        metacontig_contig=metacontig2contig)

    for bundle, key, status in bundle_iterator(inreads):

        # write out read2s and unmapped (if these options are set)
        if status == 'single_read':
            # bundle is just a single read here
            nInput += 1

            if outfile:
                outfile.write(bundle)

            nOutput += 1
            continue

        umis = bundle.keys()
        counts = {umi: bundle[umi]["count"] for umi in umis}

        nInput += sum(counts.values())

        while nOutput >= output_reads + 10000:
            output_reads += 10000
            U.info("Written out %i reads" % output_reads)

        while nInput >= input_reads + 1000000:
            input_reads += 1000000
            U.info("Parsed %i input reads" % input_reads)

        # set up UMIClusterer functor with methods specific to
        # specified options.method
        processor = network.UMIClusterer(options.method)

        # group the umis
        groups = processor(
            umis,
            counts,
            threshold=options.threshold)

        for umi_group in groups:
            top_umi = umi_group[0]

            group_count = sum(counts[umi] for umi in umi_group)

            for umi in umi_group:
                reads = bundle[umi]['read']
                for read in reads:
                    if outfile:
                        # Add the 'UG' tag to the read
                        read.tags += [('UG', unique_id)]
                        read.tags += [(options.umi_group_tag, top_umi)]
                        outfile.write(read)

                    if options.tsv:
                        if options.per_gene:
                            gene = read.get_tag(gene_tag)
                        else:
                            gene = "NA"
                        mapping_outfile.write("%s\n" % "\t".join(map(str, (
                            read.query_name, read.reference_name,
                            umi_methods.get_read_position(
                                read, options.soft_clip_threshold)[1],
                            gene,
                            umi.decode(),
                            counts[umi],
                            top_umi.decode(),
                            group_count,
                            unique_id))))

                    nOutput += 1

            unique_id += 1

    if outfile:
        outfile.close()
        if not options.no_sort_output:
            # sort the output
            pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name)
            os.unlink(out_name)  # delete the tempfile

    if options.tsv:
        mapping_outfile.close()

    # write footer and output benchmark information.
    U.info(
        "Reads: %s" % ", ".join(["%s: %s" % (x[0], x[1]) for x in
                                 bundle_iterator.read_events.most_common()]))
    U.info("Number of reads out: %i, Number of groups: %i" %
           (nOutput, unique_id))
    U.Stop()

Example #30

Show file

File: whitelist_methods.py Project: AbateLab/DAb-seq

def getKneeEstimateDensity(cell_barcode_counts,
                           expect_cells=False,
                           cell_number=False,
                           plotfile_prefix=None):
    ''' estimate the number of "true" cell barcodes using a gaussian
    density-based method

    input:
         cell_barcode_counts = dict(key = barcode, value = count)
         expect_cells (optional) = define the expected number of cells
         cell_number (optional) = define number of cell barcodes to accept
         plotfile_prefix = (optional) prefix for plots

    returns:
         List of true barcodes
    '''

    # very low abundance cell barcodes are filtered out (< 0.001 *
    # the most abundant)
    threshold = 0.001 * cell_barcode_counts.most_common(1)[0][1]

    counts = sorted(cell_barcode_counts.values(), reverse=True)
    counts_thresh = [x for x in counts if x > threshold]
    log_counts = np.log10(counts_thresh)

    # guassian density with hardcoded bw
    density = gaussian_kde(log_counts, bw_method=0.1)

    xx_values = 10000  # how many x values for density plot
    xx = np.linspace(log_counts.min(), log_counts.max(), xx_values)

    local_min = None

    if cell_number:  # we have a prior hard expectation on the number of cells
        threshold = counts[cell_number]

    else:
        local_mins = argrelextrema(density(xx), np.less)[0]
        local_mins_counts = []

        for poss_local_min in local_mins[::-1]:

            passing_threshold = sum([
                y > np.power(10, xx[poss_local_min])
                for x, y in cell_barcode_counts.items()
            ])
            local_mins_counts.append(passing_threshold)

            if not local_min:  # if we have selected a local min yet
                if expect_cells:  # we have a "soft" expectation
                    if (passing_threshold > expect_cells * 0.1
                            and passing_threshold <= expect_cells):
                        local_min = poss_local_min

                else:  # we have no prior expectation
                    # TS: In abscence of any expectation (either hard or soft),
                    # this set of heuristic thresholds are used to decide
                    # which local minimum to select.
                    # This is very unlikely to be the best way to achieve this!
                    if (poss_local_min >= 0.2 * xx_values and
                        (log_counts.max() - xx[poss_local_min] > 0.5
                         or xx[poss_local_min] < log_counts.max() / 2)):
                        local_min = poss_local_min

        if local_min is not None:
            threshold = np.power(10, xx[local_min])

    if cell_number or local_min is not None:
        final_barcodes = set(
            [x for x, y in cell_barcode_counts.items() if y > threshold])
    else:
        final_barcodes = None

    if plotfile_prefix:

        # colour-blind friendly colours - https://gist.github.com/thriveth/8560036
        CB_color_cycle = [
            '#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3',
            '#999999', '#e41a1c', '#dede00'
        ]
        user_line = mlines.Line2D([], [],
                                  color=CB_color_cycle[0],
                                  ls="dashed",
                                  markersize=15,
                                  label='User-defined')
        selected_line = mlines.Line2D([], [],
                                      color=CB_color_cycle[0],
                                      ls="dashed",
                                      markersize=15,
                                      label='Selected')
        rejected_line = mlines.Line2D([], [],
                                      color=CB_color_cycle[3],
                                      ls="dashed",
                                      markersize=15,
                                      label='Rejected')

        # make density plot
        fig = plt.figure()
        fig1 = fig.add_subplot(111)
        fig1.plot(xx, density(xx), 'k')
        fig1.set_xlabel("Count per cell (log10)")
        fig1.set_ylabel("Density")

        if cell_number:
            fig1.axvline(np.log10(threshold),
                         ls="dashed",
                         color=CB_color_cycle[0])
            lgd = fig1.legend(bbox_to_anchor=(1.05, 1),
                              loc=2,
                              borderaxespad=0.,
                              handles=[user_line],
                              title="Cell threshold")

        elif local_min is None:  # no local_min was accepted
            for pos in xx[local_mins]:
                fig1.axvline(x=pos, ls="dashed", color=CB_color_cycle[3])
            lgd = fig1.legend(bbox_to_anchor=(1.05, 1),
                              loc=2,
                              borderaxespad=0.,
                              handles=[selected_line, rejected_line],
                              title="Possible thresholds")
        else:
            for pos in xx[local_mins]:
                if pos == xx[local_min]:  # selected local minima
                    fig1.axvline(x=xx[local_min],
                                 ls="dashed",
                                 color=CB_color_cycle[0])
                else:
                    fig1.axvline(x=pos, ls="dashed", color=CB_color_cycle[3])

            lgd = fig1.legend(bbox_to_anchor=(1.05, 1),
                              loc=2,
                              borderaxespad=0.,
                              handles=[selected_line, rejected_line],
                              title="Possible thresholds")

        fig.savefig("%s_cell_barcode_count_density.png" % plotfile_prefix,
                    bbox_extra_artists=(lgd, ),
                    bbox_inches='tight')

        # make knee plot
        fig = plt.figure()
        fig2 = fig.add_subplot(111)
        fig2.plot(range(0, len(counts)), np.cumsum(counts), c="black")

        xmax = len(counts)
        if local_min is not None:
            # reasonable maximum x-axis value
            xmax = min(len(final_barcodes) * 5, xmax)

        fig2.set_xlim((0 - (0.01 * xmax), xmax))
        fig2.set_xlabel("Rank")
        fig2.set_ylabel("Cumulative count")

        if cell_number:
            fig2.axvline(x=cell_number, ls="dashed", color=CB_color_cycle[0])
            lgd = fig2.legend(bbox_to_anchor=(1.05, 1),
                              loc=2,
                              borderaxespad=0.,
                              handles=[user_line],
                              title="Cell threshold")

        elif local_min is None:  # no local_min was accepted
            for local_mins_count in local_mins_counts:
                fig2.axvline(x=local_mins_count,
                             ls="dashed",
                             color=CB_color_cycle[3])
            lgd = fig2.legend(bbox_to_anchor=(1.05, 1),
                              loc=2,
                              borderaxespad=0.,
                              handles=[selected_line, rejected_line],
                              title="Possible thresholds")

        else:
            for local_mins_count in local_mins_counts:
                if local_mins_count == len(
                        final_barcodes):  # selected local minima
                    fig2.axvline(x=local_mins_count,
                                 ls="dashed",
                                 color=CB_color_cycle[0])
                else:
                    fig2.axvline(x=local_mins_count,
                                 ls="dashed",
                                 color=CB_color_cycle[3])

            lgd = fig2.legend(bbox_to_anchor=(1.05, 1),
                              loc=2,
                              borderaxespad=0.,
                              handles=[selected_line, rejected_line],
                              title="Possible thresholds")

        fig.savefig("%s_cell_barcode_knee.png" % plotfile_prefix,
                    bbox_extra_artists=(lgd, ),
                    bbox_inches='tight')

        if local_min is not None:
            colours_selected = [
                CB_color_cycle[0] for x in range(0, len(final_barcodes))
            ]
            colours_rejected = [
                "black" for x in range(0,
                                       len(counts) - len(final_barcodes))
            ]
            colours = colours_selected + colours_rejected
        else:
            colours = ["black" for x in range(0, len(counts))]

        fig = plt.figure()
        fig3 = fig.add_subplot(111)
        fig3.scatter(x=range(1,
                             len(counts) + 1),
                     y=counts,
                     c=colours,
                     s=10,
                     linewidths=0)
        fig3.loglog()
        fig3.set_xlim(0, len(counts) * 1.25)
        fig3.set_xlabel('Barcode index')
        fig3.set_ylabel('Count')

        if cell_number:
            fig3.axvline(x=cell_number, ls="dashed", color=CB_color_cycle[0])
            lgd = fig3.legend(bbox_to_anchor=(1.05, 1),
                              loc=2,
                              borderaxespad=0.,
                              handles=[user_line],
                              title="Cell threshold")
        elif local_min is None:  # no local_min was accepted
            for local_mins_count in local_mins_counts:
                fig3.axvline(x=local_mins_count,
                             ls="dashed",
                             color=CB_color_cycle[3])
            lgd = fig3.legend(bbox_to_anchor=(1.05, 1),
                              loc=2,
                              borderaxespad=0.,
                              handles=[selected_line, rejected_line],
                              title="Possible thresholds")
        else:
            for local_mins_count in local_mins_counts:
                if local_mins_count == len(
                        final_barcodes):  # selected local minima
                    fig3.axvline(x=local_mins_count,
                                 ls="dashed",
                                 color=CB_color_cycle[0])
                else:
                    fig3.axvline(x=local_mins_count,
                                 ls="dashed",
                                 color=CB_color_cycle[3])

            lgd = fig3.legend(bbox_to_anchor=(1.05, 1),
                              loc=2,
                              borderaxespad=0.,
                              handles=[selected_line, rejected_line],
                              title="Possible thresholds")

        fig.savefig("%s_cell_barcode_counts.png" % plotfile_prefix,
                    bbox_extra_artists=(lgd, ),
                    bbox_inches='tight')

        if not cell_number:
            with U.openFile("%s_cell_thresholds.tsv" % plotfile_prefix,
                            "w") as outf:
                outf.write("count\taction\n")
                for local_mins_count in local_mins_counts:
                    if local_min and local_mins_count == len(final_barcodes):
                        threshold_type = "Selected"
                    else:
                        threshold_type = "Rejected"

                    outf.write("%s\t%s\n" % (local_mins_count, threshold_type))

    return final_barcodes

Example #31

Show file

File: dedup.py Project: messersc/UMI-tools

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true",
                      help="Input file is in sam format", default=False)
    parser.add_option("-o", "--out-sam", dest="out_sam", action="store_true",
                      help="Output alignments in sam format", default=False)
    parser.add_option("--ignore-umi", dest="ignore_umi", action="store_true",
                      help="Ignore UMI and dedup only on position",
                      default=False)
    parser.add_option("--subset", dest="subset", type="string",
                      help="Use only a fraction of reads, specified by subset",
                      default=1.1)
    parser.add_option("--spliced-is-unique", dest="spliced",
                      action="store_true",
                      help="Treat a spliced read as different to an unspliced"
                           " one",
                      default=False)
    parser.add_option("--soft-clip-threshold", dest="soft",
                      type="float",
                      help="number of bases clipped from 5' end before"
                           "read os counted as spliced",
                      default=4)
    parser.add_option("--edit-distance-threshold", dest="threshold",
                      type="int",
                      help="Edit distance theshold at which to join two UMIs"
                           "when clustering", default=1)
    parser.add_option("--chrom", dest="chrom", type="string",
                      help="Restrict to one chromosome",
                      default=None)
    parser.add_option("--paired", dest="paired", action="store_true",
                      default=False,
                      help="Use second-in-pair position when deduping")
    parser.add_option("--method", dest="method", type="choice",
                      choices=("adjacency", "directional-adjacency",
                               "percentile", "unique", "cluster"),
                      default="directional-adjacency",
                      help="method to use for umi deduping")
    parser.add_option("--output-stats", dest="stats", type="string",
                      default=False,
                      help="Specify location to output stats")
    parser.add_option("--further-stats", dest="further_stats",
                      action="store_true", default=False,
                      help="Output further stats")
    parser.add_option("--per-contig", dest="per_contig", action="store_true",
                      default=False,
                      help=("dedup per contig,"
                            " e.g for transcriptome where contig = gene"))
    parser.add_option("--whole-contig", dest="whole_contig", action="store_true",
                      default=False,
                      help="Read whole contig before outputting bundles: guarantees that no reads"
                           "are missed, but increases memory usage")
    parser.add_option("--multimapping-detection-method",
                      dest="detection_method", type="choice",
                      choices=("NH", "X0", "XT"),
                      default=None,
                      help=("Some aligners identify multimapping using bam "
                            "tags. Setting this option to NH, X0 or XT will "
                            "use these tags when selecting the best read "
                            "amongst reads with the same position and umi"))
    parser.add_option("--mapping-quality", dest="mapping_quality",
                      type="int",
                      help="Minimum mapping quality for a read to be retained",
                      default=0)
    parser.add_option("--read-length", dest="read_length", action="store_true",
                      default=False,
                      help=("use read length in addition to position and UMI"
                            "to identify possible duplicates"))

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        out_name = options.stdout.name
        options.stdout.close()
    else:
        out_name = "-"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "w"
    else:
        out_mode = "wb"

    if options.stats:
        if options.ignore_umi:
            raise ValueError("'--output-stats' and '--ignore-umi' options"
                             " cannot be used together")

    if options.further_stats:
        if not options.stats:
            raise ValueError("'--further-stats' options requires "
                             "'--output-stats' option")
        if options.method not in ["cluster", "adjacency"]:
            raise ValueError("'--further-stats' only enabled with 'cluster' "
                             "and 'adjacency' methods")

    infile = pysam.Samfile(in_name, in_mode)
    outfile = pysam.Samfile(out_name, out_mode,
                            template=infile)

    if options.paired:
        outfile = TwoPassPairWriter(infile, outfile)

    nInput, nOutput = 0, 0

    if options.detection_method:
        bam_features = detect_bam_features(infile.filename)

        if not bam_features[options.detection_method]:
            if sum(bam_features.values()) == 0:
                raise ValueError(
                    "There are no bam tags available to detect multimapping. "
                    "Do not set --multimapping-detection-method")
            else:
                raise ValueError(
                    "The chosen method of detection for multimapping (%s) "
                    "will not work with this bam. Multimapping can be detected"
                    " for this bam using any of the following: %s" % (
                        options.detection_method, ",".join(
                            [x for x in bam_features if bam_features[x]])))

    if options.stats:
        # set up arrays to hold stats data
        stats_pre_df_dict = {"UMI": [], "counts": []}
        stats_post_df_dict = {"UMI": [], "counts": []}
        pre_cluster_stats = []
        post_cluster_stats = []
        pre_cluster_stats_null = []
        post_cluster_stats_null = []
        topology_counts = collections.Counter()
        node_counts = collections.Counter()
        read_gn = random_read_generator(infile.filename, chrom=options.chrom)

    for bundle in get_bundles(infile,
                              ignore_umi=options.ignore_umi,
                              subset=float(options.subset),
                              quality_threshold=options.mapping_quality,
                              paired=options.paired,
                              chrom=options.chrom,
                              spliced=options.spliced,
                              soft_clip_threshold=options.soft,
                              per_contig=options.per_contig,
                              whole_contig=options.whole_contig,
                              read_length=options.read_length,
                              detection_method=options.detection_method):

        nInput += sum([bundle[umi]["count"] for umi in bundle])

        if nOutput % 10000 == 0:
            U.debug("Outputted %i" % nOutput)

        if nInput % 1000000 == 0:
            U.debug("Read %i input reads" % nInput)

        if options.stats:
            # generate pre-dudep stats
            average_distance = get_average_umi_distance(bundle.keys())
            pre_cluster_stats.append(average_distance)
            cluster_size = len(bundle)
            random_umis = read_gn.getUmis(cluster_size)
            average_distance_null = get_average_umi_distance(random_umis)
            pre_cluster_stats_null.append(average_distance_null)

        if options.ignore_umi:
            for umi in bundle:
                nOutput += 1
                outfile.write(bundle[umi]["read"])

        else:

            # set up ClusterAndReducer functor with methods specific to
            # specified options.method
            processor = ClusterAndReducer(options.method)

            # dedup using umis and write out deduped bam
            reads, umis, umi_counts, topologies, nodes = processor(
                bundle, options.threshold,
                options.stats, options.further_stats)

            for read in reads:
                outfile.write(read)
                nOutput += 1

            if options.stats:

                # collect pre-dudupe stats
                stats_pre_df_dict['UMI'].extend(bundle)
                stats_pre_df_dict['counts'].extend(
                    [bundle[UMI]['count'] for UMI in bundle])

                # collect post-dudupe stats
                post_cluster_umis = [x.qname.split("_")[-1] for x in reads]
                stats_post_df_dict['UMI'].extend(umis)
                stats_post_df_dict['counts'].extend(umi_counts)

                average_distance = get_average_umi_distance(post_cluster_umis)
                post_cluster_stats.append(average_distance)

                cluster_size = len(post_cluster_umis)
                random_umis = read_gn.getUmis(cluster_size)
                average_distance_null = get_average_umi_distance(random_umis)
                post_cluster_stats_null.append(average_distance_null)

                if options.further_stats:
                    for c_type, count in topologies.most_common():
                        topology_counts[c_type] += count
                    for c_type, count in nodes.most_common():
                        node_counts[c_type] += count

    outfile.close()

    if options.stats:

        stats_pre_df = pd.DataFrame(stats_pre_df_dict)
        stats_post_df = pd.DataFrame(stats_post_df_dict)

        # generate histograms of counts per UMI at each position
        UMI_counts_df_pre = pd.DataFrame(stats_pre_df.pivot_table(
            columns=stats_pre_df["counts"], values="counts", aggfunc=len))
        UMI_counts_df_post = pd.DataFrame(stats_post_df.pivot_table(
            columns=stats_post_df["counts"], values="counts", aggfunc=len))

        UMI_counts_df_pre.columns = ["instances"]
        UMI_counts_df_post.columns = ["instances"]

        UMI_counts_df = pd.merge(UMI_counts_df_pre, UMI_counts_df_post,
                                 how='left', left_index=True, right_index=True,
                                 sort=True, suffixes=["_pre", "_post"])

        # TS - if count value not observed either pre/post-dedup,
        # merge will leave an empty cell and the column will be cast as a float
        # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html
        # --> Missing data casting rules and indexing
        # so, back fill with zeros and convert back to int
        UMI_counts_df = UMI_counts_df.fillna(0).astype(int)

        UMI_counts_df.to_csv(
            options.stats + "_per_umi_per_position.tsv", sep="\t")

        # aggregate stats pre/post per UMI
        agg_pre_df = aggregateStatsDF(stats_pre_df)
        agg_post_df = aggregateStatsDF(stats_post_df)

        agg_df = pd.merge(agg_pre_df, agg_post_df, how='left',
                          left_index=True, right_index=True,
                          sort=True, suffixes=["_pre", "_post"])

        # TS - see comment above regarding missing values
        agg_df = agg_df.fillna(0).astype(int)
        agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t")

        # bin distances into integer bins
        max_ed = int(max(map(max, [pre_cluster_stats,
                                   post_cluster_stats,
                                   pre_cluster_stats_null,
                                   post_cluster_stats_null])))

        cluster_bins = range(-1, int(max_ed) + 2)

        def bin_clusters(cluster_list, bins=cluster_bins):
            ''' take list of floats and return bins'''
            return np.digitize(cluster_list, bins, right=True)

        def tallyCounts(binned_cluster, max_edit_distance):
            ''' tally counts per bin '''
            return np.bincount(binned_cluster,
                               minlength=max_edit_distance + 3)

        pre_cluster_binned = bin_clusters(pre_cluster_stats)
        post_cluster_binned = bin_clusters(post_cluster_stats)
        pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null)
        post_cluster_null_binned = bin_clusters(post_cluster_stats_null)

        edit_distance_df = pd.DataFrame({
            "unique": tallyCounts(pre_cluster_binned, max_ed),
            "unique_null": tallyCounts(pre_cluster_null_binned, max_ed),
            options.method: tallyCounts(post_cluster_binned, max_ed),
            "%s_null" % options.method: tallyCounts(post_cluster_null_binned, max_ed),
            "edit_distance": cluster_bins})

        # TS - set lowest bin (-1) to "Single_UMI"
        edit_distance_df['edit_distance'][0] = "Single_UMI"

        edit_distance_df.to_csv(options.stats + "_edit_distance.tsv",
                                index=False, sep="\t")

        if options.further_stats:
            with U.openFile(options.stats + "_topologies.tsv", "w") as outf:
                outf.write(
                    "\n".join(["\t".join((x, str(y)))
                               for x, y in topology_counts.most_common()]) + "\n")

            with U.openFile(options.stats + "_nodes.tsv", "w") as outf:
                outf.write(
                    "\n".join(["\t".join(map(str, (x, y))) for
                               x, y in node_counts.most_common()]) + "\n")

    # write footer and output benchmark information.
    
    U.info("Number of reads in: %i, Number of reads out: %i" %
           (nInput, nOutput))
    U.Stop()

Example #32

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=usage,
                            description=globals()["__doc__"])
    group = U.OptionGroup(parser, "dedup-specific options")

    group.add_option("--output-stats",
                     dest="stats",
                     type="string",
                     default=False,
                     help="Specify location to output stats")

    parser.add_option_group(group)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    U.validateSamOptions(options, group=False)

    if options.random_seed:
        np.random.seed(options.random_seed)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        if options.no_sort_output:
            out_name = options.stdout.name
        else:
            out_name = U.getTempFilename(dir=options.tmpdir)
            sorted_out_name = options.stdout.name
        options.stdout.close()
    else:
        if options.no_sort_output:
            out_name = "-"
        else:
            out_name = U.getTempFilename(dir=options.tmpdir)
            sorted_out_name = "-"

    if not options.no_sort_output:  # need to determine the output format for sort
        if options.out_sam:
            sort_format = "sam"
        else:
            sort_format = "bam"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "wh"
    else:
        out_mode = "wb"

    if options.stats and options.ignore_umi:
        raise ValueError("'--output-stats' and '--ignore-umi' options"
                         " cannot be used together")

    infile = pysam.Samfile(in_name, in_mode)
    outfile = pysam.Samfile(out_name, out_mode, template=infile)

    if options.paired:
        outfile = sam_methods.TwoPassPairWriter(infile, outfile)

    nInput, nOutput, input_reads, output_reads = 0, 0, 0, 0

    if options.detection_method:
        bam_features = detect_bam_features(infile.filename)

        if not bam_features[options.detection_method]:
            if sum(bam_features.values()) == 0:
                raise ValueError(
                    "There are no bam tags available to detect multimapping. "
                    "Do not set --multimapping-detection-method")
            else:
                raise ValueError(
                    "The chosen method of detection for multimapping (%s) "
                    "will not work with this bam. Multimapping can be detected"
                    " for this bam using any of the following: %s" %
                    (options.detection_method, ",".join(
                        [x for x in bam_features if bam_features[x]])))

    gene_tag = options.gene_tag
    metacontig2contig = None

    if options.chrom:
        inreads = infile.fetch(reference=options.chrom)

    else:
        if options.per_contig and options.gene_transcript_map:
            metacontig2contig = sam_methods.getMetaContig2contig(
                infile, options.gene_transcript_map)
            metatag = "MC"
            inreads = sam_methods.metafetcher(infile, metacontig2contig,
                                              metatag)
            gene_tag = metatag

        else:
            inreads = infile.fetch()

    # set up ReadCluster functor with methods specific to
    # specified options.method
    processor = network.ReadDeduplicator(options.method)

    bundle_iterator = sam_methods.get_bundles(
        options, metacontig_contig=metacontig2contig)

    if options.stats:
        # set up arrays to hold stats data
        stats_pre_df_dict = {"UMI": [], "counts": []}
        stats_post_df_dict = {"UMI": [], "counts": []}
        pre_cluster_stats = []
        post_cluster_stats = []
        pre_cluster_stats_null = []
        post_cluster_stats_null = []
        topology_counts = collections.Counter()
        node_counts = collections.Counter()
        read_gn = umi_methods.random_read_generator(
            infile.filename,
            chrom=options.chrom,
            barcode_getter=bundle_iterator.barcode_getter)

    for bundle, key, status in bundle_iterator(inreads):

        nInput += sum([bundle[umi]["count"] for umi in bundle])

        while nOutput >= output_reads + 100000:
            output_reads += 100000
            U.info("Written out %i reads" % output_reads)

        while nInput >= input_reads + 1000000:
            input_reads += 1000000
            U.info("Parsed %i input reads" % input_reads)

        if options.stats:
            # generate pre-dudep stats
            average_distance = umi_methods.get_average_umi_distance(
                bundle.keys())
            pre_cluster_stats.append(average_distance)
            cluster_size = len(bundle)
            random_umis = read_gn.getUmis(cluster_size)
            average_distance_null = umi_methods.get_average_umi_distance(
                random_umis)
            pre_cluster_stats_null.append(average_distance_null)

        if options.ignore_umi:
            for umi in bundle:
                nOutput += 1
                outfile.write(bundle[umi]["read"])

        else:

            # dedup using umis and write out deduped bam
            reads, umis, umi_counts = processor(bundle=bundle,
                                                threshold=options.threshold)

            for read in reads:
                outfile.write(read)
                nOutput += 1

            if options.stats:

                # collect pre-dudupe stats
                stats_pre_df_dict['UMI'].extend(bundle)
                stats_pre_df_dict['counts'].extend(
                    [bundle[UMI]['count'] for UMI in bundle])

                # collect post-dudupe stats
                post_cluster_umis = [
                    bundle_iterator.barcode_getter(x)[0] for x in reads
                ]
                stats_post_df_dict['UMI'].extend(umis)
                stats_post_df_dict['counts'].extend(umi_counts)

                average_distance = umi_methods.get_average_umi_distance(
                    post_cluster_umis)
                post_cluster_stats.append(average_distance)

                cluster_size = len(post_cluster_umis)
                random_umis = read_gn.getUmis(cluster_size)
                average_distance_null = umi_methods.get_average_umi_distance(
                    random_umis)
                post_cluster_stats_null.append(average_distance_null)

    outfile.close()

    if not options.no_sort_output:
        # sort the output
        pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name)
        os.unlink(out_name)  # delete the tempfile

    if options.stats:

        # generate the stats dataframe
        stats_pre_df = pd.DataFrame(stats_pre_df_dict)
        stats_post_df = pd.DataFrame(stats_post_df_dict)

        # tally the counts per umi per position
        pre_counts = collections.Counter(stats_pre_df["counts"])
        post_counts = collections.Counter(stats_post_df["counts"])
        counts_index = list(
            set(pre_counts.keys()).union(set(post_counts.keys())))
        counts_index.sort()
        with U.openFile(options.stats + "_per_umi_per_position.tsv",
                        "w") as outf:
            outf.write("counts\tinstances_pre\tinstances_post\n")
            for count in counts_index:
                values = (count, pre_counts[count], post_counts[count])
                outf.write("\t".join(map(str, values)) + "\n")

        # aggregate stats pre/post per UMI
        agg_pre_df = aggregateStatsDF(stats_pre_df)
        agg_post_df = aggregateStatsDF(stats_post_df)

        agg_df = pd.merge(agg_pre_df,
                          agg_post_df,
                          how='left',
                          left_index=True,
                          right_index=True,
                          sort=True,
                          suffixes=["_pre", "_post"])

        # TS - if count value not observed either pre/post-dedup,
        # merge will leave an empty cell and the column will be cast as a float
        # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html
        # --> Missing data casting rules and indexing
        # so, back fill with zeros and convert back to int
        agg_df = agg_df.fillna(0).astype(int)

        agg_df.index = [x.decode() for x in agg_df.index]
        agg_df.index.name = 'UMI'
        agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t")

        # bin distances into integer bins
        max_ed = int(
            max(
                map(max, [
                    pre_cluster_stats, post_cluster_stats,
                    pre_cluster_stats_null, post_cluster_stats_null
                ])))

        cluster_bins = range(-1, int(max_ed) + 2)

        def bin_clusters(cluster_list, bins=cluster_bins):
            ''' take list of floats and return bins'''
            return np.digitize(cluster_list, bins, right=True)

        def tallyCounts(binned_cluster, max_edit_distance):
            ''' tally counts per bin '''
            return np.bincount(binned_cluster, minlength=max_edit_distance + 3)

        pre_cluster_binned = bin_clusters(pre_cluster_stats)
        post_cluster_binned = bin_clusters(post_cluster_stats)
        pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null)
        post_cluster_null_binned = bin_clusters(post_cluster_stats_null)

        edit_distance_df = pd.DataFrame(
            {
                "unique":
                tallyCounts(pre_cluster_binned, max_ed),
                "unique_null":
                tallyCounts(pre_cluster_null_binned, max_ed),
                options.method:
                tallyCounts(post_cluster_binned, max_ed),
                "%s_null" % options.method:
                tallyCounts(post_cluster_null_binned, max_ed),
                "edit_distance":
                cluster_bins
            },
            columns=[
                "unique", "unique_null", options.method,
                "%s_null" % options.method, "edit_distance"
            ])

        # TS - set lowest bin (-1) to "Single_UMI"
        edit_distance_df['edit_distance'][0] = "Single_UMI"

        edit_distance_df.to_csv(options.stats + "_edit_distance.tsv",
                                index=False,
                                sep="\t")

    # write footer and output benchmark information.
    U.info("Reads: %s" % ", ".join([
        "%s: %s" % (x[0], x[1])
        for x in bundle_iterator.read_events.most_common()
    ]))

    U.info("Number of reads out: %i" % nOutput)

    if not options.ignore_umi:  # otherwise processor has not been used
        U.info("Total number of positions deduplicated: %i" %
               processor.UMIClusterer.positions)
        if processor.UMIClusterer.positions > 0:
            U.info("Mean number of unique UMIs per position: %.2f" %
                   (float(processor.UMIClusterer.total_umis_per_position) /
                    processor.UMIClusterer.positions))
            U.info("Max. number of unique UMIs per position: %i" %
                   processor.UMIClusterer.max_umis_per_position)
        else:
            U.warn("The BAM did not contain any valid "
                   "reads/read pairs for deduplication")

    U.Stop()

Example #33

Show file

File: whitelist.py Project: yiyewuqingfeng/UMI-tools

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=usage,
                            description=globals()["__doc__"])

    group = U.OptionGroup(parser, "whitelist-specific options")

    group.add_option("--plot-prefix",
                     dest="plot_prefix", type="string",
                     help=("Prefix for plots to visualise the automated "
                           "detection of the number of 'true' cell barcodes"))
    group.add_option("--subset-reads",
                     dest="subset_reads", type="int",
                     help=("Use the first N reads to automatically identify "
                           "the true cell barcodes. If N is greater than the "
                           "number of reads, all reads will be used. "
                           "Default is 100,000,000"))
    group.add_option("--error-correct-threshold",
                     dest="error_correct_threshold",
                     type="int",
                     help=("Hamming distance for correction of barcodes to "
                           "whitelist barcodes. This value will also be used "
                           "for error detection above the knee if required "
                           "(--ed-above-threshold)"))
    group.add_option("--method",
                     dest="method",
                     choices=["reads", "umis"],
                     help=("Use reads or unique umi counts per cell"))
    group.add_option("--knee-method",
                     dest="knee_method",
                     choices=["distance", "density"],
                     help=("Use distance or density methods for detection of knee"))
    group.add_option("--expect-cells",
                     dest="expect_cells",
                     type="int",
                     help=("Prior expectation on the upper limit on the "
                           "number of cells sequenced"))
    group.add_option("--allow-threshold-error",
                     dest="allow_threshold_error", action="store_true",
                     help=("Don't select a threshold. Will still "
                           "output the plots if requested (--plot-prefix)"))
    group.add_option("--set-cell-number",
                     dest="cell_number",
                     type="int",
                     help=("Specify the number of cell barcodes to accept"))

    parser.add_option("--ed-above-threshold",
                      dest="ed_above_threshold", type="choice",
                      choices=["discard", "correct"],
                      help=("Detect CBs above the threshold which may be "
                            "sequence errors from another CB and either "
                            "'discard' or 'correct'. Default=discard"))
    parser.add_option_group(group)

    parser.set_defaults(method="reads",
                        knee_method="distance",
                        extract_method="string",
                        whitelist_tsv=None,
                        blacklist_tsv=None,
                        error_correct_threshold=1,
                        pattern=None,
                        pattern2=None,
                        read2_in=None,
                        plot_prefix=None,
                        subset_reads=100000000,
                        expect_cells=False,
                        allow_threshold_error=False,
                        cell_number=False,
                        ed_above_threshold=None,
                        ignore_suffix=False)

    # add common options (-h/--help, ...) and parse command line

    (options, args) = U.Start(parser, argv=argv,
                              add_extract_options=True,
                              add_group_dedup_options=False,
                              add_umi_grouping_options=False,
                              add_sam_options=False)

    if options.filtered_out and not options.extract_method == "regex":
        U.error("Reads will not be filtered unless extract method is"
                "set to regex (--extract-method=regex)")

    if options.expect_cells:
        if options.knee_method == "distance":
            U.error("Cannot use --expect-cells with 'distance' knee "
                    "method. Switch to --knee-method=density if you want to "
                    "provide an expectation for the number of "
                    "cells. Alternatively, if you know the number of cell "
                    "barcodes, use --cell-number")
        if options.cell_number:
            U.error("Cannot supply both --expect-cells and "
                    "--cell-number options")

    extract_cell, extract_umi = U.validateExtractOptions(options)

    if not extract_cell:
        if options.extract_method == "string":
            U.error("barcode pattern(s) do not include any cell bases "
                    "(marked with 'Cs') %s, %s" % (
                        options.pattern, options.pattern2))
        elif options.extract_method == "regex":
            U.error("barcode regex(es) do not include any cell groups "
                    "(starting with 'cell_') %s, %s" (
                        options.pattern, options.pattern2))

    read1s = umi_methods.fastqIterate(options.stdin)

    # set up read extractor
    ReadExtractor = extract_methods.ExtractFilterAndUpdate(
        method=options.extract_method,
        pattern=options.pattern,
        pattern2=options.pattern2,
        prime3=options.prime3,
        extract_cell=extract_cell)

    cell_barcode_counts = collections.Counter()

    n_reads = 0
    n_cell_barcodes = 0

    # if using the umis method, need to keep a set of umis observed
    if options.method == "umis":
        cell_barcode_umis = collections.defaultdict(set)

    # variables for progress monitor
    displayMax = 100000
    U.info("Starting barcode extraction")

    if options.filtered_out:
        filtered_out = U.openFile(options.filtered_out, "w")

    if not options.read2_in:
        for read1 in read1s:

            # Update display in every 100kth iteration
            if n_reads % displayMax == 0:
                U.info("Parsed {} reads".format(n_reads))

            n_reads += 1
            barcode_values = ReadExtractor.getBarcodes(read1)
            if barcode_values is None:
                if options.filtered_out:
                    filtered_out.write(str(read1) + "\n")
                continue
            else:
                cell, umi, _, _, _, _, _ = barcode_values
                if options.method == "umis":
                    cell_barcode_umis[cell].add(umi)
                else:
                    cell_barcode_counts[cell] += 1
                n_cell_barcodes += 1

            if options.subset_reads:
                if n_cell_barcodes > options.subset_reads:
                    break
    else:

        if options.filtered_out2:
            filtered_out2 = U.openFile(options.filtered_out2, "w")

        read2s = umi_methods.fastqIterate(U.openFile(options.read2_in))
        for read1, read2 in izip(read1s, read2s):

            # Update display in every 100kth iteration
            if n_reads % displayMax == 0:
                U.info("Parsed {} reads".format(n_reads))

            n_reads += 1

            barcode_values = ReadExtractor.getBarcodes(read1, read2)
            if barcode_values is None:
                if options.filtered_out:
                    filtered_out.write(str(read1) + "\n")
                if options.filtered_out2:
                    filtered_out2.write(str(read2) + "\n")
                continue
            else:
                cell, umi, _, _, _, _, _ = barcode_values
                if options.method == "umis":
                    cell_barcode_umis[cell].add(umi)
                else:
                    cell_barcode_counts[cell] += 1
                n_cell_barcodes += 1

            if options.subset_reads:
                if n_reads > options.subset_reads:
                    break

    U.info("Starting - whitelist determination")

    if options.method == "umis":
        for cell in cell_barcode_umis:
            cell_barcode_counts[cell] = len(cell_barcode_umis[cell])

    if options.cell_number and options.cell_number > len(cell_barcode_counts):
        raise ValueError(
            "--set-cell-barcode option specifies more cell barcodes than the "
            "number of observed cell barcodes. This may be because "
            "--subset-reads was set to a value too low to capture reads from "
            "all cells. %s cell barcodes observed from %s parsed reads. "
            "Expected>= %s cell barcodes" % (
                len(cell_barcode_counts),
                options.subset_reads,
                options.cell_number))

    cell_whitelist, true_to_false_map = whitelist_methods.getCellWhitelist(
        cell_barcode_counts,
        options.knee_method,
        options.expect_cells,
        options.cell_number,
        options.error_correct_threshold,
        options.plot_prefix)

    if cell_whitelist:
        U.info("Top %s cell barcodes passed the selected threshold" %
               len(cell_whitelist))

    if options.ed_above_threshold:
        cell_whitelist, true_to_false_map = whitelist_methods.errorDetectAboveThreshold(
            cell_barcode_counts,
            cell_whitelist,
            true_to_false_map,
            errors=options.error_correct_threshold,
            resolution_method=options.ed_above_threshold)

    if cell_whitelist:
        U.info("Writing out whitelist")
        total_correct_barcodes = 0
        total_corrected_barcodes = 0
        for barcode in sorted(list(cell_whitelist)):

            total_correct_barcodes += cell_barcode_counts[barcode]

            if true_to_false_map:
                corrected_barcodes = ",".join(
                    sorted(true_to_false_map[barcode]))

                correct_barcode_counts = [cell_barcode_counts[x] for x in
                                          sorted(true_to_false_map[barcode])]
                total_corrected_barcodes += sum(correct_barcode_counts)

                corrected_barcode_counts = ",".join(
                    map(str, correct_barcode_counts))
            else:
                corrected_barcodes, corrected_barcode_counts = "", ""

            options.stdout.write("%s\t%s\t%s\t%s\n" % (
                barcode, corrected_barcodes, cell_barcode_counts[barcode],
                corrected_barcode_counts))
    else:
        msg = ("No local minima was accepted. Recommend checking the plot "
               "output and counts per local minima (requires `--plot-prefix`"
               "option) and then re-running with manually selected threshold "
               "(`--set-cell-number` option)")

        if options.allow_threshold_error:
            U.info(msg)
        else:
            U.error(msg)

    U.info("Parsed %i reads" % n_reads)
    U.info("%i reads matched the barcode pattern" % n_cell_barcodes)
    U.info("Found %i unique cell barcodes" % len(cell_barcode_counts))

    if cell_whitelist:
        U.info("Found %i total reads matching the selected cell barcodes" %
               total_correct_barcodes)
        U.info("Found %i total reads which can be error corrected to the "
               "selected cell barcodes" % total_corrected_barcodes)

    if options.filtered_out:
        filtered_out.close()
    if options.filtered_out2:
        filtered_out2.close()

    U.Stop()

Example #34

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--in-sam",
                      dest="in_sam",
                      action="store_true",
                      help="Input file is in sam format [default=%default]",
                      default=False)
    parser.add_option(
        "-o",
        "--out-sam",
        dest="out_sam",
        action="store_true",
        help="Output alignments in sam format [default=%default]",
        default=False)
    parser.add_option("--umi-separator",
                      dest="umi_sep",
                      type="string",
                      help="separator between read id and UMI",
                      default="_")
    parser.add_option("--umi-tag",
                      dest="umi_tag",
                      type="string",
                      help="tag containing umi",
                      default='RX')
    parser.add_option("--umi-group-tag",
                      dest="umi_group_tag",
                      type="string",
                      help="tag for the outputted umi group",
                      default='BX')
    parser.add_option("--extract-umi-method",
                      dest="get_umi_method",
                      type="choice",
                      choices=("read_id", "tag"),
                      default="read_id",
                      help="where is the read UMI encoded? [default=%default]")
    parser.add_option("--subset",
                      dest="subset",
                      type="float",
                      help="Use only a fraction of reads, specified by subset",
                      default=None)
    parser.add_option("--spliced-is-unique",
                      dest="spliced",
                      action="store_true",
                      help="Treat a spliced read as different to an unspliced"
                      " one [default=%default]",
                      default=False)
    parser.add_option("--soft-clip-threshold",
                      dest="soft",
                      type="float",
                      help="number of bases clipped from 5' end before"
                      "read is counted as spliced [default=%default]",
                      default=4)
    parser.add_option("--edit-distance-threshold",
                      dest="threshold",
                      type="int",
                      default=1,
                      help="Edit distance theshold at which to join two UMIs"
                      "when clustering. [default=%default]")
    parser.add_option("--chrom",
                      dest="chrom",
                      type="string",
                      help="Restrict to one chromosome",
                      default=None)
    parser.add_option("--paired",
                      dest="paired",
                      action="store_true",
                      default=False,
                      help="paired BAM. [default=%default]")
    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("adjacency", "directional", "unique",
                               "cluster"),
                      default="directional",
                      help="method to use for umi deduping [default=%default]")
    parser.add_option("--per-contig",
                      dest="per_contig",
                      action="store_true",
                      default=False,
                      help=("dedup per contig,"
                            " e.g for transcriptome where contig = gene"))
    parser.add_option(
        "--whole-contig",
        dest="whole_contig",
        action="store_true",
        default=False,
        help=
        "Read whole contig before outputting bundles: guarantees that no reads"
        "are missed, but increases memory usage")
    parser.add_option(
        "--read-length",
        dest="read_length",
        action="store_true",
        default=False,
        help=("use read length in addition to position and UMI"
              "to identify possible duplicates [default=%default]"))
    parser.add_option("--mapping-quality",
                      dest="mapping_quality",
                      type="int",
                      help="Minimum mapping quality for a read to be retained"
                      " [default=%default]",
                      default=0)
    parser.add_option(
        "--group-out",
        dest="tsv",
        type="string",
        help="Outfile name for file mapping read id to read group",
        default=None)
    parser.add_option(
        "--output-bam",
        dest="output_bam",
        action="store_true",
        default=False,
        help=("output a bam file with read groups tagged using the UG tag"
              "[default=%default]"))

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        out_name = options.stdout.name
        options.stdout.close()
        assert options.output_bam, (
            "To output a bam you must include --output-bam option")
    else:
        out_name = "-"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "w"
    else:
        out_mode = "wb"

    infile = pysam.Samfile(in_name, in_mode)

    if options.output_bam:
        outfile = pysam.Samfile(out_name, out_mode, template=infile)
        if options.paired:
            outfile = umi_methods.TwoPassPairWriter(infile, outfile, tags=True)
    else:
        outfile = None

    if options.tsv:
        mapping_outfile = U.openFile(options.tsv, "w")
        mapping_outfile.write(
            "read_id\tcontig\tposition\tumi\tumi_count\tfinal_umi\tfinal_umi_count\tunique_id\n"
        )

    # set the method with which to extract umis from reads
    if options.get_umi_method == "read_id":
        umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep)
    elif options.get_umi_method == "tag":
        umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag)
    else:
        raise ValueError("Unknown umi extraction method")

    nInput, nOutput, unique_id = 0, 0, 0

    read_events = collections.Counter()

    for bundle, read_events in umi_methods.get_bundles(
            infile,
            read_events,
            ignore_umi=False,
            subset=options.subset,
            quality_threshold=options.mapping_quality,
            paired=options.paired,
            chrom=options.chrom,
            spliced=options.spliced,
            soft_clip_threshold=options.soft,
            per_contig=options.per_contig,
            whole_contig=options.whole_contig,
            read_length=options.read_length,
            umi_getter=umi_getter,
            all_reads=True):

        nInput += sum([bundle[umi]["count"] for umi in bundle])

        if nOutput % 10000 == 0:
            U.debug("Outputted %i" % nOutput)

        if nInput % 1000000 == 0:
            U.debug("Read %i input reads" % nInput)

        # set up ReadCluster functor with methods specific to
        # specified options.method
        processor = network.ReadClusterer(options.method)

        bundle, groups, counts = processor(bundle=bundle,
                                           threshold=options.threshold,
                                           stats=True,
                                           deduplicate=False)

        for umi_group in groups:
            top_umi = umi_group[0]

            group_count = sum(counts[umi] for umi in umi_group)

            for umi in umi_group:
                reads = bundle[umi]['read']
                for read in reads:
                    if outfile:
                        if options.paired:
                            # if paired, we need to supply the tags to
                            # add to the paired read
                            outfile.write(read, unique_id, top_umi)

                        else:
                            # Add the 'UG' tag to the read
                            read.tags += [('UG', unique_id)]
                            read.tags += [(options.umi_group_tag, top_umi)]
                            outfile.write(read)

                    if options.tsv:
                        mapping_outfile.write("%s\n" % "\t".join(
                            map(str, (read.query_name, read.reference_name,
                                      umi_methods.get_read_position(
                                          read, options.soft)[1], umi.decode(),
                                      counts[umi], top_umi.decode(),
                                      group_count, unique_id))))

                    nOutput += 1

            unique_id += 1

    if outfile:
        outfile.close()

    if options.tsv:
        mapping_outfile.close()

    # write footer and output benchmark information.
    U.info(
        "Reads: %s" %
        ", ".join(["%s: %s" % (x[0], x[1])
                   for x in read_events.most_common()]))
    U.info("Number of reads out: %i, Number of groups: %i" %
           (nOutput, unique_id))
    U.Stop()

Example #35

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    group = U.OptionGroup(parser, "count-specific options")

    parser.add_option("--wide-format-cell-counts",
                      dest="wide_format_cell_counts",
                      action="store_true",
                      default=False,
                      help=("output the cell counts in a wide format "
                            "(rows=genes, columns=cells)"))

    parser.add_option_group(group)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False)

    options.per_gene = True  # hardcodes counting to per-gene only

    U.validateSamOptions(options, group=False)

    if options.random_seed:
        np.random.seed(options.random_seed)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    infile = pysam.Samfile(in_name, in_mode)

    # write out to tempfile and then sort to stdout
    tmpfilename = U.getTempFilename(dir=options.tmpdir)
    tmpfile = U.openFile(tmpfilename, mode="w")

    nInput, nOutput, input_reads = 0, 0, 0

    gene_tag = options.gene_tag
    metacontig2contig = None

    if options.chrom:
        inreads = infile.fetch(reference=options.chrom)
    else:
        if options.gene_transcript_map:
            metacontig2contig = umi_methods.getMetaContig2contig(
                infile, options.gene_transcript_map)
            metatag = "MC"
            inreads = umi_methods.metafetcher(infile, metacontig2contig,
                                              metatag)
            gene_tag = metatag
        else:
            inreads = infile.fetch()

    bundle_iterator = umi_methods.get_bundles(
        options, only_count_reads=True, metacontig_contig=metacontig2contig)

    for bundle, key, status in bundle_iterator(inreads):
        if status == "single_read":
            continue

        gene, cell = key

        umis = bundle.keys()
        counts = {umi: bundle[umi]["count"] for umi in umis}

        nInput += sum(counts.values())

        while nInput >= input_reads + 1000000:
            input_reads += 1000000
            U.info("Parsed %i input reads" % input_reads)

        # set up UMIClusterer functor with methods specific to
        # specified options.method

        processor = network.UMIClusterer(options.method)

        # group the umis
        groups = processor(umis, counts, threshold=options.threshold)

        gene_count = len(groups)

        if options.per_cell:
            tmpfile.write("%s\n" % "\t".join(
                (gene, cell.decode(), str(gene_count))))
        else:
            tmpfile.write("%s\n" % "\t".join((gene, str(gene_count))))

        nOutput += gene_count

    tmpfile.close()

    if options.per_cell:

        gene_counts_dict = {}

        with U.openFile(tmpfilename, mode="r") as inf:
            genes = set()
            cells = set()
            for line in inf:
                gene, cell, gene_count = line.strip().split("\t")
                genes.add(gene)
                cells.add(cell)

                if gene not in gene_counts_dict:
                    gene_counts_dict[gene] = {}

                gene_counts_dict[gene][cell] = gene_count

        if options.wide_format_cell_counts:  # write out in wide format

            options.stdout.write("%s\t%s\n" %
                                 ("gene", "\t".join(sorted(cells))))

            for gene in sorted(genes):
                counts = []
                for cell in sorted(cells):
                    if cell in gene_counts_dict[gene]:
                        counts.append(gene_counts_dict[gene][cell])
                    else:
                        counts.append(0)
                options.stdout.write("%s\t%s\n" %
                                     (gene, "\t".join(map(str, counts))))

        else:  # write out in long format
            options.stdout.write("%s\t%s\t%s\n" % ("gene", "cell", "count"))
            for gene in sorted(genes):
                for cell in sorted(list(gene_counts_dict[gene].keys())):
                    options.stdout.write(
                        "%s\t%s\t%s\n" %
                        (gene, cell, gene_counts_dict[gene][cell]))
    else:
        options.stdout.write("%s\t%s\n" % ("gene", "count"))

        with U.openFile(tmpfilename, mode="r") as inf:
            for line in inf:
                options.stdout.write(line)

    os.unlink(tmpfilename)

    # output reads events and benchmark information.
    for event in bundle_iterator.read_events.most_common():
        U.info("%s: %s" % (event[0], event[1]))

    U.info("Number of (post deduplication) reads counted: %i" % nOutput)

    U.Stop()

Example #36

Show file

File: extract.py Project: xcit10/UMI-tools

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=usage,
                            description=globals()["__doc__"])

    group = U.OptionGroup(parser, "extract-specific options")

    # (Experimental option) Retain the UMI in the sequence read"
    group.add_option("--retain-umi", dest="retain_umi", action="store_true",
                     help=optparse.SUPPRESS_HELP)
    group.add_option("--read2-out", dest="read2_out", type="string",
                     help="file to output processed paired read to")
    group.add_option("--read2-stdout", dest="read2_stdout",
                     action="store_true",
                     help="Paired reads, send read2 to stdout, discarding read1")
    group.add_option("--quality-filter-threshold",
                     dest="quality_filter_threshold", type="int",
                     help=("Remove reads where any UMI base quality score "
                           "falls below this threshold"))
    group.add_option("--quality-filter-mask",
                     dest="quality_filter_mask", type="int",
                     help=("If a UMI base has a quality below this threshold, "
                           "replace the base with 'N'"))
    group.add_option("--quality-encoding",
                     dest="quality_encoding", type="choice",
                     choices=["phred33", "phred64", "solexa"],
                     help=("Quality score encoding. Choose from 'phred33'"
                           "[33-77] 'phred64' [64-106] or 'solexa' [59-106]"))
    group.add_option("--filter-cell-barcode",
                     dest="filter_cell_barcode",
                     action="store_true",
                     help=optparse.SUPPRESS_HELP)
    group.add_option("--error-correct-cell",
                     dest="error_correct_cell",
                     action="store_true",
                     help=("Correct errors in the cell barcode"))
    group.add_option("--whitelist",
                     dest="whitelist", type="string",
                     help=("A whitelist of accepted cell barcodes"))
    group.add_option("--blacklist",
                     dest="blacklist", type="string",
                     help=("A blacklist of rejected cell barcodes"))
    group.add_option("--filter-umi",
                     dest="filter_umi",
                     action="store_true",
                     #help="Filter the UMIs"
                     help=optparse.SUPPRESS_HELP)
    group.add_option("--umi-whitelist", dest="umi_whitelist",
                     type="string", default=None,
                     #help="A whitelist of accepted UMIs [default=%default]"
                     help=optparse.SUPPRESS_HELP)
    group.add_option("--umi-whitelist-paired", dest="umi_whitelist_paired",
                     type="string", default=None,
                     #help="A whitelist of accepted UMIs for read2[default=%default]"
                     help=optparse.SUPPRESS_HELP)
    group.add_option("--correct-umi-threshold", dest="correct_umi_threshold",
                     type="int", default=0,
                     #help="Correct errors in UMIs to the whitelist(s) provided"
                     #"if within threshold [default=%default]"
                     help=optparse.SUPPRESS_HELP)
    group.add_option("--umi-correct-log", dest="umi_correct_log",
                     type="string", default=None,
                     #help="File logging UMI error correction",
                     help=optparse.SUPPRESS_HELP)
    group.add_option("--subset-reads", "--reads-subset",
                     dest="reads_subset", type="int",
                     help=("Only extract from the first N reads. If N is "
                           "greater than the number of reads, all reads will "
                           "be used"))
    group.add_option("--reconcile-pairs",
                     dest="reconcile", action="store_true",
                     help=("Allow the presences of reads in read2 input that "
                           "are not present in read1 input. This allows cell "
                           "barcode filtering of read1s without "
                           "considering read2s"))
    parser.add_option_group(group)

    group = U.OptionGroup(parser, "[EXPERIMENTAl] barcode extraction options")

    group.add_option("--either-read", dest="either_read", action="store_true",
                     help="UMI may be on either read (see "
                     "--either-read-resolve) for options to resolve cases where"
                     "UMI is on both reads")
    group.add_option("--either-read-resolve",
                     dest="either_read_resolve", type="choice",
                     choices=["discard", "quality"],
                     help=("How to resolve instances where both reads "
                           "contain a UMI but using --either-read."
                           "Choose from 'discard' or 'quality'"
                           "(use highest quality). default=dicard"))

    parser.add_option_group(group)

    parser.set_defaults(extract_method="string",
                        filter_cell_barcodes=False,
                        whitelist=None,
                        blacklist=None,
                        error_correct_cell=False,
                        pattern=None,
                        pattern2=None,
                        read2_in=None,
                        read2_out=False,
                        read2_stdout=False,
                        quality_filter_threshold=None,
                        quality_encoding=None,
                        reconcile=False,
                        either_read=False,
                        either_read_resolve="discard",
                        ignore_suffix=False)

    # add common options (-h/--help, ...) and parse command line

    (options, args) = U.Start(parser, argv=argv,
                              add_extract_options=True,
                              add_group_dedup_options=False,
                              add_umi_grouping_options=False,
                              add_sam_options=False)

    if options.filter_cell_barcode:
        U.info('Use of --whitelist ensures cell barcodes are filtered. '
               '--filter-cell-barcode is no longer required and may be '
               'removed in future versions.')

    if options.whitelist is not None:
        options.filter_cell_barcode = True

    if options.retain_umi and not options.extract_method == "regex":
        U.error("option --retain-umi only works with --extract-method=regex")

    if (options.filtered_out and not options.extract_method == "regex" and
        whitelist is None):
        U.error("Reads will not be filtered unless extract method is"
                "set to regex (--extract-method=regex) or cell"
                "barcodes are filtered (--whitelist)")

    if options.quality_filter_threshold or options.quality_filter_mask:
        if not options.quality_encoding:
            U.error("must provide a quality encoding (--quality-"
                    "encoding) to filter UMIs by quality (--quality"
                    "-filter-threshold) or mask low quality bases "
                    "with (--quality-filter-mask)")

    extract_cell, extract_umi = U.validateExtractOptions(options)

    if options.either_read:
        if extract_cell:
            U.error("Option to extract from either read (--either-read) "
                    "is not currently compatible with cell barcode extraction")
        if not options.extract_method == "regex":
            U.error("Option to extract from either read (--either-read)"
                    "requires --extract-method=regex")
        if not options.pattern or not options.pattern2:
            U.error("Option to extract from either read (--either-read)"
                    "requires --bc-pattern=[PATTERN1] and"
                    "--bc-pattern2=[PATTERN2]")

    if options.filter_umi:

        if not options.umi_whitelist:
                U.error("must provide a UMI whitelist (--umi-whitelist) if using "
                        "--filter-umi option")
        if options.pattern2 and not options.umi_whitelist_paired:
                U.error("must provide a UMI whitelist for paired end "
                        "(--umi-whitelist-paired) if using --filter-umi option"
                        "with paired end data")
        if not extract_umi:
            if options.extract_method == "string":
                U.error("barcode pattern(s) do not include any umi bases "
                        "(marked with 'Ns') %s, %s" % (
                            options.pattern, options.pattern2))
            elif options.extract_method == "regex":
                U.error("barcode regex(es) do not include any umi groups "
                        "(starting with 'umi_') %s, %s" (
                            options.pattern, options.pattern2))

    if options.whitelist:

        if not extract_cell:
            if options.extract_method == "string":
                U.error("barcode pattern(s) do not include any cell bases "
                        "(marked with 'Cs') %s, %s" % (
                            options.pattern, options.pattern2))
            elif options.extract_method == "regex":
                U.error("barcode regex(es) do not include any cell groups "
                        "(starting with 'cell_') %s, %s" (
                            options.pattern, options.pattern2))

    read1s = umi_methods.fastqIterate(options.stdin)

    # set up read extractor
    ReadExtractor = extract_methods.ExtractFilterAndUpdate(
        options.extract_method,
        options.pattern,
        options.pattern2,
        options.prime3,
        extract_cell,
        options.quality_encoding,
        options.quality_filter_threshold,
        options.quality_filter_mask,
        options.filter_umi,
        options.filter_cell_barcode,
        options.retain_umi,
        options.either_read,
        options.either_read_resolve)

    if options.filter_umi:
        umi_whitelist, false_to_true_map = whitelist_methods.getUserDefinedBarcodes(
            options.umi_whitelist,
            options.umi_whitelist_paired,
            deriveErrorCorrection=True,
            threshold=options.correct_umi_threshold)

        U.info("Length of whitelist: %i" % len(umi_whitelist))
        U.info("Length of 'correctable' whitelist: %i" % len(false_to_true_map))

        ReadExtractor.umi_whitelist = umi_whitelist
        ReadExtractor.umi_false_to_true_map = false_to_true_map
        ReadExtractor.umi_whitelist_counts = collections.defaultdict(
            lambda: collections.Counter())

    if options.whitelist:
        cell_whitelist, false_to_true_map = whitelist_methods.getUserDefinedBarcodes(
            options.whitelist,
            getErrorCorrection=options.error_correct_cell)

        ReadExtractor.cell_whitelist = cell_whitelist
        ReadExtractor.false_to_true_map = false_to_true_map

    if options.blacklist:
        blacklist = set()
        with U.openFile(options.blacklist, "r") as inf:
            for line in inf:
                blacklist.add(line.strip().split("\t")[0])
        ReadExtractor.cell_blacklist = blacklist

    # variables for progress monitor
    progCount = 0
    displayMax = 100000
    U.info("Starting barcode extraction")

    if options.filtered_out:
        filtered_out = U.openFile(options.filtered_out, "w")

    if options.read2_in is None:
        for read in read1s:

            # incrementing count for monitoring progress
            progCount += 1

            # Update display in every 100kth iteration
            if progCount % displayMax == 0:
                U.info("Parsed {} reads".format(progCount))

            new_read = ReadExtractor(read)

            if options.reads_subset:
                if (ReadExtractor.read_counts['Input Reads'] >
                    options.reads_subset):
                    break

            if not new_read:
                if options.filtered_out:
                    filtered_out.write(str(read) + "\n")
                continue

            options.stdout.write(str(new_read) + "\n")

    else:

        if options.filtered_out2:
            filtered_out2 = U.openFile(options.filtered_out2, "w")

        read2s = umi_methods.fastqIterate(U.openFile(options.read2_in))

        if options.read2_out:
            read2_out = U.openFile(options.read2_out, "w")

        if options.reconcile:
            strict = False
        else:
            strict = True

        for read1, read2 in umi_methods.joinedFastqIterate(
                read1s, read2s, strict, options.ignore_suffix):

            # incrementing count for monitoring progress
            progCount += 1

            # Update display in every 100kth iteration
            if progCount % displayMax == 0:
                U.info("Parsed {} reads".format(progCount))
                sys.stdout.flush()

            reads = ReadExtractor(read1, read2)

            if options.reads_subset:
                if (ReadExtractor.read_counts['Input Reads'] >
                    options.reads_subset):
                    break

            if not reads:
                if options.filtered_out:
                    filtered_out.write(str(read1) + "\n")
                if options.filtered_out2:
                    filtered_out2.write(str(read2) + "\n")
                continue
            else:
                new_read1, new_read2 = reads

            if options.read2_stdout:
                options.stdout.write(str(new_read2) + "\n")
            else:
                options.stdout.write(str(new_read1) + "\n")

                if options.read2_out:
                    read2_out.write(str(new_read2) + "\n")

    if options.read2_out:
        read2_out.close()
    if options.filtered_out:
        filtered_out.close()
    if options.filtered_out2:
        filtered_out2.close()

    for k, v in ReadExtractor.getReadCounts().most_common():
        U.info("%s: %s" % (k, v))

    if options.umi_correct_log:
        with U.openFile(options.umi_correct_log, "w") as outf:
            outf.write("umi\tcount_no_errors\tcount_errors\n")
            for umi, counts in ReadExtractor.umi_whitelist_counts.items():
                outf.write("%s\t%i\t%i\n" % (
                    umi, counts["no_error"], counts["error"]))
        outf.close()

    U.Stop()

Example #37

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--in-sam",
                      dest="in_sam",
                      action="store_true",
                      help="Input file is in sam format [default=%default]",
                      default=False)
    parser.add_option(
        "-o",
        "--out-sam",
        dest="out_sam",
        action="store_true",
        help="Output alignments in sam format [default=%default]",
        default=False)
    parser.add_option("--umi-separator",
                      dest="umi_sep",
                      type="string",
                      help="separator between read id and UMI",
                      default="_")
    parser.add_option("--umi-tag",
                      dest="umi_tag",
                      type="string",
                      help="tag containing umi",
                      default='RX')
    parser.add_option("--umi-group-tag",
                      dest="umi_group_tag",
                      type="string",
                      help="tag for the outputted umi group",
                      default='BX')
    parser.add_option("--extract-umi-method",
                      dest="get_umi_method",
                      type="choice",
                      choices=("read_id", "tag"),
                      default="read_id",
                      help="where is the read UMI encoded? [default=%default]")
    parser.add_option("--subset",
                      dest="subset",
                      type="float",
                      help="Use only a fraction of reads, specified by subset",
                      default=None)
    parser.add_option("--spliced-is-unique",
                      dest="spliced",
                      action="store_true",
                      help="Treat a spliced read as different to an unspliced"
                      " one [default=%default]",
                      default=False)
    parser.add_option("--soft-clip-threshold",
                      dest="soft",
                      type="float",
                      help="number of bases clipped from 5' end before"
                      "read is counted as spliced [default=%default]",
                      default=4)
    parser.add_option("--edit-distance-threshold",
                      dest="threshold",
                      type="int",
                      default=1,
                      help="Edit distance theshold at which to join two UMIs"
                      "when clustering. [default=%default]")
    parser.add_option("--chrom",
                      dest="chrom",
                      type="string",
                      help="Restrict to one chromosome",
                      default=None)
    parser.add_option("--paired",
                      dest="paired",
                      action="store_true",
                      default=False,
                      help="paired BAM. [default=%default]")
    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("adjacency", "directional", "unique",
                               "cluster"),
                      default="directional",
                      help="method to use for umi deduping [default=%default]")
    parser.add_option("--per-contig",
                      dest="per_contig",
                      action="store_true",
                      default=False,
                      help=("dedup per contig (field 3 in BAM; RNAME),"
                            " e.g for transcriptome where contig = gene"))
    parser.add_option("--per-gene",
                      dest="per_gene",
                      action="store_true",
                      default=False,
                      help=("Deduplicate per gene,"
                            "e.g for transcriptome where contig = transcript"
                            "must also provide a transript to gene map with"
                            "--gene-transcript-map [default=%default]"))
    parser.add_option("--gene-transcript-map",
                      dest="gene_transcript_map",
                      type="string",
                      help="file mapping transcripts to genes (tab separated)",
                      default=None)
    parser.add_option("--gene-tag",
                      dest="gene_tag",
                      type="string",
                      help=("Deduplicate per gene where gene is"
                            "defined by this bam tag [default=%default]"),
                      default=None)
    parser.add_option(
        "--read-length",
        dest="read_length",
        action="store_true",
        default=False,
        help=("use read length in addition to position and UMI"
              "to identify possible duplicates [default=%default]"))
    parser.add_option("--mapping-quality",
                      dest="mapping_quality",
                      type="int",
                      help="Minimum mapping quality for a read to be retained"
                      " [default=%default]",
                      default=0)
    parser.add_option(
        "--output-unmapped",
        dest="output_unmapped",
        action="store_true",
        default=False,
        help=("Retain all unmapped reads in output[default=%default]"))
    parser.add_option(
        "--group-out",
        dest="tsv",
        type="string",
        help="Outfile name for file mapping read id to read group",
        default=None)
    parser.add_option(
        "--output-bam",
        dest="output_bam",
        action="store_true",
        default=False,
        help=("output a bam file with read groups tagged using the UG tag"
              "[default=%default]"))
    parser.add_option(
        "--skip-tags-regex",
        dest="skip_regex",
        type="string",
        help=("Used with --gene-tag. "
              "Ignore reads where the gene-tag matches this regex"),
        default="^[__|Unassigned]")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        out_name = options.stdout.name
        options.stdout.close()
        assert options.output_bam, (
            "To output a bam you must include --output-bam option")
    else:
        out_name = "-"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "wh"
    else:
        out_mode = "wb"

    if options.per_gene:
        if not options.gene_transcript_map:
            raise ValueError(
                "--per-gene option requires --gene-transcript-map")

    infile = pysam.Samfile(in_name, in_mode)

    if options.output_bam:
        outfile = pysam.Samfile(out_name, out_mode, template=infile)
    else:
        outfile = None

    if options.tsv:
        mapping_outfile = U.openFile(options.tsv, "w")
        mapping_outfile.write("%s\n" % "\t".join([
            "read_id", "contig", "position", "gene", "umi", "umi_count",
            "final_umi", "final_umi_count", "unique_id"
        ]))

    # set the method with which to extract umis from reads
    if options.get_umi_method == "read_id":
        umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep)
    elif options.get_umi_method == "tag":
        umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag)
    else:
        raise ValueError("Unknown umi extraction method")

    nInput, nOutput, unique_id = 0, 0, 0

    if options.chrom:
        inreads = infile.fetch(reference=options.chrom)
        gene_tag = options.gene_tag
    else:
        if options.per_gene and options.gene_transcript_map:
            metacontig2contig = umi_methods.getMetaContig2contig(
                infile, options.gene_transcript_map)
            metatag = "MC"
            inreads = umi_methods.metafetcher(infile, metacontig2contig,
                                              metatag)
            gene_tag = metatag

        else:
            inreads = infile.fetch(until_eof=options.output_unmapped)
            gene_tag = options.gene_tag

    for bundle, read_events, status in umi_methods.get_bundles(
            inreads,
            ignore_umi=False,
            subset=options.subset,
            quality_threshold=options.mapping_quality,
            paired=options.paired,
            spliced=options.spliced,
            soft_clip_threshold=options.soft,
            per_contig=options.per_contig,
            gene_tag=gene_tag,
            skip_regex=options.skip_regex,
            read_length=options.read_length,
            umi_getter=umi_getter,
            all_reads=True,
            return_read2=True,
            return_unmapped=options.output_unmapped):

        # write out read2s and unmapped if option set
        if status == 'single_read':
            # bundle is just a single read here
            outfile.write(bundle)
            nInput += 1
            nOutput += 1
            continue

        umis = bundle.keys()
        counts = {umi: bundle[umi]["count"] for umi in umis}

        nInput += sum(counts.values())

        if nOutput % 10000 == 0:
            U.debug("Outputted %i" % nOutput)

        if nInput % 1000000 == 0:
            U.debug("Read %i input reads" % nInput)

        # set up UMIClusterer functor with methods specific to
        # specified options.method
        processor = network.UMIClusterer(options.method)

        # group the umis
        groups = processor(umis, counts, threshold=options.threshold)

        for umi_group in groups:
            top_umi = umi_group[0]

            group_count = sum(counts[umi] for umi in umi_group)

            for umi in umi_group:
                reads = bundle[umi]['read']
                for read in reads:
                    if outfile:
                        # Add the 'UG' tag to the read
                        read.tags += [('UG', unique_id)]
                        read.tags += [(options.umi_group_tag, top_umi)]
                        outfile.write(read)

                    if options.tsv:
                        if options.per_gene:
                            gene = read.get_tag(gene_tag)
                        else:
                            gene = "NA"
                        mapping_outfile.write("%s\n" % "\t".join(
                            map(str,
                                (read.query_name, read.reference_name,
                                 umi_methods.get_read_position(
                                     read, options.soft)[1],
                                 gene, umi.decode(), counts[umi],
                                 top_umi.decode(), group_count, unique_id))))

                    nOutput += 1

            unique_id += 1

    if outfile:
        outfile.close()

    if options.tsv:
        mapping_outfile.close()

    # write footer and output benchmark information.
    U.info(
        "Reads: %s" %
        ", ".join(["%s: %s" % (x[0], x[1])
                   for x in read_events.most_common()]))
    U.info("Number of reads out: %i, Number of groups: %i" %
           (nOutput, unique_id))
    U.Stop()

Example #38

Show file

File: whitelist_methods.py Project: AbateLab/DAb-seq

def getKneeEstimateDistance(cell_barcode_counts,
                            cell_number=False,
                            plotfile_prefix=None):
    ''' estimate the number of "true" cell barcodes via a knee method
    which finds the point with maximum distance

    input:
         cell_barcode_counts = dict(key = barcode, value = count)
         cell_number (optional) = define number of cell barcodes to accept
         plotfile_prefix = (optional) prefix for plots

    returns:
         List of true barcodes
    '''
    def getKneeDistance(values):
        '''
        This function is based on
        https://stackoverflow.com/questions/2018178/finding-the-best-trade-off-point-on-a-curve

        and https://dataplatform.cloud.ibm.com/analytics/notebooks/54d79c2a-f155-40ec-93ec-ed05b58afa39/view?access_token=6d8ec910cf2a1b3901c721fcb94638563cd646fe14400fecbb76cea6aaae2fb1

        The idea is to draw a line from the first to last point on the
        cumulative counts curve and then find the point on the curve
        which is the maximum distance away from this line
        '''

        # get coordinates of all the points
        nPoints = len(values)
        allCoord = np.vstack((range(nPoints), values)).T

        # get the first point
        firstPoint = allCoord[0]
        # get vector between first and last point - this is the line
        lineVec = allCoord[-1] - allCoord[0]
        lineVecNorm = lineVec / np.sqrt(np.sum(lineVec**2))

        # find the distance from each point to the line:
        # vector between all points and first point
        vecFromFirst = allCoord - firstPoint

        # To calculate the distance to the line, we split vecFromFirst into two
        # components, one that is parallel to the line and one that is perpendicular
        # Then, we take the norm of the part that is perpendicular to the line and
        # get the distance.
        # We find the vector parallel to the line by projecting vecFromFirst onto
        # the line. The perpendicular vector is vecFromFirst - vecFromFirstParallel
        # We project vecFromFirst by taking the scalar product of the vector with
        # the unit vector that points in the direction of the line (this gives us
        # the length of the projection of vecFromFirst onto the line). If we
        # multiply the scalar product by the unit vector, we have vecFromFirstParallel

        scalarProduct = np.sum(vecFromFirst *
                               npm.repmat(lineVecNorm, nPoints, 1),
                               axis=1)
        vecFromFirstParallel = np.outer(scalarProduct, lineVecNorm)
        vecToLine = vecFromFirst - vecFromFirstParallel

        # distance to line is the norm of vecToLine
        distToLine = np.sqrt(np.sum(vecToLine**2, axis=1))

        # knee/elbow is the point with max distance value
        idxOfBestPoint = np.argmax(distToLine)

        return (distToLine, idxOfBestPoint)

    counts = [x[1] for x in cell_barcode_counts.most_common()]
    values = list(np.cumsum(counts))

    # We need to perform the distance knee iteratively with reduced
    # number of CBs since it's sensitive to the number of CBs input
    # and overestimates if too many CBs are used
    previous_idxOfBestPoint = 0
    distToLine, idxOfBestPoint = getKneeDistance(values)
    if idxOfBestPoint == 0:
        raise ValueError("Something's gone wrong here!!")

    max_iterations = 100
    iterations = 0
    while idxOfBestPoint - previous_idxOfBestPoint != 0:
        previous_idxOfBestPoint = idxOfBestPoint
        iterations += 1
        if iterations > max_iterations:
            break
        distToLine, idxOfBestPoint = getKneeDistance(values[:idxOfBestPoint *
                                                            3])

    knee_final_barcodes = [
        x[0] for x in cell_barcode_counts.most_common()[:idxOfBestPoint + 1]
    ]

    if cell_number:
        threshold = counts[cell_number]
        final_barcodes = set(
            [x for x, y in cell_barcode_counts.items() if y > threshold])
    else:
        final_barcodes = knee_final_barcodes

    if plotfile_prefix:

        # colour-blind friendly colours - https://gist.github.com/thriveth/8560036
        CB_color_cycle = [
            '#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3',
            '#999999', '#e41a1c', '#dede00'
        ]

        user_line = mlines.Line2D([], [],
                                  color=CB_color_cycle[2],
                                  ls="dashed",
                                  markersize=15,
                                  label='User-defined')
        selected_line = mlines.Line2D([], [],
                                      color=CB_color_cycle[0],
                                      ls="dashed",
                                      markersize=15,
                                      label='Knee')

        # plot of the original curve and its corresponding distances
        plt.figure(figsize=(12, 6))
        plt.plot(distToLine, label='Distance', color='r')
        plt.plot(values, label='Cumulative', color='b')
        plt.plot([idxOfBestPoint],
                 values[idxOfBestPoint],
                 marker='o',
                 markersize=8,
                 color="red",
                 label='Knee')

        if cell_number:
            plt.axvline(x=cell_number,
                        ls="dashed",
                        color=CB_color_cycle[2],
                        label="User-defined")

        plt.legend()
        plt.savefig("%s_cell_barcode_knee.png" % plotfile_prefix)

        colours_selected = [
            CB_color_cycle[0] for x in range(0, len(final_barcodes))
        ]
        colours_rejected = [
            "black" for x in range(0,
                                   len(counts) - len(final_barcodes))
        ]
        colours = colours_selected + colours_rejected

        fig = plt.figure()
        fig3 = fig.add_subplot(111)
        fig3.scatter(x=range(1,
                             len(counts) + 1),
                     y=counts,
                     c=colours,
                     s=10,
                     linewidths=0)
        fig3.loglog()
        fig3.set_xlim(0, len(counts) * 1.25)
        fig3.set_xlabel('Barcode index')
        fig3.set_ylabel('Count')
        fig3.axvline(x=len(knee_final_barcodes),
                     ls="dashed",
                     color=CB_color_cycle[0])

        if cell_number:
            fig3.axvline(x=cell_number, ls="dashed", color=CB_color_cycle[2])

            lgd = fig3.legend(bbox_to_anchor=(1.05, 1),
                              loc=2,
                              borderaxespad=0.,
                              handles=[selected_line, user_line],
                              title="User threshold")
        else:
            lgd = fig3.legend(bbox_to_anchor=(1.05, 1),
                              loc=2,
                              borderaxespad=0.,
                              handles=[selected_line],
                              title="Knee threshold")

        fig.savefig("%s_cell_barcode_counts.png" % plotfile_prefix,
                    bbox_extra_artists=(lgd, ),
                    bbox_inches='tight')

        if not cell_number:
            with U.openFile("%s_cell_thresholds.tsv" % plotfile_prefix,
                            "w") as outf:
                outf.write("count\n")
                outf.write("%s\n" % idxOfBestPoint)

    return (final_barcodes)