def fill(self): ''' parse the BAM to obtain the frequency for each UMI''' self.frequency2umis = collections.defaultdict(list) for read in self.inbam: if read.is_unmapped: continue if read.is_read2: continue try: self.umis[self.barcode_getter(read)[0]] += 1 except KeyError: continue self.umis_counter = collections.Counter(self.umis) total_umis = sum(self.umis_counter.values()) U.info("total_umis %i" % total_umis) U.info("#umis %i" % len(self.umis_counter)) self.prob = self.umis_counter.values() sum_prob = sum(self.prob) self.prob = [float(x) / sum_prob for x in self.prob] self.refill_random()
def write_mates(self): '''Scan the current chromosome for matches to any of the reads stored in the read1s buffer''' if self.chrom is not None: U.debug("Dumping %i mates for contig %s" % (len(self.read1s), self.chrom)) for read in self.infile.fetch(reference=self.chrom, multiple_iterators=True): if any((read.is_unmapped, read.mate_is_unmapped, read.is_read1)): continue key = read.query_name, read.reference_name, read.reference_start if key in self.read1s: if self.read2tags is not None: unique_id, umi = self.read2tags[key] self.read2tags.pop(key) read.tags += [('UG', unique_id)] read.tags += [('FU', umi)] self.outfile.write(read) self.read1s.remove(key) U.debug("%i mates remaining" % len(self.read1s))
def breadth_first_search_recursive(node, adj_list): try: recursive_search.component = set((node,)) return recursive_search(node, adj_list) except RecursionError as error: U.info('Recursion Error: %s' % error) return breadth_first_search(node, adj_list)
def breadth_first_search_recursive(node, adj_list): try: recursive_search.component = set((node, )) return recursive_search(node, adj_list) except RecursionError as error: U.info('Recursion Error: %s' % error) return breadth_first_search(node, adj_list)
def ExtractBarcodes(read, match, extract_umi=False, extract_cell=False, discard=False, retain_umi=False): '''Extract the cell and umi barcodes using a regex.match object inputs: - read 1 and read2 = Record objects - match = regex.match object - extract_umi and extract_cell = switches to determine whether these barcodes should be extracted - discard = is there a region(s) of the sequence which should be discarded entirely? - retain_umi = Should UMI sequence be retained on the read sequence returns: - cell_barcode = Cell barcode string - cell_barcode_quals = Cell barcode quality scores - umi = UMI barcode string. - umi_quals = UMI barcode quality scores - new_seq = Read1 sequence after extraction - new_quals = Read1 qualities after extraction Barcodes and qualities default to empty strings where extract_cell or extract_umi are false. ''' cell_barcode, umi, cell_barcode_quals, umi_quals, new_seq, new_quals = ( "", ) * 6 if not extract_cell and not extract_umi: U.error("must set either extract_cell and/or extract_umi to true") groupdict = match.groupdict() cell_bases = set() umi_bases = set() discard_bases = set() for k in sorted(list(groupdict)): span = match.span(k) if extract_cell and k.startswith("cell_"): cell_barcode += groupdict[k] cell_bases.update(range(span[0], span[1])) elif extract_umi and k.startswith("umi_"): umi += groupdict[k] umi_bases.update(range(span[0], span[1])) elif discard and k.startswith("discard_"): discard_bases.update(range(span[0], span[1])) new_seq, new_quals, umi_quals, cell_quals = extractSeqAndQuals( read.seq, read.quals, umi_bases, cell_bases, discard_bases, retain_umi) return (cell_barcode, cell_barcode_quals, umi, umi_quals, new_seq, new_quals)
def getErrorCorrectMapping(cell_barcodes, whitelist, threshold=1): ''' Find the mappings between true and false cell barcodes based on an edit distance threshold. Any cell barcode within the threshold to more than one whitelist barcode will be excluded''' true_to_false = collections.defaultdict(set) # Unexpected results with cythonise hamming distance so redefine in python here def hamming_distance(first, second): ''' returns the edit distance/hamming distances between its two arguements ''' # We only want to define hamming distance for barcodes with the same length if len(first) != len(second): return np.inf dist = sum([not a == b for a, b in zip(first, second)]) return dist whitelist = set([str(x) for x in whitelist]) U.info('building bktree') tree2 = pybktree.BKTree(hamming_distance, whitelist) U.info('done building bktree') for cell_barcode in cell_barcodes: if cell_barcode in whitelist: # if the barcode is already whitelisted, no need to add continue # get all members of whitelist that are at distance 1 candidates = [ white_cell for d, white_cell in tree2.find(cell_barcode, threshold) if d > 0 ] if len(candidates) == 0: # the cell doesnt match to any whitelisted barcode, # hence we have to drop it # (as it cannot be asscociated with any frequent barcde) continue elif len(candidates) == 1: white_cell_str = candidates[0] true_to_false[white_cell_str].add(cell_barcode) else: # more than on whitelisted candidate: # we drop it as its not uniquely assignable continue return true_to_false
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False) nInput, nOutput = 0, 0 # set the method with which to extract umis from reads umi_getter = partial( umi_methods.get_umi_read_string, sep=options.umi_sep) options.stdout.write("%s\t%s\n" % ("gene", "count")) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) for gene, counts in umi_methods.get_gene_count_tab( options.stdin, umi_getter=umi_getter): umis = counts.keys() nInput += sum(counts.values()) # group the umis groups = processor( umis, counts, threshold=options.threshold) gene_count = len(groups) options.stdout.write("%s\t%i\n" % (gene, gene_count)) nOutput += gene_count U.info("Number of reads counted: %i" % nOutput) U.Stop()
def __init__(self, options): self.UMIClusterer = UMIClusterer(cluster_method=options.method) if options.filter_umi: self.umi_whitelist = whitelist_methods.getUserDefinedBarcodes( options.umi_whitelist, options.umi_whitelist_paired, deriveErrorCorrection=False)[0] self.umi_whitelist_counts = collections.Counter() U.info("Length of UMI whitelist: %i" % len(self.umi_whitelist)) else: self.umi_whitelist = None
def __call__(self, umis, counts): '''Counts is a directionary that maps UMIs to their counts''' len_umis = [len(x) for x in umis] if not max(len_umis) == min(len_umis): U.warn("not all umis are the same length(!): %d - %d" % ( min(len_umis), max(len_umis))) adj_list = self.get_adj_list(umis, counts) clusters = self.get_connected_components(umis, adj_list, counts) final_umis = [list(x) for x in self.get_groups(clusters, adj_list, counts)] return final_umis
def singleBarcodeGenerator(whitelist_tsv): with U.openFile(whitelist_tsv, "r") as inf: for line in inf: if line.startswith('#'): continue line = line.strip().split("\t") yield(line[0])
def errorDetectAboveThreshold(cell_barcode_counts, cell_whitelist, true_to_false_map, errors=1, resolution_method="discard"): assert resolution_method in [ "discard", "correct" ], ("resolution method must be discard or correct") error_counter = collections.Counter() new_true_to_false_map = copy.deepcopy(true_to_false_map) discard_cbs = set() cell_whitelist = list(cell_whitelist) cell_whitelist.sort(key=lambda x: cell_barcode_counts[x]) for ix, cb in enumerate(cell_whitelist): near_misses = checkError(cb, cell_whitelist[ix + 1:], errors=errors) if len(near_misses) > 0: error_counter["error_discarded_mt_1"] discard_cbs.add(cb) # Will always discard CB from cell_whitelist if resolution_method == "correct" and len(near_misses) == 1: # Only correct substitutions as INDELs will also mess # up UMI so simple correction of CB is insufficient if regex.match("(%s){s<=%i}" % (cb, errors), near_misses[0]): # add corrected barcode to T:F map new_true_to_false_map[near_misses[0]].add(cb) error_counter["substitution_corrected"] += 1 else: discard_cbs.add(cb) error_counter["indel_discarded"] += 1 else: error_counter["error_discarded"] += 1 if resolution_method == "correct": U.info( "CBs above the knee corrected due to possible substitutions: %i" % error_counter["substitution_corrected"]) U.info("CBs above the knee discarded due to possible INDELs: %i" % error_counter["indel_discarded"]) U.info("CBs above the knee discarded due to possible errors from " "multiple other CBs: %i" % error_counter["error_discarded_mt_1"]) else: U.info("CBs above the knee discarded due to possible errors: %i" % len(discard_cbs)) cell_whitelist = set(cell_whitelist).difference(discard_cbs) return (cell_whitelist, new_true_to_false_map)
def write_mates(self): '''Scan the current chormosome for matches to any of the reads stored in the read1s buffer''' if self.chrom is not None: U.debug("Dumping %i mates for contig %s" % ( len(self.read1s), self.infile.get_reference_name(self.chrom))) for read in self.infile.fetch(tid=self.chrom, multiple_iterators=True): if any((read.is_unmapped, read.mate_is_unmapped, read.is_read1)): continue key = read.query_name, read.reference_id, read.reference_start if key in self.read1s: self.outfile.write(read) self.read1s.remove(key) U.debug("%i mates remaining" % len(self.read1s))
def close(self): '''Write mates for remaining chromsome. Search for matches to any unmatched reads''' self.write_mates() U.info("Searching for mates for %i unmatched alignments" % len(self.read1s)) found = 0 for name, chrom, pos in self.read1s: for read in self.outfile.fetch(start=pos, end=pos + 1, tid=chrom): if (read.query_name, read.pos) == (name, pos): self.outfile.write(read) found += 1 break U.info("%i mates never found" % (len(self.read1s) - found)) self.outfile.close()
def close(self): '''Write mates for remaining chromsome. Search for matches to any unmatched reads''' self.write_mates() U.info("Searching for mates for %i unmatched alignments" % len(self.read1s)) found = 0 for name, chrom, pos in self.read1s: for read in self.outfile.fetch(start=pos, end=pos+1, tid=chrom): if (read.query_name, read.pos) == (name, pos): self.outfile.write(read) found += 1 break U.info("%i mates never found" % (len(self.read1s) - found)) self.outfile.close()
def write_mates(self): '''Scan the current chormosome for matches to any of the reads stored in the read1s buffer''' if self.chrom is not None: U.debug( "Dumping %i mates for contig %s" % (len(self.read1s), self.infile.get_reference_name(self.chrom))) for read in self.infile.fetch(tid=self.chrom, multiple_iterators=True): if any((read.is_unmapped, read.mate_is_unmapped, read.is_read1)): continue key = read.query_name, read.reference_id, read.reference_start if key in self.read1s: self.outfile.write(read) self.read1s.remove(key) U.debug("%i mates remaining" % len(self.read1s))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False) nInput, nOutput = 0, 0 # set the method with which to extract umis from reads umi_getter = partial(umi_methods.get_umi_read_string, sep=options.umi_sep) options.stdout.write("%s\t%s\n" % ("gene", "count")) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) for gene, counts in umi_methods.get_gene_count_tab(options.stdin, umi_getter=umi_getter): umis = counts.keys() nInput += sum(counts.values()) # group the umis groups = processor(umis, counts, threshold=options.threshold) gene_count = len(groups) options.stdout.write("%s\t%i\n" % (gene, gene_count)) nOutput += gene_count U.info("Number of reads counted: %i" % nOutput) U.Stop()
def getCellWhitelist(cell_barcode_counts, knee_method="distance", expect_cells=False, cell_number=False, error_correct_threshold=0, plotfile_prefix=None): if knee_method == "distance": cell_whitelist = getKneeEstimateDistance(cell_barcode_counts, cell_number, plotfile_prefix) elif knee_method == "density": cell_whitelist = getKneeEstimateDensity(cell_barcode_counts, expect_cells, cell_number, plotfile_prefix) else: raise ValueError("knee_method must be 'distance' or 'density'") U.info("Finished - whitelist determination") true_to_false_map = None if cell_whitelist and error_correct_threshold > 0: U.info("Starting - finding putative error cell barcodes") true_to_false_map = getErrorCorrectMapping(cell_barcode_counts.keys(), cell_whitelist, error_correct_threshold) U.info("Finished - finding putative error cell barcodes") return cell_whitelist, true_to_false_map
def fastqIterate(infile): '''iterate over contents of fastq file.''' def convert2string(b): if type(b) == str: return b else: return b.decode("utf-8") while 1: line1 = convert2string(infile.readline()) if not line1: break if not line1.startswith('@'): U.error("parsing error: expected '@' in line %s" % line1) line2 = convert2string(infile.readline()) line3 = convert2string(infile.readline()) if not line3.startswith('+'): U.error("parsing error: expected '+' in line %s" % line3) line4 = convert2string(infile.readline()) # incomplete entry if not line4: U.error("incomplete entry for %s" % line1) yield Record(line1[1:-1], line2[:-1], line4[:-1])
def pairedBarcodeGenerator(whitelist_tsv, whitelist_tsv2): whitelist1 = [] whitelist2 = [] with U.openFile(whitelist_tsv, "r") as inf: for line in inf: if line.startswith('#'): continue line = line.strip().split("\t") whitelist1.append(line[0]) with U.openFile(whitelist_tsv2, "r") as inf2: for line in inf2: if line.startswith('#'): continue line = line.strip().split("\t") whitelist2.append(line[0]) for w1, w2 in itertools.product(whitelist1, whitelist2): yield(w1 + w2)
def close(self): '''Write mates for remaining chromsome. Search for matches to any unmatched reads''' self.write_mates() U.info("Searching for mates for %i unmatched alignments" % len(self.read1s)) found = 0 for read in self.infile.fetch(until_eof=True, multiple_iterators=True): if any((read.is_unmapped, read.mate_is_unmapped, read.is_read1)): continue key = read.query_name, read.reference_name, read.reference_start if key in self.read1s: self.outfile.write(read) self.read1s.remove(key) found += 1 continue U.info("%i mates never found" % len(self.read1s)) self.outfile.close()
def getMetaContig2contig(gene_transcript_map): ''' ''' metacontig2contig = collections.defaultdict(set) for line in U.openFile(gene_transcript_map, "r"): if line.startswith("#"): continue if len(line.strip()) == 0: break gene, transcript = line.strip().split("\t") metacontig2contig[gene].add(transcript) return metacontig2contig
def getUserDefinedBarcodes(whitelist_tsv, getErrorCorrection=False): cell_whitelist = [] if getErrorCorrection: false_to_true_map = {} else: false_to_true_map = None with U.openFile(whitelist_tsv, "r") as inf: for line in inf: if line.startswith('#'): continue line = line.strip().split("\t") whitelist_barcode = line[0] cell_whitelist.append(whitelist_barcode) if getErrorCorrection: for error_barcode in line[1].split(","): false_to_true_map[error_barcode] = whitelist_barcode return set(cell_whitelist), false_to_true_map
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-p", "--bc-pattern", dest="pattern", type="string", help="Barcode pattern") parser.add_option("--bc-pattern2", dest="pattern2", type="string", help="Barcode pattern for paired reads") parser.add_option("--3prime", dest="prime3", action="store_true", help="barcode is on 3' end of read.") parser.add_option("--read2-in", dest="read2_in", type="string", help="file name for read pairs") parser.add_option("--read2-out", dest="read2_out", type="string", help="file to output processed paired read to") parser.add_option("--read2-stdout", dest="read2_stdout", action="store_true", help="Paired reads, send read2 to stdout, discarding read1") parser.add_option("--quality-filter-threshold", dest="quality_filter_threshold", type="int", help=("Remove reads where any UMI base quality score " "falls below this threshold")) parser.add_option("--quality-filter-mask", dest="quality_filter_mask", type="int", help=("If a UMI base has a quality below this threshold, " "replace the base with 'N'")) parser.add_option("--quality-encoding", dest="quality_encoding", type="choice", choices=["phred33", "phred64", "solexa"], help=("Quality score encoding. Choose from 'phred33'" "[33-77] 'phred64' [64-106] or 'solexa' [59-106]")) parser.add_option("--extract-method", dest="extract_method", type="choice", choices=["string", "regex"], help=("How to extract the umi +/- cell barcodes, Choose " "from 'string' or 'regex'")) parser.add_option("--filter-cell-barcode", dest="filter_cell_barcode", action="store_true", help="Filter the cell barcodes") parser.add_option("--error-correct-cell", dest="error_correct_cell", action="store_true", help=("Correct errors in the cell barcode")) parser.add_option("--whitelist", dest="whitelist", type="string", help=("A whitelist of accepted cell barcodes")) parser.add_option("--blacklist", dest="blacklist", type="string", help=("A blacklist of accepted cell barcodes")) parser.add_option("--reads-subset", dest="reads_subset", type="int", help=("Only extract from the first N reads. If N is " "greater than the number of reads, all reads will " "be used")) parser.add_option("--reconcile-pairs", dest="reconcile", action="store_true", help=("Allow the presences of reads in read2 input that are" "not present in read1 input. This allows cell barcode" "filtering of read1s without considering read2s")) parser.set_defaults(extract_method="string", filter_cell_barcodes=False, whitelist=None, blacklist=None, error_correct_cell=False, pattern=None, pattern2=None, read2_in=None, read2_out=False, read2_stdout=False, quality_filter_threshold=None, quality_encoding=None, reconcile=False) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False, add_sam_options=False) if options.quality_filter_threshold or options.quality_filter_mask: if not options.quality_encoding: U.error("must provide a quality encoding (--quality-" "encoding) to filter UMIs by quality (--quality" "-filter-threshold) or mask low quality bases " "with (--quality-filter-mask)") if not options.pattern and not options.pattern2: if not options.read2_in: U.error("Must supply --bc-pattern for single-end") else: U.error("Must supply --bc-pattern and/or --bc-pattern " "if paired-end ") if options.pattern2: if not options.read2_in: U.error("must specify a paired fastq ``--read2-in``") if not options.pattern2: options.pattern2 = options.pattern extract_cell = False extract_umi = False # If the pattern is a regex we can compile the regex(es) prior to # ExtractFilterAndUpdate instantiation if options.extract_method == "regex": if options.pattern: try: options.pattern = regex.compile(options.pattern) except regex.error: U.error("barcode_regex '%s' is not a " "valid regex" % options.pattern) if options.pattern2: try: options.pattern2 = regex.compile(options.pattern2) except regex.Error: U.error("barcode_regex2 '%s' is not a " "valid regex" % options.pattern2) # check whether the regex contains a umi group(s) and cell groups(s) if options.extract_method == "regex": if options.pattern: for group in options.pattern.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True if options.pattern2: for group in options.pattern2.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True # check whether the pattern string contains umi/cell bases elif options.extract_method == "string": if options.pattern: if "C" in options.pattern: extract_cell = True if "N" in options.pattern: extract_umi = True if options.pattern2: if "C" in options.pattern2: extract_cell = True if "N" in options.pattern2: extract_umi = True if not extract_umi: if options.extract_method == "string": U.error("barcode pattern(s) do not include any umi bases " "(marked with 'Ns') %s, %s" % ( options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any umi groups " "(starting with 'umi_') %s, %s" ( options.pattern, options.pattern2)) if options.filter_cell_barcodes: if not options.whitelist: U.error("must provide a whitelist (--whitelist) if using " "--filter-cell-barcode option") if not extract_cell: if options.extract_method == "string": U.error("barcode pattern(s) do not include any cell bases " "(marked with 'Cs') %s, %s" % ( options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any cell groups " "(starting with 'cell_') %s, %s" ( options.pattern, options.pattern2)) read1s = umi_methods.fastqIterate(options.stdin) # set up read extractor ReadExtractor = umi_methods.ExtractFilterAndUpdate( options.extract_method, options.pattern, options.pattern2, options.prime3, extract_cell, options.quality_encoding, options.quality_filter_threshold, options.quality_filter_mask, options.filter_cell_barcode) if options.filter_cell_barcode: cell_whitelist, false_to_true_map = umi_methods.getUserDefinedBarcodes( options.whitelist, options.error_correct_cell) ReadExtractor.cell_whitelist = cell_whitelist ReadExtractor.false_to_true_map = false_to_true_map if options.blacklist: blacklist = set() with U.openFile(options.blacklist, "r") as inf: for line in inf: blacklist.add(line.strip().split("\t")[0]) ReadExtractor.cell_blacklist = blacklist # variables for progress monitor progCount = 0 displayMax = 100000 U.info("Starting barcode extraction") if options.read2_in is None: for read in read1s: # incrementing count for monitoring progress progCount += 1 # Update display in every 100kth iteration if progCount % displayMax == 0: U.info("Parsed {} reads".format(progCount)) new_read = ReadExtractor(read) if options.reads_subset: if (ReadExtractor.read_counts['Input Reads'] > options.reads_subset): break if not new_read: continue options.stdout.write(str(new_read) + "\n") else: read2s = umi_methods.fastqIterate(U.openFile(options.read2_in)) if options.read2_out: read2_out = U.openFile(options.read2_out, "w") if options.reconcile: strict = False else: strict = True for read1, read2 in umi_methods.joinedFastqIterate( read1s, read2s, strict): # incrementing count for monitoring progress progCount += 1 # Update display in every 100kth iteration if progCount % displayMax == 0: U.info("Parsed {} reads".format(progCount)) sys.stdout.flush() reads = ReadExtractor(read1, read2) if options.reads_subset: if (ReadExtractor.read_counts['Input Reads'] > options.reads_subset): break if not reads: continue else: new_read1, new_read2 = reads if options.read2_stdout: options.stdout.write(str(new_read2) + "\n") else: options.stdout.write(str(new_read1) + "\n") if options.read2_out: read2_out.write(str(new_read2) + "\n") if options.read2_out: read2_out.close() for k, v in ReadExtractor.getReadCounts().most_common(): U.info("%s: %s" % (k, v)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-p", "--bc-pattern", dest="pattern", type="string", help="Barcode pattern") parser.add_option("--bc-pattern2", dest="pattern2", type="string", help="Barcode pattern for paired reads") parser.add_option("--3prime", dest="prime3", action="store_true", help="barcode is on 3' end of read.") parser.add_option("--read2-in", dest="read2_in", type="string", help="file name for read pairs") parser.add_option("--extract-method", dest="extract_method", type="choice", choices=["string", "regex"], help=("How to extract the umi +/- cell barcodes, Choose " "from 'string' or 'regex'")) parser.add_option("--plot-prefix", dest="plot_prefix", type="string", help=("Prefix for plots to visualise the automated " "detection of the number of 'true' cell barcodes")) parser.add_option("--subset-reads", dest="subset_reads", type="int", help=("Use the first N reads to automatically identify " "the true cell barcodes. If N is greater than the " "number of reads, all reads will be used")) parser.add_option("--error-correct-threshold", dest="error_correct_threshold", type="int", help=("Hamming distance for correction of " "barcodes to whitelist barcodes")) parser.add_option("--method", dest="method", choices=["reads", "umis"], help=("Use reads or unique umi counts per cell")) parser.add_option("--expect-cells", dest="expect_cells", type="int", help=("Prior expectation on the upper limit on the " "number of cells sequenced")) parser.add_option("--set-cell-number", dest="cell_number", type="int", help=("Specify the number of cell barcodes to accept")) parser.set_defaults(method="reads", extract_method="string", filter_cell_barcodes=False, whitelist_tsv=None, blacklist_tsv=None, error_correct_threshold=1, pattern=None, pattern2=None, read2_in=None, plot_prefix=None, subset_reads=100000000, expect_cells=False, cell_number=False) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False, add_sam_options=False) if options.expect_cells and options.cell_number: U.error("Cannot supply both --expect-cells and " "--cell-number options") if not options.pattern and not options.pattern2: if not options.read2_in: U.error("Must supply --bc-pattern for single-end") else: U.error("Must supply --bc-pattern and/or --bc-pattern2 " "if paired-end ") if options.pattern2: if not options.read2_in: U.error("must specify a paired fastq ``--read2-in``") if not options.pattern2: options.pattern2 = options.pattern extract_cell = False extract_umi = False # If the pattern is a regex we can compile the regex(es) prior to # ExtractFilterAndUpdate instantiation if options.extract_method == "regex": if options.pattern: try: options.pattern = regex.compile(options.pattern) except regex.error: U.error("barcode_regex '%s' is not a " "valid regex" % options.pattern) if options.pattern2: try: options.pattern2 = regex.compile(options.barcode_regex2) except regex.Error: U.error("barcode_regex2 '%s' is not a " "valid regex" % options.barcode_regex2) # check whether the regex contains a umi group(s) and cell groups(s) if options.extract_method == "regex": if options.pattern: for group in options.pattern.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True if options.pattern2: for group in options.pattern2.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True # check whether the pattern string contains umi/cell bases elif options.extract_method == "string": if options.pattern: if "C" in options.pattern: extract_cell = True if "N" in options.pattern: extract_umi = True if options.pattern2: if "C" in options.pattern2: extract_cell = True if "N" in options.pattern2: extract_umi = True if not extract_umi: if options.extract_method == "string": U.error("barcode pattern(s) do not include any umi bases " "(marked with 'Ns') %s, %s" % ( options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any umi groups " "(starting with 'umi_') %s, %s" ( options.pattern, options.pattern2)) if not extract_cell: if options.extract_method == "string": U.error("barcode pattern(s) do not include any cell bases " "(marked with 'Cs') %s, %s" % ( options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any cell groups " "(starting with 'cell_') %s, %s" ( options.pattern, options.pattern2)) read1s = umi_methods.fastqIterate(options.stdin) # set up read extractor ReadExtractor = umi_methods.ExtractFilterAndUpdate( method=options.extract_method, pattern=options.pattern, pattern2=options.pattern2, prime3=options.prime3, extract_cell=extract_cell) cell_barcode_counts = collections.Counter() n_reads = 0 n_cell_barcodes = 0 # if using the umis method, need to keep a set of umis observed if options.method == "umis": cell_barcode_umis = collections.defaultdict(set) # variables for progress monitor displayMax = 100000 U.info("Starting barcode extraction") if not options.read2_in: for read1 in read1s: # Update display in every 100kth iteration if n_reads % displayMax == 0: U.info("Parsed {} reads".format(n_reads)) n_reads += 1 barcode_values = ReadExtractor.getBarcodes(read1) if barcode_values is None: continue else: cell, umi, _, _, _, _, _ = barcode_values if options.method == "umis": cell_barcode_umis[cell].add(umi) else: cell_barcode_counts[cell] += 1 n_cell_barcodes += 1 if options.subset_reads: if n_cell_barcodes > options.subset_reads: break else: read2s = umi_methods.fastqIterate(U.openFile(options.read2_in)) for read1, read2 in izip(read1s, read2s): # Update display in every 100kth iteration if n_reads % displayMax == 0: U.info("Parsed {} reads".format(n_reads)) n_reads += 1 barcode_values = ReadExtractor.getBarcodes(read1, read2) if barcode_values is None: continue else: cell, umi, _, _, _, _, _ = barcode_values if options.method == "umis": cell_barcode_umis[cell].add(umi) else: cell_barcode_counts[cell] += 1 n_cell_barcodes += 1 if options.subset_reads: if n_reads > options.subset_reads: break U.info("Starting - whitelist determination") if options.method == "umis": for cell in cell_barcode_umis: cell_barcode_counts[cell] = len(cell_barcode_umis[cell]) if options.cell_number and options.cell_number > len(cell_barcode_counts): raise ValueError( "--set-cell-barcode option specifies more cell barcodes than the " "number of observed cell barcodes. This may be because " "--subset-reads was set to a value too low to capture reads from " "all cells. %s cell barcodes observed from %s parsed reads. " "Expected>= %s cell barcodes" % ( len(cell_barcode_counts), options.subset_reads, options.cell_number)) cell_whitelist, true_to_false_map = umi_methods.getCellWhitelist( cell_barcode_counts, options.expect_cells, options.cell_number, options.error_correct_threshold, options.plot_prefix) U.info("Writing out whitelist") for barcode in sorted(list(cell_whitelist)): if true_to_false_map: corrected_barcodes = ",".join( sorted(true_to_false_map[barcode])) corrected_barcode_counts = ",".join( map(str, [cell_barcode_counts[x] for x in sorted(true_to_false_map[barcode])])) else: corrected_barcodes, corrected_barcode_counts = "", "" options.stdout.write("%s\t%s\t%s\t%s\n" % ( barcode, corrected_barcodes, cell_barcode_counts[barcode], corrected_barcode_counts)) U.info("Parsed %i reads" % n_reads) U.info("%i reads matched the barcode pattern" % n_cell_barcodes) U.info("Found %i unique cell barcodes" % len(cell_barcode_counts)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) group = U.OptionGroup(parser, "group-specific options") group.add_option( "--group-out", dest="tsv", type="string", help="Outfile name for file mapping read id to read group", default=None) group.add_option( "--output-bam", dest="output_bam", action="store_true", default=False, help=("output a bam file with read groups tagged using the UG tag" "[default=%default]")) group.add_option( "--output-unmapped", dest="output_unmapped", action="store_true", default=False, help=("Retain all unmapped reads in output[default=%default]")) parser.add_option("--umi-group-tag", dest="umi_group_tag", type="string", help="tag for the outputted umi group", default='BX') parser.add_option_group(group) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) U.validateSamOptions(options) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: if options.no_sort_output: out_name = options.stdout.name else: out_name = U.getTempFilename() sorted_out_name = options.stdout.name options.stdout.close() assert options.output_bam, ( "To output a bam you must include --output-bam option") else: if options.no_sort_output: out_name = "-" else: out_name = U.getTempFilename() sorted_out_name = "-" if not options.no_sort_output: # need to determine the output format for sort if options.out_sam: sort_format = "sam" else: sort_format = "bam" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "wh" else: out_mode = "wb" infile = pysam.Samfile(in_name, in_mode) if options.output_bam: outfile = pysam.Samfile(out_name, out_mode, template=infile) else: outfile = None if options.tsv: mapping_outfile = U.openFile(options.tsv, "w") mapping_outfile.write("%s\n" % "\t".join([ "read_id", "contig", "position", "gene", "umi", "umi_count", "final_umi", "final_umi_count", "unique_id" ])) nInput, nOutput, unique_id, input_reads, output_reads = 0, 0, 0, 0, 0 gene_tag = options.gene_tag metacontig2contig = None if options.chrom: inreads = infile.fetch(reference=options.chrom) else: if options.per_gene and options.gene_transcript_map: metacontig2contig = umi_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch(until_eof=options.output_unmapped) bundle_iterator = umi_methods.get_bundles( options, all_reads=True, return_read2=True, return_unmapped=options.output_unmapped, metacontig_contig=metacontig2contig) for bundle, key, status in bundle_iterator(inreads): # write out read2s and unmapped (if these options are set) if status == 'single_read': # bundle is just a single read here nInput += 1 if outfile: outfile.write(bundle) nOutput += 1 continue umis = bundle.keys() counts = {umi: bundle[umi]["count"] for umi in umis} nInput += sum(counts.values()) while nOutput >= output_reads + 10000: output_reads += 10000 U.info("Written out %i reads" % output_reads) while nInput >= input_reads + 1000000: input_reads += 1000000 U.info("Parsed %i input reads" % input_reads) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) # group the umis groups = processor(umis, counts, threshold=options.threshold) for umi_group in groups: top_umi = umi_group[0] group_count = sum(counts[umi] for umi in umi_group) for umi in umi_group: reads = bundle[umi]['read'] for read in reads: if outfile: # Add the 'UG' tag to the read read.tags += [('UG', unique_id)] read.tags += [(options.umi_group_tag, top_umi)] outfile.write(read) if options.tsv: if options.per_gene: gene = read.get_tag(gene_tag) else: gene = "NA" mapping_outfile.write("%s\n" % "\t".join( map(str, (read.query_name, read.reference_name, umi_methods.get_read_position( read, options.soft_clip_threshold)[1], gene, umi.decode(), counts[umi], top_umi.decode(), group_count, unique_id)))) nOutput += 1 unique_id += 1 if outfile: outfile.close() if not options.no_sort_output: # sort the output pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name) os.unlink(out_name) # delete the tempfile if options.tsv: mapping_outfile.close() # write footer and output benchmark information. U.info("Reads: %s" % ", ".join([ "%s: %s" % (x[0], x[1]) for x in bundle_iterator.read_events.most_common() ])) U.info("Number of reads out: %i, Number of groups: %i" % (nOutput, unique_id)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true", help="Input file is in sam format [default=%default]", default=False) parser.add_option( "-o", "--out-sam", dest="out_sam", action="store_true", help="Output alignments in sam format [default=%default]", default=False) parser.add_option("--ignore-umi", dest="ignore_umi", action="store_true", help="Ignore UMI and dedup" " only on position", default=False) parser.add_option("--umi-separator", dest="umi_sep", type="string", help="separator between read id and UMI", default="_") parser.add_option("--umi-tag", dest="umi_tag", type="string", help="tag containing umi", default='RX') parser.add_option("--extract-umi-method", dest="get_umi_method", type="choice", choices=("read_id", "tag"), default="read_id", help="where is the read UMI encoded? [default=%default]") parser.add_option("--subset", dest="subset", type="float", help="Use only a fraction of reads, specified by subset", default=None) parser.add_option("--spliced-is-unique", dest="spliced", action="store_true", help="Treat a spliced read as different to an unspliced" " one [default=%default]", default=False) parser.add_option("--soft-clip-threshold", dest="soft", type="float", help="number of bases clipped from 5' end before" "read is counted as spliced [default=%default]", default=4) parser.add_option("--edit-distance-threshold", dest="threshold", type="int", default=1, help="Edit distance theshold at which to join two UMIs" "when clustering. [default=%default]") parser.add_option("--chrom", dest="chrom", type="string", help="Restrict to one chromosome", default=None) parser.add_option("--paired", dest="paired", action="store_true", default=False, help="paired BAM. [default=%default]") parser.add_option("--method", dest="method", type="choice", choices=("adjacency", "directional", "percentile", "unique", "cluster"), default="directional", help="method to use for umi deduping [default=%default]") parser.add_option("--output-stats", dest="stats", type="string", default=False, help="Specify location to output stats") parser.add_option( "--whole-contig", dest="whole_contig", action="store_true", default=False, help= "Read whole contig before outputting bundles: guarantees that no reads" "are missed, but increases memory usage") parser.add_option("--multimapping-detection-method", dest="detection_method", type="choice", choices=("NH", "X0", "XT"), default=None, help=("Some aligners identify multimapping using bam " "tags. Setting this option to NH, X0 or XT will " "use these tags when selecting the best read " "amongst reads with the same position and umi " "[default=%default]")) parser.add_option("--mapping-quality", dest="mapping_quality", type="int", help="Minimum mapping quality for a read to be retained" " [default=%default]", default=0) parser.add_option( "--read-length", dest="read_length", action="store_true", default=False, help=("use read length in addition to position and UMI" "to identify possible duplicates [default=%default]")) parser.add_option("--per-contig", dest="per_contig", action="store_true", default=False, help=("dedup per contig (field 3 in BAM; RNAME)," " e.g for transcriptome where contig = gene")) parser.add_option("--per-gene", dest="per_gene", action="store_true", default=False, help=("Deduplicate per gene," "e.g for transcriptome where contig = transcript" "must also provide a transript to gene map with" "--gene-transcript-map [default=%default]")) parser.add_option("--gene-transcript-map", dest="gene_transcript_map", type="string", help="file mapping transcripts to genes (tab separated)", default=None) parser.add_option("--gene-tag", dest="gene_tag", type="string", help=("Deduplicate per gene where gene is" "defined by this bam tag [default=%default]"), default=None) parser.add_option( "--skip-tags-regex", dest="skip_regex", type="string", help=("Used with --gene-tag. " "Ignore reads where the gene-tag matches this regex"), default="^[__|Unassigned]") # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) if options.random_seed: np.random.seed(options.random_seed) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: out_name = options.stdout.name options.stdout.close() else: out_name = "-" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "wh" else: out_mode = "wb" if options.stats: if options.ignore_umi: raise ValueError("'--output-stats' and '--ignore-umi' options" " cannot be used together") if options.per_gene: if not options.gene_transcript_map and not options.gene_map: raise ValueError( "--per-gene option requires --gene-transcript-map " "or --gene-tag") try: re.compile(options.skip_regex) except re.error: raise ValueError("skip-regex '%s' is not a " "valid regex" % options.skip_regex) infile = pysam.Samfile(in_name, in_mode) outfile = pysam.Samfile(out_name, out_mode, template=infile) if options.paired: outfile = umi_methods.TwoPassPairWriter(infile, outfile) nInput, nOutput = 0, 0 if options.detection_method: bam_features = detect_bam_features(infile.filename) if not bam_features[options.detection_method]: if sum(bam_features.values()) == 0: raise ValueError( "There are no bam tags available to detect multimapping. " "Do not set --multimapping-detection-method") else: raise ValueError( "The chosen method of detection for multimapping (%s) " "will not work with this bam. Multimapping can be detected" " for this bam using any of the following: %s" % (options.detection_method, ",".join( [x for x in bam_features if bam_features[x]]))) # set the method with which to extract umis from reads if options.get_umi_method == "read_id": umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep) elif options.get_umi_method == "tag": umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag) else: raise ValueError("Unknown umi extraction method") if options.stats: # set up arrays to hold stats data stats_pre_df_dict = {"UMI": [], "counts": []} stats_post_df_dict = {"UMI": [], "counts": []} pre_cluster_stats = [] post_cluster_stats = [] pre_cluster_stats_null = [] post_cluster_stats_null = [] topology_counts = collections.Counter() node_counts = collections.Counter() read_gn = umi_methods.random_read_generator(infile.filename, chrom=options.chrom, umi_getter=umi_getter) if options.chrom: inreads = infile.fetch(reference=options.chrom) else: if options.per_gene and options.gene_transcript_map: metacontig2contig = umi_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch() gene_tag = options.gene_tag for bundle, read_events, status in umi_methods.get_bundles( inreads, ignore_umi=options.ignore_umi, subset=options.subset, quality_threshold=options.mapping_quality, paired=options.paired, spliced=options.spliced, soft_clip_threshold=options.soft, per_contig=options.per_contig, gene_tag=options.gene_tag, skip_regex=options.skip_regex, whole_contig=options.whole_contig, read_length=options.read_length, detection_method=options.detection_method, umi_getter=umi_getter, all_reads=False, return_read2=False, return_unmapped=False): nInput += sum([bundle[umi]["count"] for umi in bundle]) if nOutput % 10000 == 0: U.debug("Outputted %i" % nOutput) if nInput % 1000000 == 0: U.debug("Read %i input reads" % nInput) if options.stats: # generate pre-dudep stats average_distance = umi_methods.get_average_umi_distance( bundle.keys()) pre_cluster_stats.append(average_distance) cluster_size = len(bundle) random_umis = read_gn.getUmis(cluster_size) average_distance_null = umi_methods.get_average_umi_distance( random_umis) pre_cluster_stats_null.append(average_distance_null) if options.ignore_umi: for umi in bundle: nOutput += 1 outfile.write(bundle[umi]["read"]) else: # set up ReadCluster functor with methods specific to # specified options.method processor = network.ReadDeduplicator(options.method) # dedup using umis and write out deduped bam reads, umis, umi_counts = processor(bundle=bundle, threshold=options.threshold) for read in reads: outfile.write(read) nOutput += 1 if options.stats: # collect pre-dudupe stats stats_pre_df_dict['UMI'].extend(bundle) stats_pre_df_dict['counts'].extend( [bundle[UMI]['count'] for UMI in bundle]) # collect post-dudupe stats post_cluster_umis = [umi_getter(x) for x in reads] stats_post_df_dict['UMI'].extend(umis) stats_post_df_dict['counts'].extend(umi_counts) average_distance = umi_methods.get_average_umi_distance( post_cluster_umis) post_cluster_stats.append(average_distance) cluster_size = len(post_cluster_umis) random_umis = read_gn.getUmis(cluster_size) average_distance_null = umi_methods.get_average_umi_distance( random_umis) post_cluster_stats_null.append(average_distance_null) outfile.close() if options.stats: # generate the stats dataframe stats_pre_df = pd.DataFrame(stats_pre_df_dict) stats_post_df = pd.DataFrame(stats_post_df_dict) # tally the counts per umi per position pre_counts = collections.Counter(stats_pre_df["counts"]) post_counts = collections.Counter(stats_post_df["counts"]) counts_index = list( set(pre_counts.keys()).union(set(post_counts.keys()))) counts_index.sort() with U.openFile(options.stats + "_per_umi_per_position.tsv", "w") as outf: outf.write("counts\tinstances_pre\tinstances_post\n") for count in counts_index: values = (count, pre_counts[count], post_counts[count]) outf.write("\t".join(map(str, values)) + "\n") # aggregate stats pre/post per UMI agg_pre_df = aggregateStatsDF(stats_pre_df) agg_post_df = aggregateStatsDF(stats_post_df) agg_df = pd.merge(agg_pre_df, agg_post_df, how='left', left_index=True, right_index=True, sort=True, suffixes=["_pre", "_post"]) # TS - if count value not observed either pre/post-dedup, # merge will leave an empty cell and the column will be cast as a float # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html # --> Missing data casting rules and indexing # so, back fill with zeros and convert back to int agg_df = agg_df.fillna(0).astype(int) agg_df.index = [x.decode() for x in agg_df.index] agg_df.index.name = 'UMI' agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t") # bin distances into integer bins max_ed = int( max( map(max, [ pre_cluster_stats, post_cluster_stats, pre_cluster_stats_null, post_cluster_stats_null ]))) cluster_bins = range(-1, int(max_ed) + 2) def bin_clusters(cluster_list, bins=cluster_bins): ''' take list of floats and return bins''' return np.digitize(cluster_list, bins, right=True) def tallyCounts(binned_cluster, max_edit_distance): ''' tally counts per bin ''' return np.bincount(binned_cluster, minlength=max_edit_distance + 3) pre_cluster_binned = bin_clusters(pre_cluster_stats) post_cluster_binned = bin_clusters(post_cluster_stats) pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null) post_cluster_null_binned = bin_clusters(post_cluster_stats_null) edit_distance_df = pd.DataFrame({ "unique": tallyCounts(pre_cluster_binned, max_ed), "unique_null": tallyCounts(pre_cluster_null_binned, max_ed), options.method: tallyCounts(post_cluster_binned, max_ed), "%s_null" % options.method: tallyCounts(post_cluster_null_binned, max_ed), "edit_distance": cluster_bins }) # TS - set lowest bin (-1) to "Single_UMI" edit_distance_df['edit_distance'][0] = "Single_UMI" edit_distance_df.to_csv(options.stats + "_edit_distance.tsv", index=False, sep="\t") # write footer and output benchmark information. U.info( "%s" % ", ".join(["%s: %s" % (x[0], x[1]) for x in read_events.most_common()])) U.info("Number of reads out: %i" % nOutput) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-p", "--bc-pattern", dest="pattern", type="string", help="Barcode pattern") parser.add_option("--bc-pattern2", dest="pattern2", type="string", help="Barcode pattern for paired reads") parser.add_option("--3prime", dest="prime3", action="store_true", help="barcode is on 3' end of read.") parser.add_option("--read2-in", dest="read2_in", type="string", help="file name for read pairs") parser.add_option("--extract-method", dest="extract_method", type="choice", choices=["string", "regex"], help=("How to extract the umi +/- cell barcodes, Choose " "from 'string' or 'regex'")) parser.add_option("--plot-prefix", dest="plot_prefix", type="string", help=("Prefix for plots to visualise the automated " "detection of the number of 'true' cell barcodes")) parser.add_option("--subset-reads", dest="subset_reads", type="int", help=("Use the first N reads to automatically identify " "the true cell barcodes. If N is greater than the " "number of reads, all reads will be used")) parser.add_option("--error-correct-threshold", dest="error_correct_threshold", type="int", help=("Hamming distance for correction of " "barcodes to whitelist barcodes")) parser.add_option("--method", dest="method", choices=["reads", "umis"], help=("Use reads or unique umi counts per cell")) parser.add_option("--expect-cells", dest="expect_cells", type="int", help=("Prior expectation on the upper limit on the " "number of cells sequenced")) parser.add_option("--set-cell-number", dest="cell_number", type="int", help=("Specify the number of cell barcodes to accept")) parser.set_defaults(method="reads", extract_method="string", filter_cell_barcodes=False, whitelist_tsv=None, blacklist_tsv=None, error_correct_threshold=1, pattern=None, pattern2=None, read2_in=None, plot_prefix=None, subset_reads=100000000, expect_cells=False, cell_number=False) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False, add_sam_options=False) if options.expect_cells and options.cell_number: U.error("Cannot supply both --expect-cells and " "--cell-number options") if not options.pattern and not options.pattern2: if not options.read2_in: U.error("Must supply --bc-pattern for single-end") else: U.error("Must supply --bc-pattern and/or --bc-pattern2 " "if paired-end ") if options.pattern2: if not options.read2_in: U.error("must specify a paired fastq ``--read2-in``") if not options.pattern2: options.pattern2 = options.pattern extract_cell = False extract_umi = False # If the pattern is a regex we can compile the regex(es) prior to # ExtractFilterAndUpdate instantiation if options.extract_method == "regex": if options.pattern: try: options.pattern = regex.compile(options.pattern) except regex.error: U.error("barcode_regex '%s' is not a " "valid regex" % options.pattern) if options.pattern2: try: options.pattern2 = regex.compile(options.barcode_regex2) except regex.Error: U.error("barcode_regex2 '%s' is not a " "valid regex" % options.barcode_regex2) # check whether the regex contains a umi group(s) and cell groups(s) if options.extract_method == "regex": if options.pattern: for group in options.pattern.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True if options.pattern2: for group in options.pattern2.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True # check whether the pattern string contains umi/cell bases elif options.extract_method == "string": if options.pattern: if "C" in options.pattern: extract_cell = True if "N" in options.pattern: extract_umi = True if options.pattern2: if "C" in options.pattern2: extract_cell = True if "N" in options.pattern2: extract_umi = True if not extract_umi: if options.extract_method == "string": U.error("barcode pattern(s) do not include any umi bases " "(marked with 'Ns') %s, %s" % (options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any umi groups " "(starting with 'umi_') %s, %s" (options.pattern, options.pattern2)) if not extract_cell: if options.extract_method == "string": U.error("barcode pattern(s) do not include any cell bases " "(marked with 'Cs') %s, %s" % (options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any cell groups " "(starting with 'cell_') %s, %s" (options.pattern, options.pattern2)) read1s = umi_methods.fastqIterate(options.stdin) # set up read extractor ReadExtractor = umi_methods.ExtractFilterAndUpdate( method=options.extract_method, pattern=options.pattern, pattern2=options.pattern2, prime3=options.prime3, extract_cell=extract_cell) cell_barcode_counts = collections.Counter() n_reads = 0 n_cell_barcodes = 0 # if using the umis method, need to keep a set of umis observed if options.method == "umis": cell_barcode_umis = collections.defaultdict(set) # variables for progress monitor displayMax = 100000 U.info("Starting barcode extraction") if not options.read2_in: for read1 in read1s: # Update display in every 100kth iteration if n_reads % displayMax == 0: U.info("Parsed {} reads".format(n_reads)) n_reads += 1 barcode_values = ReadExtractor.getBarcodes(read1) if barcode_values is None: continue else: cell, umi, _, _, _, _, _ = barcode_values if options.method == "umis": cell_barcode_umis[cell].add(umi) else: cell_barcode_counts[cell] += 1 n_cell_barcodes += 1 if options.subset_reads: if n_cell_barcodes > options.subset_reads: break else: read2s = umi_methods.fastqIterate(U.openFile(options.read2_in)) for read1, read2 in izip(read1s, read2s): # Update display in every 100kth iteration if n_reads % displayMax == 0: U.info("Parsed {} reads".format(n_reads)) n_reads += 1 barcode_values = ReadExtractor.getBarcodes(read1, read2) if barcode_values is None: continue else: cell, umi, _, _, _, _, _ = barcode_values if options.method == "umis": cell_barcode_umis[cell].add(umi) else: cell_barcode_counts[cell] += 1 n_cell_barcodes += 1 if options.subset_reads: if n_reads > options.subset_reads: break U.info("Starting - whitelist determination") if options.method == "umis": for cell in cell_barcode_umis: cell_barcode_counts[cell] = len(cell_barcode_umis[cell]) if options.cell_number and options.cell_number > len(cell_barcode_counts): raise ValueError( "--set-cell-barcode option specifies more cell barcodes than the " "number of observed cell barcodes. This may be because " "--subset-reads was set to a value too low to capture reads from " "all cells. %s cell barcodes observed from %s parsed reads. " "Expected>= %s cell barcodes" % (len(cell_barcode_counts), options.subset_reads, options.cell_number)) cell_whitelist, true_to_false_map = umi_methods.getCellWhitelist( cell_barcode_counts, options.expect_cells, options.cell_number, options.error_correct_threshold, options.plot_prefix) U.info("Writing out whitelist") for barcode in sorted(list(cell_whitelist)): if true_to_false_map: corrected_barcodes = ",".join(sorted(true_to_false_map[barcode])) corrected_barcode_counts = ",".join( map(str, [ cell_barcode_counts[x] for x in sorted(true_to_false_map[barcode]) ])) else: corrected_barcodes, corrected_barcode_counts = "", "" options.stdout.write( "%s\t%s\t%s\t%s\n" % (barcode, corrected_barcodes, cell_barcode_counts[barcode], corrected_barcode_counts)) U.info("Parsed %i reads" % n_reads) U.info("%i reads matched the barcode pattern" % n_cell_barcodes) U.info("Found %i unique cell barcodes" % len(cell_barcode_counts)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) group = U.OptionGroup(parser, "group-specific options") group.add_option("--group-out", dest="tsv", type="string", help="Outfile name for file mapping read id to read group", default=None) group.add_option("--output-bam", dest="output_bam", action="store_true", default=False, help=("output a bam file with read groups tagged using the UG tag" "[default=%default]")) group.add_option("--output-unmapped", dest="output_unmapped", action="store_true", default=False, help=("Retain all unmapped reads in output[default=%default]")) parser.add_option("--umi-group-tag", dest="umi_group_tag", type="string", help="tag for the outputted umi group", default='BX') parser.add_option_group(group) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) U.validateSamOptions(options) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: if options.no_sort_output: out_name = options.stdout.name else: out_name = U.getTempFilename() sorted_out_name = options.stdout.name options.stdout.close() assert options.output_bam, ( "To output a bam you must include --output-bam option") else: if options.no_sort_output: out_name = "-" else: out_name = U.getTempFilename() sorted_out_name = "-" if not options.no_sort_output: # need to determine the output format for sort if options.out_sam: sort_format = "sam" else: sort_format = "bam" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "wh" else: out_mode = "wb" infile = pysam.Samfile(in_name, in_mode) if options.output_bam: outfile = pysam.Samfile(out_name, out_mode, template=infile) else: outfile = None if options.tsv: mapping_outfile = U.openFile(options.tsv, "w") mapping_outfile.write("%s\n" % "\t".join( ["read_id", "contig", "position", "gene", "umi", "umi_count", "final_umi", "final_umi_count", "unique_id"])) nInput, nOutput, unique_id, input_reads, output_reads = 0, 0, 0, 0, 0 gene_tag = options.gene_tag metacontig2contig = None if options.chrom: inreads = infile.fetch(reference=options.chrom) else: if options.per_gene and options.gene_transcript_map: metacontig2contig = umi_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch(until_eof=options.output_unmapped) bundle_iterator = umi_methods.get_bundles( options, all_reads=True, return_read2=True, return_unmapped=options.output_unmapped, metacontig_contig=metacontig2contig) for bundle, key, status in bundle_iterator(inreads): # write out read2s and unmapped (if these options are set) if status == 'single_read': # bundle is just a single read here nInput += 1 if outfile: outfile.write(bundle) nOutput += 1 continue umis = bundle.keys() counts = {umi: bundle[umi]["count"] for umi in umis} nInput += sum(counts.values()) while nOutput >= output_reads + 10000: output_reads += 10000 U.info("Written out %i reads" % output_reads) while nInput >= input_reads + 1000000: input_reads += 1000000 U.info("Parsed %i input reads" % input_reads) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) # group the umis groups = processor( umis, counts, threshold=options.threshold) for umi_group in groups: top_umi = umi_group[0] group_count = sum(counts[umi] for umi in umi_group) for umi in umi_group: reads = bundle[umi]['read'] for read in reads: if outfile: # Add the 'UG' tag to the read read.tags += [('UG', unique_id)] read.tags += [(options.umi_group_tag, top_umi)] outfile.write(read) if options.tsv: if options.per_gene: gene = read.get_tag(gene_tag) else: gene = "NA" mapping_outfile.write("%s\n" % "\t".join(map(str, ( read.query_name, read.reference_name, umi_methods.get_read_position( read, options.soft_clip_threshold)[1], gene, umi.decode(), counts[umi], top_umi.decode(), group_count, unique_id)))) nOutput += 1 unique_id += 1 if outfile: outfile.close() if not options.no_sort_output: # sort the output pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name) os.unlink(out_name) # delete the tempfile if options.tsv: mapping_outfile.close() # write footer and output benchmark information. U.info( "Reads: %s" % ", ".join(["%s: %s" % (x[0], x[1]) for x in bundle_iterator.read_events.most_common()])) U.info("Number of reads out: %i, Number of groups: %i" % (nOutput, unique_id)) U.Stop()
def getKneeEstimateDensity(cell_barcode_counts, expect_cells=False, cell_number=False, plotfile_prefix=None): ''' estimate the number of "true" cell barcodes using a gaussian density-based method input: cell_barcode_counts = dict(key = barcode, value = count) expect_cells (optional) = define the expected number of cells cell_number (optional) = define number of cell barcodes to accept plotfile_prefix = (optional) prefix for plots returns: List of true barcodes ''' # very low abundance cell barcodes are filtered out (< 0.001 * # the most abundant) threshold = 0.001 * cell_barcode_counts.most_common(1)[0][1] counts = sorted(cell_barcode_counts.values(), reverse=True) counts_thresh = [x for x in counts if x > threshold] log_counts = np.log10(counts_thresh) # guassian density with hardcoded bw density = gaussian_kde(log_counts, bw_method=0.1) xx_values = 10000 # how many x values for density plot xx = np.linspace(log_counts.min(), log_counts.max(), xx_values) local_min = None if cell_number: # we have a prior hard expectation on the number of cells threshold = counts[cell_number] else: local_mins = argrelextrema(density(xx), np.less)[0] local_mins_counts = [] for poss_local_min in local_mins[::-1]: passing_threshold = sum([ y > np.power(10, xx[poss_local_min]) for x, y in cell_barcode_counts.items() ]) local_mins_counts.append(passing_threshold) if not local_min: # if we have selected a local min yet if expect_cells: # we have a "soft" expectation if (passing_threshold > expect_cells * 0.1 and passing_threshold <= expect_cells): local_min = poss_local_min else: # we have no prior expectation # TS: In abscence of any expectation (either hard or soft), # this set of heuristic thresholds are used to decide # which local minimum to select. # This is very unlikely to be the best way to achieve this! if (poss_local_min >= 0.2 * xx_values and (log_counts.max() - xx[poss_local_min] > 0.5 or xx[poss_local_min] < log_counts.max() / 2)): local_min = poss_local_min if local_min is not None: threshold = np.power(10, xx[local_min]) if cell_number or local_min is not None: final_barcodes = set( [x for x, y in cell_barcode_counts.items() if y > threshold]) else: final_barcodes = None if plotfile_prefix: # colour-blind friendly colours - https://gist.github.com/thriveth/8560036 CB_color_cycle = [ '#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3', '#999999', '#e41a1c', '#dede00' ] user_line = mlines.Line2D([], [], color=CB_color_cycle[0], ls="dashed", markersize=15, label='User-defined') selected_line = mlines.Line2D([], [], color=CB_color_cycle[0], ls="dashed", markersize=15, label='Selected') rejected_line = mlines.Line2D([], [], color=CB_color_cycle[3], ls="dashed", markersize=15, label='Rejected') # make density plot fig = plt.figure() fig1 = fig.add_subplot(111) fig1.plot(xx, density(xx), 'k') fig1.set_xlabel("Count per cell (log10)") fig1.set_ylabel("Density") if cell_number: fig1.axvline(np.log10(threshold), ls="dashed", color=CB_color_cycle[0]) lgd = fig1.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=[user_line], title="Cell threshold") elif local_min is None: # no local_min was accepted for pos in xx[local_mins]: fig1.axvline(x=pos, ls="dashed", color=CB_color_cycle[3]) lgd = fig1.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=[selected_line, rejected_line], title="Possible thresholds") else: for pos in xx[local_mins]: if pos == xx[local_min]: # selected local minima fig1.axvline(x=xx[local_min], ls="dashed", color=CB_color_cycle[0]) else: fig1.axvline(x=pos, ls="dashed", color=CB_color_cycle[3]) lgd = fig1.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=[selected_line, rejected_line], title="Possible thresholds") fig.savefig("%s_cell_barcode_count_density.png" % plotfile_prefix, bbox_extra_artists=(lgd, ), bbox_inches='tight') # make knee plot fig = plt.figure() fig2 = fig.add_subplot(111) fig2.plot(range(0, len(counts)), np.cumsum(counts), c="black") xmax = len(counts) if local_min is not None: # reasonable maximum x-axis value xmax = min(len(final_barcodes) * 5, xmax) fig2.set_xlim((0 - (0.01 * xmax), xmax)) fig2.set_xlabel("Rank") fig2.set_ylabel("Cumulative count") if cell_number: fig2.axvline(x=cell_number, ls="dashed", color=CB_color_cycle[0]) lgd = fig2.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=[user_line], title="Cell threshold") elif local_min is None: # no local_min was accepted for local_mins_count in local_mins_counts: fig2.axvline(x=local_mins_count, ls="dashed", color=CB_color_cycle[3]) lgd = fig2.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=[selected_line, rejected_line], title="Possible thresholds") else: for local_mins_count in local_mins_counts: if local_mins_count == len( final_barcodes): # selected local minima fig2.axvline(x=local_mins_count, ls="dashed", color=CB_color_cycle[0]) else: fig2.axvline(x=local_mins_count, ls="dashed", color=CB_color_cycle[3]) lgd = fig2.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=[selected_line, rejected_line], title="Possible thresholds") fig.savefig("%s_cell_barcode_knee.png" % plotfile_prefix, bbox_extra_artists=(lgd, ), bbox_inches='tight') if local_min is not None: colours_selected = [ CB_color_cycle[0] for x in range(0, len(final_barcodes)) ] colours_rejected = [ "black" for x in range(0, len(counts) - len(final_barcodes)) ] colours = colours_selected + colours_rejected else: colours = ["black" for x in range(0, len(counts))] fig = plt.figure() fig3 = fig.add_subplot(111) fig3.scatter(x=range(1, len(counts) + 1), y=counts, c=colours, s=10, linewidths=0) fig3.loglog() fig3.set_xlim(0, len(counts) * 1.25) fig3.set_xlabel('Barcode index') fig3.set_ylabel('Count') if cell_number: fig3.axvline(x=cell_number, ls="dashed", color=CB_color_cycle[0]) lgd = fig3.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=[user_line], title="Cell threshold") elif local_min is None: # no local_min was accepted for local_mins_count in local_mins_counts: fig3.axvline(x=local_mins_count, ls="dashed", color=CB_color_cycle[3]) lgd = fig3.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=[selected_line, rejected_line], title="Possible thresholds") else: for local_mins_count in local_mins_counts: if local_mins_count == len( final_barcodes): # selected local minima fig3.axvline(x=local_mins_count, ls="dashed", color=CB_color_cycle[0]) else: fig3.axvline(x=local_mins_count, ls="dashed", color=CB_color_cycle[3]) lgd = fig3.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=[selected_line, rejected_line], title="Possible thresholds") fig.savefig("%s_cell_barcode_counts.png" % plotfile_prefix, bbox_extra_artists=(lgd, ), bbox_inches='tight') if not cell_number: with U.openFile("%s_cell_thresholds.tsv" % plotfile_prefix, "w") as outf: outf.write("count\taction\n") for local_mins_count in local_mins_counts: if local_min and local_mins_count == len(final_barcodes): threshold_type = "Selected" else: threshold_type = "Rejected" outf.write("%s\t%s\n" % (local_mins_count, threshold_type)) return final_barcodes
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true", help="Input file is in sam format", default=False) parser.add_option("-o", "--out-sam", dest="out_sam", action="store_true", help="Output alignments in sam format", default=False) parser.add_option("--ignore-umi", dest="ignore_umi", action="store_true", help="Ignore UMI and dedup only on position", default=False) parser.add_option("--subset", dest="subset", type="string", help="Use only a fraction of reads, specified by subset", default=1.1) parser.add_option("--spliced-is-unique", dest="spliced", action="store_true", help="Treat a spliced read as different to an unspliced" " one", default=False) parser.add_option("--soft-clip-threshold", dest="soft", type="float", help="number of bases clipped from 5' end before" "read os counted as spliced", default=4) parser.add_option("--edit-distance-threshold", dest="threshold", type="int", help="Edit distance theshold at which to join two UMIs" "when clustering", default=1) parser.add_option("--chrom", dest="chrom", type="string", help="Restrict to one chromosome", default=None) parser.add_option("--paired", dest="paired", action="store_true", default=False, help="Use second-in-pair position when deduping") parser.add_option("--method", dest="method", type="choice", choices=("adjacency", "directional-adjacency", "percentile", "unique", "cluster"), default="directional-adjacency", help="method to use for umi deduping") parser.add_option("--output-stats", dest="stats", type="string", default=False, help="Specify location to output stats") parser.add_option("--further-stats", dest="further_stats", action="store_true", default=False, help="Output further stats") parser.add_option("--per-contig", dest="per_contig", action="store_true", default=False, help=("dedup per contig," " e.g for transcriptome where contig = gene")) parser.add_option("--whole-contig", dest="whole_contig", action="store_true", default=False, help="Read whole contig before outputting bundles: guarantees that no reads" "are missed, but increases memory usage") parser.add_option("--multimapping-detection-method", dest="detection_method", type="choice", choices=("NH", "X0", "XT"), default=None, help=("Some aligners identify multimapping using bam " "tags. Setting this option to NH, X0 or XT will " "use these tags when selecting the best read " "amongst reads with the same position and umi")) parser.add_option("--mapping-quality", dest="mapping_quality", type="int", help="Minimum mapping quality for a read to be retained", default=0) parser.add_option("--read-length", dest="read_length", action="store_true", default=False, help=("use read length in addition to position and UMI" "to identify possible duplicates")) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: out_name = options.stdout.name options.stdout.close() else: out_name = "-" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "w" else: out_mode = "wb" if options.stats: if options.ignore_umi: raise ValueError("'--output-stats' and '--ignore-umi' options" " cannot be used together") if options.further_stats: if not options.stats: raise ValueError("'--further-stats' options requires " "'--output-stats' option") if options.method not in ["cluster", "adjacency"]: raise ValueError("'--further-stats' only enabled with 'cluster' " "and 'adjacency' methods") infile = pysam.Samfile(in_name, in_mode) outfile = pysam.Samfile(out_name, out_mode, template=infile) if options.paired: outfile = TwoPassPairWriter(infile, outfile) nInput, nOutput = 0, 0 if options.detection_method: bam_features = detect_bam_features(infile.filename) if not bam_features[options.detection_method]: if sum(bam_features.values()) == 0: raise ValueError( "There are no bam tags available to detect multimapping. " "Do not set --multimapping-detection-method") else: raise ValueError( "The chosen method of detection for multimapping (%s) " "will not work with this bam. Multimapping can be detected" " for this bam using any of the following: %s" % ( options.detection_method, ",".join( [x for x in bam_features if bam_features[x]]))) if options.stats: # set up arrays to hold stats data stats_pre_df_dict = {"UMI": [], "counts": []} stats_post_df_dict = {"UMI": [], "counts": []} pre_cluster_stats = [] post_cluster_stats = [] pre_cluster_stats_null = [] post_cluster_stats_null = [] topology_counts = collections.Counter() node_counts = collections.Counter() read_gn = random_read_generator(infile.filename, chrom=options.chrom) for bundle in get_bundles(infile, ignore_umi=options.ignore_umi, subset=float(options.subset), quality_threshold=options.mapping_quality, paired=options.paired, chrom=options.chrom, spliced=options.spliced, soft_clip_threshold=options.soft, per_contig=options.per_contig, whole_contig=options.whole_contig, read_length=options.read_length, detection_method=options.detection_method): nInput += sum([bundle[umi]["count"] for umi in bundle]) if nOutput % 10000 == 0: U.debug("Outputted %i" % nOutput) if nInput % 1000000 == 0: U.debug("Read %i input reads" % nInput) if options.stats: # generate pre-dudep stats average_distance = get_average_umi_distance(bundle.keys()) pre_cluster_stats.append(average_distance) cluster_size = len(bundle) random_umis = read_gn.getUmis(cluster_size) average_distance_null = get_average_umi_distance(random_umis) pre_cluster_stats_null.append(average_distance_null) if options.ignore_umi: for umi in bundle: nOutput += 1 outfile.write(bundle[umi]["read"]) else: # set up ClusterAndReducer functor with methods specific to # specified options.method processor = ClusterAndReducer(options.method) # dedup using umis and write out deduped bam reads, umis, umi_counts, topologies, nodes = processor( bundle, options.threshold, options.stats, options.further_stats) for read in reads: outfile.write(read) nOutput += 1 if options.stats: # collect pre-dudupe stats stats_pre_df_dict['UMI'].extend(bundle) stats_pre_df_dict['counts'].extend( [bundle[UMI]['count'] for UMI in bundle]) # collect post-dudupe stats post_cluster_umis = [x.qname.split("_")[-1] for x in reads] stats_post_df_dict['UMI'].extend(umis) stats_post_df_dict['counts'].extend(umi_counts) average_distance = get_average_umi_distance(post_cluster_umis) post_cluster_stats.append(average_distance) cluster_size = len(post_cluster_umis) random_umis = read_gn.getUmis(cluster_size) average_distance_null = get_average_umi_distance(random_umis) post_cluster_stats_null.append(average_distance_null) if options.further_stats: for c_type, count in topologies.most_common(): topology_counts[c_type] += count for c_type, count in nodes.most_common(): node_counts[c_type] += count outfile.close() if options.stats: stats_pre_df = pd.DataFrame(stats_pre_df_dict) stats_post_df = pd.DataFrame(stats_post_df_dict) # generate histograms of counts per UMI at each position UMI_counts_df_pre = pd.DataFrame(stats_pre_df.pivot_table( columns=stats_pre_df["counts"], values="counts", aggfunc=len)) UMI_counts_df_post = pd.DataFrame(stats_post_df.pivot_table( columns=stats_post_df["counts"], values="counts", aggfunc=len)) UMI_counts_df_pre.columns = ["instances"] UMI_counts_df_post.columns = ["instances"] UMI_counts_df = pd.merge(UMI_counts_df_pre, UMI_counts_df_post, how='left', left_index=True, right_index=True, sort=True, suffixes=["_pre", "_post"]) # TS - if count value not observed either pre/post-dedup, # merge will leave an empty cell and the column will be cast as a float # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html # --> Missing data casting rules and indexing # so, back fill with zeros and convert back to int UMI_counts_df = UMI_counts_df.fillna(0).astype(int) UMI_counts_df.to_csv( options.stats + "_per_umi_per_position.tsv", sep="\t") # aggregate stats pre/post per UMI agg_pre_df = aggregateStatsDF(stats_pre_df) agg_post_df = aggregateStatsDF(stats_post_df) agg_df = pd.merge(agg_pre_df, agg_post_df, how='left', left_index=True, right_index=True, sort=True, suffixes=["_pre", "_post"]) # TS - see comment above regarding missing values agg_df = agg_df.fillna(0).astype(int) agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t") # bin distances into integer bins max_ed = int(max(map(max, [pre_cluster_stats, post_cluster_stats, pre_cluster_stats_null, post_cluster_stats_null]))) cluster_bins = range(-1, int(max_ed) + 2) def bin_clusters(cluster_list, bins=cluster_bins): ''' take list of floats and return bins''' return np.digitize(cluster_list, bins, right=True) def tallyCounts(binned_cluster, max_edit_distance): ''' tally counts per bin ''' return np.bincount(binned_cluster, minlength=max_edit_distance + 3) pre_cluster_binned = bin_clusters(pre_cluster_stats) post_cluster_binned = bin_clusters(post_cluster_stats) pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null) post_cluster_null_binned = bin_clusters(post_cluster_stats_null) edit_distance_df = pd.DataFrame({ "unique": tallyCounts(pre_cluster_binned, max_ed), "unique_null": tallyCounts(pre_cluster_null_binned, max_ed), options.method: tallyCounts(post_cluster_binned, max_ed), "%s_null" % options.method: tallyCounts(post_cluster_null_binned, max_ed), "edit_distance": cluster_bins}) # TS - set lowest bin (-1) to "Single_UMI" edit_distance_df['edit_distance'][0] = "Single_UMI" edit_distance_df.to_csv(options.stats + "_edit_distance.tsv", index=False, sep="\t") if options.further_stats: with U.openFile(options.stats + "_topologies.tsv", "w") as outf: outf.write( "\n".join(["\t".join((x, str(y))) for x, y in topology_counts.most_common()]) + "\n") with U.openFile(options.stats + "_nodes.tsv", "w") as outf: outf.write( "\n".join(["\t".join(map(str, (x, y))) for x, y in node_counts.most_common()]) + "\n") # write footer and output benchmark information. U.info("Number of reads in: %i, Number of reads out: %i" % (nInput, nOutput)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=usage, description=globals()["__doc__"]) group = U.OptionGroup(parser, "dedup-specific options") group.add_option("--output-stats", dest="stats", type="string", default=False, help="Specify location to output stats") parser.add_option_group(group) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) U.validateSamOptions(options, group=False) if options.random_seed: np.random.seed(options.random_seed) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: if options.no_sort_output: out_name = options.stdout.name else: out_name = U.getTempFilename(dir=options.tmpdir) sorted_out_name = options.stdout.name options.stdout.close() else: if options.no_sort_output: out_name = "-" else: out_name = U.getTempFilename(dir=options.tmpdir) sorted_out_name = "-" if not options.no_sort_output: # need to determine the output format for sort if options.out_sam: sort_format = "sam" else: sort_format = "bam" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "wh" else: out_mode = "wb" if options.stats and options.ignore_umi: raise ValueError("'--output-stats' and '--ignore-umi' options" " cannot be used together") infile = pysam.Samfile(in_name, in_mode) outfile = pysam.Samfile(out_name, out_mode, template=infile) if options.paired: outfile = sam_methods.TwoPassPairWriter(infile, outfile) nInput, nOutput, input_reads, output_reads = 0, 0, 0, 0 if options.detection_method: bam_features = detect_bam_features(infile.filename) if not bam_features[options.detection_method]: if sum(bam_features.values()) == 0: raise ValueError( "There are no bam tags available to detect multimapping. " "Do not set --multimapping-detection-method") else: raise ValueError( "The chosen method of detection for multimapping (%s) " "will not work with this bam. Multimapping can be detected" " for this bam using any of the following: %s" % (options.detection_method, ",".join( [x for x in bam_features if bam_features[x]]))) gene_tag = options.gene_tag metacontig2contig = None if options.chrom: inreads = infile.fetch(reference=options.chrom) else: if options.per_contig and options.gene_transcript_map: metacontig2contig = sam_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = sam_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch() # set up ReadCluster functor with methods specific to # specified options.method processor = network.ReadDeduplicator(options.method) bundle_iterator = sam_methods.get_bundles( options, metacontig_contig=metacontig2contig) if options.stats: # set up arrays to hold stats data stats_pre_df_dict = {"UMI": [], "counts": []} stats_post_df_dict = {"UMI": [], "counts": []} pre_cluster_stats = [] post_cluster_stats = [] pre_cluster_stats_null = [] post_cluster_stats_null = [] topology_counts = collections.Counter() node_counts = collections.Counter() read_gn = umi_methods.random_read_generator( infile.filename, chrom=options.chrom, barcode_getter=bundle_iterator.barcode_getter) for bundle, key, status in bundle_iterator(inreads): nInput += sum([bundle[umi]["count"] for umi in bundle]) while nOutput >= output_reads + 100000: output_reads += 100000 U.info("Written out %i reads" % output_reads) while nInput >= input_reads + 1000000: input_reads += 1000000 U.info("Parsed %i input reads" % input_reads) if options.stats: # generate pre-dudep stats average_distance = umi_methods.get_average_umi_distance( bundle.keys()) pre_cluster_stats.append(average_distance) cluster_size = len(bundle) random_umis = read_gn.getUmis(cluster_size) average_distance_null = umi_methods.get_average_umi_distance( random_umis) pre_cluster_stats_null.append(average_distance_null) if options.ignore_umi: for umi in bundle: nOutput += 1 outfile.write(bundle[umi]["read"]) else: # dedup using umis and write out deduped bam reads, umis, umi_counts = processor(bundle=bundle, threshold=options.threshold) for read in reads: outfile.write(read) nOutput += 1 if options.stats: # collect pre-dudupe stats stats_pre_df_dict['UMI'].extend(bundle) stats_pre_df_dict['counts'].extend( [bundle[UMI]['count'] for UMI in bundle]) # collect post-dudupe stats post_cluster_umis = [ bundle_iterator.barcode_getter(x)[0] for x in reads ] stats_post_df_dict['UMI'].extend(umis) stats_post_df_dict['counts'].extend(umi_counts) average_distance = umi_methods.get_average_umi_distance( post_cluster_umis) post_cluster_stats.append(average_distance) cluster_size = len(post_cluster_umis) random_umis = read_gn.getUmis(cluster_size) average_distance_null = umi_methods.get_average_umi_distance( random_umis) post_cluster_stats_null.append(average_distance_null) outfile.close() if not options.no_sort_output: # sort the output pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name) os.unlink(out_name) # delete the tempfile if options.stats: # generate the stats dataframe stats_pre_df = pd.DataFrame(stats_pre_df_dict) stats_post_df = pd.DataFrame(stats_post_df_dict) # tally the counts per umi per position pre_counts = collections.Counter(stats_pre_df["counts"]) post_counts = collections.Counter(stats_post_df["counts"]) counts_index = list( set(pre_counts.keys()).union(set(post_counts.keys()))) counts_index.sort() with U.openFile(options.stats + "_per_umi_per_position.tsv", "w") as outf: outf.write("counts\tinstances_pre\tinstances_post\n") for count in counts_index: values = (count, pre_counts[count], post_counts[count]) outf.write("\t".join(map(str, values)) + "\n") # aggregate stats pre/post per UMI agg_pre_df = aggregateStatsDF(stats_pre_df) agg_post_df = aggregateStatsDF(stats_post_df) agg_df = pd.merge(agg_pre_df, agg_post_df, how='left', left_index=True, right_index=True, sort=True, suffixes=["_pre", "_post"]) # TS - if count value not observed either pre/post-dedup, # merge will leave an empty cell and the column will be cast as a float # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html # --> Missing data casting rules and indexing # so, back fill with zeros and convert back to int agg_df = agg_df.fillna(0).astype(int) agg_df.index = [x.decode() for x in agg_df.index] agg_df.index.name = 'UMI' agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t") # bin distances into integer bins max_ed = int( max( map(max, [ pre_cluster_stats, post_cluster_stats, pre_cluster_stats_null, post_cluster_stats_null ]))) cluster_bins = range(-1, int(max_ed) + 2) def bin_clusters(cluster_list, bins=cluster_bins): ''' take list of floats and return bins''' return np.digitize(cluster_list, bins, right=True) def tallyCounts(binned_cluster, max_edit_distance): ''' tally counts per bin ''' return np.bincount(binned_cluster, minlength=max_edit_distance + 3) pre_cluster_binned = bin_clusters(pre_cluster_stats) post_cluster_binned = bin_clusters(post_cluster_stats) pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null) post_cluster_null_binned = bin_clusters(post_cluster_stats_null) edit_distance_df = pd.DataFrame( { "unique": tallyCounts(pre_cluster_binned, max_ed), "unique_null": tallyCounts(pre_cluster_null_binned, max_ed), options.method: tallyCounts(post_cluster_binned, max_ed), "%s_null" % options.method: tallyCounts(post_cluster_null_binned, max_ed), "edit_distance": cluster_bins }, columns=[ "unique", "unique_null", options.method, "%s_null" % options.method, "edit_distance" ]) # TS - set lowest bin (-1) to "Single_UMI" edit_distance_df['edit_distance'][0] = "Single_UMI" edit_distance_df.to_csv(options.stats + "_edit_distance.tsv", index=False, sep="\t") # write footer and output benchmark information. U.info("Reads: %s" % ", ".join([ "%s: %s" % (x[0], x[1]) for x in bundle_iterator.read_events.most_common() ])) U.info("Number of reads out: %i" % nOutput) if not options.ignore_umi: # otherwise processor has not been used U.info("Total number of positions deduplicated: %i" % processor.UMIClusterer.positions) if processor.UMIClusterer.positions > 0: U.info("Mean number of unique UMIs per position: %.2f" % (float(processor.UMIClusterer.total_umis_per_position) / processor.UMIClusterer.positions)) U.info("Max. number of unique UMIs per position: %i" % processor.UMIClusterer.max_umis_per_position) else: U.warn("The BAM did not contain any valid " "reads/read pairs for deduplication") U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=usage, description=globals()["__doc__"]) group = U.OptionGroup(parser, "whitelist-specific options") group.add_option("--plot-prefix", dest="plot_prefix", type="string", help=("Prefix for plots to visualise the automated " "detection of the number of 'true' cell barcodes")) group.add_option("--subset-reads", dest="subset_reads", type="int", help=("Use the first N reads to automatically identify " "the true cell barcodes. If N is greater than the " "number of reads, all reads will be used. " "Default is 100,000,000")) group.add_option("--error-correct-threshold", dest="error_correct_threshold", type="int", help=("Hamming distance for correction of barcodes to " "whitelist barcodes. This value will also be used " "for error detection above the knee if required " "(--ed-above-threshold)")) group.add_option("--method", dest="method", choices=["reads", "umis"], help=("Use reads or unique umi counts per cell")) group.add_option("--knee-method", dest="knee_method", choices=["distance", "density"], help=("Use distance or density methods for detection of knee")) group.add_option("--expect-cells", dest="expect_cells", type="int", help=("Prior expectation on the upper limit on the " "number of cells sequenced")) group.add_option("--allow-threshold-error", dest="allow_threshold_error", action="store_true", help=("Don't select a threshold. Will still " "output the plots if requested (--plot-prefix)")) group.add_option("--set-cell-number", dest="cell_number", type="int", help=("Specify the number of cell barcodes to accept")) parser.add_option("--ed-above-threshold", dest="ed_above_threshold", type="choice", choices=["discard", "correct"], help=("Detect CBs above the threshold which may be " "sequence errors from another CB and either " "'discard' or 'correct'. Default=discard")) parser.add_option_group(group) parser.set_defaults(method="reads", knee_method="distance", extract_method="string", whitelist_tsv=None, blacklist_tsv=None, error_correct_threshold=1, pattern=None, pattern2=None, read2_in=None, plot_prefix=None, subset_reads=100000000, expect_cells=False, allow_threshold_error=False, cell_number=False, ed_above_threshold=None, ignore_suffix=False) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_extract_options=True, add_group_dedup_options=False, add_umi_grouping_options=False, add_sam_options=False) if options.filtered_out and not options.extract_method == "regex": U.error("Reads will not be filtered unless extract method is" "set to regex (--extract-method=regex)") if options.expect_cells: if options.knee_method == "distance": U.error("Cannot use --expect-cells with 'distance' knee " "method. Switch to --knee-method=density if you want to " "provide an expectation for the number of " "cells. Alternatively, if you know the number of cell " "barcodes, use --cell-number") if options.cell_number: U.error("Cannot supply both --expect-cells and " "--cell-number options") extract_cell, extract_umi = U.validateExtractOptions(options) if not extract_cell: if options.extract_method == "string": U.error("barcode pattern(s) do not include any cell bases " "(marked with 'Cs') %s, %s" % ( options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any cell groups " "(starting with 'cell_') %s, %s" ( options.pattern, options.pattern2)) read1s = umi_methods.fastqIterate(options.stdin) # set up read extractor ReadExtractor = extract_methods.ExtractFilterAndUpdate( method=options.extract_method, pattern=options.pattern, pattern2=options.pattern2, prime3=options.prime3, extract_cell=extract_cell) cell_barcode_counts = collections.Counter() n_reads = 0 n_cell_barcodes = 0 # if using the umis method, need to keep a set of umis observed if options.method == "umis": cell_barcode_umis = collections.defaultdict(set) # variables for progress monitor displayMax = 100000 U.info("Starting barcode extraction") if options.filtered_out: filtered_out = U.openFile(options.filtered_out, "w") if not options.read2_in: for read1 in read1s: # Update display in every 100kth iteration if n_reads % displayMax == 0: U.info("Parsed {} reads".format(n_reads)) n_reads += 1 barcode_values = ReadExtractor.getBarcodes(read1) if barcode_values is None: if options.filtered_out: filtered_out.write(str(read1) + "\n") continue else: cell, umi, _, _, _, _, _ = barcode_values if options.method == "umis": cell_barcode_umis[cell].add(umi) else: cell_barcode_counts[cell] += 1 n_cell_barcodes += 1 if options.subset_reads: if n_cell_barcodes > options.subset_reads: break else: if options.filtered_out2: filtered_out2 = U.openFile(options.filtered_out2, "w") read2s = umi_methods.fastqIterate(U.openFile(options.read2_in)) for read1, read2 in izip(read1s, read2s): # Update display in every 100kth iteration if n_reads % displayMax == 0: U.info("Parsed {} reads".format(n_reads)) n_reads += 1 barcode_values = ReadExtractor.getBarcodes(read1, read2) if barcode_values is None: if options.filtered_out: filtered_out.write(str(read1) + "\n") if options.filtered_out2: filtered_out2.write(str(read2) + "\n") continue else: cell, umi, _, _, _, _, _ = barcode_values if options.method == "umis": cell_barcode_umis[cell].add(umi) else: cell_barcode_counts[cell] += 1 n_cell_barcodes += 1 if options.subset_reads: if n_reads > options.subset_reads: break U.info("Starting - whitelist determination") if options.method == "umis": for cell in cell_barcode_umis: cell_barcode_counts[cell] = len(cell_barcode_umis[cell]) if options.cell_number and options.cell_number > len(cell_barcode_counts): raise ValueError( "--set-cell-barcode option specifies more cell barcodes than the " "number of observed cell barcodes. This may be because " "--subset-reads was set to a value too low to capture reads from " "all cells. %s cell barcodes observed from %s parsed reads. " "Expected>= %s cell barcodes" % ( len(cell_barcode_counts), options.subset_reads, options.cell_number)) cell_whitelist, true_to_false_map = whitelist_methods.getCellWhitelist( cell_barcode_counts, options.knee_method, options.expect_cells, options.cell_number, options.error_correct_threshold, options.plot_prefix) if cell_whitelist: U.info("Top %s cell barcodes passed the selected threshold" % len(cell_whitelist)) if options.ed_above_threshold: cell_whitelist, true_to_false_map = whitelist_methods.errorDetectAboveThreshold( cell_barcode_counts, cell_whitelist, true_to_false_map, errors=options.error_correct_threshold, resolution_method=options.ed_above_threshold) if cell_whitelist: U.info("Writing out whitelist") total_correct_barcodes = 0 total_corrected_barcodes = 0 for barcode in sorted(list(cell_whitelist)): total_correct_barcodes += cell_barcode_counts[barcode] if true_to_false_map: corrected_barcodes = ",".join( sorted(true_to_false_map[barcode])) correct_barcode_counts = [cell_barcode_counts[x] for x in sorted(true_to_false_map[barcode])] total_corrected_barcodes += sum(correct_barcode_counts) corrected_barcode_counts = ",".join( map(str, correct_barcode_counts)) else: corrected_barcodes, corrected_barcode_counts = "", "" options.stdout.write("%s\t%s\t%s\t%s\n" % ( barcode, corrected_barcodes, cell_barcode_counts[barcode], corrected_barcode_counts)) else: msg = ("No local minima was accepted. Recommend checking the plot " "output and counts per local minima (requires `--plot-prefix`" "option) and then re-running with manually selected threshold " "(`--set-cell-number` option)") if options.allow_threshold_error: U.info(msg) else: U.error(msg) U.info("Parsed %i reads" % n_reads) U.info("%i reads matched the barcode pattern" % n_cell_barcodes) U.info("Found %i unique cell barcodes" % len(cell_barcode_counts)) if cell_whitelist: U.info("Found %i total reads matching the selected cell barcodes" % total_correct_barcodes) U.info("Found %i total reads which can be error corrected to the " "selected cell barcodes" % total_corrected_barcodes) if options.filtered_out: filtered_out.close() if options.filtered_out2: filtered_out2.close() U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true", help="Input file is in sam format [default=%default]", default=False) parser.add_option( "-o", "--out-sam", dest="out_sam", action="store_true", help="Output alignments in sam format [default=%default]", default=False) parser.add_option("--umi-separator", dest="umi_sep", type="string", help="separator between read id and UMI", default="_") parser.add_option("--umi-tag", dest="umi_tag", type="string", help="tag containing umi", default='RX') parser.add_option("--umi-group-tag", dest="umi_group_tag", type="string", help="tag for the outputted umi group", default='BX') parser.add_option("--extract-umi-method", dest="get_umi_method", type="choice", choices=("read_id", "tag"), default="read_id", help="where is the read UMI encoded? [default=%default]") parser.add_option("--subset", dest="subset", type="float", help="Use only a fraction of reads, specified by subset", default=None) parser.add_option("--spliced-is-unique", dest="spliced", action="store_true", help="Treat a spliced read as different to an unspliced" " one [default=%default]", default=False) parser.add_option("--soft-clip-threshold", dest="soft", type="float", help="number of bases clipped from 5' end before" "read is counted as spliced [default=%default]", default=4) parser.add_option("--edit-distance-threshold", dest="threshold", type="int", default=1, help="Edit distance theshold at which to join two UMIs" "when clustering. [default=%default]") parser.add_option("--chrom", dest="chrom", type="string", help="Restrict to one chromosome", default=None) parser.add_option("--paired", dest="paired", action="store_true", default=False, help="paired BAM. [default=%default]") parser.add_option("--method", dest="method", type="choice", choices=("adjacency", "directional", "unique", "cluster"), default="directional", help="method to use for umi deduping [default=%default]") parser.add_option("--per-contig", dest="per_contig", action="store_true", default=False, help=("dedup per contig," " e.g for transcriptome where contig = gene")) parser.add_option( "--whole-contig", dest="whole_contig", action="store_true", default=False, help= "Read whole contig before outputting bundles: guarantees that no reads" "are missed, but increases memory usage") parser.add_option( "--read-length", dest="read_length", action="store_true", default=False, help=("use read length in addition to position and UMI" "to identify possible duplicates [default=%default]")) parser.add_option("--mapping-quality", dest="mapping_quality", type="int", help="Minimum mapping quality for a read to be retained" " [default=%default]", default=0) parser.add_option( "--group-out", dest="tsv", type="string", help="Outfile name for file mapping read id to read group", default=None) parser.add_option( "--output-bam", dest="output_bam", action="store_true", default=False, help=("output a bam file with read groups tagged using the UG tag" "[default=%default]")) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: out_name = options.stdout.name options.stdout.close() assert options.output_bam, ( "To output a bam you must include --output-bam option") else: out_name = "-" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "w" else: out_mode = "wb" infile = pysam.Samfile(in_name, in_mode) if options.output_bam: outfile = pysam.Samfile(out_name, out_mode, template=infile) if options.paired: outfile = umi_methods.TwoPassPairWriter(infile, outfile, tags=True) else: outfile = None if options.tsv: mapping_outfile = U.openFile(options.tsv, "w") mapping_outfile.write( "read_id\tcontig\tposition\tumi\tumi_count\tfinal_umi\tfinal_umi_count\tunique_id\n" ) # set the method with which to extract umis from reads if options.get_umi_method == "read_id": umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep) elif options.get_umi_method == "tag": umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag) else: raise ValueError("Unknown umi extraction method") nInput, nOutput, unique_id = 0, 0, 0 read_events = collections.Counter() for bundle, read_events in umi_methods.get_bundles( infile, read_events, ignore_umi=False, subset=options.subset, quality_threshold=options.mapping_quality, paired=options.paired, chrom=options.chrom, spliced=options.spliced, soft_clip_threshold=options.soft, per_contig=options.per_contig, whole_contig=options.whole_contig, read_length=options.read_length, umi_getter=umi_getter, all_reads=True): nInput += sum([bundle[umi]["count"] for umi in bundle]) if nOutput % 10000 == 0: U.debug("Outputted %i" % nOutput) if nInput % 1000000 == 0: U.debug("Read %i input reads" % nInput) # set up ReadCluster functor with methods specific to # specified options.method processor = network.ReadClusterer(options.method) bundle, groups, counts = processor(bundle=bundle, threshold=options.threshold, stats=True, deduplicate=False) for umi_group in groups: top_umi = umi_group[0] group_count = sum(counts[umi] for umi in umi_group) for umi in umi_group: reads = bundle[umi]['read'] for read in reads: if outfile: if options.paired: # if paired, we need to supply the tags to # add to the paired read outfile.write(read, unique_id, top_umi) else: # Add the 'UG' tag to the read read.tags += [('UG', unique_id)] read.tags += [(options.umi_group_tag, top_umi)] outfile.write(read) if options.tsv: mapping_outfile.write("%s\n" % "\t".join( map(str, (read.query_name, read.reference_name, umi_methods.get_read_position( read, options.soft)[1], umi.decode(), counts[umi], top_umi.decode(), group_count, unique_id)))) nOutput += 1 unique_id += 1 if outfile: outfile.close() if options.tsv: mapping_outfile.close() # write footer and output benchmark information. U.info( "Reads: %s" % ", ".join(["%s: %s" % (x[0], x[1]) for x in read_events.most_common()])) U.info("Number of reads out: %i, Number of groups: %i" % (nOutput, unique_id)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) group = U.OptionGroup(parser, "count-specific options") parser.add_option("--wide-format-cell-counts", dest="wide_format_cell_counts", action="store_true", default=False, help=("output the cell counts in a wide format " "(rows=genes, columns=cells)")) parser.add_option_group(group) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False) options.per_gene = True # hardcodes counting to per-gene only U.validateSamOptions(options, group=False) if options.random_seed: np.random.seed(options.random_seed) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.in_sam: in_mode = "r" else: in_mode = "rb" infile = pysam.Samfile(in_name, in_mode) # write out to tempfile and then sort to stdout tmpfilename = U.getTempFilename(dir=options.tmpdir) tmpfile = U.openFile(tmpfilename, mode="w") nInput, nOutput, input_reads = 0, 0, 0 gene_tag = options.gene_tag metacontig2contig = None if options.chrom: inreads = infile.fetch(reference=options.chrom) else: if options.gene_transcript_map: metacontig2contig = umi_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch() bundle_iterator = umi_methods.get_bundles( options, only_count_reads=True, metacontig_contig=metacontig2contig) for bundle, key, status in bundle_iterator(inreads): if status == "single_read": continue gene, cell = key umis = bundle.keys() counts = {umi: bundle[umi]["count"] for umi in umis} nInput += sum(counts.values()) while nInput >= input_reads + 1000000: input_reads += 1000000 U.info("Parsed %i input reads" % input_reads) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) # group the umis groups = processor(umis, counts, threshold=options.threshold) gene_count = len(groups) if options.per_cell: tmpfile.write("%s\n" % "\t".join( (gene, cell.decode(), str(gene_count)))) else: tmpfile.write("%s\n" % "\t".join((gene, str(gene_count)))) nOutput += gene_count tmpfile.close() if options.per_cell: gene_counts_dict = {} with U.openFile(tmpfilename, mode="r") as inf: genes = set() cells = set() for line in inf: gene, cell, gene_count = line.strip().split("\t") genes.add(gene) cells.add(cell) if gene not in gene_counts_dict: gene_counts_dict[gene] = {} gene_counts_dict[gene][cell] = gene_count if options.wide_format_cell_counts: # write out in wide format options.stdout.write("%s\t%s\n" % ("gene", "\t".join(sorted(cells)))) for gene in sorted(genes): counts = [] for cell in sorted(cells): if cell in gene_counts_dict[gene]: counts.append(gene_counts_dict[gene][cell]) else: counts.append(0) options.stdout.write("%s\t%s\n" % (gene, "\t".join(map(str, counts)))) else: # write out in long format options.stdout.write("%s\t%s\t%s\n" % ("gene", "cell", "count")) for gene in sorted(genes): for cell in sorted(list(gene_counts_dict[gene].keys())): options.stdout.write( "%s\t%s\t%s\n" % (gene, cell, gene_counts_dict[gene][cell])) else: options.stdout.write("%s\t%s\n" % ("gene", "count")) with U.openFile(tmpfilename, mode="r") as inf: for line in inf: options.stdout.write(line) os.unlink(tmpfilename) # output reads events and benchmark information. for event in bundle_iterator.read_events.most_common(): U.info("%s: %s" % (event[0], event[1])) U.info("Number of (post deduplication) reads counted: %i" % nOutput) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=usage, description=globals()["__doc__"]) group = U.OptionGroup(parser, "extract-specific options") # (Experimental option) Retain the UMI in the sequence read" group.add_option("--retain-umi", dest="retain_umi", action="store_true", help=optparse.SUPPRESS_HELP) group.add_option("--read2-out", dest="read2_out", type="string", help="file to output processed paired read to") group.add_option("--read2-stdout", dest="read2_stdout", action="store_true", help="Paired reads, send read2 to stdout, discarding read1") group.add_option("--quality-filter-threshold", dest="quality_filter_threshold", type="int", help=("Remove reads where any UMI base quality score " "falls below this threshold")) group.add_option("--quality-filter-mask", dest="quality_filter_mask", type="int", help=("If a UMI base has a quality below this threshold, " "replace the base with 'N'")) group.add_option("--quality-encoding", dest="quality_encoding", type="choice", choices=["phred33", "phred64", "solexa"], help=("Quality score encoding. Choose from 'phred33'" "[33-77] 'phred64' [64-106] or 'solexa' [59-106]")) group.add_option("--filter-cell-barcode", dest="filter_cell_barcode", action="store_true", help=optparse.SUPPRESS_HELP) group.add_option("--error-correct-cell", dest="error_correct_cell", action="store_true", help=("Correct errors in the cell barcode")) group.add_option("--whitelist", dest="whitelist", type="string", help=("A whitelist of accepted cell barcodes")) group.add_option("--blacklist", dest="blacklist", type="string", help=("A blacklist of rejected cell barcodes")) group.add_option("--filter-umi", dest="filter_umi", action="store_true", #help="Filter the UMIs" help=optparse.SUPPRESS_HELP) group.add_option("--umi-whitelist", dest="umi_whitelist", type="string", default=None, #help="A whitelist of accepted UMIs [default=%default]" help=optparse.SUPPRESS_HELP) group.add_option("--umi-whitelist-paired", dest="umi_whitelist_paired", type="string", default=None, #help="A whitelist of accepted UMIs for read2[default=%default]" help=optparse.SUPPRESS_HELP) group.add_option("--correct-umi-threshold", dest="correct_umi_threshold", type="int", default=0, #help="Correct errors in UMIs to the whitelist(s) provided" #"if within threshold [default=%default]" help=optparse.SUPPRESS_HELP) group.add_option("--umi-correct-log", dest="umi_correct_log", type="string", default=None, #help="File logging UMI error correction", help=optparse.SUPPRESS_HELP) group.add_option("--subset-reads", "--reads-subset", dest="reads_subset", type="int", help=("Only extract from the first N reads. If N is " "greater than the number of reads, all reads will " "be used")) group.add_option("--reconcile-pairs", dest="reconcile", action="store_true", help=("Allow the presences of reads in read2 input that " "are not present in read1 input. This allows cell " "barcode filtering of read1s without " "considering read2s")) parser.add_option_group(group) group = U.OptionGroup(parser, "[EXPERIMENTAl] barcode extraction options") group.add_option("--either-read", dest="either_read", action="store_true", help="UMI may be on either read (see " "--either-read-resolve) for options to resolve cases where" "UMI is on both reads") group.add_option("--either-read-resolve", dest="either_read_resolve", type="choice", choices=["discard", "quality"], help=("How to resolve instances where both reads " "contain a UMI but using --either-read." "Choose from 'discard' or 'quality'" "(use highest quality). default=dicard")) parser.add_option_group(group) parser.set_defaults(extract_method="string", filter_cell_barcodes=False, whitelist=None, blacklist=None, error_correct_cell=False, pattern=None, pattern2=None, read2_in=None, read2_out=False, read2_stdout=False, quality_filter_threshold=None, quality_encoding=None, reconcile=False, either_read=False, either_read_resolve="discard", ignore_suffix=False) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_extract_options=True, add_group_dedup_options=False, add_umi_grouping_options=False, add_sam_options=False) if options.filter_cell_barcode: U.info('Use of --whitelist ensures cell barcodes are filtered. ' '--filter-cell-barcode is no longer required and may be ' 'removed in future versions.') if options.whitelist is not None: options.filter_cell_barcode = True if options.retain_umi and not options.extract_method == "regex": U.error("option --retain-umi only works with --extract-method=regex") if (options.filtered_out and not options.extract_method == "regex" and whitelist is None): U.error("Reads will not be filtered unless extract method is" "set to regex (--extract-method=regex) or cell" "barcodes are filtered (--whitelist)") if options.quality_filter_threshold or options.quality_filter_mask: if not options.quality_encoding: U.error("must provide a quality encoding (--quality-" "encoding) to filter UMIs by quality (--quality" "-filter-threshold) or mask low quality bases " "with (--quality-filter-mask)") extract_cell, extract_umi = U.validateExtractOptions(options) if options.either_read: if extract_cell: U.error("Option to extract from either read (--either-read) " "is not currently compatible with cell barcode extraction") if not options.extract_method == "regex": U.error("Option to extract from either read (--either-read)" "requires --extract-method=regex") if not options.pattern or not options.pattern2: U.error("Option to extract from either read (--either-read)" "requires --bc-pattern=[PATTERN1] and" "--bc-pattern2=[PATTERN2]") if options.filter_umi: if not options.umi_whitelist: U.error("must provide a UMI whitelist (--umi-whitelist) if using " "--filter-umi option") if options.pattern2 and not options.umi_whitelist_paired: U.error("must provide a UMI whitelist for paired end " "(--umi-whitelist-paired) if using --filter-umi option" "with paired end data") if not extract_umi: if options.extract_method == "string": U.error("barcode pattern(s) do not include any umi bases " "(marked with 'Ns') %s, %s" % ( options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any umi groups " "(starting with 'umi_') %s, %s" ( options.pattern, options.pattern2)) if options.whitelist: if not extract_cell: if options.extract_method == "string": U.error("barcode pattern(s) do not include any cell bases " "(marked with 'Cs') %s, %s" % ( options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any cell groups " "(starting with 'cell_') %s, %s" ( options.pattern, options.pattern2)) read1s = umi_methods.fastqIterate(options.stdin) # set up read extractor ReadExtractor = extract_methods.ExtractFilterAndUpdate( options.extract_method, options.pattern, options.pattern2, options.prime3, extract_cell, options.quality_encoding, options.quality_filter_threshold, options.quality_filter_mask, options.filter_umi, options.filter_cell_barcode, options.retain_umi, options.either_read, options.either_read_resolve) if options.filter_umi: umi_whitelist, false_to_true_map = whitelist_methods.getUserDefinedBarcodes( options.umi_whitelist, options.umi_whitelist_paired, deriveErrorCorrection=True, threshold=options.correct_umi_threshold) U.info("Length of whitelist: %i" % len(umi_whitelist)) U.info("Length of 'correctable' whitelist: %i" % len(false_to_true_map)) ReadExtractor.umi_whitelist = umi_whitelist ReadExtractor.umi_false_to_true_map = false_to_true_map ReadExtractor.umi_whitelist_counts = collections.defaultdict( lambda: collections.Counter()) if options.whitelist: cell_whitelist, false_to_true_map = whitelist_methods.getUserDefinedBarcodes( options.whitelist, getErrorCorrection=options.error_correct_cell) ReadExtractor.cell_whitelist = cell_whitelist ReadExtractor.false_to_true_map = false_to_true_map if options.blacklist: blacklist = set() with U.openFile(options.blacklist, "r") as inf: for line in inf: blacklist.add(line.strip().split("\t")[0]) ReadExtractor.cell_blacklist = blacklist # variables for progress monitor progCount = 0 displayMax = 100000 U.info("Starting barcode extraction") if options.filtered_out: filtered_out = U.openFile(options.filtered_out, "w") if options.read2_in is None: for read in read1s: # incrementing count for monitoring progress progCount += 1 # Update display in every 100kth iteration if progCount % displayMax == 0: U.info("Parsed {} reads".format(progCount)) new_read = ReadExtractor(read) if options.reads_subset: if (ReadExtractor.read_counts['Input Reads'] > options.reads_subset): break if not new_read: if options.filtered_out: filtered_out.write(str(read) + "\n") continue options.stdout.write(str(new_read) + "\n") else: if options.filtered_out2: filtered_out2 = U.openFile(options.filtered_out2, "w") read2s = umi_methods.fastqIterate(U.openFile(options.read2_in)) if options.read2_out: read2_out = U.openFile(options.read2_out, "w") if options.reconcile: strict = False else: strict = True for read1, read2 in umi_methods.joinedFastqIterate( read1s, read2s, strict, options.ignore_suffix): # incrementing count for monitoring progress progCount += 1 # Update display in every 100kth iteration if progCount % displayMax == 0: U.info("Parsed {} reads".format(progCount)) sys.stdout.flush() reads = ReadExtractor(read1, read2) if options.reads_subset: if (ReadExtractor.read_counts['Input Reads'] > options.reads_subset): break if not reads: if options.filtered_out: filtered_out.write(str(read1) + "\n") if options.filtered_out2: filtered_out2.write(str(read2) + "\n") continue else: new_read1, new_read2 = reads if options.read2_stdout: options.stdout.write(str(new_read2) + "\n") else: options.stdout.write(str(new_read1) + "\n") if options.read2_out: read2_out.write(str(new_read2) + "\n") if options.read2_out: read2_out.close() if options.filtered_out: filtered_out.close() if options.filtered_out2: filtered_out2.close() for k, v in ReadExtractor.getReadCounts().most_common(): U.info("%s: %s" % (k, v)) if options.umi_correct_log: with U.openFile(options.umi_correct_log, "w") as outf: outf.write("umi\tcount_no_errors\tcount_errors\n") for umi, counts in ReadExtractor.umi_whitelist_counts.items(): outf.write("%s\t%i\t%i\n" % ( umi, counts["no_error"], counts["error"])) outf.close() U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true", help="Input file is in sam format [default=%default]", default=False) parser.add_option( "-o", "--out-sam", dest="out_sam", action="store_true", help="Output alignments in sam format [default=%default]", default=False) parser.add_option("--umi-separator", dest="umi_sep", type="string", help="separator between read id and UMI", default="_") parser.add_option("--umi-tag", dest="umi_tag", type="string", help="tag containing umi", default='RX') parser.add_option("--umi-group-tag", dest="umi_group_tag", type="string", help="tag for the outputted umi group", default='BX') parser.add_option("--extract-umi-method", dest="get_umi_method", type="choice", choices=("read_id", "tag"), default="read_id", help="where is the read UMI encoded? [default=%default]") parser.add_option("--subset", dest="subset", type="float", help="Use only a fraction of reads, specified by subset", default=None) parser.add_option("--spliced-is-unique", dest="spliced", action="store_true", help="Treat a spliced read as different to an unspliced" " one [default=%default]", default=False) parser.add_option("--soft-clip-threshold", dest="soft", type="float", help="number of bases clipped from 5' end before" "read is counted as spliced [default=%default]", default=4) parser.add_option("--edit-distance-threshold", dest="threshold", type="int", default=1, help="Edit distance theshold at which to join two UMIs" "when clustering. [default=%default]") parser.add_option("--chrom", dest="chrom", type="string", help="Restrict to one chromosome", default=None) parser.add_option("--paired", dest="paired", action="store_true", default=False, help="paired BAM. [default=%default]") parser.add_option("--method", dest="method", type="choice", choices=("adjacency", "directional", "unique", "cluster"), default="directional", help="method to use for umi deduping [default=%default]") parser.add_option("--per-contig", dest="per_contig", action="store_true", default=False, help=("dedup per contig (field 3 in BAM; RNAME)," " e.g for transcriptome where contig = gene")) parser.add_option("--per-gene", dest="per_gene", action="store_true", default=False, help=("Deduplicate per gene," "e.g for transcriptome where contig = transcript" "must also provide a transript to gene map with" "--gene-transcript-map [default=%default]")) parser.add_option("--gene-transcript-map", dest="gene_transcript_map", type="string", help="file mapping transcripts to genes (tab separated)", default=None) parser.add_option("--gene-tag", dest="gene_tag", type="string", help=("Deduplicate per gene where gene is" "defined by this bam tag [default=%default]"), default=None) parser.add_option( "--read-length", dest="read_length", action="store_true", default=False, help=("use read length in addition to position and UMI" "to identify possible duplicates [default=%default]")) parser.add_option("--mapping-quality", dest="mapping_quality", type="int", help="Minimum mapping quality for a read to be retained" " [default=%default]", default=0) parser.add_option( "--output-unmapped", dest="output_unmapped", action="store_true", default=False, help=("Retain all unmapped reads in output[default=%default]")) parser.add_option( "--group-out", dest="tsv", type="string", help="Outfile name for file mapping read id to read group", default=None) parser.add_option( "--output-bam", dest="output_bam", action="store_true", default=False, help=("output a bam file with read groups tagged using the UG tag" "[default=%default]")) parser.add_option( "--skip-tags-regex", dest="skip_regex", type="string", help=("Used with --gene-tag. " "Ignore reads where the gene-tag matches this regex"), default="^[__|Unassigned]") # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: out_name = options.stdout.name options.stdout.close() assert options.output_bam, ( "To output a bam you must include --output-bam option") else: out_name = "-" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "wh" else: out_mode = "wb" if options.per_gene: if not options.gene_transcript_map: raise ValueError( "--per-gene option requires --gene-transcript-map") infile = pysam.Samfile(in_name, in_mode) if options.output_bam: outfile = pysam.Samfile(out_name, out_mode, template=infile) else: outfile = None if options.tsv: mapping_outfile = U.openFile(options.tsv, "w") mapping_outfile.write("%s\n" % "\t".join([ "read_id", "contig", "position", "gene", "umi", "umi_count", "final_umi", "final_umi_count", "unique_id" ])) # set the method with which to extract umis from reads if options.get_umi_method == "read_id": umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep) elif options.get_umi_method == "tag": umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag) else: raise ValueError("Unknown umi extraction method") nInput, nOutput, unique_id = 0, 0, 0 if options.chrom: inreads = infile.fetch(reference=options.chrom) gene_tag = options.gene_tag else: if options.per_gene and options.gene_transcript_map: metacontig2contig = umi_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch(until_eof=options.output_unmapped) gene_tag = options.gene_tag for bundle, read_events, status in umi_methods.get_bundles( inreads, ignore_umi=False, subset=options.subset, quality_threshold=options.mapping_quality, paired=options.paired, spliced=options.spliced, soft_clip_threshold=options.soft, per_contig=options.per_contig, gene_tag=gene_tag, skip_regex=options.skip_regex, read_length=options.read_length, umi_getter=umi_getter, all_reads=True, return_read2=True, return_unmapped=options.output_unmapped): # write out read2s and unmapped if option set if status == 'single_read': # bundle is just a single read here outfile.write(bundle) nInput += 1 nOutput += 1 continue umis = bundle.keys() counts = {umi: bundle[umi]["count"] for umi in umis} nInput += sum(counts.values()) if nOutput % 10000 == 0: U.debug("Outputted %i" % nOutput) if nInput % 1000000 == 0: U.debug("Read %i input reads" % nInput) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) # group the umis groups = processor(umis, counts, threshold=options.threshold) for umi_group in groups: top_umi = umi_group[0] group_count = sum(counts[umi] for umi in umi_group) for umi in umi_group: reads = bundle[umi]['read'] for read in reads: if outfile: # Add the 'UG' tag to the read read.tags += [('UG', unique_id)] read.tags += [(options.umi_group_tag, top_umi)] outfile.write(read) if options.tsv: if options.per_gene: gene = read.get_tag(gene_tag) else: gene = "NA" mapping_outfile.write("%s\n" % "\t".join( map(str, (read.query_name, read.reference_name, umi_methods.get_read_position( read, options.soft)[1], gene, umi.decode(), counts[umi], top_umi.decode(), group_count, unique_id)))) nOutput += 1 unique_id += 1 if outfile: outfile.close() if options.tsv: mapping_outfile.close() # write footer and output benchmark information. U.info( "Reads: %s" % ", ".join(["%s: %s" % (x[0], x[1]) for x in read_events.most_common()])) U.info("Number of reads out: %i, Number of groups: %i" % (nOutput, unique_id)) U.Stop()
def getKneeEstimateDistance(cell_barcode_counts, cell_number=False, plotfile_prefix=None): ''' estimate the number of "true" cell barcodes via a knee method which finds the point with maximum distance input: cell_barcode_counts = dict(key = barcode, value = count) cell_number (optional) = define number of cell barcodes to accept plotfile_prefix = (optional) prefix for plots returns: List of true barcodes ''' def getKneeDistance(values): ''' This function is based on https://stackoverflow.com/questions/2018178/finding-the-best-trade-off-point-on-a-curve and https://dataplatform.cloud.ibm.com/analytics/notebooks/54d79c2a-f155-40ec-93ec-ed05b58afa39/view?access_token=6d8ec910cf2a1b3901c721fcb94638563cd646fe14400fecbb76cea6aaae2fb1 The idea is to draw a line from the first to last point on the cumulative counts curve and then find the point on the curve which is the maximum distance away from this line ''' # get coordinates of all the points nPoints = len(values) allCoord = np.vstack((range(nPoints), values)).T # get the first point firstPoint = allCoord[0] # get vector between first and last point - this is the line lineVec = allCoord[-1] - allCoord[0] lineVecNorm = lineVec / np.sqrt(np.sum(lineVec**2)) # find the distance from each point to the line: # vector between all points and first point vecFromFirst = allCoord - firstPoint # To calculate the distance to the line, we split vecFromFirst into two # components, one that is parallel to the line and one that is perpendicular # Then, we take the norm of the part that is perpendicular to the line and # get the distance. # We find the vector parallel to the line by projecting vecFromFirst onto # the line. The perpendicular vector is vecFromFirst - vecFromFirstParallel # We project vecFromFirst by taking the scalar product of the vector with # the unit vector that points in the direction of the line (this gives us # the length of the projection of vecFromFirst onto the line). If we # multiply the scalar product by the unit vector, we have vecFromFirstParallel scalarProduct = np.sum(vecFromFirst * npm.repmat(lineVecNorm, nPoints, 1), axis=1) vecFromFirstParallel = np.outer(scalarProduct, lineVecNorm) vecToLine = vecFromFirst - vecFromFirstParallel # distance to line is the norm of vecToLine distToLine = np.sqrt(np.sum(vecToLine**2, axis=1)) # knee/elbow is the point with max distance value idxOfBestPoint = np.argmax(distToLine) return (distToLine, idxOfBestPoint) counts = [x[1] for x in cell_barcode_counts.most_common()] values = list(np.cumsum(counts)) # We need to perform the distance knee iteratively with reduced # number of CBs since it's sensitive to the number of CBs input # and overestimates if too many CBs are used previous_idxOfBestPoint = 0 distToLine, idxOfBestPoint = getKneeDistance(values) if idxOfBestPoint == 0: raise ValueError("Something's gone wrong here!!") max_iterations = 100 iterations = 0 while idxOfBestPoint - previous_idxOfBestPoint != 0: previous_idxOfBestPoint = idxOfBestPoint iterations += 1 if iterations > max_iterations: break distToLine, idxOfBestPoint = getKneeDistance(values[:idxOfBestPoint * 3]) knee_final_barcodes = [ x[0] for x in cell_barcode_counts.most_common()[:idxOfBestPoint + 1] ] if cell_number: threshold = counts[cell_number] final_barcodes = set( [x for x, y in cell_barcode_counts.items() if y > threshold]) else: final_barcodes = knee_final_barcodes if plotfile_prefix: # colour-blind friendly colours - https://gist.github.com/thriveth/8560036 CB_color_cycle = [ '#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3', '#999999', '#e41a1c', '#dede00' ] user_line = mlines.Line2D([], [], color=CB_color_cycle[2], ls="dashed", markersize=15, label='User-defined') selected_line = mlines.Line2D([], [], color=CB_color_cycle[0], ls="dashed", markersize=15, label='Knee') # plot of the original curve and its corresponding distances plt.figure(figsize=(12, 6)) plt.plot(distToLine, label='Distance', color='r') plt.plot(values, label='Cumulative', color='b') plt.plot([idxOfBestPoint], values[idxOfBestPoint], marker='o', markersize=8, color="red", label='Knee') if cell_number: plt.axvline(x=cell_number, ls="dashed", color=CB_color_cycle[2], label="User-defined") plt.legend() plt.savefig("%s_cell_barcode_knee.png" % plotfile_prefix) colours_selected = [ CB_color_cycle[0] for x in range(0, len(final_barcodes)) ] colours_rejected = [ "black" for x in range(0, len(counts) - len(final_barcodes)) ] colours = colours_selected + colours_rejected fig = plt.figure() fig3 = fig.add_subplot(111) fig3.scatter(x=range(1, len(counts) + 1), y=counts, c=colours, s=10, linewidths=0) fig3.loglog() fig3.set_xlim(0, len(counts) * 1.25) fig3.set_xlabel('Barcode index') fig3.set_ylabel('Count') fig3.axvline(x=len(knee_final_barcodes), ls="dashed", color=CB_color_cycle[0]) if cell_number: fig3.axvline(x=cell_number, ls="dashed", color=CB_color_cycle[2]) lgd = fig3.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=[selected_line, user_line], title="User threshold") else: lgd = fig3.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=[selected_line], title="Knee threshold") fig.savefig("%s_cell_barcode_counts.png" % plotfile_prefix, bbox_extra_artists=(lgd, ), bbox_inches='tight') if not cell_number: with U.openFile("%s_cell_thresholds.tsv" % plotfile_prefix, "w") as outf: outf.write("count\n") outf.write("%s\n" % idxOfBestPoint) return (final_barcodes)