def singleBarcodeGenerator(whitelist_tsv): with U.openFile(whitelist_tsv, "r") as inf: for line in inf: if line.startswith('#'): continue line = line.strip().split("\t") yield(line[0])
def pairedBarcodeGenerator(whitelist_tsv, whitelist_tsv2): whitelist1 = [] whitelist2 = [] with U.openFile(whitelist_tsv, "r") as inf: for line in inf: if line.startswith('#'): continue line = line.strip().split("\t") whitelist1.append(line[0]) with U.openFile(whitelist_tsv2, "r") as inf2: for line in inf2: if line.startswith('#'): continue line = line.strip().split("\t") whitelist2.append(line[0]) for w1, w2 in itertools.product(whitelist1, whitelist2): yield(w1 + w2)
def getMetaContig2contig(gene_transcript_map): ''' ''' metacontig2contig = collections.defaultdict(set) for line in U.openFile(gene_transcript_map, "r"): if line.startswith("#"): continue if len(line.strip()) == 0: break gene, transcript = line.strip().split("\t") metacontig2contig[gene].add(transcript) return metacontig2contig
def getUserDefinedBarcodes(whitelist_tsv, getErrorCorrection=False): cell_whitelist = [] if getErrorCorrection: false_to_true_map = {} else: false_to_true_map = None with U.openFile(whitelist_tsv, "r") as inf: for line in inf: if line.startswith('#'): continue line = line.strip().split("\t") whitelist_barcode = line[0] cell_whitelist.append(whitelist_barcode) if getErrorCorrection: for error_barcode in line[1].split(","): false_to_true_map[error_barcode] = whitelist_barcode return set(cell_whitelist), false_to_true_map
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) group = U.OptionGroup(parser, "count-specific options") parser.add_option("--wide-format-cell-counts", dest="wide_format_cell_counts", action="store_true", default=False, help=("output the cell counts in a wide format " "(rows=genes, columns=cells)")) parser.add_option_group(group) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False) options.per_gene = True # hardcodes counting to per-gene only U.validateSamOptions(options, group=False) if options.random_seed: np.random.seed(options.random_seed) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.in_sam: in_mode = "r" else: in_mode = "rb" infile = pysam.Samfile(in_name, in_mode) # write out to tempfile and then sort to stdout tmpfilename = U.getTempFilename(dir=options.tmpdir) tmpfile = U.openFile(tmpfilename, mode="w") nInput, nOutput, input_reads = 0, 0, 0 gene_tag = options.gene_tag metacontig2contig = None if options.chrom: inreads = infile.fetch(reference=options.chrom) else: if options.gene_transcript_map: metacontig2contig = umi_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch() bundle_iterator = umi_methods.get_bundles( options, only_count_reads=True, metacontig_contig=metacontig2contig) for bundle, key, status in bundle_iterator(inreads): if status == "single_read": continue gene, cell = key umis = bundle.keys() counts = {umi: bundle[umi]["count"] for umi in umis} nInput += sum(counts.values()) while nInput >= input_reads + 1000000: input_reads += 1000000 U.info("Parsed %i input reads" % input_reads) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) # group the umis groups = processor(umis, counts, threshold=options.threshold) gene_count = len(groups) if options.per_cell: tmpfile.write("%s\n" % "\t".join( (gene, cell.decode(), str(gene_count)))) else: tmpfile.write("%s\n" % "\t".join((gene, str(gene_count)))) nOutput += gene_count tmpfile.close() if options.per_cell: gene_counts_dict = {} with U.openFile(tmpfilename, mode="r") as inf: genes = set() cells = set() for line in inf: gene, cell, gene_count = line.strip().split("\t") genes.add(gene) cells.add(cell) if gene not in gene_counts_dict: gene_counts_dict[gene] = {} gene_counts_dict[gene][cell] = gene_count if options.wide_format_cell_counts: # write out in wide format options.stdout.write("%s\t%s\n" % ("gene", "\t".join(sorted(cells)))) for gene in sorted(genes): counts = [] for cell in sorted(cells): if cell in gene_counts_dict[gene]: counts.append(gene_counts_dict[gene][cell]) else: counts.append(0) options.stdout.write("%s\t%s\n" % (gene, "\t".join(map(str, counts)))) else: # write out in long format options.stdout.write("%s\t%s\t%s\n" % ("gene", "cell", "count")) for gene in sorted(genes): for cell in sorted(list(gene_counts_dict[gene].keys())): options.stdout.write( "%s\t%s\t%s\n" % (gene, cell, gene_counts_dict[gene][cell])) else: options.stdout.write("%s\t%s\n" % ("gene", "count")) with U.openFile(tmpfilename, mode="r") as inf: for line in inf: options.stdout.write(line) os.unlink(tmpfilename) # output reads events and benchmark information. for event in bundle_iterator.read_events.most_common(): U.info("%s: %s" % (event[0], event[1])) U.info("Number of (post deduplication) reads counted: %i" % nOutput) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true", help="Input file is in sam format [default=%default]", default=False) parser.add_option( "-o", "--out-sam", dest="out_sam", action="store_true", help="Output alignments in sam format [default=%default]", default=False) parser.add_option("--umi-separator", dest="umi_sep", type="string", help="separator between read id and UMI", default="_") parser.add_option("--umi-tag", dest="umi_tag", type="string", help="tag containing umi", default='RX') parser.add_option("--umi-group-tag", dest="umi_group_tag", type="string", help="tag for the outputted umi group", default='BX') parser.add_option("--extract-umi-method", dest="get_umi_method", type="choice", choices=("read_id", "tag"), default="read_id", help="where is the read UMI encoded? [default=%default]") parser.add_option("--subset", dest="subset", type="float", help="Use only a fraction of reads, specified by subset", default=None) parser.add_option("--spliced-is-unique", dest="spliced", action="store_true", help="Treat a spliced read as different to an unspliced" " one [default=%default]", default=False) parser.add_option("--soft-clip-threshold", dest="soft", type="float", help="number of bases clipped from 5' end before" "read is counted as spliced [default=%default]", default=4) parser.add_option("--edit-distance-threshold", dest="threshold", type="int", default=1, help="Edit distance theshold at which to join two UMIs" "when clustering. [default=%default]") parser.add_option("--chrom", dest="chrom", type="string", help="Restrict to one chromosome", default=None) parser.add_option("--paired", dest="paired", action="store_true", default=False, help="paired BAM. [default=%default]") parser.add_option("--method", dest="method", type="choice", choices=("adjacency", "directional", "unique", "cluster"), default="directional", help="method to use for umi deduping [default=%default]") parser.add_option("--per-contig", dest="per_contig", action="store_true", default=False, help=("dedup per contig," " e.g for transcriptome where contig = gene")) parser.add_option( "--whole-contig", dest="whole_contig", action="store_true", default=False, help= "Read whole contig before outputting bundles: guarantees that no reads" "are missed, but increases memory usage") parser.add_option( "--read-length", dest="read_length", action="store_true", default=False, help=("use read length in addition to position and UMI" "to identify possible duplicates [default=%default]")) parser.add_option("--mapping-quality", dest="mapping_quality", type="int", help="Minimum mapping quality for a read to be retained" " [default=%default]", default=0) parser.add_option( "--group-out", dest="tsv", type="string", help="Outfile name for file mapping read id to read group", default=None) parser.add_option( "--output-bam", dest="output_bam", action="store_true", default=False, help=("output a bam file with read groups tagged using the UG tag" "[default=%default]")) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: out_name = options.stdout.name options.stdout.close() assert options.output_bam, ( "To output a bam you must include --output-bam option") else: out_name = "-" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "w" else: out_mode = "wb" infile = pysam.Samfile(in_name, in_mode) if options.output_bam: outfile = pysam.Samfile(out_name, out_mode, template=infile) if options.paired: outfile = umi_methods.TwoPassPairWriter(infile, outfile, tags=True) else: outfile = None if options.tsv: mapping_outfile = U.openFile(options.tsv, "w") mapping_outfile.write( "read_id\tcontig\tposition\tumi\tumi_count\tfinal_umi\tfinal_umi_count\tunique_id\n" ) # set the method with which to extract umis from reads if options.get_umi_method == "read_id": umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep) elif options.get_umi_method == "tag": umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag) else: raise ValueError("Unknown umi extraction method") nInput, nOutput, unique_id = 0, 0, 0 read_events = collections.Counter() for bundle, read_events in umi_methods.get_bundles( infile, read_events, ignore_umi=False, subset=options.subset, quality_threshold=options.mapping_quality, paired=options.paired, chrom=options.chrom, spliced=options.spliced, soft_clip_threshold=options.soft, per_contig=options.per_contig, whole_contig=options.whole_contig, read_length=options.read_length, umi_getter=umi_getter, all_reads=True): nInput += sum([bundle[umi]["count"] for umi in bundle]) if nOutput % 10000 == 0: U.debug("Outputted %i" % nOutput) if nInput % 1000000 == 0: U.debug("Read %i input reads" % nInput) # set up ReadCluster functor with methods specific to # specified options.method processor = network.ReadClusterer(options.method) bundle, groups, counts = processor(bundle=bundle, threshold=options.threshold, stats=True, deduplicate=False) for umi_group in groups: top_umi = umi_group[0] group_count = sum(counts[umi] for umi in umi_group) for umi in umi_group: reads = bundle[umi]['read'] for read in reads: if outfile: if options.paired: # if paired, we need to supply the tags to # add to the paired read outfile.write(read, unique_id, top_umi) else: # Add the 'UG' tag to the read read.tags += [('UG', unique_id)] read.tags += [(options.umi_group_tag, top_umi)] outfile.write(read) if options.tsv: mapping_outfile.write("%s\n" % "\t".join( map(str, (read.query_name, read.reference_name, umi_methods.get_read_position( read, options.soft)[1], umi.decode(), counts[umi], top_umi.decode(), group_count, unique_id)))) nOutput += 1 unique_id += 1 if outfile: outfile.close() if options.tsv: mapping_outfile.close() # write footer and output benchmark information. U.info( "Reads: %s" % ", ".join(["%s: %s" % (x[0], x[1]) for x in read_events.most_common()])) U.info("Number of reads out: %i, Number of groups: %i" % (nOutput, unique_id)) U.Stop()
def getUserDefinedBarcodes(whitelist_tsv): cell_whitelist = [] with U.openFile(whitelist_tsv, "r") as inf: for line in inf: cell_whitelist.append(line.strip()) return set(cell_whitelist)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true", help="Input file is in sam format [default=%default]", default=False) parser.add_option( "-o", "--out-sam", dest="out_sam", action="store_true", help="Output alignments in sam format [default=%default]", default=False) parser.add_option("--ignore-umi", dest="ignore_umi", action="store_true", help="Ignore UMI and dedup" " only on position", default=False) parser.add_option("--umi-separator", dest="umi_sep", type="string", help="separator between read id and UMI", default="_") parser.add_option("--umi-tag", dest="umi_tag", type="string", help="tag containing umi", default='RX') parser.add_option("--extract-umi-method", dest="get_umi_method", type="choice", choices=("read_id", "tag"), default="read_id", help="where is the read UMI encoded? [default=%default]") parser.add_option("--subset", dest="subset", type="float", help="Use only a fraction of reads, specified by subset", default=None) parser.add_option("--spliced-is-unique", dest="spliced", action="store_true", help="Treat a spliced read as different to an unspliced" " one [default=%default]", default=False) parser.add_option("--soft-clip-threshold", dest="soft", type="float", help="number of bases clipped from 5' end before" "read is counted as spliced [default=%default]", default=4) parser.add_option("--edit-distance-threshold", dest="threshold", type="int", default=1, help="Edit distance theshold at which to join two UMIs" "when clustering. [default=%default]") parser.add_option("--chrom", dest="chrom", type="string", help="Restrict to one chromosome", default=None) parser.add_option("--paired", dest="paired", action="store_true", default=False, help="paired BAM. [default=%default]") parser.add_option("--method", dest="method", type="choice", choices=("adjacency", "directional", "percentile", "unique", "cluster"), default="directional", help="method to use for umi deduping [default=%default]") parser.add_option("--output-stats", dest="stats", type="string", default=False, help="Specify location to output stats") parser.add_option( "--whole-contig", dest="whole_contig", action="store_true", default=False, help= "Read whole contig before outputting bundles: guarantees that no reads" "are missed, but increases memory usage") parser.add_option("--multimapping-detection-method", dest="detection_method", type="choice", choices=("NH", "X0", "XT"), default=None, help=("Some aligners identify multimapping using bam " "tags. Setting this option to NH, X0 or XT will " "use these tags when selecting the best read " "amongst reads with the same position and umi " "[default=%default]")) parser.add_option("--mapping-quality", dest="mapping_quality", type="int", help="Minimum mapping quality for a read to be retained" " [default=%default]", default=0) parser.add_option( "--read-length", dest="read_length", action="store_true", default=False, help=("use read length in addition to position and UMI" "to identify possible duplicates [default=%default]")) parser.add_option("--per-contig", dest="per_contig", action="store_true", default=False, help=("dedup per contig (field 3 in BAM; RNAME)," " e.g for transcriptome where contig = gene")) parser.add_option("--per-gene", dest="per_gene", action="store_true", default=False, help=("Deduplicate per gene," "e.g for transcriptome where contig = transcript" "must also provide a transript to gene map with" "--gene-transcript-map [default=%default]")) parser.add_option("--gene-transcript-map", dest="gene_transcript_map", type="string", help="file mapping transcripts to genes (tab separated)", default=None) parser.add_option("--gene-tag", dest="gene_tag", type="string", help=("Deduplicate per gene where gene is" "defined by this bam tag [default=%default]"), default=None) parser.add_option( "--skip-tags-regex", dest="skip_regex", type="string", help=("Used with --gene-tag. " "Ignore reads where the gene-tag matches this regex"), default="^[__|Unassigned]") # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) if options.random_seed: np.random.seed(options.random_seed) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: out_name = options.stdout.name options.stdout.close() else: out_name = "-" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "wh" else: out_mode = "wb" if options.stats: if options.ignore_umi: raise ValueError("'--output-stats' and '--ignore-umi' options" " cannot be used together") if options.per_gene: if not options.gene_transcript_map and not options.gene_map: raise ValueError( "--per-gene option requires --gene-transcript-map " "or --gene-tag") try: re.compile(options.skip_regex) except re.error: raise ValueError("skip-regex '%s' is not a " "valid regex" % options.skip_regex) infile = pysam.Samfile(in_name, in_mode) outfile = pysam.Samfile(out_name, out_mode, template=infile) if options.paired: outfile = umi_methods.TwoPassPairWriter(infile, outfile) nInput, nOutput = 0, 0 if options.detection_method: bam_features = detect_bam_features(infile.filename) if not bam_features[options.detection_method]: if sum(bam_features.values()) == 0: raise ValueError( "There are no bam tags available to detect multimapping. " "Do not set --multimapping-detection-method") else: raise ValueError( "The chosen method of detection for multimapping (%s) " "will not work with this bam. Multimapping can be detected" " for this bam using any of the following: %s" % (options.detection_method, ",".join( [x for x in bam_features if bam_features[x]]))) # set the method with which to extract umis from reads if options.get_umi_method == "read_id": umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep) elif options.get_umi_method == "tag": umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag) else: raise ValueError("Unknown umi extraction method") if options.stats: # set up arrays to hold stats data stats_pre_df_dict = {"UMI": [], "counts": []} stats_post_df_dict = {"UMI": [], "counts": []} pre_cluster_stats = [] post_cluster_stats = [] pre_cluster_stats_null = [] post_cluster_stats_null = [] topology_counts = collections.Counter() node_counts = collections.Counter() read_gn = umi_methods.random_read_generator(infile.filename, chrom=options.chrom, umi_getter=umi_getter) if options.chrom: inreads = infile.fetch(reference=options.chrom) else: if options.per_gene and options.gene_transcript_map: metacontig2contig = umi_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch() gene_tag = options.gene_tag for bundle, read_events, status in umi_methods.get_bundles( inreads, ignore_umi=options.ignore_umi, subset=options.subset, quality_threshold=options.mapping_quality, paired=options.paired, spliced=options.spliced, soft_clip_threshold=options.soft, per_contig=options.per_contig, gene_tag=options.gene_tag, skip_regex=options.skip_regex, whole_contig=options.whole_contig, read_length=options.read_length, detection_method=options.detection_method, umi_getter=umi_getter, all_reads=False, return_read2=False, return_unmapped=False): nInput += sum([bundle[umi]["count"] for umi in bundle]) if nOutput % 10000 == 0: U.debug("Outputted %i" % nOutput) if nInput % 1000000 == 0: U.debug("Read %i input reads" % nInput) if options.stats: # generate pre-dudep stats average_distance = umi_methods.get_average_umi_distance( bundle.keys()) pre_cluster_stats.append(average_distance) cluster_size = len(bundle) random_umis = read_gn.getUmis(cluster_size) average_distance_null = umi_methods.get_average_umi_distance( random_umis) pre_cluster_stats_null.append(average_distance_null) if options.ignore_umi: for umi in bundle: nOutput += 1 outfile.write(bundle[umi]["read"]) else: # set up ReadCluster functor with methods specific to # specified options.method processor = network.ReadDeduplicator(options.method) # dedup using umis and write out deduped bam reads, umis, umi_counts = processor(bundle=bundle, threshold=options.threshold) for read in reads: outfile.write(read) nOutput += 1 if options.stats: # collect pre-dudupe stats stats_pre_df_dict['UMI'].extend(bundle) stats_pre_df_dict['counts'].extend( [bundle[UMI]['count'] for UMI in bundle]) # collect post-dudupe stats post_cluster_umis = [umi_getter(x) for x in reads] stats_post_df_dict['UMI'].extend(umis) stats_post_df_dict['counts'].extend(umi_counts) average_distance = umi_methods.get_average_umi_distance( post_cluster_umis) post_cluster_stats.append(average_distance) cluster_size = len(post_cluster_umis) random_umis = read_gn.getUmis(cluster_size) average_distance_null = umi_methods.get_average_umi_distance( random_umis) post_cluster_stats_null.append(average_distance_null) outfile.close() if options.stats: # generate the stats dataframe stats_pre_df = pd.DataFrame(stats_pre_df_dict) stats_post_df = pd.DataFrame(stats_post_df_dict) # tally the counts per umi per position pre_counts = collections.Counter(stats_pre_df["counts"]) post_counts = collections.Counter(stats_post_df["counts"]) counts_index = list( set(pre_counts.keys()).union(set(post_counts.keys()))) counts_index.sort() with U.openFile(options.stats + "_per_umi_per_position.tsv", "w") as outf: outf.write("counts\tinstances_pre\tinstances_post\n") for count in counts_index: values = (count, pre_counts[count], post_counts[count]) outf.write("\t".join(map(str, values)) + "\n") # aggregate stats pre/post per UMI agg_pre_df = aggregateStatsDF(stats_pre_df) agg_post_df = aggregateStatsDF(stats_post_df) agg_df = pd.merge(agg_pre_df, agg_post_df, how='left', left_index=True, right_index=True, sort=True, suffixes=["_pre", "_post"]) # TS - if count value not observed either pre/post-dedup, # merge will leave an empty cell and the column will be cast as a float # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html # --> Missing data casting rules and indexing # so, back fill with zeros and convert back to int agg_df = agg_df.fillna(0).astype(int) agg_df.index = [x.decode() for x in agg_df.index] agg_df.index.name = 'UMI' agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t") # bin distances into integer bins max_ed = int( max( map(max, [ pre_cluster_stats, post_cluster_stats, pre_cluster_stats_null, post_cluster_stats_null ]))) cluster_bins = range(-1, int(max_ed) + 2) def bin_clusters(cluster_list, bins=cluster_bins): ''' take list of floats and return bins''' return np.digitize(cluster_list, bins, right=True) def tallyCounts(binned_cluster, max_edit_distance): ''' tally counts per bin ''' return np.bincount(binned_cluster, minlength=max_edit_distance + 3) pre_cluster_binned = bin_clusters(pre_cluster_stats) post_cluster_binned = bin_clusters(post_cluster_stats) pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null) post_cluster_null_binned = bin_clusters(post_cluster_stats_null) edit_distance_df = pd.DataFrame({ "unique": tallyCounts(pre_cluster_binned, max_ed), "unique_null": tallyCounts(pre_cluster_null_binned, max_ed), options.method: tallyCounts(post_cluster_binned, max_ed), "%s_null" % options.method: tallyCounts(post_cluster_null_binned, max_ed), "edit_distance": cluster_bins }) # TS - set lowest bin (-1) to "Single_UMI" edit_distance_df['edit_distance'][0] = "Single_UMI" edit_distance_df.to_csv(options.stats + "_edit_distance.tsv", index=False, sep="\t") # write footer and output benchmark information. U.info( "%s" % ", ".join(["%s: %s" % (x[0], x[1]) for x in read_events.most_common()])) U.info("Number of reads out: %i" % nOutput) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-p", "--bc-pattern", dest="pattern", type="string", help="Barcode pattern") parser.add_option("--bc-pattern2", dest="pattern2", type="string", help="Barcode pattern for paired reads") parser.add_option("--3prime", dest="prime3", action="store_true", help="barcode is on 3' end of read.") parser.add_option("--read2-in", dest="read2_in", type="string", help="file name for read pairs") parser.add_option("--extract-method", dest="extract_method", type="choice", choices=["string", "regex"], help=("How to extract the umi +/- cell barcodes, Choose " "from 'string' or 'regex'")) parser.add_option("--plot-prefix", dest="plot_prefix", type="string", help=("Prefix for plots to visualise the automated " "detection of the number of 'true' cell barcodes")) parser.add_option("--subset-reads", dest="subset_reads", type="int", help=("Use the first N reads to automatically identify " "the true cell barcodes. If N is greater than the " "number of reads, all reads will be used")) parser.add_option("--error-correct-threshold", dest="error_correct_threshold", type="int", help=("Hamming distance for correction of " "barcodes to whitelist barcodes")) parser.add_option("--method", dest="method", choices=["reads", "umis"], help=("Use reads or unique umi counts per cell")) parser.add_option("--expect-cells", dest="expect_cells", type="int", help=("Prior expectation on the upper limit on the " "number of cells sequenced")) parser.add_option("--set-cell-number", dest="cell_number", type="int", help=("Specify the number of cell barcodes to accept")) parser.set_defaults(method="reads", extract_method="string", filter_cell_barcodes=False, whitelist_tsv=None, blacklist_tsv=None, error_correct_threshold=1, pattern=None, pattern2=None, read2_in=None, plot_prefix=None, subset_reads=100000000, expect_cells=False, cell_number=False) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False, add_sam_options=False) if options.expect_cells and options.cell_number: U.error("Cannot supply both --expect-cells and " "--cell-number options") if not options.pattern and not options.pattern2: if not options.read2_in: U.error("Must supply --bc-pattern for single-end") else: U.error("Must supply --bc-pattern and/or --bc-pattern2 " "if paired-end ") if options.pattern2: if not options.read2_in: U.error("must specify a paired fastq ``--read2-in``") if not options.pattern2: options.pattern2 = options.pattern extract_cell = False extract_umi = False # If the pattern is a regex we can compile the regex(es) prior to # ExtractFilterAndUpdate instantiation if options.extract_method == "regex": if options.pattern: try: options.pattern = regex.compile(options.pattern) except regex.error: U.error("barcode_regex '%s' is not a " "valid regex" % options.pattern) if options.pattern2: try: options.pattern2 = regex.compile(options.barcode_regex2) except regex.Error: U.error("barcode_regex2 '%s' is not a " "valid regex" % options.barcode_regex2) # check whether the regex contains a umi group(s) and cell groups(s) if options.extract_method == "regex": if options.pattern: for group in options.pattern.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True if options.pattern2: for group in options.pattern2.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True # check whether the pattern string contains umi/cell bases elif options.extract_method == "string": if options.pattern: if "C" in options.pattern: extract_cell = True if "N" in options.pattern: extract_umi = True if options.pattern2: if "C" in options.pattern2: extract_cell = True if "N" in options.pattern2: extract_umi = True if not extract_umi: if options.extract_method == "string": U.error("barcode pattern(s) do not include any umi bases " "(marked with 'Ns') %s, %s" % (options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any umi groups " "(starting with 'umi_') %s, %s" (options.pattern, options.pattern2)) if not extract_cell: if options.extract_method == "string": U.error("barcode pattern(s) do not include any cell bases " "(marked with 'Cs') %s, %s" % (options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any cell groups " "(starting with 'cell_') %s, %s" (options.pattern, options.pattern2)) read1s = umi_methods.fastqIterate(options.stdin) # set up read extractor ReadExtractor = umi_methods.ExtractFilterAndUpdate( method=options.extract_method, pattern=options.pattern, pattern2=options.pattern2, prime3=options.prime3, extract_cell=extract_cell) cell_barcode_counts = collections.Counter() n_reads = 0 n_cell_barcodes = 0 # if using the umis method, need to keep a set of umis observed if options.method == "umis": cell_barcode_umis = collections.defaultdict(set) # variables for progress monitor displayMax = 100000 U.info("Starting barcode extraction") if not options.read2_in: for read1 in read1s: # Update display in every 100kth iteration if n_reads % displayMax == 0: U.info("Parsed {} reads".format(n_reads)) n_reads += 1 barcode_values = ReadExtractor.getBarcodes(read1) if barcode_values is None: continue else: cell, umi, _, _, _, _, _ = barcode_values if options.method == "umis": cell_barcode_umis[cell].add(umi) else: cell_barcode_counts[cell] += 1 n_cell_barcodes += 1 if options.subset_reads: if n_cell_barcodes > options.subset_reads: break else: read2s = umi_methods.fastqIterate(U.openFile(options.read2_in)) for read1, read2 in izip(read1s, read2s): # Update display in every 100kth iteration if n_reads % displayMax == 0: U.info("Parsed {} reads".format(n_reads)) n_reads += 1 barcode_values = ReadExtractor.getBarcodes(read1, read2) if barcode_values is None: continue else: cell, umi, _, _, _, _, _ = barcode_values if options.method == "umis": cell_barcode_umis[cell].add(umi) else: cell_barcode_counts[cell] += 1 n_cell_barcodes += 1 if options.subset_reads: if n_reads > options.subset_reads: break U.info("Starting - whitelist determination") if options.method == "umis": for cell in cell_barcode_umis: cell_barcode_counts[cell] = len(cell_barcode_umis[cell]) if options.cell_number and options.cell_number > len(cell_barcode_counts): raise ValueError( "--set-cell-barcode option specifies more cell barcodes than the " "number of observed cell barcodes. This may be because " "--subset-reads was set to a value too low to capture reads from " "all cells. %s cell barcodes observed from %s parsed reads. " "Expected>= %s cell barcodes" % (len(cell_barcode_counts), options.subset_reads, options.cell_number)) cell_whitelist, true_to_false_map = umi_methods.getCellWhitelist( cell_barcode_counts, options.expect_cells, options.cell_number, options.error_correct_threshold, options.plot_prefix) U.info("Writing out whitelist") for barcode in sorted(list(cell_whitelist)): if true_to_false_map: corrected_barcodes = ",".join(sorted(true_to_false_map[barcode])) corrected_barcode_counts = ",".join( map(str, [ cell_barcode_counts[x] for x in sorted(true_to_false_map[barcode]) ])) else: corrected_barcodes, corrected_barcode_counts = "", "" options.stdout.write( "%s\t%s\t%s\t%s\n" % (barcode, corrected_barcodes, cell_barcode_counts[barcode], corrected_barcode_counts)) U.info("Parsed %i reads" % n_reads) U.info("%i reads matched the barcode pattern" % n_cell_barcodes) U.info("Found %i unique cell barcodes" % len(cell_barcode_counts)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true", help="Input file is in sam format", default=False) parser.add_option("-o", "--out-sam", dest="out_sam", action="store_true", help="Output alignments in sam format", default=False) parser.add_option("--ignore-umi", dest="ignore_umi", action="store_true", help="Ignore UMI and dedup only on position", default=False) parser.add_option("--subset", dest="subset", type="string", help="Use only a fraction of reads, specified by subset", default=1.1) parser.add_option("--spliced-is-unique", dest="spliced", action="store_true", help="Treat a spliced read as different to an unspliced" " one", default=False) parser.add_option("--soft-clip-threshold", dest="soft", type="float", help="number of bases clipped from 5' end before" "read os counted as spliced", default=4) parser.add_option("--edit-distance-threshold", dest="threshold", type="int", help="Edit distance theshold at which to join two UMIs" "when clustering", default=1) parser.add_option("--chrom", dest="chrom", type="string", help="Restrict to one chromosome", default=None) parser.add_option("--paired", dest="paired", action="store_true", default=False, help="Use second-in-pair position when deduping") parser.add_option("--method", dest="method", type="choice", choices=("adjacency", "directional-adjacency", "percentile", "unique", "cluster"), default="directional-adjacency", help="method to use for umi deduping") parser.add_option("--output-stats", dest="stats", type="string", default=False, help="Specify location to output stats") parser.add_option("--further-stats", dest="further_stats", action="store_true", default=False, help="Output further stats") parser.add_option("--per-contig", dest="per_contig", action="store_true", default=False, help=("dedup per contig," " e.g for transcriptome where contig = gene")) parser.add_option("--whole-contig", dest="whole_contig", action="store_true", default=False, help="Read whole contig before outputting bundles: guarantees that no reads" "are missed, but increases memory usage") parser.add_option("--multimapping-detection-method", dest="detection_method", type="choice", choices=("NH", "X0", "XT"), default=None, help=("Some aligners identify multimapping using bam " "tags. Setting this option to NH, X0 or XT will " "use these tags when selecting the best read " "amongst reads with the same position and umi")) parser.add_option("--mapping-quality", dest="mapping_quality", type="int", help="Minimum mapping quality for a read to be retained", default=0) parser.add_option("--read-length", dest="read_length", action="store_true", default=False, help=("use read length in addition to position and UMI" "to identify possible duplicates")) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: out_name = options.stdout.name options.stdout.close() else: out_name = "-" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "w" else: out_mode = "wb" if options.stats: if options.ignore_umi: raise ValueError("'--output-stats' and '--ignore-umi' options" " cannot be used together") if options.further_stats: if not options.stats: raise ValueError("'--further-stats' options requires " "'--output-stats' option") if options.method not in ["cluster", "adjacency"]: raise ValueError("'--further-stats' only enabled with 'cluster' " "and 'adjacency' methods") infile = pysam.Samfile(in_name, in_mode) outfile = pysam.Samfile(out_name, out_mode, template=infile) if options.paired: outfile = TwoPassPairWriter(infile, outfile) nInput, nOutput = 0, 0 if options.detection_method: bam_features = detect_bam_features(infile.filename) if not bam_features[options.detection_method]: if sum(bam_features.values()) == 0: raise ValueError( "There are no bam tags available to detect multimapping. " "Do not set --multimapping-detection-method") else: raise ValueError( "The chosen method of detection for multimapping (%s) " "will not work with this bam. Multimapping can be detected" " for this bam using any of the following: %s" % ( options.detection_method, ",".join( [x for x in bam_features if bam_features[x]]))) if options.stats: # set up arrays to hold stats data stats_pre_df_dict = {"UMI": [], "counts": []} stats_post_df_dict = {"UMI": [], "counts": []} pre_cluster_stats = [] post_cluster_stats = [] pre_cluster_stats_null = [] post_cluster_stats_null = [] topology_counts = collections.Counter() node_counts = collections.Counter() read_gn = random_read_generator(infile.filename, chrom=options.chrom) for bundle in get_bundles(infile, ignore_umi=options.ignore_umi, subset=float(options.subset), quality_threshold=options.mapping_quality, paired=options.paired, chrom=options.chrom, spliced=options.spliced, soft_clip_threshold=options.soft, per_contig=options.per_contig, whole_contig=options.whole_contig, read_length=options.read_length, detection_method=options.detection_method): nInput += sum([bundle[umi]["count"] for umi in bundle]) if nOutput % 10000 == 0: U.debug("Outputted %i" % nOutput) if nInput % 1000000 == 0: U.debug("Read %i input reads" % nInput) if options.stats: # generate pre-dudep stats average_distance = get_average_umi_distance(bundle.keys()) pre_cluster_stats.append(average_distance) cluster_size = len(bundle) random_umis = read_gn.getUmis(cluster_size) average_distance_null = get_average_umi_distance(random_umis) pre_cluster_stats_null.append(average_distance_null) if options.ignore_umi: for umi in bundle: nOutput += 1 outfile.write(bundle[umi]["read"]) else: # set up ClusterAndReducer functor with methods specific to # specified options.method processor = ClusterAndReducer(options.method) # dedup using umis and write out deduped bam reads, umis, umi_counts, topologies, nodes = processor( bundle, options.threshold, options.stats, options.further_stats) for read in reads: outfile.write(read) nOutput += 1 if options.stats: # collect pre-dudupe stats stats_pre_df_dict['UMI'].extend(bundle) stats_pre_df_dict['counts'].extend( [bundle[UMI]['count'] for UMI in bundle]) # collect post-dudupe stats post_cluster_umis = [x.qname.split("_")[-1] for x in reads] stats_post_df_dict['UMI'].extend(umis) stats_post_df_dict['counts'].extend(umi_counts) average_distance = get_average_umi_distance(post_cluster_umis) post_cluster_stats.append(average_distance) cluster_size = len(post_cluster_umis) random_umis = read_gn.getUmis(cluster_size) average_distance_null = get_average_umi_distance(random_umis) post_cluster_stats_null.append(average_distance_null) if options.further_stats: for c_type, count in topologies.most_common(): topology_counts[c_type] += count for c_type, count in nodes.most_common(): node_counts[c_type] += count outfile.close() if options.stats: stats_pre_df = pd.DataFrame(stats_pre_df_dict) stats_post_df = pd.DataFrame(stats_post_df_dict) # generate histograms of counts per UMI at each position UMI_counts_df_pre = pd.DataFrame(stats_pre_df.pivot_table( columns=stats_pre_df["counts"], values="counts", aggfunc=len)) UMI_counts_df_post = pd.DataFrame(stats_post_df.pivot_table( columns=stats_post_df["counts"], values="counts", aggfunc=len)) UMI_counts_df_pre.columns = ["instances"] UMI_counts_df_post.columns = ["instances"] UMI_counts_df = pd.merge(UMI_counts_df_pre, UMI_counts_df_post, how='left', left_index=True, right_index=True, sort=True, suffixes=["_pre", "_post"]) # TS - if count value not observed either pre/post-dedup, # merge will leave an empty cell and the column will be cast as a float # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html # --> Missing data casting rules and indexing # so, back fill with zeros and convert back to int UMI_counts_df = UMI_counts_df.fillna(0).astype(int) UMI_counts_df.to_csv( options.stats + "_per_umi_per_position.tsv", sep="\t") # aggregate stats pre/post per UMI agg_pre_df = aggregateStatsDF(stats_pre_df) agg_post_df = aggregateStatsDF(stats_post_df) agg_df = pd.merge(agg_pre_df, agg_post_df, how='left', left_index=True, right_index=True, sort=True, suffixes=["_pre", "_post"]) # TS - see comment above regarding missing values agg_df = agg_df.fillna(0).astype(int) agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t") # bin distances into integer bins max_ed = int(max(map(max, [pre_cluster_stats, post_cluster_stats, pre_cluster_stats_null, post_cluster_stats_null]))) cluster_bins = range(-1, int(max_ed) + 2) def bin_clusters(cluster_list, bins=cluster_bins): ''' take list of floats and return bins''' return np.digitize(cluster_list, bins, right=True) def tallyCounts(binned_cluster, max_edit_distance): ''' tally counts per bin ''' return np.bincount(binned_cluster, minlength=max_edit_distance + 3) pre_cluster_binned = bin_clusters(pre_cluster_stats) post_cluster_binned = bin_clusters(post_cluster_stats) pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null) post_cluster_null_binned = bin_clusters(post_cluster_stats_null) edit_distance_df = pd.DataFrame({ "unique": tallyCounts(pre_cluster_binned, max_ed), "unique_null": tallyCounts(pre_cluster_null_binned, max_ed), options.method: tallyCounts(post_cluster_binned, max_ed), "%s_null" % options.method: tallyCounts(post_cluster_null_binned, max_ed), "edit_distance": cluster_bins}) # TS - set lowest bin (-1) to "Single_UMI" edit_distance_df['edit_distance'][0] = "Single_UMI" edit_distance_df.to_csv(options.stats + "_edit_distance.tsv", index=False, sep="\t") if options.further_stats: with U.openFile(options.stats + "_topologies.tsv", "w") as outf: outf.write( "\n".join(["\t".join((x, str(y))) for x, y in topology_counts.most_common()]) + "\n") with U.openFile(options.stats + "_nodes.tsv", "w") as outf: outf.write( "\n".join(["\t".join(map(str, (x, y))) for x, y in node_counts.most_common()]) + "\n") # write footer and output benchmark information. U.info("Number of reads in: %i, Number of reads out: %i" % (nInput, nOutput)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) group = U.OptionGroup(parser, "group-specific options") group.add_option("--group-out", dest="tsv", type="string", help="Outfile name for file mapping read id to read group", default=None) group.add_option("--output-bam", dest="output_bam", action="store_true", default=False, help=("output a bam file with read groups tagged using the UG tag" "[default=%default]")) group.add_option("--output-unmapped", dest="output_unmapped", action="store_true", default=False, help=("Retain all unmapped reads in output[default=%default]")) parser.add_option("--umi-group-tag", dest="umi_group_tag", type="string", help="tag for the outputted umi group", default='BX') parser.add_option_group(group) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) U.validateSamOptions(options) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: if options.no_sort_output: out_name = options.stdout.name else: out_name = U.getTempFilename() sorted_out_name = options.stdout.name options.stdout.close() assert options.output_bam, ( "To output a bam you must include --output-bam option") else: if options.no_sort_output: out_name = "-" else: out_name = U.getTempFilename() sorted_out_name = "-" if not options.no_sort_output: # need to determine the output format for sort if options.out_sam: sort_format = "sam" else: sort_format = "bam" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "wh" else: out_mode = "wb" infile = pysam.Samfile(in_name, in_mode) if options.output_bam: outfile = pysam.Samfile(out_name, out_mode, template=infile) else: outfile = None if options.tsv: mapping_outfile = U.openFile(options.tsv, "w") mapping_outfile.write("%s\n" % "\t".join( ["read_id", "contig", "position", "gene", "umi", "umi_count", "final_umi", "final_umi_count", "unique_id"])) nInput, nOutput, unique_id, input_reads, output_reads = 0, 0, 0, 0, 0 gene_tag = options.gene_tag metacontig2contig = None if options.chrom: inreads = infile.fetch(reference=options.chrom) else: if options.per_gene and options.gene_transcript_map: metacontig2contig = umi_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch(until_eof=options.output_unmapped) bundle_iterator = umi_methods.get_bundles( options, all_reads=True, return_read2=True, return_unmapped=options.output_unmapped, metacontig_contig=metacontig2contig) for bundle, key, status in bundle_iterator(inreads): # write out read2s and unmapped (if these options are set) if status == 'single_read': # bundle is just a single read here nInput += 1 if outfile: outfile.write(bundle) nOutput += 1 continue umis = bundle.keys() counts = {umi: bundle[umi]["count"] for umi in umis} nInput += sum(counts.values()) while nOutput >= output_reads + 10000: output_reads += 10000 U.info("Written out %i reads" % output_reads) while nInput >= input_reads + 1000000: input_reads += 1000000 U.info("Parsed %i input reads" % input_reads) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) # group the umis groups = processor( umis, counts, threshold=options.threshold) for umi_group in groups: top_umi = umi_group[0] group_count = sum(counts[umi] for umi in umi_group) for umi in umi_group: reads = bundle[umi]['read'] for read in reads: if outfile: # Add the 'UG' tag to the read read.tags += [('UG', unique_id)] read.tags += [(options.umi_group_tag, top_umi)] outfile.write(read) if options.tsv: if options.per_gene: gene = read.get_tag(gene_tag) else: gene = "NA" mapping_outfile.write("%s\n" % "\t".join(map(str, ( read.query_name, read.reference_name, umi_methods.get_read_position( read, options.soft_clip_threshold)[1], gene, umi.decode(), counts[umi], top_umi.decode(), group_count, unique_id)))) nOutput += 1 unique_id += 1 if outfile: outfile.close() if not options.no_sort_output: # sort the output pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name) os.unlink(out_name) # delete the tempfile if options.tsv: mapping_outfile.close() # write footer and output benchmark information. U.info( "Reads: %s" % ", ".join(["%s: %s" % (x[0], x[1]) for x in bundle_iterator.read_events.most_common()])) U.info("Number of reads out: %i, Number of groups: %i" % (nOutput, unique_id)) U.Stop()
def getUserDefinedBarcodes(whitelist_tsv, whitelist_tsv2=None, getErrorCorrection=False, deriveErrorCorrection=False, threshold=1): ''' whitelist_tsv: tab-separated file with whitelisted barcodes. First field should be whitelist barcodes. Second field [optional] should be comma-separated barcodes which are to be corrected to the barcode in the first field. whitelist_tsv2: as above but for read2s getErrorCorrection: extract the second field in whitelist_tsv and return a map of non-whitelist:whitelist deriveErrorCorrection: return a map of non-whitelist:whitelist using a simple edit distance threshold ''' base2errors = {"A": ["T", "C", "G", "N"], "T": ["A", "C", "G", "N"], "C": ["T", "A", "G", "N"], "G": ["T", "C", "A", "N"]} whitelist = [] if getErrorCorrection or deriveErrorCorrection: false_to_true_map = {} else: false_to_true_map = None def singleBarcodeGenerator(whitelist_tsv): with U.openFile(whitelist_tsv, "r") as inf: for line in inf: if line.startswith('#'): continue line = line.strip().split("\t") yield(line[0]) def pairedBarcodeGenerator(whitelist_tsv, whitelist_tsv2): whitelist1 = [] whitelist2 = [] with U.openFile(whitelist_tsv, "r") as inf: for line in inf: if line.startswith('#'): continue line = line.strip().split("\t") whitelist1.append(line[0]) with U.openFile(whitelist_tsv2, "r") as inf2: for line in inf2: if line.startswith('#'): continue line = line.strip().split("\t") whitelist2.append(line[0]) for w1, w2 in itertools.product(whitelist1, whitelist2): yield(w1 + w2) if deriveErrorCorrection: if whitelist_tsv2: whitelist_barcodes = pairedBarcodeGenerator(whitelist_tsv, whitelist_tsv2) else: whitelist_barcodes = singleBarcodeGenerator(whitelist_tsv) for whitelist_barcode in whitelist_barcodes: whitelist.append(whitelist_barcode) # for every possible combination of positions for error(s) for positions in itertools.product( range(0, len(whitelist_barcode)), repeat=threshold): m_bases = [base2errors[whitelist_barcode[x]] for x in positions] # for every possible combination of errors for m in itertools.product(*m_bases): error_barcode = list(whitelist_barcode) # add errors for pos, error_base in zip(positions, m): error_barcode[pos] = error_base error_barcode = "".join(error_barcode) # if error barcode has already been seen, must be within # threshold edit distance of >1 whitelisted barcodes if error_barcode in false_to_true_map: # don't report multiple times for the same barcode if false_to_true_map[error_barcode]: U.info("Error barcode %s can be assigned to more than " "one possible true barcode: %s or %s" % ( error_barcode, false_to_true_map[error_barcode], whitelist_barcode)) false_to_true_map[error_barcode] = None else: false_to_true_map[error_barcode] = whitelist_barcode elif getErrorCorrection: assert not whitelist_tsv2, ("Can only extract errors from the whitelist " "if a single whitelist is given") with U.openFile(whitelist_tsv, "r") as inf: for line in inf: if line.startswith('#'): continue line = line.strip().split("\t") whitelist_barcode = line[0] whitelist.append(whitelist_barcode) if getErrorCorrection: for error_barcode in line[1].split(","): false_to_true_map[error_barcode] = whitelist_barcode else: # no error correction if whitelist_tsv2: whitelist_barcodes = pairedBarcodeGenerator(whitelist_tsv, whitelist_tsv2) else: whitelist_barcodes = singleBarcodeGenerator(whitelist_tsv) whitelist = [x for x in whitelist_barcodes] return set(whitelist), false_to_true_map
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-p", "--bc-pattern", dest="pattern", type="string", help="Barcode pattern") parser.add_option("--bc-pattern2", dest="pattern2", type="string", help="Barcode pattern for paired reads") parser.add_option("--3prime", dest="prime3", action="store_true", help="barcode is on 3' end of read.") parser.add_option("--read2-in", dest="read2_in", type="string", help="file name for read pairs") parser.add_option("--read2-out", dest="read2_out", type="string", help="file to output processed paired read to") parser.add_option( "--read2-out-only", dest="read2_out_only", action="store_true", help="Paired reads, only output the second read in the pair") parser.add_option("--quality-filter-threshold", dest="quality_filter_threshold", type="int", help=("Remove reads where any UMI base quality score " "falls below this threshold")) parser.add_option( "--quality-filter-mask", dest="quality_filter_mask", type="int", help=("If a UMI base has a quality below this threshold, " "replace the base with 'N'")) parser.add_option("--quality-encoding", dest="quality_encoding", type="choice", choices=["phred33", "phred64", "solexa"], help=("Quality score encoding. Choose from 'phred33'" "[33-77] 'phred64' [64-106] or 'solexa' [59-106]")) parser.add_option("--extract-method", dest="extract_method", type="choice", choices=["string", "regex"], help=("How to extract the umi +/- cell barcodes, Choose " "from 'string' or 'regex'")) parser.add_option("--filter-cell-barcode", dest="filter_cell_barcode", action="store_true", help="Filter the cell barcodes") parser.add_option("--error-correct-cell", dest="error_correct_cell", action="store_true", help=("Correct errors in the cell barcode")) parser.add_option("--error-correct-threshold", dest="error_correct_threshold", type="int", help=("Hamming distance allowed for correction")) parser.add_option("--plot-prefix", dest="plot_prefix", type="string", help=("Prefix for plots to visualise the automated " "detection of the number of 'true' cell barcodes")) parser.add_option("--output-whitelist", dest="output_whitelist", type="string", help=("Write out the automatically generated whitelist")) parser.add_option("--whitelist-tsv", dest="whitelist_tsv", type="string", help=("A whitelist of accepted cell barcodes")) parser.add_option("--blacklist-tsv", dest="blacklist_tsv", type="string", help=("A blacklist of accepted cell barcodes")) parser.add_option( "--cell-barcode-subset", dest="cell_barcode_subset", type="int", help=("Use only the first N reads to automatically " "identify the true cell barcodes. If N is greater " "than the number of reads, all reads will be used")) parser.add_option("--reads-subset", dest="reads_subset", type="int", help=("Only extract from the first N reads. If N is " "greater than the number of reads, all reads will " "be used")) parser.add_option( "--reconcile-pairs", dest="reconcile", action="store_true", help=("Allow the presences of reads in read2 input that are" "not present in read1 input. This allows cell barcode" "filtering of read1s without considering read2s")) parser.set_defaults(extract_method="string", filter_cell_barcodes=False, whitelist_tsv=None, blacklist_tsv=None, error_correct_cell=False, error_correct_threshold=1, pattern=None, pattern2=None, read2_in=None, read2_out=False, read2_out_only=False, quality_filter_threshold=None, quality_encoding=None, plot_prefix=None, output_whitelist=None, cell_barcode_subset=50000000, reconcile=False) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) if options.quality_filter_threshold or options.quality_filter_mask: if not options.quality_encoding: U.error("must provide a quality encoding (--quality-" "encoding) to filter UMIs by quality (--quality" "-filter-threshold) or mask low quality bases " "with (--quality-filter-mask)") if not options.pattern and not options.pattern2: if not options.read2_in: U.error("Must supply --bc-pattern for single-end") else: U.error("Must supply --bc-pattern and/or --bc-pattern " "if paired-end ") if options.pattern2: if not options.read2_in: U.error("must specify a paired fastq ``--read2-in``") if not options.pattern2: options.pattern2 = options.pattern extract_cell = False extract_umi = False # If the pattern is a regex we can compile the regex(es) prior to # ExtractFilterAndUpdate instantiation if options.extract_method == "regex": if options.pattern: try: options.pattern = regex.compile(options.pattern) except regex.error: U.error("barcode_regex '%s' is not a " "valid regex" % options.pattern) if options.pattern2: try: options.pattern2 = regex.compile(options.barcode_regex2) except regex.Error: U.error("barcode_regex2 '%s' is not a " "valid regex" % options.barcode_regex2) # check whether the regex contains a umi group(s) and cell groups(s) if options.extract_method == "regex": if options.pattern: for group in options.pattern.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True if options.pattern2: for group in options.pattern2.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True # check whether the pattern string contains umi/cell bases elif options.extract_method == "string": if options.pattern: if "C" in options.pattern: extract_cell = True if "N" in options.pattern: extract_umi = True if options.pattern2: if "C" in options.pattern2: extract_cell = True if "N" in options.pattern2: extract_umi = True if options.whitelist_tsv: if options.blacklist_tsv: U.error("Do not supply a blacklist and a whitelist. Just " "remove the blacklist barcodes from the whitelist!") if not extract_umi: if options.extract_method == "string": U.error("barcode pattern(s) do not include any umi bases " "(marked with 'Ns') %s, %s" % (options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any umi groups " "(starting with 'umi_') %s, %s" (options.pattern, options.pattern2)) if options.stdin == sys.stdin: if not options.whitelist_tsv and options.filter_cell_barcode: U.error( "cannot support reading from stdin if correcting cell barcode") read1s = umi_methods.fastqIterate(U.openFile(options.stdin)) else: read1s = umi_methods.fastqIterate(U.openFile(options.stdin.name)) # set up read extractor ReadExtractor = umi_methods.ExtractFilterAndUpdate( options.extract_method, options.pattern, options.pattern2, options.prime3, extract_cell, options.quality_encoding, options.quality_filter_threshold, options.quality_filter_mask, options.filter_cell_barcode) if options.filter_cell_barcode: if (not options.whitelist_tsv) or options.error_correct_cell: cell_barcode_counts = collections.Counter() n_reads = 0 if not options.read2_in: for read1 in read1s: n_reads += 1 cell_barcode = ReadExtractor.getCellBarcode(read1) if cell_barcode: cell_barcode_counts[cell_barcode] += 1 if options.cell_barcode_subset: if (n_reads > options.cell_barcode_subset): break else: read2s = umi_methods.fastqIterate(U.openFile(options.read2_in)) for read1, read2 in izip(read1s, read2s): n_reads += 1 cell_barcode = ReadExtractor.getCellBarcode(read1, read2) if cell_barcode: cell_barcode_counts[cell_barcode] += 1 if options.cell_barcode_subset: if (n_reads > options.cell_barcode_subset): break if options.blacklist_tsv: cell_blacklist = umi_methods.getUserDefinedBarcodes( options.blacklist_tsv) for cell in cell_blacklist: del cell_barcode_counts[cell] if options.whitelist_tsv: cell_whitelist = umi_methods.getUserDefinedBarcodes( options.whitelist_tsv) error_correct_mappings = umi_methods.getErrorCorrectMappings( cell_barcode_counts.keys(), cell_whitelist, options.error_correct_threshold) else: # getCellWhitelist has not been properly defined yet! cell_whitelist, error_correct_mappings = umi_methods.getCellWhitelist( cell_barcode_counts, options.error_correct_threshold, options.plot_prefix) # re-make the reads1s iterator read1s = umi_methods.fastqIterate(U.openFile(options.stdin.name)) else: cell_whitelist = umi_methods.getUserDefinedBarcodes( options.whitelist_tsv) error_correct_mappings = None, None false_to_true_map, true_to_false_map = error_correct_mappings if options.output_whitelist: with U.openFile(options.output_whitelist, "w") as outf: columns = [ "barcode", "count", "corrected_barcodes", "corrected_barcode_counts" ] outf.write("\t".join(columns) + "\n") for barcode in sorted(list(cell_whitelist)): if true_to_false_map: corrected_barcodes = ",".join( sorted(true_to_false_map[barcode])) corrected_barcode_counts = ",".join( map(str, [ cell_barcode_counts[x] for x in sorted(true_to_false_map[barcode]) ])) else: corrected_barcodes, corrected_barcode_counts = "", "" outf.write("%s\t%s\t%s\t%s\n" % (barcode, cell_barcode_counts[barcode], corrected_barcodes, corrected_barcode_counts)) ReadExtractor.cell_whitelist = cell_whitelist ReadExtractor.false_to_true_map = false_to_true_map if options.read2_in is None: for read in read1s: new_read = ReadExtractor(read) if options.reads_subset: if (ReadExtractor.read_counts['Input Reads'] > options.reads_subset): break if not new_read: continue options.stdout.write(str(new_read) + "\n") else: read2s = umi_methods.fastqIterate(U.openFile(options.read2_in)) if options.read2_out: read2_out = U.openFile(options.read2_out, "w") if options.reconcile: strict = False else: strict = True for read1, read2 in umi_methods.joinedFastqIterate( read1s, read2s, strict): reads = ReadExtractor(read1, read2) if options.reads_subset: if (ReadExtractor.read_counts['Input Reads'] > options.reads_subset): break if not reads: continue else: new_read1, new_read2 = reads if not options.read2_out_only: options.stdout.write(str(new_read1) + "\n") if options.read2_out: read2_out.write(str(new_read2) + "\n") if options.read2_out: read2_out.close() for k, v in ReadExtractor.getReadCounts().most_common(): U.info("%s: %s" % (k, v)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true", help="Input file is in sam format", default=False) parser.add_option("-o", "--out-sam", dest="out_sam", action="store_true", help="Output alignments in sam format", default=False) parser.add_option("--ignore-umi", dest="ignore_umi", action="store_true", help="Ignore UMI and dedup only on position", default=False) parser.add_option("--subset", dest="subset", type="string", help="Use only a fraction of reads, specified by subset", default=1.1) parser.add_option("--spliced-is-unique", dest="spliced", action="store_true", help="Treat a spliced read as different to an unspliced" " one", default=False) parser.add_option("--soft-clip-threshold", dest="soft", type="float", help="number of bases clipped from 5' end before" "read os counted as spliced", default=4) parser.add_option("--edit-distance-threshold", dest="threshold", type="int", help="Edit distance theshold at which to join two UMIs" "when clustering", default=1) parser.add_option("--chrom", dest="chrom", type="string", help="Restrict to one chromosome", default=None) parser.add_option("--paired", dest="paired", action="store_true", default=False, help="Use second-in-pair position when deduping") parser.add_option("--method", dest="method", type="choice", choices=("adjacency", "directional-adjacency", "percentile", "unique", "cluster"), default="directional-adjacency", help="method to use for umi deduping") parser.add_option("--output-stats", dest="stats", type="string", default=False, help="Specify location to output stats") parser.add_option("--further-stats", dest="further_stats", action="store_true", default=False, help="Output further stats") parser.add_option("--per-contig", dest="per_contig", action="store_true", default=False, help=("dedup per contig," " e.g for transcriptome where contig = gene")) parser.add_option( "--whole-contig", dest="whole_contig", action="store_true", default=False, help= "Read whole contig before outputting bundles: guarantees that no reads" "are missed, but increases memory usage") parser.add_option("--multimapping-detection-method", dest="detection_method", type="choice", choices=("NH", "X0", "XT"), default=None, help=("Some aligners identify multimapping using bam " "tags. Setting this option to NH, X0 or XT will " "use these tags when selecting the best read " "amongst reads with the same position and umi")) parser.add_option("--mapping-quality", dest="mapping_quality", type="int", help="Minimum mapping quality for a read to be retained", default=0) parser.add_option("--read-length", dest="read_length", action="store_true", default=False, help=("use read length in addition to position and UMI" "to identify possible duplicates")) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: out_name = options.stdout.name options.stdout.close() else: out_name = "-" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "w" else: out_mode = "wb" if options.stats: if options.ignore_umi: raise ValueError("'--output-stats' and '--ignore-umi' options" " cannot be used together") if options.further_stats: if not options.stats: raise ValueError("'--further-stats' options requires " "'--output-stats' option") if options.method not in ["cluster", "adjacency"]: raise ValueError("'--further-stats' only enabled with 'cluster' " "and 'adjacency' methods") infile = pysam.Samfile(in_name, in_mode) outfile = pysam.Samfile(out_name, out_mode, template=infile) if options.paired: outfile = TwoPassPairWriter(infile, outfile) nInput, nOutput = 0, 0 if options.detection_method: bam_features = detect_bam_features(infile.filename) if not bam_features[options.detection_method]: if sum(bam_features.values()) == 0: raise ValueError( "There are no bam tags available to detect multimapping. " "Do not set --multimapping-detection-method") else: raise ValueError( "The chosen method of detection for multimapping (%s) " "will not work with this bam. Multimapping can be detected" " for this bam using any of the following: %s" % (options.detection_method, ",".join( [x for x in bam_features if bam_features[x]]))) if options.stats: # set up arrays to hold stats data stats_pre_df_dict = {"UMI": [], "counts": []} stats_post_df_dict = {"UMI": [], "counts": []} pre_cluster_stats = [] post_cluster_stats = [] pre_cluster_stats_null = [] post_cluster_stats_null = [] topology_counts = collections.Counter() node_counts = collections.Counter() read_gn = random_read_generator(infile.filename, chrom=options.chrom) for bundle in get_bundles(infile, ignore_umi=options.ignore_umi, subset=float(options.subset), quality_threshold=options.mapping_quality, paired=options.paired, chrom=options.chrom, spliced=options.spliced, soft_clip_threshold=options.soft, per_contig=options.per_contig, whole_contig=options.whole_contig, read_length=options.read_length, detection_method=options.detection_method): nInput += sum([bundle[umi]["count"] for umi in bundle]) if nOutput % 10000 == 0: U.debug("Outputted %i" % nOutput) if nInput % 1000000 == 0: U.debug("Read %i input reads" % nInput) if options.stats: # generate pre-dudep stats average_distance = get_average_umi_distance(bundle.keys()) pre_cluster_stats.append(average_distance) cluster_size = len(bundle) random_umis = read_gn.getUmis(cluster_size) average_distance_null = get_average_umi_distance(random_umis) pre_cluster_stats_null.append(average_distance_null) if options.ignore_umi: for umi in bundle: nOutput += 1 outfile.write(bundle[umi]["read"]) else: # set up ClusterAndReducer functor with methods specific to # specified options.method processor = ClusterAndReducer(options.method) # dedup using umis and write out deduped bam reads, umis, umi_counts, topologies, nodes = processor( bundle, options.threshold, options.stats, options.further_stats) for read in reads: outfile.write(read) nOutput += 1 if options.stats: # collect pre-dudupe stats stats_pre_df_dict['UMI'].extend(bundle) stats_pre_df_dict['counts'].extend( [bundle[UMI]['count'] for UMI in bundle]) # collect post-dudupe stats post_cluster_umis = [x.qname.split("_")[-1] for x in reads] stats_post_df_dict['UMI'].extend(umis) stats_post_df_dict['counts'].extend(umi_counts) average_distance = get_average_umi_distance(post_cluster_umis) post_cluster_stats.append(average_distance) cluster_size = len(post_cluster_umis) random_umis = read_gn.getUmis(cluster_size) average_distance_null = get_average_umi_distance(random_umis) post_cluster_stats_null.append(average_distance_null) if options.further_stats: for c_type, count in topologies.most_common(): topology_counts[c_type] += count for c_type, count in nodes.most_common(): node_counts[c_type] += count outfile.close() if options.stats: stats_pre_df = pd.DataFrame(stats_pre_df_dict) stats_post_df = pd.DataFrame(stats_post_df_dict) # generate histograms of counts per UMI at each position UMI_counts_df_pre = pd.DataFrame( stats_pre_df.pivot_table(columns=stats_pre_df["counts"], values="counts", aggfunc=len)) UMI_counts_df_post = pd.DataFrame( stats_post_df.pivot_table(columns=stats_post_df["counts"], values="counts", aggfunc=len)) UMI_counts_df_pre.columns = ["instances"] UMI_counts_df_post.columns = ["instances"] UMI_counts_df = pd.merge(UMI_counts_df_pre, UMI_counts_df_post, how='left', left_index=True, right_index=True, sort=True, suffixes=["_pre", "_post"]) # TS - if count value not observed either pre/post-dedup, # merge will leave an empty cell and the column will be cast as a float # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html # --> Missing data casting rules and indexing # so, back fill with zeros and convert back to int UMI_counts_df = UMI_counts_df.fillna(0).astype(int) UMI_counts_df.to_csv(options.stats + "_per_umi_per_position.tsv", sep="\t") # aggregate stats pre/post per UMI agg_pre_df = aggregateStatsDF(stats_pre_df) agg_post_df = aggregateStatsDF(stats_post_df) agg_df = pd.merge(agg_pre_df, agg_post_df, how='left', left_index=True, right_index=True, sort=True, suffixes=["_pre", "_post"]) # TS - see comment above regarding missing values agg_df = agg_df.fillna(0).astype(int) agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t") # bin distances into integer bins max_ed = int( max( map(max, [ pre_cluster_stats, post_cluster_stats, pre_cluster_stats_null, post_cluster_stats_null ]))) cluster_bins = range(-1, int(max_ed) + 2) def bin_clusters(cluster_list, bins=cluster_bins): ''' take list of floats and return bins''' return np.digitize(cluster_list, bins, right=True) def tallyCounts(binned_cluster, max_edit_distance): ''' tally counts per bin ''' return np.bincount(binned_cluster, minlength=max_edit_distance + 3) pre_cluster_binned = bin_clusters(pre_cluster_stats) post_cluster_binned = bin_clusters(post_cluster_stats) pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null) post_cluster_null_binned = bin_clusters(post_cluster_stats_null) edit_distance_df = pd.DataFrame({ "unique": tallyCounts(pre_cluster_binned, max_ed), "unique_null": tallyCounts(pre_cluster_null_binned, max_ed), options.method: tallyCounts(post_cluster_binned, max_ed), "%s_null" % options.method: tallyCounts(post_cluster_null_binned, max_ed), "edit_distance": cluster_bins }) # TS - set lowest bin (-1) to "Single_UMI" edit_distance_df['edit_distance'][0] = "Single_UMI" edit_distance_df.to_csv(options.stats + "_edit_distance.tsv", index=False, sep="\t") if options.further_stats: with U.openFile(options.stats + "_topologies.tsv", "w") as outf: outf.write("\n".join([ "\t".join((x, str(y))) for x, y in topology_counts.most_common() ]) + "\n") with U.openFile(options.stats + "_nodes.tsv", "w") as outf: outf.write("\n".join([ "\t".join(map(str, (x, y))) for x, y in node_counts.most_common() ]) + "\n") # write footer and output benchmark information. U.info("Number of reads in: %i, Number of reads out: %i" % (nInput, nOutput)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=usage, description=globals()["__doc__"]) group = U.OptionGroup(parser, "whitelist-specific options") group.add_option("--plot-prefix", dest="plot_prefix", type="string", help=("Prefix for plots to visualise the automated " "detection of the number of 'true' cell barcodes")) group.add_option("--subset-reads", dest="subset_reads", type="int", help=("Use the first N reads to automatically identify " "the true cell barcodes. If N is greater than the " "number of reads, all reads will be used. " "Default is 100,000,000")) group.add_option("--error-correct-threshold", dest="error_correct_threshold", type="int", help=("Hamming distance for correction of barcodes to " "whitelist barcodes. This value will also be used " "for error detection above the knee if required " "(--ed-above-threshold)")) group.add_option("--method", dest="method", choices=["reads", "umis"], help=("Use reads or unique umi counts per cell")) group.add_option("--knee-method", dest="knee_method", choices=["distance", "density"], help=("Use distance or density methods for detection of knee")) group.add_option("--expect-cells", dest="expect_cells", type="int", help=("Prior expectation on the upper limit on the " "number of cells sequenced")) group.add_option("--allow-threshold-error", dest="allow_threshold_error", action="store_true", help=("Don't select a threshold. Will still " "output the plots if requested (--plot-prefix)")) group.add_option("--set-cell-number", dest="cell_number", type="int", help=("Specify the number of cell barcodes to accept")) parser.add_option("--ed-above-threshold", dest="ed_above_threshold", type="choice", choices=["discard", "correct"], help=("Detect CBs above the threshold which may be " "sequence errors from another CB and either " "'discard' or 'correct'. Default=discard")) parser.add_option_group(group) parser.set_defaults(method="reads", knee_method="distance", extract_method="string", whitelist_tsv=None, blacklist_tsv=None, error_correct_threshold=1, pattern=None, pattern2=None, read2_in=None, plot_prefix=None, subset_reads=100000000, expect_cells=False, allow_threshold_error=False, cell_number=False, ed_above_threshold=None, ignore_suffix=False) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_extract_options=True, add_group_dedup_options=False, add_umi_grouping_options=False, add_sam_options=False) if options.filtered_out and not options.extract_method == "regex": U.error("Reads will not be filtered unless extract method is" "set to regex (--extract-method=regex)") if options.expect_cells: if options.knee_method == "distance": U.error("Cannot use --expect-cells with 'distance' knee " "method. Switch to --knee-method=density if you want to " "provide an expectation for the number of " "cells. Alternatively, if you know the number of cell " "barcodes, use --cell-number") if options.cell_number: U.error("Cannot supply both --expect-cells and " "--cell-number options") extract_cell, extract_umi = U.validateExtractOptions(options) if not extract_cell: if options.extract_method == "string": U.error("barcode pattern(s) do not include any cell bases " "(marked with 'Cs') %s, %s" % ( options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any cell groups " "(starting with 'cell_') %s, %s" ( options.pattern, options.pattern2)) read1s = umi_methods.fastqIterate(options.stdin) # set up read extractor ReadExtractor = extract_methods.ExtractFilterAndUpdate( method=options.extract_method, pattern=options.pattern, pattern2=options.pattern2, prime3=options.prime3, extract_cell=extract_cell) cell_barcode_counts = collections.Counter() n_reads = 0 n_cell_barcodes = 0 # if using the umis method, need to keep a set of umis observed if options.method == "umis": cell_barcode_umis = collections.defaultdict(set) # variables for progress monitor displayMax = 100000 U.info("Starting barcode extraction") if options.filtered_out: filtered_out = U.openFile(options.filtered_out, "w") if not options.read2_in: for read1 in read1s: # Update display in every 100kth iteration if n_reads % displayMax == 0: U.info("Parsed {} reads".format(n_reads)) n_reads += 1 barcode_values = ReadExtractor.getBarcodes(read1) if barcode_values is None: if options.filtered_out: filtered_out.write(str(read1) + "\n") continue else: cell, umi, _, _, _, _, _ = barcode_values if options.method == "umis": cell_barcode_umis[cell].add(umi) else: cell_barcode_counts[cell] += 1 n_cell_barcodes += 1 if options.subset_reads: if n_cell_barcodes > options.subset_reads: break else: if options.filtered_out2: filtered_out2 = U.openFile(options.filtered_out2, "w") read2s = umi_methods.fastqIterate(U.openFile(options.read2_in)) for read1, read2 in izip(read1s, read2s): # Update display in every 100kth iteration if n_reads % displayMax == 0: U.info("Parsed {} reads".format(n_reads)) n_reads += 1 barcode_values = ReadExtractor.getBarcodes(read1, read2) if barcode_values is None: if options.filtered_out: filtered_out.write(str(read1) + "\n") if options.filtered_out2: filtered_out2.write(str(read2) + "\n") continue else: cell, umi, _, _, _, _, _ = barcode_values if options.method == "umis": cell_barcode_umis[cell].add(umi) else: cell_barcode_counts[cell] += 1 n_cell_barcodes += 1 if options.subset_reads: if n_reads > options.subset_reads: break U.info("Starting - whitelist determination") if options.method == "umis": for cell in cell_barcode_umis: cell_barcode_counts[cell] = len(cell_barcode_umis[cell]) if options.cell_number and options.cell_number > len(cell_barcode_counts): raise ValueError( "--set-cell-barcode option specifies more cell barcodes than the " "number of observed cell barcodes. This may be because " "--subset-reads was set to a value too low to capture reads from " "all cells. %s cell barcodes observed from %s parsed reads. " "Expected>= %s cell barcodes" % ( len(cell_barcode_counts), options.subset_reads, options.cell_number)) cell_whitelist, true_to_false_map = whitelist_methods.getCellWhitelist( cell_barcode_counts, options.knee_method, options.expect_cells, options.cell_number, options.error_correct_threshold, options.plot_prefix) if cell_whitelist: U.info("Top %s cell barcodes passed the selected threshold" % len(cell_whitelist)) if options.ed_above_threshold: cell_whitelist, true_to_false_map = whitelist_methods.errorDetectAboveThreshold( cell_barcode_counts, cell_whitelist, true_to_false_map, errors=options.error_correct_threshold, resolution_method=options.ed_above_threshold) if cell_whitelist: U.info("Writing out whitelist") total_correct_barcodes = 0 total_corrected_barcodes = 0 for barcode in sorted(list(cell_whitelist)): total_correct_barcodes += cell_barcode_counts[barcode] if true_to_false_map: corrected_barcodes = ",".join( sorted(true_to_false_map[barcode])) correct_barcode_counts = [cell_barcode_counts[x] for x in sorted(true_to_false_map[barcode])] total_corrected_barcodes += sum(correct_barcode_counts) corrected_barcode_counts = ",".join( map(str, correct_barcode_counts)) else: corrected_barcodes, corrected_barcode_counts = "", "" options.stdout.write("%s\t%s\t%s\t%s\n" % ( barcode, corrected_barcodes, cell_barcode_counts[barcode], corrected_barcode_counts)) else: msg = ("No local minima was accepted. Recommend checking the plot " "output and counts per local minima (requires `--plot-prefix`" "option) and then re-running with manually selected threshold " "(`--set-cell-number` option)") if options.allow_threshold_error: U.info(msg) else: U.error(msg) U.info("Parsed %i reads" % n_reads) U.info("%i reads matched the barcode pattern" % n_cell_barcodes) U.info("Found %i unique cell barcodes" % len(cell_barcode_counts)) if cell_whitelist: U.info("Found %i total reads matching the selected cell barcodes" % total_correct_barcodes) U.info("Found %i total reads which can be error corrected to the " "selected cell barcodes" % total_corrected_barcodes) if options.filtered_out: filtered_out.close() if options.filtered_out2: filtered_out2.close() U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--split-barcode", dest="split", action="store_true", help="barcode is split across read pair") parser.add_option("-p", "--bc-pattern", dest="pattern", type="string", help="Barcode pattern. Ns are random bases X's fixed") parser.add_option("--bc-pattern2", dest="pattern2", type="string", help="Barcode pattern. Ns are random bases X's fixed") parser.add_option("--read2-in", dest="read2_in", type="string", help="file name for read pairs") parser.add_option("--3prime", dest="prime3", action="store_true", help="barcode is on 3' end of read") parser.add_option("--read2-out", dest="read2_out", type="string", help="file to output processed paired read to") parser.add_option("--quality-filter-threshold", dest="quality_filter_threshold", type="int", help=("Remove reads where any UMI base quality score " "falls below this threshold")) parser.add_option("--quality-encoding", dest="quality_encoding", type="choice", choices=["phred33", "phred64", "solexa"], help=("Quality score encoding. Choose from phred33" "[33-77] phred64 [64-106] or solexa [59-106]")) parser.add_option("--supress-stats", dest="stats", action="store_false", help="Suppress the writing of stats to the log") parser.set_defaults(split=False, pattern=None, pattern2=None, read2_in=None, read2_out=None, prime3=False, stats=True, quality_filter_threshold=None, quality_encoding=None) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) # check options if not options.pattern: raise ValueError("must specify a pattern using ``--bc-pattern``") if options.split: if not options.read2_in: raise ValueError("must specify a paired fastq ``--read2-in``") if not options.pattern2: options.pattern2 = options.pattern if options.read2_in: if not options.read2_out: raise ValueError("must specify an output for the paired end " "``--read2-out``") if options.quality_filter_threshold: if not options.quality_encoding: raise ValueError("must provide a quality encoding to filter UMIs " "by quality ``--quality-encoding``") # Initialise the processor processor = Extractor(options.pattern, options.pattern2, options.quality_filter_threshold, options.quality_encoding, options.prime3) read1s = fastqIterate(options.stdin) if options.read2_in is None: for read in read1s: new_1 = processor(read) if new_1: options.stdout.write(str(new_1) + "\n") else: read2s = fastqIterate(U.openFile(options.read2_in)) read2_out = U.openFile(options.read2_out, "w") for read1, read2 in izip(read1s, read2s): new_1, new_2 = processor(read1, read2) if new_1: options.stdout.write(str(new_1) + "\n") read2_out.write(str(new_2) + "\n") # write footer and output benchmark information. if options.stats: options.stdlog.write("\t".join(["Barcode", "UMI", "Sample", "Count"]) + "\n") for id in processor.bc_count: options.stdlog.write("\t".join(id + (str(processor.bc_count[id]), )) + "\n") U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true", help="Input file is in sam format [default=%default]", default=False) parser.add_option( "-o", "--out-sam", dest="out_sam", action="store_true", help="Output alignments in sam format [default=%default]", default=False) parser.add_option("--umi-separator", dest="umi_sep", type="string", help="separator between read id and UMI", default="_") parser.add_option("--umi-tag", dest="umi_tag", type="string", help="tag containing umi", default='RX') parser.add_option("--umi-group-tag", dest="umi_group_tag", type="string", help="tag for the outputted umi group", default='BX') parser.add_option("--extract-umi-method", dest="get_umi_method", type="choice", choices=("read_id", "tag"), default="read_id", help="where is the read UMI encoded? [default=%default]") parser.add_option("--subset", dest="subset", type="float", help="Use only a fraction of reads, specified by subset", default=None) parser.add_option("--spliced-is-unique", dest="spliced", action="store_true", help="Treat a spliced read as different to an unspliced" " one [default=%default]", default=False) parser.add_option("--soft-clip-threshold", dest="soft", type="float", help="number of bases clipped from 5' end before" "read is counted as spliced [default=%default]", default=4) parser.add_option("--edit-distance-threshold", dest="threshold", type="int", default=1, help="Edit distance theshold at which to join two UMIs" "when clustering. [default=%default]") parser.add_option("--chrom", dest="chrom", type="string", help="Restrict to one chromosome", default=None) parser.add_option("--paired", dest="paired", action="store_true", default=False, help="paired BAM. [default=%default]") parser.add_option("--method", dest="method", type="choice", choices=("adjacency", "directional", "unique", "cluster"), default="directional", help="method to use for umi deduping [default=%default]") parser.add_option("--per-contig", dest="per_contig", action="store_true", default=False, help=("dedup per contig (field 3 in BAM; RNAME)," " e.g for transcriptome where contig = gene")) parser.add_option("--per-gene", dest="per_gene", action="store_true", default=False, help=("Deduplicate per gene," "e.g for transcriptome where contig = transcript" "must also provide a transript to gene map with" "--gene-transcript-map [default=%default]")) parser.add_option("--gene-transcript-map", dest="gene_transcript_map", type="string", help="file mapping transcripts to genes (tab separated)", default=None) parser.add_option("--gene-tag", dest="gene_tag", type="string", help=("Deduplicate per gene where gene is" "defined by this bam tag [default=%default]"), default=None) parser.add_option( "--read-length", dest="read_length", action="store_true", default=False, help=("use read length in addition to position and UMI" "to identify possible duplicates [default=%default]")) parser.add_option("--mapping-quality", dest="mapping_quality", type="int", help="Minimum mapping quality for a read to be retained" " [default=%default]", default=0) parser.add_option( "--output-unmapped", dest="output_unmapped", action="store_true", default=False, help=("Retain all unmapped reads in output[default=%default]")) parser.add_option( "--group-out", dest="tsv", type="string", help="Outfile name for file mapping read id to read group", default=None) parser.add_option( "--output-bam", dest="output_bam", action="store_true", default=False, help=("output a bam file with read groups tagged using the UG tag" "[default=%default]")) parser.add_option( "--skip-tags-regex", dest="skip_regex", type="string", help=("Used with --gene-tag. " "Ignore reads where the gene-tag matches this regex"), default="^[__|Unassigned]") # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: out_name = options.stdout.name options.stdout.close() assert options.output_bam, ( "To output a bam you must include --output-bam option") else: out_name = "-" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "wh" else: out_mode = "wb" if options.per_gene: if not options.gene_transcript_map: raise ValueError( "--per-gene option requires --gene-transcript-map") infile = pysam.Samfile(in_name, in_mode) if options.output_bam: outfile = pysam.Samfile(out_name, out_mode, template=infile) else: outfile = None if options.tsv: mapping_outfile = U.openFile(options.tsv, "w") mapping_outfile.write("%s\n" % "\t".join([ "read_id", "contig", "position", "gene", "umi", "umi_count", "final_umi", "final_umi_count", "unique_id" ])) # set the method with which to extract umis from reads if options.get_umi_method == "read_id": umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep) elif options.get_umi_method == "tag": umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag) else: raise ValueError("Unknown umi extraction method") nInput, nOutput, unique_id = 0, 0, 0 if options.chrom: inreads = infile.fetch(reference=options.chrom) gene_tag = options.gene_tag else: if options.per_gene and options.gene_transcript_map: metacontig2contig = umi_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch(until_eof=options.output_unmapped) gene_tag = options.gene_tag for bundle, read_events, status in umi_methods.get_bundles( inreads, ignore_umi=False, subset=options.subset, quality_threshold=options.mapping_quality, paired=options.paired, spliced=options.spliced, soft_clip_threshold=options.soft, per_contig=options.per_contig, gene_tag=gene_tag, skip_regex=options.skip_regex, read_length=options.read_length, umi_getter=umi_getter, all_reads=True, return_read2=True, return_unmapped=options.output_unmapped): # write out read2s and unmapped if option set if status == 'single_read': # bundle is just a single read here outfile.write(bundle) nInput += 1 nOutput += 1 continue umis = bundle.keys() counts = {umi: bundle[umi]["count"] for umi in umis} nInput += sum(counts.values()) if nOutput % 10000 == 0: U.debug("Outputted %i" % nOutput) if nInput % 1000000 == 0: U.debug("Read %i input reads" % nInput) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) # group the umis groups = processor(umis, counts, threshold=options.threshold) for umi_group in groups: top_umi = umi_group[0] group_count = sum(counts[umi] for umi in umi_group) for umi in umi_group: reads = bundle[umi]['read'] for read in reads: if outfile: # Add the 'UG' tag to the read read.tags += [('UG', unique_id)] read.tags += [(options.umi_group_tag, top_umi)] outfile.write(read) if options.tsv: if options.per_gene: gene = read.get_tag(gene_tag) else: gene = "NA" mapping_outfile.write("%s\n" % "\t".join( map(str, (read.query_name, read.reference_name, umi_methods.get_read_position( read, options.soft)[1], gene, umi.decode(), counts[umi], top_umi.decode(), group_count, unique_id)))) nOutput += 1 unique_id += 1 if outfile: outfile.close() if options.tsv: mapping_outfile.close() # write footer and output benchmark information. U.info( "Reads: %s" % ", ".join(["%s: %s" % (x[0], x[1]) for x in read_events.most_common()])) U.info("Number of reads out: %i, Number of groups: %i" % (nOutput, unique_id)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=usage, description=globals()["__doc__"]) group = U.OptionGroup(parser, "extract-specific options") # (Experimental option) Retain the UMI in the sequence read" group.add_option("--retain-umi", dest="retain_umi", action="store_true", help=optparse.SUPPRESS_HELP) group.add_option("--read2-out", dest="read2_out", type="string", help="file to output processed paired read to") group.add_option("--read2-stdout", dest="read2_stdout", action="store_true", help="Paired reads, send read2 to stdout, discarding read1") group.add_option("--quality-filter-threshold", dest="quality_filter_threshold", type="int", help=("Remove reads where any UMI base quality score " "falls below this threshold")) group.add_option("--quality-filter-mask", dest="quality_filter_mask", type="int", help=("If a UMI base has a quality below this threshold, " "replace the base with 'N'")) group.add_option("--quality-encoding", dest="quality_encoding", type="choice", choices=["phred33", "phred64", "solexa"], help=("Quality score encoding. Choose from 'phred33'" "[33-77] 'phred64' [64-106] or 'solexa' [59-106]")) group.add_option("--filter-cell-barcode", dest="filter_cell_barcode", action="store_true", help=optparse.SUPPRESS_HELP) group.add_option("--error-correct-cell", dest="error_correct_cell", action="store_true", help=("Correct errors in the cell barcode")) group.add_option("--whitelist", dest="whitelist", type="string", help=("A whitelist of accepted cell barcodes")) group.add_option("--blacklist", dest="blacklist", type="string", help=("A blacklist of rejected cell barcodes")) group.add_option("--filter-umi", dest="filter_umi", action="store_true", #help="Filter the UMIs" help=optparse.SUPPRESS_HELP) group.add_option("--umi-whitelist", dest="umi_whitelist", type="string", default=None, #help="A whitelist of accepted UMIs [default=%default]" help=optparse.SUPPRESS_HELP) group.add_option("--umi-whitelist-paired", dest="umi_whitelist_paired", type="string", default=None, #help="A whitelist of accepted UMIs for read2[default=%default]" help=optparse.SUPPRESS_HELP) group.add_option("--correct-umi-threshold", dest="correct_umi_threshold", type="int", default=0, #help="Correct errors in UMIs to the whitelist(s) provided" #"if within threshold [default=%default]" help=optparse.SUPPRESS_HELP) group.add_option("--umi-correct-log", dest="umi_correct_log", type="string", default=None, #help="File logging UMI error correction", help=optparse.SUPPRESS_HELP) group.add_option("--subset-reads", "--reads-subset", dest="reads_subset", type="int", help=("Only extract from the first N reads. If N is " "greater than the number of reads, all reads will " "be used")) group.add_option("--reconcile-pairs", dest="reconcile", action="store_true", help=("Allow the presences of reads in read2 input that " "are not present in read1 input. This allows cell " "barcode filtering of read1s without " "considering read2s")) parser.add_option_group(group) group = U.OptionGroup(parser, "[EXPERIMENTAl] barcode extraction options") group.add_option("--either-read", dest="either_read", action="store_true", help="UMI may be on either read (see " "--either-read-resolve) for options to resolve cases where" "UMI is on both reads") group.add_option("--either-read-resolve", dest="either_read_resolve", type="choice", choices=["discard", "quality"], help=("How to resolve instances where both reads " "contain a UMI but using --either-read." "Choose from 'discard' or 'quality'" "(use highest quality). default=dicard")) parser.add_option_group(group) parser.set_defaults(extract_method="string", filter_cell_barcodes=False, whitelist=None, blacklist=None, error_correct_cell=False, pattern=None, pattern2=None, read2_in=None, read2_out=False, read2_stdout=False, quality_filter_threshold=None, quality_encoding=None, reconcile=False, either_read=False, either_read_resolve="discard", ignore_suffix=False) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_extract_options=True, add_group_dedup_options=False, add_umi_grouping_options=False, add_sam_options=False) if options.filter_cell_barcode: U.info('Use of --whitelist ensures cell barcodes are filtered. ' '--filter-cell-barcode is no longer required and may be ' 'removed in future versions.') if options.whitelist is not None: options.filter_cell_barcode = True if options.retain_umi and not options.extract_method == "regex": U.error("option --retain-umi only works with --extract-method=regex") if (options.filtered_out and not options.extract_method == "regex" and whitelist is None): U.error("Reads will not be filtered unless extract method is" "set to regex (--extract-method=regex) or cell" "barcodes are filtered (--whitelist)") if options.quality_filter_threshold or options.quality_filter_mask: if not options.quality_encoding: U.error("must provide a quality encoding (--quality-" "encoding) to filter UMIs by quality (--quality" "-filter-threshold) or mask low quality bases " "with (--quality-filter-mask)") extract_cell, extract_umi = U.validateExtractOptions(options) if options.either_read: if extract_cell: U.error("Option to extract from either read (--either-read) " "is not currently compatible with cell barcode extraction") if not options.extract_method == "regex": U.error("Option to extract from either read (--either-read)" "requires --extract-method=regex") if not options.pattern or not options.pattern2: U.error("Option to extract from either read (--either-read)" "requires --bc-pattern=[PATTERN1] and" "--bc-pattern2=[PATTERN2]") if options.filter_umi: if not options.umi_whitelist: U.error("must provide a UMI whitelist (--umi-whitelist) if using " "--filter-umi option") if options.pattern2 and not options.umi_whitelist_paired: U.error("must provide a UMI whitelist for paired end " "(--umi-whitelist-paired) if using --filter-umi option" "with paired end data") if not extract_umi: if options.extract_method == "string": U.error("barcode pattern(s) do not include any umi bases " "(marked with 'Ns') %s, %s" % ( options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any umi groups " "(starting with 'umi_') %s, %s" ( options.pattern, options.pattern2)) if options.whitelist: if not extract_cell: if options.extract_method == "string": U.error("barcode pattern(s) do not include any cell bases " "(marked with 'Cs') %s, %s" % ( options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any cell groups " "(starting with 'cell_') %s, %s" ( options.pattern, options.pattern2)) read1s = umi_methods.fastqIterate(options.stdin) # set up read extractor ReadExtractor = extract_methods.ExtractFilterAndUpdate( options.extract_method, options.pattern, options.pattern2, options.prime3, extract_cell, options.quality_encoding, options.quality_filter_threshold, options.quality_filter_mask, options.filter_umi, options.filter_cell_barcode, options.retain_umi, options.either_read, options.either_read_resolve) if options.filter_umi: umi_whitelist, false_to_true_map = whitelist_methods.getUserDefinedBarcodes( options.umi_whitelist, options.umi_whitelist_paired, deriveErrorCorrection=True, threshold=options.correct_umi_threshold) U.info("Length of whitelist: %i" % len(umi_whitelist)) U.info("Length of 'correctable' whitelist: %i" % len(false_to_true_map)) ReadExtractor.umi_whitelist = umi_whitelist ReadExtractor.umi_false_to_true_map = false_to_true_map ReadExtractor.umi_whitelist_counts = collections.defaultdict( lambda: collections.Counter()) if options.whitelist: cell_whitelist, false_to_true_map = whitelist_methods.getUserDefinedBarcodes( options.whitelist, getErrorCorrection=options.error_correct_cell) ReadExtractor.cell_whitelist = cell_whitelist ReadExtractor.false_to_true_map = false_to_true_map if options.blacklist: blacklist = set() with U.openFile(options.blacklist, "r") as inf: for line in inf: blacklist.add(line.strip().split("\t")[0]) ReadExtractor.cell_blacklist = blacklist # variables for progress monitor progCount = 0 displayMax = 100000 U.info("Starting barcode extraction") if options.filtered_out: filtered_out = U.openFile(options.filtered_out, "w") if options.read2_in is None: for read in read1s: # incrementing count for monitoring progress progCount += 1 # Update display in every 100kth iteration if progCount % displayMax == 0: U.info("Parsed {} reads".format(progCount)) new_read = ReadExtractor(read) if options.reads_subset: if (ReadExtractor.read_counts['Input Reads'] > options.reads_subset): break if not new_read: if options.filtered_out: filtered_out.write(str(read) + "\n") continue options.stdout.write(str(new_read) + "\n") else: if options.filtered_out2: filtered_out2 = U.openFile(options.filtered_out2, "w") read2s = umi_methods.fastqIterate(U.openFile(options.read2_in)) if options.read2_out: read2_out = U.openFile(options.read2_out, "w") if options.reconcile: strict = False else: strict = True for read1, read2 in umi_methods.joinedFastqIterate( read1s, read2s, strict, options.ignore_suffix): # incrementing count for monitoring progress progCount += 1 # Update display in every 100kth iteration if progCount % displayMax == 0: U.info("Parsed {} reads".format(progCount)) sys.stdout.flush() reads = ReadExtractor(read1, read2) if options.reads_subset: if (ReadExtractor.read_counts['Input Reads'] > options.reads_subset): break if not reads: if options.filtered_out: filtered_out.write(str(read1) + "\n") if options.filtered_out2: filtered_out2.write(str(read2) + "\n") continue else: new_read1, new_read2 = reads if options.read2_stdout: options.stdout.write(str(new_read2) + "\n") else: options.stdout.write(str(new_read1) + "\n") if options.read2_out: read2_out.write(str(new_read2) + "\n") if options.read2_out: read2_out.close() if options.filtered_out: filtered_out.close() if options.filtered_out2: filtered_out2.close() for k, v in ReadExtractor.getReadCounts().most_common(): U.info("%s: %s" % (k, v)) if options.umi_correct_log: with U.openFile(options.umi_correct_log, "w") as outf: outf.write("umi\tcount_no_errors\tcount_errors\n") for umi, counts in ReadExtractor.umi_whitelist_counts.items(): outf.write("%s\t%i\t%i\n" % ( umi, counts["no_error"], counts["error"])) outf.close() U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) group = U.OptionGroup(parser, "group-specific options") group.add_option( "--group-out", dest="tsv", type="string", help="Outfile name for file mapping read id to read group", default=None) group.add_option( "--output-bam", dest="output_bam", action="store_true", default=False, help=("output a bam file with read groups tagged using the UG tag" "[default=%default]")) group.add_option( "--output-unmapped", dest="output_unmapped", action="store_true", default=False, help=("Retain all unmapped reads in output[default=%default]")) parser.add_option("--umi-group-tag", dest="umi_group_tag", type="string", help="tag for the outputted umi group", default='BX') parser.add_option_group(group) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) U.validateSamOptions(options) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: if options.no_sort_output: out_name = options.stdout.name else: out_name = U.getTempFilename() sorted_out_name = options.stdout.name options.stdout.close() assert options.output_bam, ( "To output a bam you must include --output-bam option") else: if options.no_sort_output: out_name = "-" else: out_name = U.getTempFilename() sorted_out_name = "-" if not options.no_sort_output: # need to determine the output format for sort if options.out_sam: sort_format = "sam" else: sort_format = "bam" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "wh" else: out_mode = "wb" infile = pysam.Samfile(in_name, in_mode) if options.output_bam: outfile = pysam.Samfile(out_name, out_mode, template=infile) else: outfile = None if options.tsv: mapping_outfile = U.openFile(options.tsv, "w") mapping_outfile.write("%s\n" % "\t".join([ "read_id", "contig", "position", "gene", "umi", "umi_count", "final_umi", "final_umi_count", "unique_id" ])) nInput, nOutput, unique_id, input_reads, output_reads = 0, 0, 0, 0, 0 gene_tag = options.gene_tag metacontig2contig = None if options.chrom: inreads = infile.fetch(reference=options.chrom) else: if options.per_gene and options.gene_transcript_map: metacontig2contig = umi_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch(until_eof=options.output_unmapped) bundle_iterator = umi_methods.get_bundles( options, all_reads=True, return_read2=True, return_unmapped=options.output_unmapped, metacontig_contig=metacontig2contig) for bundle, key, status in bundle_iterator(inreads): # write out read2s and unmapped (if these options are set) if status == 'single_read': # bundle is just a single read here nInput += 1 if outfile: outfile.write(bundle) nOutput += 1 continue umis = bundle.keys() counts = {umi: bundle[umi]["count"] for umi in umis} nInput += sum(counts.values()) while nOutput >= output_reads + 10000: output_reads += 10000 U.info("Written out %i reads" % output_reads) while nInput >= input_reads + 1000000: input_reads += 1000000 U.info("Parsed %i input reads" % input_reads) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) # group the umis groups = processor(umis, counts, threshold=options.threshold) for umi_group in groups: top_umi = umi_group[0] group_count = sum(counts[umi] for umi in umi_group) for umi in umi_group: reads = bundle[umi]['read'] for read in reads: if outfile: # Add the 'UG' tag to the read read.tags += [('UG', unique_id)] read.tags += [(options.umi_group_tag, top_umi)] outfile.write(read) if options.tsv: if options.per_gene: gene = read.get_tag(gene_tag) else: gene = "NA" mapping_outfile.write("%s\n" % "\t".join( map(str, (read.query_name, read.reference_name, umi_methods.get_read_position( read, options.soft_clip_threshold)[1], gene, umi.decode(), counts[umi], top_umi.decode(), group_count, unique_id)))) nOutput += 1 unique_id += 1 if outfile: outfile.close() if not options.no_sort_output: # sort the output pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name) os.unlink(out_name) # delete the tempfile if options.tsv: mapping_outfile.close() # write footer and output benchmark information. U.info("Reads: %s" % ", ".join([ "%s: %s" % (x[0], x[1]) for x in bundle_iterator.read_events.most_common() ])) U.info("Number of reads out: %i, Number of groups: %i" % (nOutput, unique_id)) U.Stop()
def getKneeEstimateDensity(cell_barcode_counts, expect_cells=False, cell_number=False, plotfile_prefix=None): ''' estimate the number of "true" cell barcodes using a gaussian density-based method input: cell_barcode_counts = dict(key = barcode, value = count) expect_cells (optional) = define the expected number of cells cell_number (optional) = define number of cell barcodes to accept plotfile_prefix = (optional) prefix for plots returns: List of true barcodes ''' # very low abundance cell barcodes are filtered out (< 0.001 * # the most abundant) threshold = 0.001 * cell_barcode_counts.most_common(1)[0][1] counts = sorted(cell_barcode_counts.values(), reverse=True) counts_thresh = [x for x in counts if x > threshold] log_counts = np.log10(counts_thresh) # guassian density with hardcoded bw density = gaussian_kde(log_counts, bw_method=0.1) xx_values = 10000 # how many x values for density plot xx = np.linspace(log_counts.min(), log_counts.max(), xx_values) local_min = None if cell_number: # we have a prior hard expectation on the number of cells threshold = counts[cell_number] else: local_mins = argrelextrema(density(xx), np.less)[0] local_mins_counts = [] for poss_local_min in local_mins[::-1]: passing_threshold = sum([ y > np.power(10, xx[poss_local_min]) for x, y in cell_barcode_counts.items() ]) local_mins_counts.append(passing_threshold) if not local_min: # if we have selected a local min yet if expect_cells: # we have a "soft" expectation if (passing_threshold > expect_cells * 0.1 and passing_threshold <= expect_cells): local_min = poss_local_min else: # we have no prior expectation # TS: In abscence of any expectation (either hard or soft), # this set of heuristic thresholds are used to decide # which local minimum to select. # This is very unlikely to be the best way to achieve this! if (poss_local_min >= 0.2 * xx_values and (log_counts.max() - xx[poss_local_min] > 0.5 or xx[poss_local_min] < log_counts.max() / 2)): local_min = poss_local_min if local_min is not None: threshold = np.power(10, xx[local_min]) if cell_number or local_min is not None: final_barcodes = set( [x for x, y in cell_barcode_counts.items() if y > threshold]) else: final_barcodes = None if plotfile_prefix: # colour-blind friendly colours - https://gist.github.com/thriveth/8560036 CB_color_cycle = [ '#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3', '#999999', '#e41a1c', '#dede00' ] user_line = mlines.Line2D([], [], color=CB_color_cycle[0], ls="dashed", markersize=15, label='User-defined') selected_line = mlines.Line2D([], [], color=CB_color_cycle[0], ls="dashed", markersize=15, label='Selected') rejected_line = mlines.Line2D([], [], color=CB_color_cycle[3], ls="dashed", markersize=15, label='Rejected') # make density plot fig = plt.figure() fig1 = fig.add_subplot(111) fig1.plot(xx, density(xx), 'k') fig1.set_xlabel("Count per cell (log10)") fig1.set_ylabel("Density") if cell_number: fig1.axvline(np.log10(threshold), ls="dashed", color=CB_color_cycle[0]) lgd = fig1.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=[user_line], title="Cell threshold") elif local_min is None: # no local_min was accepted for pos in xx[local_mins]: fig1.axvline(x=pos, ls="dashed", color=CB_color_cycle[3]) lgd = fig1.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=[selected_line, rejected_line], title="Possible thresholds") else: for pos in xx[local_mins]: if pos == xx[local_min]: # selected local minima fig1.axvline(x=xx[local_min], ls="dashed", color=CB_color_cycle[0]) else: fig1.axvline(x=pos, ls="dashed", color=CB_color_cycle[3]) lgd = fig1.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=[selected_line, rejected_line], title="Possible thresholds") fig.savefig("%s_cell_barcode_count_density.png" % plotfile_prefix, bbox_extra_artists=(lgd, ), bbox_inches='tight') # make knee plot fig = plt.figure() fig2 = fig.add_subplot(111) fig2.plot(range(0, len(counts)), np.cumsum(counts), c="black") xmax = len(counts) if local_min is not None: # reasonable maximum x-axis value xmax = min(len(final_barcodes) * 5, xmax) fig2.set_xlim((0 - (0.01 * xmax), xmax)) fig2.set_xlabel("Rank") fig2.set_ylabel("Cumulative count") if cell_number: fig2.axvline(x=cell_number, ls="dashed", color=CB_color_cycle[0]) lgd = fig2.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=[user_line], title="Cell threshold") elif local_min is None: # no local_min was accepted for local_mins_count in local_mins_counts: fig2.axvline(x=local_mins_count, ls="dashed", color=CB_color_cycle[3]) lgd = fig2.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=[selected_line, rejected_line], title="Possible thresholds") else: for local_mins_count in local_mins_counts: if local_mins_count == len( final_barcodes): # selected local minima fig2.axvline(x=local_mins_count, ls="dashed", color=CB_color_cycle[0]) else: fig2.axvline(x=local_mins_count, ls="dashed", color=CB_color_cycle[3]) lgd = fig2.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=[selected_line, rejected_line], title="Possible thresholds") fig.savefig("%s_cell_barcode_knee.png" % plotfile_prefix, bbox_extra_artists=(lgd, ), bbox_inches='tight') if local_min is not None: colours_selected = [ CB_color_cycle[0] for x in range(0, len(final_barcodes)) ] colours_rejected = [ "black" for x in range(0, len(counts) - len(final_barcodes)) ] colours = colours_selected + colours_rejected else: colours = ["black" for x in range(0, len(counts))] fig = plt.figure() fig3 = fig.add_subplot(111) fig3.scatter(x=range(1, len(counts) + 1), y=counts, c=colours, s=10, linewidths=0) fig3.loglog() fig3.set_xlim(0, len(counts) * 1.25) fig3.set_xlabel('Barcode index') fig3.set_ylabel('Count') if cell_number: fig3.axvline(x=cell_number, ls="dashed", color=CB_color_cycle[0]) lgd = fig3.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=[user_line], title="Cell threshold") elif local_min is None: # no local_min was accepted for local_mins_count in local_mins_counts: fig3.axvline(x=local_mins_count, ls="dashed", color=CB_color_cycle[3]) lgd = fig3.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=[selected_line, rejected_line], title="Possible thresholds") else: for local_mins_count in local_mins_counts: if local_mins_count == len( final_barcodes): # selected local minima fig3.axvline(x=local_mins_count, ls="dashed", color=CB_color_cycle[0]) else: fig3.axvline(x=local_mins_count, ls="dashed", color=CB_color_cycle[3]) lgd = fig3.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=[selected_line, rejected_line], title="Possible thresholds") fig.savefig("%s_cell_barcode_counts.png" % plotfile_prefix, bbox_extra_artists=(lgd, ), bbox_inches='tight') if not cell_number: with U.openFile("%s_cell_thresholds.tsv" % plotfile_prefix, "w") as outf: outf.write("count\taction\n") for local_mins_count in local_mins_counts: if local_min and local_mins_count == len(final_barcodes): threshold_type = "Selected" else: threshold_type = "Rejected" outf.write("%s\t%s\n" % (local_mins_count, threshold_type)) return final_barcodes
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-p", "--bc-pattern", dest="pattern", type="string", help="Barcode pattern") parser.add_option("--bc-pattern2", dest="pattern2", type="string", help="Barcode pattern for paired reads") parser.add_option("--3prime", dest="prime3", action="store_true", help="barcode is on 3' end of read.") parser.add_option("--read2-in", dest="read2_in", type="string", help="file name for read pairs") parser.add_option("--read2-out", dest="read2_out", type="string", help="file to output processed paired read to") parser.add_option("--read2-stdout", dest="read2_stdout", action="store_true", help="Paired reads, send read2 to stdout, discarding read1") parser.add_option("--quality-filter-threshold", dest="quality_filter_threshold", type="int", help=("Remove reads where any UMI base quality score " "falls below this threshold")) parser.add_option("--quality-filter-mask", dest="quality_filter_mask", type="int", help=("If a UMI base has a quality below this threshold, " "replace the base with 'N'")) parser.add_option("--quality-encoding", dest="quality_encoding", type="choice", choices=["phred33", "phred64", "solexa"], help=("Quality score encoding. Choose from 'phred33'" "[33-77] 'phred64' [64-106] or 'solexa' [59-106]")) parser.add_option("--extract-method", dest="extract_method", type="choice", choices=["string", "regex"], help=("How to extract the umi +/- cell barcodes, Choose " "from 'string' or 'regex'")) parser.add_option("--filter-cell-barcode", dest="filter_cell_barcode", action="store_true", help="Filter the cell barcodes") parser.add_option("--error-correct-cell", dest="error_correct_cell", action="store_true", help=("Correct errors in the cell barcode")) parser.add_option("--whitelist", dest="whitelist", type="string", help=("A whitelist of accepted cell barcodes")) parser.add_option("--blacklist", dest="blacklist", type="string", help=("A blacklist of accepted cell barcodes")) parser.add_option("--reads-subset", dest="reads_subset", type="int", help=("Only extract from the first N reads. If N is " "greater than the number of reads, all reads will " "be used")) parser.add_option("--reconcile-pairs", dest="reconcile", action="store_true", help=("Allow the presences of reads in read2 input that are" "not present in read1 input. This allows cell barcode" "filtering of read1s without considering read2s")) parser.set_defaults(extract_method="string", filter_cell_barcodes=False, whitelist=None, blacklist=None, error_correct_cell=False, pattern=None, pattern2=None, read2_in=None, read2_out=False, read2_stdout=False, quality_filter_threshold=None, quality_encoding=None, reconcile=False) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False, add_sam_options=False) if options.quality_filter_threshold or options.quality_filter_mask: if not options.quality_encoding: U.error("must provide a quality encoding (--quality-" "encoding) to filter UMIs by quality (--quality" "-filter-threshold) or mask low quality bases " "with (--quality-filter-mask)") if not options.pattern and not options.pattern2: if not options.read2_in: U.error("Must supply --bc-pattern for single-end") else: U.error("Must supply --bc-pattern and/or --bc-pattern " "if paired-end ") if options.pattern2: if not options.read2_in: U.error("must specify a paired fastq ``--read2-in``") if not options.pattern2: options.pattern2 = options.pattern extract_cell = False extract_umi = False # If the pattern is a regex we can compile the regex(es) prior to # ExtractFilterAndUpdate instantiation if options.extract_method == "regex": if options.pattern: try: options.pattern = regex.compile(options.pattern) except regex.error: U.error("barcode_regex '%s' is not a " "valid regex" % options.pattern) if options.pattern2: try: options.pattern2 = regex.compile(options.pattern2) except regex.Error: U.error("barcode_regex2 '%s' is not a " "valid regex" % options.pattern2) # check whether the regex contains a umi group(s) and cell groups(s) if options.extract_method == "regex": if options.pattern: for group in options.pattern.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True if options.pattern2: for group in options.pattern2.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True # check whether the pattern string contains umi/cell bases elif options.extract_method == "string": if options.pattern: if "C" in options.pattern: extract_cell = True if "N" in options.pattern: extract_umi = True if options.pattern2: if "C" in options.pattern2: extract_cell = True if "N" in options.pattern2: extract_umi = True if not extract_umi: if options.extract_method == "string": U.error("barcode pattern(s) do not include any umi bases " "(marked with 'Ns') %s, %s" % ( options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any umi groups " "(starting with 'umi_') %s, %s" ( options.pattern, options.pattern2)) if options.filter_cell_barcodes: if not options.whitelist: U.error("must provide a whitelist (--whitelist) if using " "--filter-cell-barcode option") if not extract_cell: if options.extract_method == "string": U.error("barcode pattern(s) do not include any cell bases " "(marked with 'Cs') %s, %s" % ( options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any cell groups " "(starting with 'cell_') %s, %s" ( options.pattern, options.pattern2)) read1s = umi_methods.fastqIterate(options.stdin) # set up read extractor ReadExtractor = umi_methods.ExtractFilterAndUpdate( options.extract_method, options.pattern, options.pattern2, options.prime3, extract_cell, options.quality_encoding, options.quality_filter_threshold, options.quality_filter_mask, options.filter_cell_barcode) if options.filter_cell_barcode: cell_whitelist, false_to_true_map = umi_methods.getUserDefinedBarcodes( options.whitelist, options.error_correct_cell) ReadExtractor.cell_whitelist = cell_whitelist ReadExtractor.false_to_true_map = false_to_true_map if options.blacklist: blacklist = set() with U.openFile(options.blacklist, "r") as inf: for line in inf: blacklist.add(line.strip().split("\t")[0]) ReadExtractor.cell_blacklist = blacklist # variables for progress monitor progCount = 0 displayMax = 100000 U.info("Starting barcode extraction") if options.read2_in is None: for read in read1s: # incrementing count for monitoring progress progCount += 1 # Update display in every 100kth iteration if progCount % displayMax == 0: U.info("Parsed {} reads".format(progCount)) new_read = ReadExtractor(read) if options.reads_subset: if (ReadExtractor.read_counts['Input Reads'] > options.reads_subset): break if not new_read: continue options.stdout.write(str(new_read) + "\n") else: read2s = umi_methods.fastqIterate(U.openFile(options.read2_in)) if options.read2_out: read2_out = U.openFile(options.read2_out, "w") if options.reconcile: strict = False else: strict = True for read1, read2 in umi_methods.joinedFastqIterate( read1s, read2s, strict): # incrementing count for monitoring progress progCount += 1 # Update display in every 100kth iteration if progCount % displayMax == 0: U.info("Parsed {} reads".format(progCount)) sys.stdout.flush() reads = ReadExtractor(read1, read2) if options.reads_subset: if (ReadExtractor.read_counts['Input Reads'] > options.reads_subset): break if not reads: continue else: new_read1, new_read2 = reads if options.read2_stdout: options.stdout.write(str(new_read2) + "\n") else: options.stdout.write(str(new_read1) + "\n") if options.read2_out: read2_out.write(str(new_read2) + "\n") if options.read2_out: read2_out.close() for k, v in ReadExtractor.getReadCounts().most_common(): U.info("%s: %s" % (k, v)) U.Stop()
def getKneeEstimateDistance(cell_barcode_counts, cell_number=False, plotfile_prefix=None): ''' estimate the number of "true" cell barcodes via a knee method which finds the point with maximum distance input: cell_barcode_counts = dict(key = barcode, value = count) cell_number (optional) = define number of cell barcodes to accept plotfile_prefix = (optional) prefix for plots returns: List of true barcodes ''' def getKneeDistance(values): ''' This function is based on https://stackoverflow.com/questions/2018178/finding-the-best-trade-off-point-on-a-curve and https://dataplatform.cloud.ibm.com/analytics/notebooks/54d79c2a-f155-40ec-93ec-ed05b58afa39/view?access_token=6d8ec910cf2a1b3901c721fcb94638563cd646fe14400fecbb76cea6aaae2fb1 The idea is to draw a line from the first to last point on the cumulative counts curve and then find the point on the curve which is the maximum distance away from this line ''' # get coordinates of all the points nPoints = len(values) allCoord = np.vstack((range(nPoints), values)).T # get the first point firstPoint = allCoord[0] # get vector between first and last point - this is the line lineVec = allCoord[-1] - allCoord[0] lineVecNorm = lineVec / np.sqrt(np.sum(lineVec**2)) # find the distance from each point to the line: # vector between all points and first point vecFromFirst = allCoord - firstPoint # To calculate the distance to the line, we split vecFromFirst into two # components, one that is parallel to the line and one that is perpendicular # Then, we take the norm of the part that is perpendicular to the line and # get the distance. # We find the vector parallel to the line by projecting vecFromFirst onto # the line. The perpendicular vector is vecFromFirst - vecFromFirstParallel # We project vecFromFirst by taking the scalar product of the vector with # the unit vector that points in the direction of the line (this gives us # the length of the projection of vecFromFirst onto the line). If we # multiply the scalar product by the unit vector, we have vecFromFirstParallel scalarProduct = np.sum(vecFromFirst * npm.repmat(lineVecNorm, nPoints, 1), axis=1) vecFromFirstParallel = np.outer(scalarProduct, lineVecNorm) vecToLine = vecFromFirst - vecFromFirstParallel # distance to line is the norm of vecToLine distToLine = np.sqrt(np.sum(vecToLine**2, axis=1)) # knee/elbow is the point with max distance value idxOfBestPoint = np.argmax(distToLine) return (distToLine, idxOfBestPoint) counts = [x[1] for x in cell_barcode_counts.most_common()] values = list(np.cumsum(counts)) # We need to perform the distance knee iteratively with reduced # number of CBs since it's sensitive to the number of CBs input # and overestimates if too many CBs are used previous_idxOfBestPoint = 0 distToLine, idxOfBestPoint = getKneeDistance(values) if idxOfBestPoint == 0: raise ValueError("Something's gone wrong here!!") max_iterations = 100 iterations = 0 while idxOfBestPoint - previous_idxOfBestPoint != 0: previous_idxOfBestPoint = idxOfBestPoint iterations += 1 if iterations > max_iterations: break distToLine, idxOfBestPoint = getKneeDistance(values[:idxOfBestPoint * 3]) knee_final_barcodes = [ x[0] for x in cell_barcode_counts.most_common()[:idxOfBestPoint + 1] ] if cell_number: threshold = counts[cell_number] final_barcodes = set( [x for x, y in cell_barcode_counts.items() if y > threshold]) else: final_barcodes = knee_final_barcodes if plotfile_prefix: # colour-blind friendly colours - https://gist.github.com/thriveth/8560036 CB_color_cycle = [ '#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3', '#999999', '#e41a1c', '#dede00' ] user_line = mlines.Line2D([], [], color=CB_color_cycle[2], ls="dashed", markersize=15, label='User-defined') selected_line = mlines.Line2D([], [], color=CB_color_cycle[0], ls="dashed", markersize=15, label='Knee') # plot of the original curve and its corresponding distances plt.figure(figsize=(12, 6)) plt.plot(distToLine, label='Distance', color='r') plt.plot(values, label='Cumulative', color='b') plt.plot([idxOfBestPoint], values[idxOfBestPoint], marker='o', markersize=8, color="red", label='Knee') if cell_number: plt.axvline(x=cell_number, ls="dashed", color=CB_color_cycle[2], label="User-defined") plt.legend() plt.savefig("%s_cell_barcode_knee.png" % plotfile_prefix) colours_selected = [ CB_color_cycle[0] for x in range(0, len(final_barcodes)) ] colours_rejected = [ "black" for x in range(0, len(counts) - len(final_barcodes)) ] colours = colours_selected + colours_rejected fig = plt.figure() fig3 = fig.add_subplot(111) fig3.scatter(x=range(1, len(counts) + 1), y=counts, c=colours, s=10, linewidths=0) fig3.loglog() fig3.set_xlim(0, len(counts) * 1.25) fig3.set_xlabel('Barcode index') fig3.set_ylabel('Count') fig3.axvline(x=len(knee_final_barcodes), ls="dashed", color=CB_color_cycle[0]) if cell_number: fig3.axvline(x=cell_number, ls="dashed", color=CB_color_cycle[2]) lgd = fig3.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=[selected_line, user_line], title="User threshold") else: lgd = fig3.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=[selected_line], title="Knee threshold") fig.savefig("%s_cell_barcode_counts.png" % plotfile_prefix, bbox_extra_artists=(lgd, ), bbox_inches='tight') if not cell_number: with U.openFile("%s_cell_thresholds.tsv" % plotfile_prefix, "w") as outf: outf.write("count\n") outf.write("%s\n" % idxOfBestPoint) return (final_barcodes)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=usage, description=globals()["__doc__"]) group = U.OptionGroup(parser, "dedup-specific options") group.add_option("--output-stats", dest="stats", type="string", default=False, help="Specify location to output stats") parser.add_option_group(group) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) U.validateSamOptions(options, group=False) if options.random_seed: np.random.seed(options.random_seed) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: if options.no_sort_output: out_name = options.stdout.name else: out_name = U.getTempFilename(dir=options.tmpdir) sorted_out_name = options.stdout.name options.stdout.close() else: if options.no_sort_output: out_name = "-" else: out_name = U.getTempFilename(dir=options.tmpdir) sorted_out_name = "-" if not options.no_sort_output: # need to determine the output format for sort if options.out_sam: sort_format = "sam" else: sort_format = "bam" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "wh" else: out_mode = "wb" if options.stats and options.ignore_umi: raise ValueError("'--output-stats' and '--ignore-umi' options" " cannot be used together") infile = pysam.Samfile(in_name, in_mode) outfile = pysam.Samfile(out_name, out_mode, template=infile) if options.paired: outfile = sam_methods.TwoPassPairWriter(infile, outfile) nInput, nOutput, input_reads, output_reads = 0, 0, 0, 0 if options.detection_method: bam_features = detect_bam_features(infile.filename) if not bam_features[options.detection_method]: if sum(bam_features.values()) == 0: raise ValueError( "There are no bam tags available to detect multimapping. " "Do not set --multimapping-detection-method") else: raise ValueError( "The chosen method of detection for multimapping (%s) " "will not work with this bam. Multimapping can be detected" " for this bam using any of the following: %s" % (options.detection_method, ",".join( [x for x in bam_features if bam_features[x]]))) gene_tag = options.gene_tag metacontig2contig = None if options.chrom: inreads = infile.fetch(reference=options.chrom) else: if options.per_contig and options.gene_transcript_map: metacontig2contig = sam_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = sam_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch() # set up ReadCluster functor with methods specific to # specified options.method processor = network.ReadDeduplicator(options.method) bundle_iterator = sam_methods.get_bundles( options, metacontig_contig=metacontig2contig) if options.stats: # set up arrays to hold stats data stats_pre_df_dict = {"UMI": [], "counts": []} stats_post_df_dict = {"UMI": [], "counts": []} pre_cluster_stats = [] post_cluster_stats = [] pre_cluster_stats_null = [] post_cluster_stats_null = [] topology_counts = collections.Counter() node_counts = collections.Counter() read_gn = umi_methods.random_read_generator( infile.filename, chrom=options.chrom, barcode_getter=bundle_iterator.barcode_getter) for bundle, key, status in bundle_iterator(inreads): nInput += sum([bundle[umi]["count"] for umi in bundle]) while nOutput >= output_reads + 100000: output_reads += 100000 U.info("Written out %i reads" % output_reads) while nInput >= input_reads + 1000000: input_reads += 1000000 U.info("Parsed %i input reads" % input_reads) if options.stats: # generate pre-dudep stats average_distance = umi_methods.get_average_umi_distance( bundle.keys()) pre_cluster_stats.append(average_distance) cluster_size = len(bundle) random_umis = read_gn.getUmis(cluster_size) average_distance_null = umi_methods.get_average_umi_distance( random_umis) pre_cluster_stats_null.append(average_distance_null) if options.ignore_umi: for umi in bundle: nOutput += 1 outfile.write(bundle[umi]["read"]) else: # dedup using umis and write out deduped bam reads, umis, umi_counts = processor(bundle=bundle, threshold=options.threshold) for read in reads: outfile.write(read) nOutput += 1 if options.stats: # collect pre-dudupe stats stats_pre_df_dict['UMI'].extend(bundle) stats_pre_df_dict['counts'].extend( [bundle[UMI]['count'] for UMI in bundle]) # collect post-dudupe stats post_cluster_umis = [ bundle_iterator.barcode_getter(x)[0] for x in reads ] stats_post_df_dict['UMI'].extend(umis) stats_post_df_dict['counts'].extend(umi_counts) average_distance = umi_methods.get_average_umi_distance( post_cluster_umis) post_cluster_stats.append(average_distance) cluster_size = len(post_cluster_umis) random_umis = read_gn.getUmis(cluster_size) average_distance_null = umi_methods.get_average_umi_distance( random_umis) post_cluster_stats_null.append(average_distance_null) outfile.close() if not options.no_sort_output: # sort the output pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name) os.unlink(out_name) # delete the tempfile if options.stats: # generate the stats dataframe stats_pre_df = pd.DataFrame(stats_pre_df_dict) stats_post_df = pd.DataFrame(stats_post_df_dict) # tally the counts per umi per position pre_counts = collections.Counter(stats_pre_df["counts"]) post_counts = collections.Counter(stats_post_df["counts"]) counts_index = list( set(pre_counts.keys()).union(set(post_counts.keys()))) counts_index.sort() with U.openFile(options.stats + "_per_umi_per_position.tsv", "w") as outf: outf.write("counts\tinstances_pre\tinstances_post\n") for count in counts_index: values = (count, pre_counts[count], post_counts[count]) outf.write("\t".join(map(str, values)) + "\n") # aggregate stats pre/post per UMI agg_pre_df = aggregateStatsDF(stats_pre_df) agg_post_df = aggregateStatsDF(stats_post_df) agg_df = pd.merge(agg_pre_df, agg_post_df, how='left', left_index=True, right_index=True, sort=True, suffixes=["_pre", "_post"]) # TS - if count value not observed either pre/post-dedup, # merge will leave an empty cell and the column will be cast as a float # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html # --> Missing data casting rules and indexing # so, back fill with zeros and convert back to int agg_df = agg_df.fillna(0).astype(int) agg_df.index = [x.decode() for x in agg_df.index] agg_df.index.name = 'UMI' agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t") # bin distances into integer bins max_ed = int( max( map(max, [ pre_cluster_stats, post_cluster_stats, pre_cluster_stats_null, post_cluster_stats_null ]))) cluster_bins = range(-1, int(max_ed) + 2) def bin_clusters(cluster_list, bins=cluster_bins): ''' take list of floats and return bins''' return np.digitize(cluster_list, bins, right=True) def tallyCounts(binned_cluster, max_edit_distance): ''' tally counts per bin ''' return np.bincount(binned_cluster, minlength=max_edit_distance + 3) pre_cluster_binned = bin_clusters(pre_cluster_stats) post_cluster_binned = bin_clusters(post_cluster_stats) pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null) post_cluster_null_binned = bin_clusters(post_cluster_stats_null) edit_distance_df = pd.DataFrame( { "unique": tallyCounts(pre_cluster_binned, max_ed), "unique_null": tallyCounts(pre_cluster_null_binned, max_ed), options.method: tallyCounts(post_cluster_binned, max_ed), "%s_null" % options.method: tallyCounts(post_cluster_null_binned, max_ed), "edit_distance": cluster_bins }, columns=[ "unique", "unique_null", options.method, "%s_null" % options.method, "edit_distance" ]) # TS - set lowest bin (-1) to "Single_UMI" edit_distance_df['edit_distance'][0] = "Single_UMI" edit_distance_df.to_csv(options.stats + "_edit_distance.tsv", index=False, sep="\t") # write footer and output benchmark information. U.info("Reads: %s" % ", ".join([ "%s: %s" % (x[0], x[1]) for x in bundle_iterator.read_events.most_common() ])) U.info("Number of reads out: %i" % nOutput) if not options.ignore_umi: # otherwise processor has not been used U.info("Total number of positions deduplicated: %i" % processor.UMIClusterer.positions) if processor.UMIClusterer.positions > 0: U.info("Mean number of unique UMIs per position: %.2f" % (float(processor.UMIClusterer.total_umis_per_position) / processor.UMIClusterer.positions)) U.info("Max. number of unique UMIs per position: %i" % processor.UMIClusterer.max_umis_per_position) else: U.warn("The BAM did not contain any valid " "reads/read pairs for deduplication") U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-p", "--bc-pattern", dest="pattern", type="string", help="Barcode pattern") parser.add_option("--bc-pattern2", dest="pattern2", type="string", help="Barcode pattern for paired reads") parser.add_option("--3prime", dest="prime3", action="store_true", help="barcode is on 3' end of read.") parser.add_option("--read2-in", dest="read2_in", type="string", help="file name for read pairs") parser.add_option("--extract-method", dest="extract_method", type="choice", choices=["string", "regex"], help=("How to extract the umi +/- cell barcodes, Choose " "from 'string' or 'regex'")) parser.add_option("--plot-prefix", dest="plot_prefix", type="string", help=("Prefix for plots to visualise the automated " "detection of the number of 'true' cell barcodes")) parser.add_option("--subset-reads", dest="subset_reads", type="int", help=("Use the first N reads to automatically identify " "the true cell barcodes. If N is greater than the " "number of reads, all reads will be used")) parser.add_option("--error-correct-threshold", dest="error_correct_threshold", type="int", help=("Hamming distance for correction of " "barcodes to whitelist barcodes")) parser.add_option("--method", dest="method", choices=["reads", "umis"], help=("Use reads or unique umi counts per cell")) parser.add_option("--expect-cells", dest="expect_cells", type="int", help=("Prior expectation on the upper limit on the " "number of cells sequenced")) parser.add_option("--set-cell-number", dest="cell_number", type="int", help=("Specify the number of cell barcodes to accept")) parser.set_defaults(method="reads", extract_method="string", filter_cell_barcodes=False, whitelist_tsv=None, blacklist_tsv=None, error_correct_threshold=1, pattern=None, pattern2=None, read2_in=None, plot_prefix=None, subset_reads=100000000, expect_cells=False, cell_number=False) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False, add_sam_options=False) if options.expect_cells and options.cell_number: U.error("Cannot supply both --expect-cells and " "--cell-number options") if not options.pattern and not options.pattern2: if not options.read2_in: U.error("Must supply --bc-pattern for single-end") else: U.error("Must supply --bc-pattern and/or --bc-pattern2 " "if paired-end ") if options.pattern2: if not options.read2_in: U.error("must specify a paired fastq ``--read2-in``") if not options.pattern2: options.pattern2 = options.pattern extract_cell = False extract_umi = False # If the pattern is a regex we can compile the regex(es) prior to # ExtractFilterAndUpdate instantiation if options.extract_method == "regex": if options.pattern: try: options.pattern = regex.compile(options.pattern) except regex.error: U.error("barcode_regex '%s' is not a " "valid regex" % options.pattern) if options.pattern2: try: options.pattern2 = regex.compile(options.barcode_regex2) except regex.Error: U.error("barcode_regex2 '%s' is not a " "valid regex" % options.barcode_regex2) # check whether the regex contains a umi group(s) and cell groups(s) if options.extract_method == "regex": if options.pattern: for group in options.pattern.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True if options.pattern2: for group in options.pattern2.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True # check whether the pattern string contains umi/cell bases elif options.extract_method == "string": if options.pattern: if "C" in options.pattern: extract_cell = True if "N" in options.pattern: extract_umi = True if options.pattern2: if "C" in options.pattern2: extract_cell = True if "N" in options.pattern2: extract_umi = True if not extract_umi: if options.extract_method == "string": U.error("barcode pattern(s) do not include any umi bases " "(marked with 'Ns') %s, %s" % ( options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any umi groups " "(starting with 'umi_') %s, %s" ( options.pattern, options.pattern2)) if not extract_cell: if options.extract_method == "string": U.error("barcode pattern(s) do not include any cell bases " "(marked with 'Cs') %s, %s" % ( options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any cell groups " "(starting with 'cell_') %s, %s" ( options.pattern, options.pattern2)) read1s = umi_methods.fastqIterate(options.stdin) # set up read extractor ReadExtractor = umi_methods.ExtractFilterAndUpdate( method=options.extract_method, pattern=options.pattern, pattern2=options.pattern2, prime3=options.prime3, extract_cell=extract_cell) cell_barcode_counts = collections.Counter() n_reads = 0 n_cell_barcodes = 0 # if using the umis method, need to keep a set of umis observed if options.method == "umis": cell_barcode_umis = collections.defaultdict(set) # variables for progress monitor displayMax = 100000 U.info("Starting barcode extraction") if not options.read2_in: for read1 in read1s: # Update display in every 100kth iteration if n_reads % displayMax == 0: U.info("Parsed {} reads".format(n_reads)) n_reads += 1 barcode_values = ReadExtractor.getBarcodes(read1) if barcode_values is None: continue else: cell, umi, _, _, _, _, _ = barcode_values if options.method == "umis": cell_barcode_umis[cell].add(umi) else: cell_barcode_counts[cell] += 1 n_cell_barcodes += 1 if options.subset_reads: if n_cell_barcodes > options.subset_reads: break else: read2s = umi_methods.fastqIterate(U.openFile(options.read2_in)) for read1, read2 in izip(read1s, read2s): # Update display in every 100kth iteration if n_reads % displayMax == 0: U.info("Parsed {} reads".format(n_reads)) n_reads += 1 barcode_values = ReadExtractor.getBarcodes(read1, read2) if barcode_values is None: continue else: cell, umi, _, _, _, _, _ = barcode_values if options.method == "umis": cell_barcode_umis[cell].add(umi) else: cell_barcode_counts[cell] += 1 n_cell_barcodes += 1 if options.subset_reads: if n_reads > options.subset_reads: break U.info("Starting - whitelist determination") if options.method == "umis": for cell in cell_barcode_umis: cell_barcode_counts[cell] = len(cell_barcode_umis[cell]) if options.cell_number and options.cell_number > len(cell_barcode_counts): raise ValueError( "--set-cell-barcode option specifies more cell barcodes than the " "number of observed cell barcodes. This may be because " "--subset-reads was set to a value too low to capture reads from " "all cells. %s cell barcodes observed from %s parsed reads. " "Expected>= %s cell barcodes" % ( len(cell_barcode_counts), options.subset_reads, options.cell_number)) cell_whitelist, true_to_false_map = umi_methods.getCellWhitelist( cell_barcode_counts, options.expect_cells, options.cell_number, options.error_correct_threshold, options.plot_prefix) U.info("Writing out whitelist") for barcode in sorted(list(cell_whitelist)): if true_to_false_map: corrected_barcodes = ",".join( sorted(true_to_false_map[barcode])) corrected_barcode_counts = ",".join( map(str, [cell_barcode_counts[x] for x in sorted(true_to_false_map[barcode])])) else: corrected_barcodes, corrected_barcode_counts = "", "" options.stdout.write("%s\t%s\t%s\t%s\n" % ( barcode, corrected_barcodes, cell_barcode_counts[barcode], corrected_barcode_counts)) U.info("Parsed %i reads" % n_reads) U.info("%i reads matched the barcode pattern" % n_cell_barcodes) U.info("Found %i unique cell barcodes" % len(cell_barcode_counts)) U.Stop()