コード例 #1
0
    def write_mates(self):
        '''Scan the current chromosome for matches to any of the reads stored
        in the read1s buffer'''
        if self.chrom is not None:
            U.debug("Dumping %i mates for contig %s" %
                    (len(self.read1s), self.chrom))

        for read in self.infile.fetch(reference=self.chrom,
                                      multiple_iterators=True):
            if any((read.is_unmapped, read.mate_is_unmapped, read.is_read1)):
                continue

            key = read.query_name, read.reference_name, read.reference_start
            if key in self.read1s:
                if self.read2tags is not None:
                    unique_id, umi = self.read2tags[key]
                    self.read2tags.pop(key)

                    read.tags += [('UG', unique_id)]
                    read.tags += [('FU', umi)]

                self.outfile.write(read)
                self.read1s.remove(key)

        U.debug("%i mates remaining" % len(self.read1s))
コード例 #2
0
ファイル: dedup.py プロジェクト: messersc/UMI-tools
    def write_mates(self):
        '''Scan the current chormosome for matches to any of the reads stored
        in the read1s buffer'''

        if self.chrom is not None:
            U.debug("Dumping %i mates for contig %s" % (
                len(self.read1s), self.infile.get_reference_name(self.chrom)))

        for read in self.infile.fetch(tid=self.chrom, multiple_iterators=True):
            if any((read.is_unmapped, read.mate_is_unmapped, read.is_read1)):
                continue

            key = read.query_name, read.reference_id, read.reference_start
            if key in self.read1s:
                self.outfile.write(read)
                self.read1s.remove(key)
        U.debug("%i mates remaining" % len(self.read1s))
コード例 #3
0
    def write_mates(self):
        '''Scan the current chormosome for matches to any of the reads stored
        in the read1s buffer'''

        if self.chrom is not None:
            U.debug(
                "Dumping %i mates for contig %s" %
                (len(self.read1s), self.infile.get_reference_name(self.chrom)))

        for read in self.infile.fetch(tid=self.chrom, multiple_iterators=True):
            if any((read.is_unmapped, read.mate_is_unmapped, read.is_read1)):
                continue

            key = read.query_name, read.reference_id, read.reference_start
            if key in self.read1s:
                self.outfile.write(read)
                self.read1s.remove(key)
        U.debug("%i mates remaining" % len(self.read1s))
コード例 #4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--in-sam",
                      dest="in_sam",
                      action="store_true",
                      help="Input file is in sam format [default=%default]",
                      default=False)
    parser.add_option(
        "-o",
        "--out-sam",
        dest="out_sam",
        action="store_true",
        help="Output alignments in sam format [default=%default]",
        default=False)
    parser.add_option("--ignore-umi",
                      dest="ignore_umi",
                      action="store_true",
                      help="Ignore UMI and dedup"
                      " only on position",
                      default=False)
    parser.add_option("--umi-separator",
                      dest="umi_sep",
                      type="string",
                      help="separator between read id and UMI",
                      default="_")
    parser.add_option("--umi-tag",
                      dest="umi_tag",
                      type="string",
                      help="tag containing umi",
                      default='RX')
    parser.add_option("--extract-umi-method",
                      dest="get_umi_method",
                      type="choice",
                      choices=("read_id", "tag"),
                      default="read_id",
                      help="where is the read UMI encoded? [default=%default]")
    parser.add_option("--subset",
                      dest="subset",
                      type="float",
                      help="Use only a fraction of reads, specified by subset",
                      default=None)
    parser.add_option("--spliced-is-unique",
                      dest="spliced",
                      action="store_true",
                      help="Treat a spliced read as different to an unspliced"
                      " one [default=%default]",
                      default=False)
    parser.add_option("--soft-clip-threshold",
                      dest="soft",
                      type="float",
                      help="number of bases clipped from 5' end before"
                      "read is counted as spliced [default=%default]",
                      default=4)
    parser.add_option("--edit-distance-threshold",
                      dest="threshold",
                      type="int",
                      default=1,
                      help="Edit distance theshold at which to join two UMIs"
                      "when clustering. [default=%default]")
    parser.add_option("--chrom",
                      dest="chrom",
                      type="string",
                      help="Restrict to one chromosome",
                      default=None)
    parser.add_option("--paired",
                      dest="paired",
                      action="store_true",
                      default=False,
                      help="paired BAM. [default=%default]")
    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("adjacency", "directional", "percentile",
                               "unique", "cluster"),
                      default="directional",
                      help="method to use for umi deduping [default=%default]")
    parser.add_option("--output-stats",
                      dest="stats",
                      type="string",
                      default=False,
                      help="Specify location to output stats")
    parser.add_option(
        "--whole-contig",
        dest="whole_contig",
        action="store_true",
        default=False,
        help=
        "Read whole contig before outputting bundles: guarantees that no reads"
        "are missed, but increases memory usage")
    parser.add_option("--multimapping-detection-method",
                      dest="detection_method",
                      type="choice",
                      choices=("NH", "X0", "XT"),
                      default=None,
                      help=("Some aligners identify multimapping using bam "
                            "tags. Setting this option to NH, X0 or XT will "
                            "use these tags when selecting the best read "
                            "amongst reads with the same position and umi "
                            "[default=%default]"))
    parser.add_option("--mapping-quality",
                      dest="mapping_quality",
                      type="int",
                      help="Minimum mapping quality for a read to be retained"
                      " [default=%default]",
                      default=0)
    parser.add_option(
        "--read-length",
        dest="read_length",
        action="store_true",
        default=False,
        help=("use read length in addition to position and UMI"
              "to identify possible duplicates [default=%default]"))
    parser.add_option("--per-contig",
                      dest="per_contig",
                      action="store_true",
                      default=False,
                      help=("dedup per contig (field 3 in BAM; RNAME),"
                            " e.g for transcriptome where contig = gene"))
    parser.add_option("--per-gene",
                      dest="per_gene",
                      action="store_true",
                      default=False,
                      help=("Deduplicate per gene,"
                            "e.g for transcriptome where contig = transcript"
                            "must also provide a transript to gene map with"
                            "--gene-transcript-map [default=%default]"))
    parser.add_option("--gene-transcript-map",
                      dest="gene_transcript_map",
                      type="string",
                      help="file mapping transcripts to genes (tab separated)",
                      default=None)
    parser.add_option("--gene-tag",
                      dest="gene_tag",
                      type="string",
                      help=("Deduplicate per gene where gene is"
                            "defined by this bam tag [default=%default]"),
                      default=None)
    parser.add_option(
        "--skip-tags-regex",
        dest="skip_regex",
        type="string",
        help=("Used with --gene-tag. "
              "Ignore reads where the gene-tag matches this regex"),
        default="^[__|Unassigned]")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    if options.random_seed:
        np.random.seed(options.random_seed)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        out_name = options.stdout.name
        options.stdout.close()
    else:
        out_name = "-"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "wh"
    else:
        out_mode = "wb"

    if options.stats:
        if options.ignore_umi:
            raise ValueError("'--output-stats' and '--ignore-umi' options"
                             " cannot be used together")

    if options.per_gene:
        if not options.gene_transcript_map and not options.gene_map:
            raise ValueError(
                "--per-gene option requires --gene-transcript-map "
                "or --gene-tag")
    try:
        re.compile(options.skip_regex)
    except re.error:
        raise ValueError("skip-regex '%s' is not a "
                         "valid regex" % options.skip_regex)

    infile = pysam.Samfile(in_name, in_mode)
    outfile = pysam.Samfile(out_name, out_mode, template=infile)

    if options.paired:
        outfile = umi_methods.TwoPassPairWriter(infile, outfile)

    nInput, nOutput = 0, 0

    if options.detection_method:
        bam_features = detect_bam_features(infile.filename)

        if not bam_features[options.detection_method]:
            if sum(bam_features.values()) == 0:
                raise ValueError(
                    "There are no bam tags available to detect multimapping. "
                    "Do not set --multimapping-detection-method")
            else:
                raise ValueError(
                    "The chosen method of detection for multimapping (%s) "
                    "will not work with this bam. Multimapping can be detected"
                    " for this bam using any of the following: %s" %
                    (options.detection_method, ",".join(
                        [x for x in bam_features if bam_features[x]])))

    # set the method with which to extract umis from reads
    if options.get_umi_method == "read_id":
        umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep)
    elif options.get_umi_method == "tag":
        umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag)
    else:
        raise ValueError("Unknown umi extraction method")

    if options.stats:
        # set up arrays to hold stats data
        stats_pre_df_dict = {"UMI": [], "counts": []}
        stats_post_df_dict = {"UMI": [], "counts": []}
        pre_cluster_stats = []
        post_cluster_stats = []
        pre_cluster_stats_null = []
        post_cluster_stats_null = []
        topology_counts = collections.Counter()
        node_counts = collections.Counter()
        read_gn = umi_methods.random_read_generator(infile.filename,
                                                    chrom=options.chrom,
                                                    umi_getter=umi_getter)

    if options.chrom:
        inreads = infile.fetch(reference=options.chrom)
    else:
        if options.per_gene and options.gene_transcript_map:
            metacontig2contig = umi_methods.getMetaContig2contig(
                infile, options.gene_transcript_map)
            metatag = "MC"
            inreads = umi_methods.metafetcher(infile, metacontig2contig,
                                              metatag)
            gene_tag = metatag

        else:
            inreads = infile.fetch()
            gene_tag = options.gene_tag

    for bundle, read_events, status in umi_methods.get_bundles(
            inreads,
            ignore_umi=options.ignore_umi,
            subset=options.subset,
            quality_threshold=options.mapping_quality,
            paired=options.paired,
            spliced=options.spliced,
            soft_clip_threshold=options.soft,
            per_contig=options.per_contig,
            gene_tag=options.gene_tag,
            skip_regex=options.skip_regex,
            whole_contig=options.whole_contig,
            read_length=options.read_length,
            detection_method=options.detection_method,
            umi_getter=umi_getter,
            all_reads=False,
            return_read2=False,
            return_unmapped=False):

        nInput += sum([bundle[umi]["count"] for umi in bundle])

        if nOutput % 10000 == 0:
            U.debug("Outputted %i" % nOutput)

        if nInput % 1000000 == 0:
            U.debug("Read %i input reads" % nInput)

        if options.stats:
            # generate pre-dudep stats
            average_distance = umi_methods.get_average_umi_distance(
                bundle.keys())
            pre_cluster_stats.append(average_distance)
            cluster_size = len(bundle)
            random_umis = read_gn.getUmis(cluster_size)
            average_distance_null = umi_methods.get_average_umi_distance(
                random_umis)
            pre_cluster_stats_null.append(average_distance_null)

        if options.ignore_umi:
            for umi in bundle:
                nOutput += 1
                outfile.write(bundle[umi]["read"])

        else:

            # set up ReadCluster functor with methods specific to
            # specified options.method
            processor = network.ReadDeduplicator(options.method)

            # dedup using umis and write out deduped bam
            reads, umis, umi_counts = processor(bundle=bundle,
                                                threshold=options.threshold)

            for read in reads:
                outfile.write(read)
                nOutput += 1

            if options.stats:

                # collect pre-dudupe stats
                stats_pre_df_dict['UMI'].extend(bundle)
                stats_pre_df_dict['counts'].extend(
                    [bundle[UMI]['count'] for UMI in bundle])

                # collect post-dudupe stats
                post_cluster_umis = [umi_getter(x) for x in reads]
                stats_post_df_dict['UMI'].extend(umis)
                stats_post_df_dict['counts'].extend(umi_counts)

                average_distance = umi_methods.get_average_umi_distance(
                    post_cluster_umis)
                post_cluster_stats.append(average_distance)

                cluster_size = len(post_cluster_umis)
                random_umis = read_gn.getUmis(cluster_size)
                average_distance_null = umi_methods.get_average_umi_distance(
                    random_umis)
                post_cluster_stats_null.append(average_distance_null)

    outfile.close()

    if options.stats:

        # generate the stats dataframe
        stats_pre_df = pd.DataFrame(stats_pre_df_dict)
        stats_post_df = pd.DataFrame(stats_post_df_dict)

        # tally the counts per umi per position
        pre_counts = collections.Counter(stats_pre_df["counts"])
        post_counts = collections.Counter(stats_post_df["counts"])
        counts_index = list(
            set(pre_counts.keys()).union(set(post_counts.keys())))
        counts_index.sort()
        with U.openFile(options.stats + "_per_umi_per_position.tsv",
                        "w") as outf:
            outf.write("counts\tinstances_pre\tinstances_post\n")
            for count in counts_index:
                values = (count, pre_counts[count], post_counts[count])
                outf.write("\t".join(map(str, values)) + "\n")

        # aggregate stats pre/post per UMI
        agg_pre_df = aggregateStatsDF(stats_pre_df)
        agg_post_df = aggregateStatsDF(stats_post_df)

        agg_df = pd.merge(agg_pre_df,
                          agg_post_df,
                          how='left',
                          left_index=True,
                          right_index=True,
                          sort=True,
                          suffixes=["_pre", "_post"])

        # TS - if count value not observed either pre/post-dedup,
        # merge will leave an empty cell and the column will be cast as a float
        # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html
        # --> Missing data casting rules and indexing
        # so, back fill with zeros and convert back to int
        agg_df = agg_df.fillna(0).astype(int)

        agg_df.index = [x.decode() for x in agg_df.index]
        agg_df.index.name = 'UMI'
        agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t")

        # bin distances into integer bins
        max_ed = int(
            max(
                map(max, [
                    pre_cluster_stats, post_cluster_stats,
                    pre_cluster_stats_null, post_cluster_stats_null
                ])))

        cluster_bins = range(-1, int(max_ed) + 2)

        def bin_clusters(cluster_list, bins=cluster_bins):
            ''' take list of floats and return bins'''
            return np.digitize(cluster_list, bins, right=True)

        def tallyCounts(binned_cluster, max_edit_distance):
            ''' tally counts per bin '''
            return np.bincount(binned_cluster, minlength=max_edit_distance + 3)

        pre_cluster_binned = bin_clusters(pre_cluster_stats)
        post_cluster_binned = bin_clusters(post_cluster_stats)
        pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null)
        post_cluster_null_binned = bin_clusters(post_cluster_stats_null)

        edit_distance_df = pd.DataFrame({
            "unique":
            tallyCounts(pre_cluster_binned, max_ed),
            "unique_null":
            tallyCounts(pre_cluster_null_binned, max_ed),
            options.method:
            tallyCounts(post_cluster_binned, max_ed),
            "%s_null" % options.method:
            tallyCounts(post_cluster_null_binned, max_ed),
            "edit_distance":
            cluster_bins
        })

        # TS - set lowest bin (-1) to "Single_UMI"
        edit_distance_df['edit_distance'][0] = "Single_UMI"

        edit_distance_df.to_csv(options.stats + "_edit_distance.tsv",
                                index=False,
                                sep="\t")

    # write footer and output benchmark information.

    U.info(
        "%s" %
        ", ".join(["%s: %s" % (x[0], x[1])
                   for x in read_events.most_common()]))
    U.info("Number of reads out: %i" % nOutput)

    U.Stop()
コード例 #5
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--in-sam",
                      dest="in_sam",
                      action="store_true",
                      help="Input file is in sam format [default=%default]",
                      default=False)
    parser.add_option(
        "-o",
        "--out-sam",
        dest="out_sam",
        action="store_true",
        help="Output alignments in sam format [default=%default]",
        default=False)
    parser.add_option("--umi-separator",
                      dest="umi_sep",
                      type="string",
                      help="separator between read id and UMI",
                      default="_")
    parser.add_option("--umi-tag",
                      dest="umi_tag",
                      type="string",
                      help="tag containing umi",
                      default='RX')
    parser.add_option("--umi-group-tag",
                      dest="umi_group_tag",
                      type="string",
                      help="tag for the outputted umi group",
                      default='BX')
    parser.add_option("--extract-umi-method",
                      dest="get_umi_method",
                      type="choice",
                      choices=("read_id", "tag"),
                      default="read_id",
                      help="where is the read UMI encoded? [default=%default]")
    parser.add_option("--subset",
                      dest="subset",
                      type="float",
                      help="Use only a fraction of reads, specified by subset",
                      default=None)
    parser.add_option("--spliced-is-unique",
                      dest="spliced",
                      action="store_true",
                      help="Treat a spliced read as different to an unspliced"
                      " one [default=%default]",
                      default=False)
    parser.add_option("--soft-clip-threshold",
                      dest="soft",
                      type="float",
                      help="number of bases clipped from 5' end before"
                      "read is counted as spliced [default=%default]",
                      default=4)
    parser.add_option("--edit-distance-threshold",
                      dest="threshold",
                      type="int",
                      default=1,
                      help="Edit distance theshold at which to join two UMIs"
                      "when clustering. [default=%default]")
    parser.add_option("--chrom",
                      dest="chrom",
                      type="string",
                      help="Restrict to one chromosome",
                      default=None)
    parser.add_option("--paired",
                      dest="paired",
                      action="store_true",
                      default=False,
                      help="paired BAM. [default=%default]")
    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("adjacency", "directional", "unique",
                               "cluster"),
                      default="directional",
                      help="method to use for umi deduping [default=%default]")
    parser.add_option("--per-contig",
                      dest="per_contig",
                      action="store_true",
                      default=False,
                      help=("dedup per contig (field 3 in BAM; RNAME),"
                            " e.g for transcriptome where contig = gene"))
    parser.add_option("--per-gene",
                      dest="per_gene",
                      action="store_true",
                      default=False,
                      help=("Deduplicate per gene,"
                            "e.g for transcriptome where contig = transcript"
                            "must also provide a transript to gene map with"
                            "--gene-transcript-map [default=%default]"))
    parser.add_option("--gene-transcript-map",
                      dest="gene_transcript_map",
                      type="string",
                      help="file mapping transcripts to genes (tab separated)",
                      default=None)
    parser.add_option("--gene-tag",
                      dest="gene_tag",
                      type="string",
                      help=("Deduplicate per gene where gene is"
                            "defined by this bam tag [default=%default]"),
                      default=None)
    parser.add_option(
        "--read-length",
        dest="read_length",
        action="store_true",
        default=False,
        help=("use read length in addition to position and UMI"
              "to identify possible duplicates [default=%default]"))
    parser.add_option("--mapping-quality",
                      dest="mapping_quality",
                      type="int",
                      help="Minimum mapping quality for a read to be retained"
                      " [default=%default]",
                      default=0)
    parser.add_option(
        "--output-unmapped",
        dest="output_unmapped",
        action="store_true",
        default=False,
        help=("Retain all unmapped reads in output[default=%default]"))
    parser.add_option(
        "--group-out",
        dest="tsv",
        type="string",
        help="Outfile name for file mapping read id to read group",
        default=None)
    parser.add_option(
        "--output-bam",
        dest="output_bam",
        action="store_true",
        default=False,
        help=("output a bam file with read groups tagged using the UG tag"
              "[default=%default]"))
    parser.add_option(
        "--skip-tags-regex",
        dest="skip_regex",
        type="string",
        help=("Used with --gene-tag. "
              "Ignore reads where the gene-tag matches this regex"),
        default="^[__|Unassigned]")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        out_name = options.stdout.name
        options.stdout.close()
        assert options.output_bam, (
            "To output a bam you must include --output-bam option")
    else:
        out_name = "-"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "wh"
    else:
        out_mode = "wb"

    if options.per_gene:
        if not options.gene_transcript_map:
            raise ValueError(
                "--per-gene option requires --gene-transcript-map")

    infile = pysam.Samfile(in_name, in_mode)

    if options.output_bam:
        outfile = pysam.Samfile(out_name, out_mode, template=infile)
    else:
        outfile = None

    if options.tsv:
        mapping_outfile = U.openFile(options.tsv, "w")
        mapping_outfile.write("%s\n" % "\t".join([
            "read_id", "contig", "position", "gene", "umi", "umi_count",
            "final_umi", "final_umi_count", "unique_id"
        ]))

    # set the method with which to extract umis from reads
    if options.get_umi_method == "read_id":
        umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep)
    elif options.get_umi_method == "tag":
        umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag)
    else:
        raise ValueError("Unknown umi extraction method")

    nInput, nOutput, unique_id = 0, 0, 0

    if options.chrom:
        inreads = infile.fetch(reference=options.chrom)
        gene_tag = options.gene_tag
    else:
        if options.per_gene and options.gene_transcript_map:
            metacontig2contig = umi_methods.getMetaContig2contig(
                infile, options.gene_transcript_map)
            metatag = "MC"
            inreads = umi_methods.metafetcher(infile, metacontig2contig,
                                              metatag)
            gene_tag = metatag

        else:
            inreads = infile.fetch(until_eof=options.output_unmapped)
            gene_tag = options.gene_tag

    for bundle, read_events, status in umi_methods.get_bundles(
            inreads,
            ignore_umi=False,
            subset=options.subset,
            quality_threshold=options.mapping_quality,
            paired=options.paired,
            spliced=options.spliced,
            soft_clip_threshold=options.soft,
            per_contig=options.per_contig,
            gene_tag=gene_tag,
            skip_regex=options.skip_regex,
            read_length=options.read_length,
            umi_getter=umi_getter,
            all_reads=True,
            return_read2=True,
            return_unmapped=options.output_unmapped):

        # write out read2s and unmapped if option set
        if status == 'single_read':
            # bundle is just a single read here
            outfile.write(bundle)
            nInput += 1
            nOutput += 1
            continue

        umis = bundle.keys()
        counts = {umi: bundle[umi]["count"] for umi in umis}

        nInput += sum(counts.values())

        if nOutput % 10000 == 0:
            U.debug("Outputted %i" % nOutput)

        if nInput % 1000000 == 0:
            U.debug("Read %i input reads" % nInput)

        # set up UMIClusterer functor with methods specific to
        # specified options.method
        processor = network.UMIClusterer(options.method)

        # group the umis
        groups = processor(umis, counts, threshold=options.threshold)

        for umi_group in groups:
            top_umi = umi_group[0]

            group_count = sum(counts[umi] for umi in umi_group)

            for umi in umi_group:
                reads = bundle[umi]['read']
                for read in reads:
                    if outfile:
                        # Add the 'UG' tag to the read
                        read.tags += [('UG', unique_id)]
                        read.tags += [(options.umi_group_tag, top_umi)]
                        outfile.write(read)

                    if options.tsv:
                        if options.per_gene:
                            gene = read.get_tag(gene_tag)
                        else:
                            gene = "NA"
                        mapping_outfile.write("%s\n" % "\t".join(
                            map(str,
                                (read.query_name, read.reference_name,
                                 umi_methods.get_read_position(
                                     read, options.soft)[1],
                                 gene, umi.decode(), counts[umi],
                                 top_umi.decode(), group_count, unique_id))))

                    nOutput += 1

            unique_id += 1

    if outfile:
        outfile.close()

    if options.tsv:
        mapping_outfile.close()

    # write footer and output benchmark information.
    U.info(
        "Reads: %s" %
        ", ".join(["%s: %s" % (x[0], x[1])
                   for x in read_events.most_common()]))
    U.info("Number of reads out: %i, Number of groups: %i" %
           (nOutput, unique_id))
    U.Stop()
コード例 #6
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--in-sam",
                      dest="in_sam",
                      action="store_true",
                      help="Input file is in sam format [default=%default]",
                      default=False)
    parser.add_option(
        "-o",
        "--out-sam",
        dest="out_sam",
        action="store_true",
        help="Output alignments in sam format [default=%default]",
        default=False)
    parser.add_option("--umi-separator",
                      dest="umi_sep",
                      type="string",
                      help="separator between read id and UMI",
                      default="_")
    parser.add_option("--umi-tag",
                      dest="umi_tag",
                      type="string",
                      help="tag containing umi",
                      default='RX')
    parser.add_option("--umi-group-tag",
                      dest="umi_group_tag",
                      type="string",
                      help="tag for the outputted umi group",
                      default='BX')
    parser.add_option("--extract-umi-method",
                      dest="get_umi_method",
                      type="choice",
                      choices=("read_id", "tag"),
                      default="read_id",
                      help="where is the read UMI encoded? [default=%default]")
    parser.add_option("--subset",
                      dest="subset",
                      type="float",
                      help="Use only a fraction of reads, specified by subset",
                      default=None)
    parser.add_option("--spliced-is-unique",
                      dest="spliced",
                      action="store_true",
                      help="Treat a spliced read as different to an unspliced"
                      " one [default=%default]",
                      default=False)
    parser.add_option("--soft-clip-threshold",
                      dest="soft",
                      type="float",
                      help="number of bases clipped from 5' end before"
                      "read is counted as spliced [default=%default]",
                      default=4)
    parser.add_option("--edit-distance-threshold",
                      dest="threshold",
                      type="int",
                      default=1,
                      help="Edit distance theshold at which to join two UMIs"
                      "when clustering. [default=%default]")
    parser.add_option("--chrom",
                      dest="chrom",
                      type="string",
                      help="Restrict to one chromosome",
                      default=None)
    parser.add_option("--paired",
                      dest="paired",
                      action="store_true",
                      default=False,
                      help="paired BAM. [default=%default]")
    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("adjacency", "directional", "unique",
                               "cluster"),
                      default="directional",
                      help="method to use for umi deduping [default=%default]")
    parser.add_option("--per-contig",
                      dest="per_contig",
                      action="store_true",
                      default=False,
                      help=("dedup per contig,"
                            " e.g for transcriptome where contig = gene"))
    parser.add_option(
        "--whole-contig",
        dest="whole_contig",
        action="store_true",
        default=False,
        help=
        "Read whole contig before outputting bundles: guarantees that no reads"
        "are missed, but increases memory usage")
    parser.add_option(
        "--read-length",
        dest="read_length",
        action="store_true",
        default=False,
        help=("use read length in addition to position and UMI"
              "to identify possible duplicates [default=%default]"))
    parser.add_option("--mapping-quality",
                      dest="mapping_quality",
                      type="int",
                      help="Minimum mapping quality for a read to be retained"
                      " [default=%default]",
                      default=0)
    parser.add_option(
        "--group-out",
        dest="tsv",
        type="string",
        help="Outfile name for file mapping read id to read group",
        default=None)
    parser.add_option(
        "--output-bam",
        dest="output_bam",
        action="store_true",
        default=False,
        help=("output a bam file with read groups tagged using the UG tag"
              "[default=%default]"))

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        out_name = options.stdout.name
        options.stdout.close()
        assert options.output_bam, (
            "To output a bam you must include --output-bam option")
    else:
        out_name = "-"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "w"
    else:
        out_mode = "wb"

    infile = pysam.Samfile(in_name, in_mode)

    if options.output_bam:
        outfile = pysam.Samfile(out_name, out_mode, template=infile)
        if options.paired:
            outfile = umi_methods.TwoPassPairWriter(infile, outfile, tags=True)
    else:
        outfile = None

    if options.tsv:
        mapping_outfile = U.openFile(options.tsv, "w")
        mapping_outfile.write(
            "read_id\tcontig\tposition\tumi\tumi_count\tfinal_umi\tfinal_umi_count\tunique_id\n"
        )

    # set the method with which to extract umis from reads
    if options.get_umi_method == "read_id":
        umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep)
    elif options.get_umi_method == "tag":
        umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag)
    else:
        raise ValueError("Unknown umi extraction method")

    nInput, nOutput, unique_id = 0, 0, 0

    read_events = collections.Counter()

    for bundle, read_events in umi_methods.get_bundles(
            infile,
            read_events,
            ignore_umi=False,
            subset=options.subset,
            quality_threshold=options.mapping_quality,
            paired=options.paired,
            chrom=options.chrom,
            spliced=options.spliced,
            soft_clip_threshold=options.soft,
            per_contig=options.per_contig,
            whole_contig=options.whole_contig,
            read_length=options.read_length,
            umi_getter=umi_getter,
            all_reads=True):

        nInput += sum([bundle[umi]["count"] for umi in bundle])

        if nOutput % 10000 == 0:
            U.debug("Outputted %i" % nOutput)

        if nInput % 1000000 == 0:
            U.debug("Read %i input reads" % nInput)

        # set up ReadCluster functor with methods specific to
        # specified options.method
        processor = network.ReadClusterer(options.method)

        bundle, groups, counts = processor(bundle=bundle,
                                           threshold=options.threshold,
                                           stats=True,
                                           deduplicate=False)

        for umi_group in groups:
            top_umi = umi_group[0]

            group_count = sum(counts[umi] for umi in umi_group)

            for umi in umi_group:
                reads = bundle[umi]['read']
                for read in reads:
                    if outfile:
                        if options.paired:
                            # if paired, we need to supply the tags to
                            # add to the paired read
                            outfile.write(read, unique_id, top_umi)

                        else:
                            # Add the 'UG' tag to the read
                            read.tags += [('UG', unique_id)]
                            read.tags += [(options.umi_group_tag, top_umi)]
                            outfile.write(read)

                    if options.tsv:
                        mapping_outfile.write("%s\n" % "\t".join(
                            map(str, (read.query_name, read.reference_name,
                                      umi_methods.get_read_position(
                                          read, options.soft)[1], umi.decode(),
                                      counts[umi], top_umi.decode(),
                                      group_count, unique_id))))

                    nOutput += 1

            unique_id += 1

    if outfile:
        outfile.close()

    if options.tsv:
        mapping_outfile.close()

    # write footer and output benchmark information.
    U.info(
        "Reads: %s" %
        ", ".join(["%s: %s" % (x[0], x[1])
                   for x in read_events.most_common()]))
    U.info("Number of reads out: %i, Number of groups: %i" %
           (nOutput, unique_id))
    U.Stop()
コード例 #7
0
ファイル: dedup.py プロジェクト: messersc/UMI-tools
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true",
                      help="Input file is in sam format", default=False)
    parser.add_option("-o", "--out-sam", dest="out_sam", action="store_true",
                      help="Output alignments in sam format", default=False)
    parser.add_option("--ignore-umi", dest="ignore_umi", action="store_true",
                      help="Ignore UMI and dedup only on position",
                      default=False)
    parser.add_option("--subset", dest="subset", type="string",
                      help="Use only a fraction of reads, specified by subset",
                      default=1.1)
    parser.add_option("--spliced-is-unique", dest="spliced",
                      action="store_true",
                      help="Treat a spliced read as different to an unspliced"
                           " one",
                      default=False)
    parser.add_option("--soft-clip-threshold", dest="soft",
                      type="float",
                      help="number of bases clipped from 5' end before"
                           "read os counted as spliced",
                      default=4)
    parser.add_option("--edit-distance-threshold", dest="threshold",
                      type="int",
                      help="Edit distance theshold at which to join two UMIs"
                           "when clustering", default=1)
    parser.add_option("--chrom", dest="chrom", type="string",
                      help="Restrict to one chromosome",
                      default=None)
    parser.add_option("--paired", dest="paired", action="store_true",
                      default=False,
                      help="Use second-in-pair position when deduping")
    parser.add_option("--method", dest="method", type="choice",
                      choices=("adjacency", "directional-adjacency",
                               "percentile", "unique", "cluster"),
                      default="directional-adjacency",
                      help="method to use for umi deduping")
    parser.add_option("--output-stats", dest="stats", type="string",
                      default=False,
                      help="Specify location to output stats")
    parser.add_option("--further-stats", dest="further_stats",
                      action="store_true", default=False,
                      help="Output further stats")
    parser.add_option("--per-contig", dest="per_contig", action="store_true",
                      default=False,
                      help=("dedup per contig,"
                            " e.g for transcriptome where contig = gene"))
    parser.add_option("--whole-contig", dest="whole_contig", action="store_true",
                      default=False,
                      help="Read whole contig before outputting bundles: guarantees that no reads"
                           "are missed, but increases memory usage")
    parser.add_option("--multimapping-detection-method",
                      dest="detection_method", type="choice",
                      choices=("NH", "X0", "XT"),
                      default=None,
                      help=("Some aligners identify multimapping using bam "
                            "tags. Setting this option to NH, X0 or XT will "
                            "use these tags when selecting the best read "
                            "amongst reads with the same position and umi"))
    parser.add_option("--mapping-quality", dest="mapping_quality",
                      type="int",
                      help="Minimum mapping quality for a read to be retained",
                      default=0)
    parser.add_option("--read-length", dest="read_length", action="store_true",
                      default=False,
                      help=("use read length in addition to position and UMI"
                            "to identify possible duplicates"))

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        out_name = options.stdout.name
        options.stdout.close()
    else:
        out_name = "-"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "w"
    else:
        out_mode = "wb"

    if options.stats:
        if options.ignore_umi:
            raise ValueError("'--output-stats' and '--ignore-umi' options"
                             " cannot be used together")

    if options.further_stats:
        if not options.stats:
            raise ValueError("'--further-stats' options requires "
                             "'--output-stats' option")
        if options.method not in ["cluster", "adjacency"]:
            raise ValueError("'--further-stats' only enabled with 'cluster' "
                             "and 'adjacency' methods")

    infile = pysam.Samfile(in_name, in_mode)
    outfile = pysam.Samfile(out_name, out_mode,
                            template=infile)

    if options.paired:
        outfile = TwoPassPairWriter(infile, outfile)

    nInput, nOutput = 0, 0

    if options.detection_method:
        bam_features = detect_bam_features(infile.filename)

        if not bam_features[options.detection_method]:
            if sum(bam_features.values()) == 0:
                raise ValueError(
                    "There are no bam tags available to detect multimapping. "
                    "Do not set --multimapping-detection-method")
            else:
                raise ValueError(
                    "The chosen method of detection for multimapping (%s) "
                    "will not work with this bam. Multimapping can be detected"
                    " for this bam using any of the following: %s" % (
                        options.detection_method, ",".join(
                            [x for x in bam_features if bam_features[x]])))

    if options.stats:
        # set up arrays to hold stats data
        stats_pre_df_dict = {"UMI": [], "counts": []}
        stats_post_df_dict = {"UMI": [], "counts": []}
        pre_cluster_stats = []
        post_cluster_stats = []
        pre_cluster_stats_null = []
        post_cluster_stats_null = []
        topology_counts = collections.Counter()
        node_counts = collections.Counter()
        read_gn = random_read_generator(infile.filename, chrom=options.chrom)

    for bundle in get_bundles(infile,
                              ignore_umi=options.ignore_umi,
                              subset=float(options.subset),
                              quality_threshold=options.mapping_quality,
                              paired=options.paired,
                              chrom=options.chrom,
                              spliced=options.spliced,
                              soft_clip_threshold=options.soft,
                              per_contig=options.per_contig,
                              whole_contig=options.whole_contig,
                              read_length=options.read_length,
                              detection_method=options.detection_method):

        nInput += sum([bundle[umi]["count"] for umi in bundle])

        if nOutput % 10000 == 0:
            U.debug("Outputted %i" % nOutput)

        if nInput % 1000000 == 0:
            U.debug("Read %i input reads" % nInput)

        if options.stats:
            # generate pre-dudep stats
            average_distance = get_average_umi_distance(bundle.keys())
            pre_cluster_stats.append(average_distance)
            cluster_size = len(bundle)
            random_umis = read_gn.getUmis(cluster_size)
            average_distance_null = get_average_umi_distance(random_umis)
            pre_cluster_stats_null.append(average_distance_null)

        if options.ignore_umi:
            for umi in bundle:
                nOutput += 1
                outfile.write(bundle[umi]["read"])

        else:

            # set up ClusterAndReducer functor with methods specific to
            # specified options.method
            processor = ClusterAndReducer(options.method)

            # dedup using umis and write out deduped bam
            reads, umis, umi_counts, topologies, nodes = processor(
                bundle, options.threshold,
                options.stats, options.further_stats)

            for read in reads:
                outfile.write(read)
                nOutput += 1

            if options.stats:

                # collect pre-dudupe stats
                stats_pre_df_dict['UMI'].extend(bundle)
                stats_pre_df_dict['counts'].extend(
                    [bundle[UMI]['count'] for UMI in bundle])

                # collect post-dudupe stats
                post_cluster_umis = [x.qname.split("_")[-1] for x in reads]
                stats_post_df_dict['UMI'].extend(umis)
                stats_post_df_dict['counts'].extend(umi_counts)

                average_distance = get_average_umi_distance(post_cluster_umis)
                post_cluster_stats.append(average_distance)

                cluster_size = len(post_cluster_umis)
                random_umis = read_gn.getUmis(cluster_size)
                average_distance_null = get_average_umi_distance(random_umis)
                post_cluster_stats_null.append(average_distance_null)

                if options.further_stats:
                    for c_type, count in topologies.most_common():
                        topology_counts[c_type] += count
                    for c_type, count in nodes.most_common():
                        node_counts[c_type] += count

    outfile.close()

    if options.stats:

        stats_pre_df = pd.DataFrame(stats_pre_df_dict)
        stats_post_df = pd.DataFrame(stats_post_df_dict)

        # generate histograms of counts per UMI at each position
        UMI_counts_df_pre = pd.DataFrame(stats_pre_df.pivot_table(
            columns=stats_pre_df["counts"], values="counts", aggfunc=len))
        UMI_counts_df_post = pd.DataFrame(stats_post_df.pivot_table(
            columns=stats_post_df["counts"], values="counts", aggfunc=len))

        UMI_counts_df_pre.columns = ["instances"]
        UMI_counts_df_post.columns = ["instances"]

        UMI_counts_df = pd.merge(UMI_counts_df_pre, UMI_counts_df_post,
                                 how='left', left_index=True, right_index=True,
                                 sort=True, suffixes=["_pre", "_post"])

        # TS - if count value not observed either pre/post-dedup,
        # merge will leave an empty cell and the column will be cast as a float
        # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html
        # --> Missing data casting rules and indexing
        # so, back fill with zeros and convert back to int
        UMI_counts_df = UMI_counts_df.fillna(0).astype(int)

        UMI_counts_df.to_csv(
            options.stats + "_per_umi_per_position.tsv", sep="\t")

        # aggregate stats pre/post per UMI
        agg_pre_df = aggregateStatsDF(stats_pre_df)
        agg_post_df = aggregateStatsDF(stats_post_df)

        agg_df = pd.merge(agg_pre_df, agg_post_df, how='left',
                          left_index=True, right_index=True,
                          sort=True, suffixes=["_pre", "_post"])

        # TS - see comment above regarding missing values
        agg_df = agg_df.fillna(0).astype(int)
        agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t")

        # bin distances into integer bins
        max_ed = int(max(map(max, [pre_cluster_stats,
                                   post_cluster_stats,
                                   pre_cluster_stats_null,
                                   post_cluster_stats_null])))

        cluster_bins = range(-1, int(max_ed) + 2)

        def bin_clusters(cluster_list, bins=cluster_bins):
            ''' take list of floats and return bins'''
            return np.digitize(cluster_list, bins, right=True)

        def tallyCounts(binned_cluster, max_edit_distance):
            ''' tally counts per bin '''
            return np.bincount(binned_cluster,
                               minlength=max_edit_distance + 3)

        pre_cluster_binned = bin_clusters(pre_cluster_stats)
        post_cluster_binned = bin_clusters(post_cluster_stats)
        pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null)
        post_cluster_null_binned = bin_clusters(post_cluster_stats_null)

        edit_distance_df = pd.DataFrame({
            "unique": tallyCounts(pre_cluster_binned, max_ed),
            "unique_null": tallyCounts(pre_cluster_null_binned, max_ed),
            options.method: tallyCounts(post_cluster_binned, max_ed),
            "%s_null" % options.method: tallyCounts(post_cluster_null_binned, max_ed),
            "edit_distance": cluster_bins})

        # TS - set lowest bin (-1) to "Single_UMI"
        edit_distance_df['edit_distance'][0] = "Single_UMI"

        edit_distance_df.to_csv(options.stats + "_edit_distance.tsv",
                                index=False, sep="\t")

        if options.further_stats:
            with U.openFile(options.stats + "_topologies.tsv", "w") as outf:
                outf.write(
                    "\n".join(["\t".join((x, str(y)))
                               for x, y in topology_counts.most_common()]) + "\n")

            with U.openFile(options.stats + "_nodes.tsv", "w") as outf:
                outf.write(
                    "\n".join(["\t".join(map(str, (x, y))) for
                               x, y in node_counts.most_common()]) + "\n")

    # write footer and output benchmark information.
    
    U.info("Number of reads in: %i, Number of reads out: %i" %
           (nInput, nOutput))
    U.Stop()
コード例 #8
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--in-sam",
                      dest="in_sam",
                      action="store_true",
                      help="Input file is in sam format",
                      default=False)
    parser.add_option("-o",
                      "--out-sam",
                      dest="out_sam",
                      action="store_true",
                      help="Output alignments in sam format",
                      default=False)
    parser.add_option("--ignore-umi",
                      dest="ignore_umi",
                      action="store_true",
                      help="Ignore UMI and dedup only on position",
                      default=False)
    parser.add_option("--subset",
                      dest="subset",
                      type="string",
                      help="Use only a fraction of reads, specified by subset",
                      default=1.1)
    parser.add_option("--spliced-is-unique",
                      dest="spliced",
                      action="store_true",
                      help="Treat a spliced read as different to an unspliced"
                      " one",
                      default=False)
    parser.add_option("--soft-clip-threshold",
                      dest="soft",
                      type="float",
                      help="number of bases clipped from 5' end before"
                      "read os counted as spliced",
                      default=4)
    parser.add_option("--edit-distance-threshold",
                      dest="threshold",
                      type="int",
                      help="Edit distance theshold at which to join two UMIs"
                      "when clustering",
                      default=1)
    parser.add_option("--chrom",
                      dest="chrom",
                      type="string",
                      help="Restrict to one chromosome",
                      default=None)
    parser.add_option("--paired",
                      dest="paired",
                      action="store_true",
                      default=False,
                      help="Use second-in-pair position when deduping")
    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("adjacency", "directional-adjacency",
                               "percentile", "unique", "cluster"),
                      default="directional-adjacency",
                      help="method to use for umi deduping")
    parser.add_option("--output-stats",
                      dest="stats",
                      type="string",
                      default=False,
                      help="Specify location to output stats")
    parser.add_option("--further-stats",
                      dest="further_stats",
                      action="store_true",
                      default=False,
                      help="Output further stats")
    parser.add_option("--per-contig",
                      dest="per_contig",
                      action="store_true",
                      default=False,
                      help=("dedup per contig,"
                            " e.g for transcriptome where contig = gene"))
    parser.add_option(
        "--whole-contig",
        dest="whole_contig",
        action="store_true",
        default=False,
        help=
        "Read whole contig before outputting bundles: guarantees that no reads"
        "are missed, but increases memory usage")
    parser.add_option("--multimapping-detection-method",
                      dest="detection_method",
                      type="choice",
                      choices=("NH", "X0", "XT"),
                      default=None,
                      help=("Some aligners identify multimapping using bam "
                            "tags. Setting this option to NH, X0 or XT will "
                            "use these tags when selecting the best read "
                            "amongst reads with the same position and umi"))
    parser.add_option("--mapping-quality",
                      dest="mapping_quality",
                      type="int",
                      help="Minimum mapping quality for a read to be retained",
                      default=0)
    parser.add_option("--read-length",
                      dest="read_length",
                      action="store_true",
                      default=False,
                      help=("use read length in addition to position and UMI"
                            "to identify possible duplicates"))

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        out_name = options.stdout.name
        options.stdout.close()
    else:
        out_name = "-"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "w"
    else:
        out_mode = "wb"

    if options.stats:
        if options.ignore_umi:
            raise ValueError("'--output-stats' and '--ignore-umi' options"
                             " cannot be used together")

    if options.further_stats:
        if not options.stats:
            raise ValueError("'--further-stats' options requires "
                             "'--output-stats' option")
        if options.method not in ["cluster", "adjacency"]:
            raise ValueError("'--further-stats' only enabled with 'cluster' "
                             "and 'adjacency' methods")

    infile = pysam.Samfile(in_name, in_mode)
    outfile = pysam.Samfile(out_name, out_mode, template=infile)

    if options.paired:
        outfile = TwoPassPairWriter(infile, outfile)

    nInput, nOutput = 0, 0

    if options.detection_method:
        bam_features = detect_bam_features(infile.filename)

        if not bam_features[options.detection_method]:
            if sum(bam_features.values()) == 0:
                raise ValueError(
                    "There are no bam tags available to detect multimapping. "
                    "Do not set --multimapping-detection-method")
            else:
                raise ValueError(
                    "The chosen method of detection for multimapping (%s) "
                    "will not work with this bam. Multimapping can be detected"
                    " for this bam using any of the following: %s" %
                    (options.detection_method, ",".join(
                        [x for x in bam_features if bam_features[x]])))

    if options.stats:
        # set up arrays to hold stats data
        stats_pre_df_dict = {"UMI": [], "counts": []}
        stats_post_df_dict = {"UMI": [], "counts": []}
        pre_cluster_stats = []
        post_cluster_stats = []
        pre_cluster_stats_null = []
        post_cluster_stats_null = []
        topology_counts = collections.Counter()
        node_counts = collections.Counter()
        read_gn = random_read_generator(infile.filename, chrom=options.chrom)

    for bundle in get_bundles(infile,
                              ignore_umi=options.ignore_umi,
                              subset=float(options.subset),
                              quality_threshold=options.mapping_quality,
                              paired=options.paired,
                              chrom=options.chrom,
                              spliced=options.spliced,
                              soft_clip_threshold=options.soft,
                              per_contig=options.per_contig,
                              whole_contig=options.whole_contig,
                              read_length=options.read_length,
                              detection_method=options.detection_method):

        nInput += sum([bundle[umi]["count"] for umi in bundle])

        if nOutput % 10000 == 0:
            U.debug("Outputted %i" % nOutput)

        if nInput % 1000000 == 0:
            U.debug("Read %i input reads" % nInput)

        if options.stats:
            # generate pre-dudep stats
            average_distance = get_average_umi_distance(bundle.keys())
            pre_cluster_stats.append(average_distance)
            cluster_size = len(bundle)
            random_umis = read_gn.getUmis(cluster_size)
            average_distance_null = get_average_umi_distance(random_umis)
            pre_cluster_stats_null.append(average_distance_null)

        if options.ignore_umi:
            for umi in bundle:
                nOutput += 1
                outfile.write(bundle[umi]["read"])

        else:

            # set up ClusterAndReducer functor with methods specific to
            # specified options.method
            processor = ClusterAndReducer(options.method)

            # dedup using umis and write out deduped bam
            reads, umis, umi_counts, topologies, nodes = processor(
                bundle, options.threshold, options.stats,
                options.further_stats)

            for read in reads:
                outfile.write(read)
                nOutput += 1

            if options.stats:

                # collect pre-dudupe stats
                stats_pre_df_dict['UMI'].extend(bundle)
                stats_pre_df_dict['counts'].extend(
                    [bundle[UMI]['count'] for UMI in bundle])

                # collect post-dudupe stats
                post_cluster_umis = [x.qname.split("_")[-1] for x in reads]
                stats_post_df_dict['UMI'].extend(umis)
                stats_post_df_dict['counts'].extend(umi_counts)

                average_distance = get_average_umi_distance(post_cluster_umis)
                post_cluster_stats.append(average_distance)

                cluster_size = len(post_cluster_umis)
                random_umis = read_gn.getUmis(cluster_size)
                average_distance_null = get_average_umi_distance(random_umis)
                post_cluster_stats_null.append(average_distance_null)

                if options.further_stats:
                    for c_type, count in topologies.most_common():
                        topology_counts[c_type] += count
                    for c_type, count in nodes.most_common():
                        node_counts[c_type] += count

    outfile.close()

    if options.stats:

        stats_pre_df = pd.DataFrame(stats_pre_df_dict)
        stats_post_df = pd.DataFrame(stats_post_df_dict)

        # generate histograms of counts per UMI at each position
        UMI_counts_df_pre = pd.DataFrame(
            stats_pre_df.pivot_table(columns=stats_pre_df["counts"],
                                     values="counts",
                                     aggfunc=len))
        UMI_counts_df_post = pd.DataFrame(
            stats_post_df.pivot_table(columns=stats_post_df["counts"],
                                      values="counts",
                                      aggfunc=len))

        UMI_counts_df_pre.columns = ["instances"]
        UMI_counts_df_post.columns = ["instances"]

        UMI_counts_df = pd.merge(UMI_counts_df_pre,
                                 UMI_counts_df_post,
                                 how='left',
                                 left_index=True,
                                 right_index=True,
                                 sort=True,
                                 suffixes=["_pre", "_post"])

        # TS - if count value not observed either pre/post-dedup,
        # merge will leave an empty cell and the column will be cast as a float
        # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html
        # --> Missing data casting rules and indexing
        # so, back fill with zeros and convert back to int
        UMI_counts_df = UMI_counts_df.fillna(0).astype(int)

        UMI_counts_df.to_csv(options.stats + "_per_umi_per_position.tsv",
                             sep="\t")

        # aggregate stats pre/post per UMI
        agg_pre_df = aggregateStatsDF(stats_pre_df)
        agg_post_df = aggregateStatsDF(stats_post_df)

        agg_df = pd.merge(agg_pre_df,
                          agg_post_df,
                          how='left',
                          left_index=True,
                          right_index=True,
                          sort=True,
                          suffixes=["_pre", "_post"])

        # TS - see comment above regarding missing values
        agg_df = agg_df.fillna(0).astype(int)
        agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t")

        # bin distances into integer bins
        max_ed = int(
            max(
                map(max, [
                    pre_cluster_stats, post_cluster_stats,
                    pre_cluster_stats_null, post_cluster_stats_null
                ])))

        cluster_bins = range(-1, int(max_ed) + 2)

        def bin_clusters(cluster_list, bins=cluster_bins):
            ''' take list of floats and return bins'''
            return np.digitize(cluster_list, bins, right=True)

        def tallyCounts(binned_cluster, max_edit_distance):
            ''' tally counts per bin '''
            return np.bincount(binned_cluster, minlength=max_edit_distance + 3)

        pre_cluster_binned = bin_clusters(pre_cluster_stats)
        post_cluster_binned = bin_clusters(post_cluster_stats)
        pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null)
        post_cluster_null_binned = bin_clusters(post_cluster_stats_null)

        edit_distance_df = pd.DataFrame({
            "unique":
            tallyCounts(pre_cluster_binned, max_ed),
            "unique_null":
            tallyCounts(pre_cluster_null_binned, max_ed),
            options.method:
            tallyCounts(post_cluster_binned, max_ed),
            "%s_null" % options.method:
            tallyCounts(post_cluster_null_binned, max_ed),
            "edit_distance":
            cluster_bins
        })

        # TS - set lowest bin (-1) to "Single_UMI"
        edit_distance_df['edit_distance'][0] = "Single_UMI"

        edit_distance_df.to_csv(options.stats + "_edit_distance.tsv",
                                index=False,
                                sep="\t")

        if options.further_stats:
            with U.openFile(options.stats + "_topologies.tsv", "w") as outf:
                outf.write("\n".join([
                    "\t".join((x, str(y)))
                    for x, y in topology_counts.most_common()
                ]) + "\n")

            with U.openFile(options.stats + "_nodes.tsv", "w") as outf:
                outf.write("\n".join([
                    "\t".join(map(str, (x, y)))
                    for x, y in node_counts.most_common()
                ]) + "\n")

    # write footer and output benchmark information.

    U.info("Number of reads in: %i, Number of reads out: %i" %
           (nInput, nOutput))
    U.Stop()