コード例 #1
0
ファイル: network.py プロジェクト: zorrodong/UMI-tools
    def __call__(self, umis, counts):
        '''Counts is a directionary that maps UMIs to their counts'''

        len_umis = [len(x) for x in umis]
        if not max(len_umis) == min(len_umis):
            U.warn("not all umis are the same length(!):  %d - %d" % (
                min(len_umis), max(len_umis)))

        adj_list = self.get_adj_list(umis, counts)

        clusters = self.get_connected_components(umis, adj_list, counts)

        final_umis = [list(x) for x in
                      self.get_groups(clusters, adj_list, counts)]

        return final_umis
コード例 #2
0
ファイル: network.py プロジェクト: bdemaree/UMI-tools
    def __call__(self, umis, counts):
        '''Counts is a directionary that maps UMIs to their counts'''

        len_umis = [len(x) for x in umis]
        if not max(len_umis) == min(len_umis):
            U.warn("not all umis are the same length(!):  %d - %d" % (
                min(len_umis), max(len_umis)))

        adj_list = self.get_adj_list(umis, counts)

        clusters = self.get_connected_components(umis, adj_list, counts)

        final_umis = [list(x) for x in
                      self.get_groups(clusters, adj_list, counts)]

        return final_umis
コード例 #3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=usage,
                            description=globals()["__doc__"])
    group = U.OptionGroup(parser, "dedup-specific options")

    group.add_option("--output-stats",
                     dest="stats",
                     type="string",
                     default=False,
                     help="Specify location to output stats")

    parser.add_option_group(group)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    U.validateSamOptions(options, group=False)

    if options.random_seed:
        np.random.seed(options.random_seed)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        if options.no_sort_output:
            out_name = options.stdout.name
        else:
            out_name = U.getTempFilename(dir=options.tmpdir)
            sorted_out_name = options.stdout.name
        options.stdout.close()
    else:
        if options.no_sort_output:
            out_name = "-"
        else:
            out_name = U.getTempFilename(dir=options.tmpdir)
            sorted_out_name = "-"

    if not options.no_sort_output:  # need to determine the output format for sort
        if options.out_sam:
            sort_format = "sam"
        else:
            sort_format = "bam"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "wh"
    else:
        out_mode = "wb"

    if options.stats and options.ignore_umi:
        raise ValueError("'--output-stats' and '--ignore-umi' options"
                         " cannot be used together")

    infile = pysam.Samfile(in_name, in_mode)
    outfile = pysam.Samfile(out_name, out_mode, template=infile)

    if options.paired:
        outfile = sam_methods.TwoPassPairWriter(infile, outfile)

    nInput, nOutput, input_reads, output_reads = 0, 0, 0, 0

    if options.detection_method:
        bam_features = detect_bam_features(infile.filename)

        if not bam_features[options.detection_method]:
            if sum(bam_features.values()) == 0:
                raise ValueError(
                    "There are no bam tags available to detect multimapping. "
                    "Do not set --multimapping-detection-method")
            else:
                raise ValueError(
                    "The chosen method of detection for multimapping (%s) "
                    "will not work with this bam. Multimapping can be detected"
                    " for this bam using any of the following: %s" %
                    (options.detection_method, ",".join(
                        [x for x in bam_features if bam_features[x]])))

    gene_tag = options.gene_tag
    metacontig2contig = None

    if options.chrom:
        inreads = infile.fetch(reference=options.chrom)

    else:
        if options.per_contig and options.gene_transcript_map:
            metacontig2contig = sam_methods.getMetaContig2contig(
                infile, options.gene_transcript_map)
            metatag = "MC"
            inreads = sam_methods.metafetcher(infile, metacontig2contig,
                                              metatag)
            gene_tag = metatag

        else:
            inreads = infile.fetch()

    # set up ReadCluster functor with methods specific to
    # specified options.method
    processor = network.ReadDeduplicator(options.method)

    bundle_iterator = sam_methods.get_bundles(
        options, metacontig_contig=metacontig2contig)

    if options.stats:
        # set up arrays to hold stats data
        stats_pre_df_dict = {"UMI": [], "counts": []}
        stats_post_df_dict = {"UMI": [], "counts": []}
        pre_cluster_stats = []
        post_cluster_stats = []
        pre_cluster_stats_null = []
        post_cluster_stats_null = []
        topology_counts = collections.Counter()
        node_counts = collections.Counter()
        read_gn = umi_methods.random_read_generator(
            infile.filename,
            chrom=options.chrom,
            barcode_getter=bundle_iterator.barcode_getter)

    for bundle, key, status in bundle_iterator(inreads):

        nInput += sum([bundle[umi]["count"] for umi in bundle])

        while nOutput >= output_reads + 100000:
            output_reads += 100000
            U.info("Written out %i reads" % output_reads)

        while nInput >= input_reads + 1000000:
            input_reads += 1000000
            U.info("Parsed %i input reads" % input_reads)

        if options.stats:
            # generate pre-dudep stats
            average_distance = umi_methods.get_average_umi_distance(
                bundle.keys())
            pre_cluster_stats.append(average_distance)
            cluster_size = len(bundle)
            random_umis = read_gn.getUmis(cluster_size)
            average_distance_null = umi_methods.get_average_umi_distance(
                random_umis)
            pre_cluster_stats_null.append(average_distance_null)

        if options.ignore_umi:
            for umi in bundle:
                nOutput += 1
                outfile.write(bundle[umi]["read"])

        else:

            # dedup using umis and write out deduped bam
            reads, umis, umi_counts = processor(bundle=bundle,
                                                threshold=options.threshold)

            for read in reads:
                outfile.write(read)
                nOutput += 1

            if options.stats:

                # collect pre-dudupe stats
                stats_pre_df_dict['UMI'].extend(bundle)
                stats_pre_df_dict['counts'].extend(
                    [bundle[UMI]['count'] for UMI in bundle])

                # collect post-dudupe stats
                post_cluster_umis = [
                    bundle_iterator.barcode_getter(x)[0] for x in reads
                ]
                stats_post_df_dict['UMI'].extend(umis)
                stats_post_df_dict['counts'].extend(umi_counts)

                average_distance = umi_methods.get_average_umi_distance(
                    post_cluster_umis)
                post_cluster_stats.append(average_distance)

                cluster_size = len(post_cluster_umis)
                random_umis = read_gn.getUmis(cluster_size)
                average_distance_null = umi_methods.get_average_umi_distance(
                    random_umis)
                post_cluster_stats_null.append(average_distance_null)

    outfile.close()

    if not options.no_sort_output:
        # sort the output
        pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name)
        os.unlink(out_name)  # delete the tempfile

    if options.stats:

        # generate the stats dataframe
        stats_pre_df = pd.DataFrame(stats_pre_df_dict)
        stats_post_df = pd.DataFrame(stats_post_df_dict)

        # tally the counts per umi per position
        pre_counts = collections.Counter(stats_pre_df["counts"])
        post_counts = collections.Counter(stats_post_df["counts"])
        counts_index = list(
            set(pre_counts.keys()).union(set(post_counts.keys())))
        counts_index.sort()
        with U.openFile(options.stats + "_per_umi_per_position.tsv",
                        "w") as outf:
            outf.write("counts\tinstances_pre\tinstances_post\n")
            for count in counts_index:
                values = (count, pre_counts[count], post_counts[count])
                outf.write("\t".join(map(str, values)) + "\n")

        # aggregate stats pre/post per UMI
        agg_pre_df = aggregateStatsDF(stats_pre_df)
        agg_post_df = aggregateStatsDF(stats_post_df)

        agg_df = pd.merge(agg_pre_df,
                          agg_post_df,
                          how='left',
                          left_index=True,
                          right_index=True,
                          sort=True,
                          suffixes=["_pre", "_post"])

        # TS - if count value not observed either pre/post-dedup,
        # merge will leave an empty cell and the column will be cast as a float
        # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html
        # --> Missing data casting rules and indexing
        # so, back fill with zeros and convert back to int
        agg_df = agg_df.fillna(0).astype(int)

        agg_df.index = [x.decode() for x in agg_df.index]
        agg_df.index.name = 'UMI'
        agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t")

        # bin distances into integer bins
        max_ed = int(
            max(
                map(max, [
                    pre_cluster_stats, post_cluster_stats,
                    pre_cluster_stats_null, post_cluster_stats_null
                ])))

        cluster_bins = range(-1, int(max_ed) + 2)

        def bin_clusters(cluster_list, bins=cluster_bins):
            ''' take list of floats and return bins'''
            return np.digitize(cluster_list, bins, right=True)

        def tallyCounts(binned_cluster, max_edit_distance):
            ''' tally counts per bin '''
            return np.bincount(binned_cluster, minlength=max_edit_distance + 3)

        pre_cluster_binned = bin_clusters(pre_cluster_stats)
        post_cluster_binned = bin_clusters(post_cluster_stats)
        pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null)
        post_cluster_null_binned = bin_clusters(post_cluster_stats_null)

        edit_distance_df = pd.DataFrame(
            {
                "unique":
                tallyCounts(pre_cluster_binned, max_ed),
                "unique_null":
                tallyCounts(pre_cluster_null_binned, max_ed),
                options.method:
                tallyCounts(post_cluster_binned, max_ed),
                "%s_null" % options.method:
                tallyCounts(post_cluster_null_binned, max_ed),
                "edit_distance":
                cluster_bins
            },
            columns=[
                "unique", "unique_null", options.method,
                "%s_null" % options.method, "edit_distance"
            ])

        # TS - set lowest bin (-1) to "Single_UMI"
        edit_distance_df['edit_distance'][0] = "Single_UMI"

        edit_distance_df.to_csv(options.stats + "_edit_distance.tsv",
                                index=False,
                                sep="\t")

    # write footer and output benchmark information.
    U.info("Reads: %s" % ", ".join([
        "%s: %s" % (x[0], x[1])
        for x in bundle_iterator.read_events.most_common()
    ]))

    U.info("Number of reads out: %i" % nOutput)

    if not options.ignore_umi:  # otherwise processor has not been used
        U.info("Total number of positions deduplicated: %i" %
               processor.UMIClusterer.positions)
        if processor.UMIClusterer.positions > 0:
            U.info("Mean number of unique UMIs per position: %.2f" %
                   (float(processor.UMIClusterer.total_umis_per_position) /
                    processor.UMIClusterer.positions))
            U.info("Max. number of unique UMIs per position: %i" %
                   processor.UMIClusterer.max_umis_per_position)
        else:
            U.warn("The BAM did not contain any valid "
                   "reads/read pairs for deduplication")

    U.Stop()
コード例 #4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=usage,
                            description=globals()["__doc__"])

    group = U.OptionGroup(parser, "group-specific options")

    group.add_option(
        "--group-out",
        dest="tsv",
        type="string",
        help="Outfile name for file mapping read id to read group",
        default=None)

    group.add_option(
        "--output-bam",
        dest="output_bam",
        action="store_true",
        default=False,
        help=("output a bam file with read groups tagged using the UG tag"
              "[default=%default]"))

    parser.add_option("--umi-group-tag",
                      dest="umi_group_tag",
                      type="string",
                      help="tag for the outputted umi group",
                      default='BX')

    parser.add_option_group(group)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    U.validateSamOptions(options, group=True)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        if options.no_sort_output:
            out_name = options.stdout.name
        else:
            out_name = U.getTempFilename(dir=options.tmpdir)
            sorted_out_name = options.stdout.name
        options.stdout.close()
        assert options.output_bam, (
            "To output a bam you must include --output-bam option")
    else:
        if options.no_sort_output:
            out_name = "-"
        else:
            out_name = U.getTempFilename(dir=options.tmpdir)
            sorted_out_name = "-"

    if not options.no_sort_output:  # need to determine the output format for sort
        if options.out_sam:
            sort_format = "sam"
        else:
            sort_format = "bam"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "wh"
    else:
        out_mode = "wb"

    infile = pysam.Samfile(in_name, in_mode)

    if options.output_bam:
        outfile = pysam.Samfile(out_name, out_mode, template=infile)
    else:
        outfile = None

    if options.tsv:
        mapping_outfile = U.openFile(options.tsv, "w")
        mapping_outfile.write("%s\n" % "\t".join([
            "read_id", "contig", "position", "gene", "umi", "umi_count",
            "final_umi", "final_umi_count", "unique_id"
        ]))

    nInput, nOutput, unique_id, input_reads, output_reads = 0, 0, 0, 0, 0

    gene_tag = options.gene_tag
    metacontig2contig = None

    if options.unmapped_reads in ["use", "output"]:
        output_unmapped = True
    else:
        output_unmapped = False

    if options.chrom:
        inreads = infile.fetch(reference=options.chrom)
    else:
        if options.per_gene and options.gene_transcript_map:
            metacontig2contig = sam_methods.getMetaContig2contig(
                infile, options.gene_transcript_map)
            metatag = "MC"
            inreads = sam_methods.metafetcher(infile, metacontig2contig,
                                              metatag)
            gene_tag = metatag

        else:
            inreads = infile.fetch(until_eof=output_unmapped)

    bundle_iterator = sam_methods.get_bundles(
        options,
        all_reads=True,
        return_read2=True,
        return_unmapped=output_unmapped,
        metacontig_contig=metacontig2contig)

    # set up UMIClusterer functor with methods specific to
    # specified options.method
    processor = network.UMIClusterer(options.method)

    for bundle, key, status in bundle_iterator(inreads):

        # write out read2s and unmapped/chimeric (if these options are set)
        if status == 'single_read':
            # bundle is just a single read here
            nInput += 1

            if outfile:
                outfile.write(bundle)

            nOutput += 1
            continue

        umis = bundle.keys()
        counts = {umi: bundle[umi]["count"] for umi in umis}

        nInput += sum(counts.values())

        while nOutput >= output_reads + 10000:
            output_reads += 10000
            U.info("Written out %i reads" % output_reads)

        while nInput >= input_reads + 1000000:
            input_reads += 1000000
            U.info("Parsed %i input reads" % input_reads)

        # group the umis
        groups = processor(counts, threshold=options.threshold)

        for umi_group in groups:
            top_umi = umi_group[0]

            group_count = sum(counts[umi] for umi in umi_group)

            for umi in umi_group:
                reads = bundle[umi]['read']
                for read in reads:
                    if outfile:
                        # Add the 'UG' tag to the read
                        read.set_tag('UG', unique_id)
                        read.set_tag(options.umi_group_tag, top_umi)
                        outfile.write(read)

                    if options.tsv:
                        if options.per_gene:
                            gene = read.get_tag(gene_tag)
                        else:
                            gene = "NA"
                        mapping_outfile.write("%s\n" % "\t".join(
                            map(str,
                                (read.query_name, read.reference_name,
                                 sam_methods.get_read_position(
                                     read, options.soft_clip_threshold)[1],
                                 gene, umi.decode(), counts[umi],
                                 top_umi.decode(), group_count, unique_id))))

                    nOutput += 1

            unique_id += 1

    if outfile:
        outfile.close()
        if not options.no_sort_output:
            # sort the output
            pysam.sort("-o", sorted_out_name, "-O", sort_format, "--no-PG",
                       out_name)
            os.unlink(out_name)  # delete the tempfile

    if options.tsv:
        mapping_outfile.close()

    # write footer and output benchmark information.
    U.info("Reads: %s" % ", ".join([
        "%s: %s" % (x[0], x[1])
        for x in bundle_iterator.read_events.most_common()
    ]))
    U.info("Number of reads out: %i, Number of groups: %i" %
           (nOutput, unique_id))

    U.info("Total number of positions deduplicated: %i" % processor.positions)
    if processor.positions > 0:
        U.info(
            "Mean number of unique UMIs per position: %.2f" %
            (float(processor.total_umis_per_position) / processor.positions))
        U.info("Max. number of unique UMIs per position: %i" %
               processor.max_umis_per_position)
    else:
        U.warn("The BAM did not contain any valid "
               "reads/read pairs for deduplication")

    U.Stop()
コード例 #5
0
ファイル: sam_methods.py プロジェクト: yh154/UMI-tools
    def __call__(self, inreads):

        for read in inreads:

            if read.is_read2:
                if self.return_read2:
                    if not read.is_unmapped or (read.is_unmapped
                                                and self.return_unmapped):
                        yield read, None, "single_read"
                continue
            else:
                self.read_events['Input Reads'] += 1

            # only ever dealing with read1s from here

            if self.options.paired:
                if read.is_paired:
                    self.read_events['Read pairs'] += 1
                else:
                    self.read_events['Unpaired reads'] += 1

                    # if paired end input and read1 is unpaired...

                    # skip, or
                    if self.options.unpaired_reads == "discard":
                        continue

                    # yield without grouping, or
                    elif self.options.unpaired_reads == "output":
                        yield read, None, "single_read"

                    # Use read pair; TLEN will be 0
                    elif self.options.unpaired_reads == "use":
                        pass

            if read.is_unmapped:
                if self.options.paired:
                    if read.mate_is_unmapped:
                        self.read_events['Both unmapped'] += 1
                    else:
                        self.read_events['Read 1 unmapped'] += 1
                else:
                    self.read_events['Single end unmapped'] += 1

                # if read1 is unmapped, yield immediately or skip read
                if self.return_unmapped:
                    self.read_events['Input Reads'] += 1
                    yield read, None, "single_read"
                continue

            if self.options.paired and read.mate_is_unmapped:
                if not read.is_unmapped:
                    self.read_events['Read 2 unmapped'] += 1

                # if paired end input and read2 is unmapped, skip unless
                # options.unmapped_reads == "use", in which case TLEN will be 0
                if self.options.unmapped_reads != "use":
                    if self.return_unmapped:
                        yield read, None, "single_read"
                        continue

            if read.is_paired and (read.reference_name !=
                                   read.next_reference_name):
                self.read_events['Chimeric read pair'] += 1

                # if paired end input and read2 is mapped to another contig...

                # skip, or
                if self.options.chimeric_pairs == "discard":
                    continue

                # yield without grouping, or
                elif self.options.chimeric_pairs == "output":
                    yield read, None, "single_read"
                    continue

                # Use read pair; TLEN will be 0
                elif self.options.chimeric_pairs == "use":
                    pass

            if self.options.subset:
                if random.random() >= self.options.subset:
                    self.read_events['Randomly excluded'] += 1
                    continue

            if self.options.mapping_quality:
                if read.mapq < self.options.mapping_quality:
                    self.read_events['< MAPQ threshold'] += 1
                    continue

            # get the umi +/- cell barcodes
            if self.options.ignore_umi:
                if self.options.per_cell:
                    umi, cell = self.barcode_getter(read)
                    umi = ""
                else:
                    umi, cell = "", ""
            else:
                try:
                    umi, cell = self.barcode_getter(read)
                except KeyError:
                    error_msg = "Read skipped, missing umi and/or cell tag"
                    if self.read_events[error_msg] == 0:

                        # pysam renamed .tostring -> to_string in 0.14
                        # .tostring requies access to the parent AlignmentFile
                        try:
                            formatted_read = read.to_string()
                        except AttributeError:
                            formatted_read = read.query_name

                        U.warn("At least one read is missing UMI and/or "
                               "cell tag(s): %s" % formatted_read)
                    self.read_events[error_msg] += 1
                    continue

            self.current_chr = read.reference_name

            if self.options.per_gene:

                if self.options.per_contig:

                    if self.metacontig_contig:
                        transcript = read.reference_name
                        gene = self.contig_metacontig[transcript]
                    else:
                        gene = read.reference_name

                elif self.options.gene_tag:

                    try:
                        assigned = read.get_tag(self.options.assigned_tag)
                        gene = read.get_tag(self.options.gene_tag)
                    except KeyError:
                        self.read_events['Read skipped, no tag'] += 1
                        continue

                    if gene == "":
                        if self.read_events[
                                'Read skipped - gene string is empty'] == 0:
                            U.warn("Assigned gene is empty string. First such "
                                   "read:\n%s" % read.to_string())
                        self.read_events[
                            'Read skipped - gene string is empty'] += 1
                        continue

                    if re.search(self.options.skip_regex, assigned):
                        self.read_events[
                            'Read skipped - assigned tag matches skip_regex'] += 1
                        continue

                pos = gene
                key = pos

                if self.last_chr:
                    do_output, out_keys = self.check_output()
                else:
                    do_output = False

                if do_output:
                    for p in out_keys:
                        for k in sorted(self.reads_dict[p].keys()):
                            yield self.reads_dict[p][k], k, "bundle"

                        del self.reads_dict[p]

                self.last_chr = self.current_chr
                self.last_pos = pos

            else:

                start, pos, is_spliced = get_read_position(
                    read, self.options.soft_clip_threshold)

                do_output, out_keys = self.check_output()

                if do_output:
                    for p in out_keys:
                        for k in sorted(self.reads_dict[p].keys()):
                            yield self.reads_dict[p][k], k, "bundle"

                        del self.reads_dict[p]
                        if p in self.read_counts:
                            del self.read_counts[p]

                self.last_pos = self.start
                self.last_chr = self.current_chr

                if self.options.read_length:
                    r_length = read.query_length
                else:
                    r_length = 0

                key = (read.is_reverse, self.options.spliced and is_spliced,
                       self.options.paired * read.tlen, r_length)

            # update dictionaries
            key = (key, cell)
            self.update_dicts(read, pos, key, umi)

            if self.metacontig_contig:
                # keep track of observed contigs for each gene
                self.observed_contigs[gene].add(transcript)

        # yield remaining bundles
        for p in sorted(self.reads_dict.keys()):
            for k in sorted(self.reads_dict[p].keys()):
                yield self.reads_dict[p][k], k, "bundle"
コード例 #6
0
ファイル: prepare-for-rsem.py プロジェクト: nf-core/rnaseq
def main(argv=None):

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=usage,
                            description=globals()["__doc__"])
    group = U.OptionGroup(parser, "RSEM preparation specific options")

    group.add_option(
        "--tags",
        dest="tags",
        type="string",
        default="UG,BX",
        help="Comma-seperated list of tags to transfer from read1 to read2")
    group.add_option("--sam",
                     dest="sam",
                     action="store_true",
                     default=False,
                     help="input and output SAM rather than BAM")

    parser.add_option_group(group)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser,
                              argv=argv,
                              add_group_dedup_options=False,
                              add_umi_grouping_options=False,
                              add_sam_options=False)

    skipped_stats = Counter()

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        in_name = "-"

    if options.sam:
        mode = ""
    else:
        mode = "b"

    inbam = pysam.AlignmentFile(in_name, "r" + mode)

    if options.stdout != sys.stdout:
        out_name = options.stdout.name
        options.stdout.close()
    else:
        out_name = "-"

    outbam = pysam.AlignmentFile(out_name, "w" + mode, template=inbam)

    options.tags = options.tags.split(",")

    for template in chunk_bam(inbam):

        assert len(set(r.query_name for r in template)) == 1
        current_template = {True: defaultdict(list), False: defaultdict(list)}

        for read in template:
            key = (read.reference_name, read.pos, not read.is_secondary)
            current_template[read.is_read1][key].append(read)

        output = set()

        for read in template:

            mate = None

            # if this read is a non_primary alignment, we first want to check if it has a mate
            # with the non-primary alignment flag set.

            mate_key_primary = (True)
            mate_key_secondary = (read.next_reference_name,
                                  read.next_reference_start, False)

            # First look for a read that has the same primary/secondary status
            # as read (i.e. secondary mate for secondary read, and primary mate
            # for primary read)
            mate_key = (read.next_reference_name, read.next_reference_start,
                        read.is_secondary)
            mate = pick_mate(read, current_template, mate_key)

            # If none was found then look for the opposite (primary mate of secondary
            # read or seconadary mate of primary read)
            if mate is None:
                mate_key = (read.next_reference_name,
                            read.next_reference_start, not read.is_secondary)
                mate = pick_mate(read, current_template, mate_key)

            # If we still don't have a mate, then their can't be one?
            if mate is None:
                skipped_stats["no_mate"] += 1
                U.warn("Alignment {} has no mate -- skipped".format("\t".join(
                    map(str, [
                        read.query_name, read.flag, read.reference_name,
                        int(read.pos)
                    ]))))
                continue

            # because we might want to make changes to the read, but not have those changes reflected
            # if we need the read again,we copy the read. This is only way I can find to do this.
            read = pysam.AlignedSegment().from_dict(read.to_dict(),
                                                    read.header)
            mate = pysam.AlignedSegment().from_dict(mate.to_dict(),
                                                    read.header)

            # Make it so that if our read is secondary, the mate is also secondary. We don't make the
            # mate primary if the read is primary because we would otherwise end up with mulitple
            # primary alignments.
            if read.is_secondary:
                mate.is_secondary = True

            # In a situation where there is already one mate for each read, then we will come across
            # each pair twice - once when we scan read1 and once when we scan read2. Thus we need
            # to make sure we don't output something already output.
            if read.is_read1:

                mate = copy_tags(options.tags, read, mate)
                output_key = str(read) + str(mate)

                if output_key not in output:
                    output.add(output_key)
                    outbam.write(read)
                    outbam.write(mate)
                    skipped_stats["pairs_output"] += 1

            elif read.is_read2:

                read = copy_tags(options.tags, mate, read)
                output_key = str(mate) + str(read)

                if output_key not in output:
                    output.add(output_key)
                    outbam.write(mate)
                    outbam.write(read)
                    skipped_stats["pairs_output"] += 1

            else:
                skipped_stats["skipped_not_read12"] += 1
                U.warn("Alignment {} is neither read1 nor read2 -- skipped".
                       format("\t".join(
                           map(str, [
                               read.query_name, read.flag, read.reference_name,
                               int(read.pos)
                           ]))))
                continue

    if not out_name == "-":
        outbam.close()

    U.info("Total pairs output: {}, Pairs skipped - no mates: {},"
           " Pairs skipped - not read1 or 2: {}".format(
               skipped_stats["pairs_output"], skipped_stats["no_mate"],
               skipped_stats["skipped_not_read12"]))
    U.Stop()