Python batch_sort Beispiele, chimerascan.lib.batch_sort.batch_sort Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: resolve_discordant_reads.py Projekt: BioXiao/chimerascan

def sort_read_stats_by_read_name(input_file, output_file, tmp_dir):
    def sort_read_name(line):
        return line.strip().split('\t', QNAME_COL+1)[QNAME_COL]
    batch_sort(input=input_file,
               output=output_file,
               key=sort_read_name,
               buffer_size=32000,
               tempdirs=[tmp_dir])

Beispiel #2

0

Datei anzeigen

def sort_read_stats_by_read_name(input_file, output_file, tmp_dir):
    def sort_read_name(line):
        return line.strip().split('\t', QNAME_COL + 1)[QNAME_COL]

    batch_sort(input=input_file,
               output=output_file,
               key=sort_read_name,
               buffer_size=32000,
               tempdirs=[tmp_dir])

Beispiel #3

0

Datei anzeigen

Datei: discordant_reads_to_bedpe.py Projekt: bioinfo-dirty-jobs/chimerascan-vrl

def sort_bedpe(input_file, output_file, tmp_dir):
    # sort BEDPE file by paired chromosome/position
    def sortfunc(line):
        fields = line.strip().split('\t')
        return tuple([fields[0], fields[3], fields[1], fields[4]])
    tempdirs = [tmp_dir]
    batch_sort(input=input_file,
               output=output_file,
               key=sortfunc,
               buffer_size=32000,
               tempdirs=tempdirs)

Beispiel #4

0

Datei anzeigen

Datei: discordant_reads_to_bedpe_v1.py Projekt: marcopavoni/chimerascan

def sort_bedpe(input_file, output_file, tmp_dir):
    # sort BEDPE file by paired chromosome/position
    def sortfunc(line):
        fields = line.strip().split('\t')
        return tuple([fields[0], fields[3], fields[1], fields[4]])
    tempdirs = [tmp_dir]
    batch_sort(input=input_file,
               output=output_file,
               key=sortfunc,
               buffer_size=32000,
               tempdirs=tempdirs)

Beispiel #5

0

Datei anzeigen

Datei: chimeras_to_breakpoints_v3.py Projekt: marcopavoni/chimerascan

def chimeras_to_breakpoints(input_file, breakpoint_sorted_chimera_file,
                            breakpoint_map_file, breakpoint_fasta_file,
                            tmp_dir):
    # sort chimera file by breakpoint name
    def sortfunc(line):
        fields = line.strip().split('\t')
        return fields[Chimera.BREAKPOINT_NAME_FIELD]

    tempdirs = [tmp_dir]
    batch_sort(input=input_file,
               output=breakpoint_sorted_chimera_file,
               key=sortfunc,
               buffer_size=32000,
               tempdirs=tempdirs)
    # parse and build breakpoint -> chimera map
    fastafh = open(breakpoint_fasta_file, "w")
    mapfh = open(breakpoint_map_file, "w")
    prev_breakpoint_name = None
    prev_seq = None
    chimera_names = set()
    for c in Chimera.parse(open(breakpoint_sorted_chimera_file)):
        seq = c.breakpoint_seq_5p + c.breakpoint_seq_3p
        if c.breakpoint_name != prev_breakpoint_name:
            if len(chimera_names) > 0:
                # write to fasta
                print >> fastafh, ">%s\n%s" % (prev_breakpoint_name,
                                               split_seq(prev_seq))
                # write to map file
                print >> mapfh, "%s\t%s\t%s" % (prev_breakpoint_name,
                                                prev_seq, ",".join(
                                                    sorted(chimera_names)))
                chimera_names = set()
            prev_seq = seq
            prev_breakpoint_name = c.breakpoint_name
        chimera_names.add(c.name)
    if len(chimera_names) > 0:
        print >> fastafh, ">%s\n%s" % (prev_breakpoint_name,
                                       split_seq(prev_seq))
        print >> mapfh, "%s\t%s\t%s" % (prev_breakpoint_name, prev_seq,
                                        ",".join(chimera_names))
    fastafh.close()
    mapfh.close()

Beispiel #6

0

Datei anzeigen

Datei: chimeras_to_breakpoints.py Projekt: bioinfo-dirty-jobs/chimerascan-vrl

def chimeras_to_breakpoints(input_file, breakpoint_sorted_chimera_file, 
                            breakpoint_map_file, breakpoint_fasta_file,
                            tmp_dir):
    # sort chimera file by breakpoint name
    def sortfunc(line):
        fields = line.strip().split('\t')
        return fields[Chimera.BREAKPOINT_NAME_FIELD]
    tempdirs = [tmp_dir]
    batch_sort(input=input_file,
               output=breakpoint_sorted_chimera_file,
               key=sortfunc,
               buffer_size=32000,
               tempdirs=tempdirs)
    # parse and build breakpoint -> chimera map
    fastafh = open(breakpoint_fasta_file, "w")
    mapfh = open(breakpoint_map_file, "w")
    prev_breakpoint_name = None
    prev_seq = None
    chimera_names = set()
    for c in Chimera.parse(open(breakpoint_sorted_chimera_file)):        
        seq = c.breakpoint_seq_5p + c.breakpoint_seq_3p
        if c.breakpoint_name != prev_breakpoint_name:
            if len(chimera_names) > 0:
                # write to fasta
                print >>fastafh, ">%s\n%s" % (prev_breakpoint_name, split_seq(prev_seq))
                # write to map file
                print >>mapfh, "%s\t%s\t%s" % (prev_breakpoint_name, 
                                               prev_seq, 
                                               ",".join(sorted(chimera_names)))
                chimera_names = set()
            prev_seq = seq
            prev_breakpoint_name = c.breakpoint_name
        chimera_names.add(c.name)
    if len(chimera_names) > 0:
        print >>fastafh, ">%s\n%s" % (prev_breakpoint_name, split_seq(prev_seq))
        print >>mapfh, "%s\t%s\t%s" % (prev_breakpoint_name, prev_seq, ",".join(chimera_names))
    fastafh.close()
    mapfh.close()

Beispiel #7

0

Datei anzeigen

Datei: pair_clusters.py Projekt: BioXiao/chimerascan

def pair_discordant_clusters(discordant_bam_file, cluster_pair_file, tmp_dir):
    #
    # sort the BAM file that has cluster annotations by read name
    #
    logging.debug("Sorting newly annotated discordant BAM file by read name")
    qname_sorted_bam_prefix = os.path.join(tmp_dir, os.path.splitext(discordant_bam_file)[0] + ".byname")
    qname_sorted_bam_file = qname_sorted_bam_prefix + ".bam"
    pysam.sort("-n", "-m", str(int(1e9)), discordant_bam_file, qname_sorted_bam_prefix)
    #
    # iterate through named-sorted bam file write cluster pairs
    #
    logging.debug("Enumerating cluster pairs")
    tmp_cluster_file = os.path.join(tmp_dir, "tmp_clusters.txt")
    tmp_cluster_fh = open(tmp_cluster_file, 'w')
    bamfh = pysam.Samfile(qname_sorted_bam_file, "rb")
    for pe_reads in parse_pe_reads(bamfh):
        # group into 5' and 3' reads
        reads5p = []
        reads3p = []
        for reads in pe_reads:
            for r in reads:
                orientation = r.opt(ORIENTATION_TAG)
                if orientation == ORIENTATION_5P:
                    reads5p.append(r)
                else:
                    reads3p.append(r)
        # iterate through possible pairs
        for r5p in reads5p:
            for r3p in reads3p:
                id5p = r5p.opt(DISCORDANT_CLUSTER_TAG)
                id3p = r3p.opt(DISCORDANT_CLUSTER_TAG)
                print >>tmp_cluster_fh, '\t'.join(map(str, (id5p, id3p, r5p.qname)))
    bamfh.close()
    tmp_cluster_fh.close()
    #
    # sort cluster pairs
    #
    logging.debug("Sorting cluster pairs")
    tmp_sorted_cluster_file = os.path.join(tmp_dir, "tmp_clusters.srt.txt")
    def sortfunc(line):
        fields = line.strip().split('\t')
        return (fields[0], fields[1])
    batch_sort(input=tmp_cluster_file,
               output=tmp_sorted_cluster_file,
               key=sortfunc,
               buffer_size=32000,
               tempdirs=[tmp_dir])
    #
    # write cluster pairs
    #
    logging.debug("Grouping cluster pairs")
    pair_id = 0
    outfh = open(cluster_pair_file, "w")
    for id5p, id3p, qnames in parse_and_group_cluster_pairs(open(tmp_sorted_cluster_file)):
        print >>outfh, '\t'.join(map(str, [pair_id, id5p, id3p, ','.join(qnames)]))
        pair_id += 1
    outfh.close()
    # remove temporary files
    if os.path.exists(qname_sorted_bam_file):
        os.remove(qname_sorted_bam_file)
    if os.path.exists(tmp_cluster_file):
        os.remove(tmp_cluster_file)
    if os.path.exists(tmp_sorted_cluster_file):
        os.remove(tmp_sorted_cluster_file)
    return config.JOB_SUCCESS

Beispiel #8

0

Datei anzeigen

Datei: nominate_spanning_reads.py Projekt: genome-vendor/chimerascan

def nominate_single_mapped_spanning_reads(chimera_file, single_mapped_bam_file, single_mapped_fastq_file, tmp_dir):
    # find sequences that could cross a breakpoint
    tmp_seqs_to_remap = os.path.join(tmp_dir, "tmp_singlemap_seqs.txt")
    f = open(tmp_seqs_to_remap, "w")
    # search for matches to 5' chimeras
    logging.debug("Matching single-mapped frags to 5' chimeras")
    for clist, reads in parse_sync_chimera_with_bam(chimera_file, single_mapped_bam_file, OrientationTags.FIVEPRIME):
        # TODO: test more specifically that read has a chance to cross breakpoint
        for r in reads:
            # reverse read number
            readnum = 1 if r.is_read1 else 0
            print >> f, "\t".join(map(str, [r.qname, readnum, r.opt("R2"), r.opt("Q2")]))
    # sort chimeras by 3' partner
    logging.debug("Sorting chimeras by 3' transcript")

    def sort_by_3p_partner(line):
        fields = line.strip().split("\t", Chimera.TX_NAME_3P_FIELD + 1)
        return fields[Chimera.TX_NAME_3P_FIELD]

    tmp_chimera_file_sorted_3p = os.path.join(tmp_dir, "tmp_chimeras.sorted3p.bedpe")
    batch_sort(
        input=chimera_file,
        output=tmp_chimera_file_sorted_3p,
        key=sort_by_3p_partner,
        buffer_size=32000,
        tempdirs=[tmp_dir],
    )
    # search for matches to 3' chimeras
    logging.debug("Matching single-mapped frags to 3' chimeras")
    for clist, reads in parse_sync_chimera_with_bam(
        tmp_chimera_file_sorted_3p, single_mapped_bam_file, OrientationTags.THREEPRIME
    ):
        # TODO: test more specifically that read has a chance to cross breakpoint
        for r in reads:
            # reverse read number
            readnum = 1 if r.is_read1 else 0
            print >> f, "\t".join(map(str, [r.qname, readnum, r.opt("R2"), r.opt("Q2")]))
    f.close()
    #
    # now sort the file of sequences by read name/number to
    # eliminate duplicates
    #
    def sort_by_qname(line):
        fields = line.strip().split("\t")
        return (fields[0], int(fields[1]))

    tmp_sorted_seqs_to_remap = os.path.join(tmp_dir, "tmp_singlemap_seqs.sorted.txt")
    batch_sort(
        input=tmp_seqs_to_remap,
        output=tmp_sorted_seqs_to_remap,
        key=sort_by_qname,
        buffer_size=32000,
        tempdirs=[tmp_dir],
    )
    #
    # read file and write fastq, ignoring duplicates
    #
    fqfh = open(single_mapped_fastq_file, "w")
    prev = None
    for line in open(tmp_sorted_seqs_to_remap):
        fields = line.strip().split("\t")
        qname, readnum, seq, qual = fields[0], int(fields[1]), fields[2], fields[3]
        cur = (fields[0], int(fields[1]))
        if prev != cur:
            if prev is not None:
                print >> fqfh, to_fastq(qname, readnum, seq, qual)
            prev = cur
    if prev is not None:
        print >> fqfh, to_fastq(qname, readnum, seq, qual)
    fqfh.close()
    # TODO: remove temporary files
    # os.remove(tmp_chimera_file_sorted_3p)
    # os.remove(tmp_seqs_to_remap)
    # os.remove(tmp_sorted_seqs_to_remap)
    return config.JOB_SUCCESS

Beispiel #9

0

Datei anzeigen

Datei: pair_clusters.py Projekt: marcopavoni/chimerascan

def pair_discordant_clusters(discordant_bam_file, cluster_pair_file, tmp_dir):
    #
    # sort the BAM file that has cluster annotations by read name
    #
    logging.debug("Sorting newly annotated discordant BAM file by read name")
    qname_sorted_bam_prefix = os.path.join(
        tmp_dir,
        os.path.splitext(discordant_bam_file)[0] + ".byname")
    qname_sorted_bam_file = qname_sorted_bam_prefix + ".bam"
    pysam.sort("-n", "-m", str(int(1e9)), discordant_bam_file,
               qname_sorted_bam_prefix)
    #
    # iterate through named-sorted bam file write cluster pairs
    #
    logging.debug("Enumerating cluster pairs")
    tmp_cluster_file = os.path.join(tmp_dir, "tmp_clusters.txt")
    tmp_cluster_fh = open(tmp_cluster_file, 'w')
    bamfh = pysam.Samfile(qname_sorted_bam_file, "rb")
    for pe_reads in parse_pe_reads(bamfh):
        # group into 5' and 3' reads
        reads5p = []
        reads3p = []
        for reads in pe_reads:
            for r in reads:
                orientation = r.opt(ORIENTATION_TAG)
                if orientation == ORIENTATION_5P:
                    reads5p.append(r)
                else:
                    reads3p.append(r)
        # iterate through possible pairs
        for r5p in reads5p:
            for r3p in reads3p:
                id5p = r5p.opt(DISCORDANT_CLUSTER_TAG)
                id3p = r3p.opt(DISCORDANT_CLUSTER_TAG)
                print >> tmp_cluster_fh, '\t'.join(
                    map(str, (id5p, id3p, r5p.qname)))
    bamfh.close()
    tmp_cluster_fh.close()
    #
    # sort cluster pairs
    #
    logging.debug("Sorting cluster pairs")
    tmp_sorted_cluster_file = os.path.join(tmp_dir, "tmp_clusters.srt.txt")

    def sortfunc(line):
        fields = line.strip().split('\t')
        return (fields[0], fields[1])

    batch_sort(input=tmp_cluster_file,
               output=tmp_sorted_cluster_file,
               key=sortfunc,
               buffer_size=32000,
               tempdirs=[tmp_dir])
    #
    # write cluster pairs
    #
    logging.debug("Grouping cluster pairs")
    pair_id = 0
    outfh = open(cluster_pair_file, "w")
    for id5p, id3p, qnames in parse_and_group_cluster_pairs(
            open(tmp_sorted_cluster_file)):
        print >> outfh, '\t'.join(
            map(str, [pair_id, id5p, id3p, ','.join(qnames)]))
        pair_id += 1
    outfh.close()
    # remove temporary files
    if os.path.exists(qname_sorted_bam_file):
        os.remove(qname_sorted_bam_file)
    if os.path.exists(tmp_cluster_file):
        os.remove(tmp_cluster_file)
    if os.path.exists(tmp_sorted_cluster_file):
        os.remove(tmp_sorted_cluster_file)
    return config.JOB_SUCCESS

Beispiel #10

0

Datei anzeigen

def resolve_discordant_reads(input_file, output_file, isize_dist,
                             min_isize_prob, tmp_dir):
    #
    # parse chimeras and output reads to a file
    #
    logging.debug("Getting discordant read information")
    read_stats_file = os.path.join(tmp_dir, "read_stats.txt")
    make_discordant_read_stats_file(input_file, read_stats_file, isize_dist)
    #
    # now sort the read/chimera stats list
    #
    logging.debug("Sorting reads by read name")
    sorted_read_stats_file = os.path.join(tmp_dir,
                                          "read_stats.rname_sorted.txt")
    sort_read_stats_by_read_name(read_stats_file, sorted_read_stats_file,
                                 tmp_dir)
    #
    # parse reads by read name
    #
    logging.debug("Choosing best read groups")
    resolved_read_stats_file = os.path.join(
        tmp_dir, "read_stats.rname_sorted.resolved.txt")
    f = open(resolved_read_stats_file, "w")
    for rname, readstats in group_by_attr(
            ChimeraStats.parse(open(sorted_read_stats_file)), 'qname'):
        # build a dictionary of stats -> read/chimeras
        stats_dict = collections.defaultdict(lambda: [])
        for s in readstats:
            # add key/value pairs
            stats_dict[s.score_tuple].append(s)
        # sort based on stats
        sorted_stats_keys = sorted(stats_dict.keys(), reverse=True)
        # use only the best key
        for s in stats_dict[sorted_stats_keys[0]]:
            # output read -> chimera relationships
            print >> f, '\t'.join(map(str, s.to_list()))
    f.close()
    #
    # re-sort by chimera name
    #
    logging.debug("Resorting reads by chimera name")

    def sort_reads_by_chimera_name(line):
        return line.strip().split('\t', CHIMERA_NAME_COL + 1)[CHIMERA_NAME_COL]

    sorted_resolved_read_stats_file = os.path.join(
        tmp_dir, "read_stats.chimera_name_sorted.resolved.txt")
    batch_sort(input=resolved_read_stats_file,
               output=sorted_resolved_read_stats_file,
               key=sort_reads_by_chimera_name,
               buffer_size=32000,
               tempdirs=[tmp_dir])
    logging.debug("Resorting chimeras by name")

    def sort_chimeras_by_name(line):
        return line.strip().split('\t',
                                  Chimera.NAME_FIELD + 1)[Chimera.NAME_FIELD]

    sorted_chimera_file = os.path.join(tmp_dir,
                                       "spanning_chimeras.name_sorted.txt")
    batch_sort(input=input_file,
               output=sorted_chimera_file,
               key=sort_chimeras_by_name,
               buffer_size=32000,
               tempdirs=[tmp_dir])
    #
    # parse and rebuild chimeras based on best reads
    #
    logging.debug("Rewriting chimeras with lists of 'best' reads")
    f = open(output_file, "w")
    # need to sync chimeras with stats
    for c, stats in parse_sync_chimeras_read_stats(
            sorted_chimera_file, sorted_resolved_read_stats_file):
        # parse and make lookup set of the resolved alignments
        good_alignments = set()
        for s in stats:
            if s.isize_prob < min_isize_prob:
                continue
            good_alignments.add((s.qname, s.tid5p, s.pos5p, s.tid3p, s.pos3p))
        # replace encompassing frags with resolved alignments
        new_encomp_frags = []
        for dpair in c.encomp_frags:
            # get alignment tuple
            aln = (dpair[0].qname, dpair[0].tid, dpair[0].pos, dpair[1].tid,
                   dpair[1].pos)
            if aln in good_alignments:
                new_encomp_frags.append(dpair)
        c.encomp_frags = new_encomp_frags
        c.score = c.get_num_frags()
        print >> f, '\t'.join(map(str, c.to_list()))
    f.close()

Beispiel #11

0

Datei anzeigen

Datei: resolve_discordant_reads.py Projekt: BioXiao/chimerascan

def resolve_discordant_reads(input_file, output_file, isize_dist, min_isize_prob,
                             tmp_dir):
    #
    # parse chimeras and output reads to a file
    #
    logging.debug("Getting discordant read information")
    read_stats_file = os.path.join(tmp_dir, "read_stats.txt")
    make_discordant_read_stats_file(input_file, read_stats_file, isize_dist)
    #
    # now sort the read/chimera stats list
    #
    logging.debug("Sorting reads by read name")
    sorted_read_stats_file = os.path.join(tmp_dir, "read_stats.rname_sorted.txt")
    sort_read_stats_by_read_name(read_stats_file, sorted_read_stats_file, tmp_dir)
    #
    # parse reads by read name
    #
    logging.debug("Choosing best read groups")
    resolved_read_stats_file = os.path.join(tmp_dir, "read_stats.rname_sorted.resolved.txt")
    f = open(resolved_read_stats_file, "w")
    for rname,readstats in group_by_attr(ChimeraStats.parse(open(sorted_read_stats_file)), 
                                         'qname'):
        # build a dictionary of stats -> read/chimeras
        stats_dict = collections.defaultdict(lambda: [])
        for s in readstats:
            # add key/value pairs
            stats_dict[s.score_tuple].append(s)
        # sort based on stats
        sorted_stats_keys = sorted(stats_dict.keys(), reverse=True)
        # use only the best key
        for s in stats_dict[sorted_stats_keys[0]]:
            # output read -> chimera relationships
            print >>f, '\t'.join(map(str, s.to_list()))
    f.close()
    #
    # re-sort by chimera name
    #
    logging.debug("Resorting reads by chimera name")
    def sort_reads_by_chimera_name(line):
        return line.strip().split('\t',CHIMERA_NAME_COL+1)[CHIMERA_NAME_COL]
    sorted_resolved_read_stats_file = os.path.join(tmp_dir, "read_stats.chimera_name_sorted.resolved.txt")
    batch_sort(input=resolved_read_stats_file,
               output=sorted_resolved_read_stats_file,
               key=sort_reads_by_chimera_name,
               buffer_size=32000,
               tempdirs=[tmp_dir])
    logging.debug("Resorting chimeras by name")
    def sort_chimeras_by_name(line):
        return line.strip().split('\t',Chimera.NAME_FIELD+1)[Chimera.NAME_FIELD]
    sorted_chimera_file = os.path.join(tmp_dir, "spanning_chimeras.name_sorted.txt")
    batch_sort(input=input_file,
               output=sorted_chimera_file,
               key=sort_chimeras_by_name,
               buffer_size=32000,
               tempdirs=[tmp_dir])
    #
    # parse and rebuild chimeras based on best reads
    # 
    logging.debug("Rewriting chimeras with lists of 'best' reads")
    f = open(output_file, "w")
    # need to sync chimeras with stats
    for c,stats in parse_sync_chimeras_read_stats(sorted_chimera_file, sorted_resolved_read_stats_file):
        # parse and make lookup set of the resolved alignments
        good_alignments = set()
        for s in stats:
            if s.isize_prob < min_isize_prob:
                continue
            good_alignments.add((s.qname, s.tid5p, s.pos5p, s.tid3p, s.pos3p))
        # replace encompassing frags with resolved alignments
        new_encomp_frags = []
        for dpair in c.encomp_frags:
            # get alignment tuple
            aln = (dpair[0].qname, dpair[0].tid, dpair[0].pos, dpair[1].tid, dpair[1].pos)
            if aln in good_alignments:
                new_encomp_frags.append(dpair)
        c.encomp_frags = new_encomp_frags
        c.score = c.get_num_frags()
        print >>f, '\t'.join(map(str, c.to_list()))
    f.close()

Beispiel #12

0

Datei anzeigen

def nominate_single_mapped_spanning_reads(chimera_file, single_mapped_bam_file,
                                          single_mapped_fastq_file, tmp_dir):
    # find sequences that could cross a breakpoint
    tmp_seqs_to_remap = os.path.join(tmp_dir, "tmp_singlemap_seqs.txt")
    f = open(tmp_seqs_to_remap, "w")
    # search for matches to 5' chimeras
    logging.debug("Matching single-mapped frags to 5' chimeras")
    for clist, reads in parse_sync_chimera_with_bam(chimera_file,
                                                    single_mapped_bam_file,
                                                    OrientationTags.FIVEPRIME):
        # TODO: test more specifically that read has a chance to cross breakpoint
        for r in reads:
            # reverse read number
            readnum = 1 if r.is_read1 else 0
            print >> f, '\t'.join(
                map(str,
                    [r.qname, readnum,
                     r.opt("R2"), r.opt("Q2")]))
    # sort chimeras by 3' partner
    logging.debug("Sorting chimeras by 3' transcript")

    def sort_by_3p_partner(line):
        fields = line.strip().split('\t', Chimera.TX_NAME_3P_FIELD + 1)
        return fields[Chimera.TX_NAME_3P_FIELD]

    tmp_chimera_file_sorted_3p = os.path.join(tmp_dir,
                                              "tmp_chimeras.sorted3p.bedpe")
    batch_sort(input=chimera_file,
               output=tmp_chimera_file_sorted_3p,
               key=sort_by_3p_partner,
               buffer_size=32000,
               tempdirs=[tmp_dir])
    # search for matches to 3' chimeras
    logging.debug("Matching single-mapped frags to 3' chimeras")
    for clist, reads in parse_sync_chimera_with_bam(
            tmp_chimera_file_sorted_3p, single_mapped_bam_file,
            OrientationTags.THREEPRIME):
        # TODO: test more specifically that read has a chance to cross breakpoint
        for r in reads:
            # reverse read number
            readnum = 1 if r.is_read1 else 0
            print >> f, '\t'.join(
                map(str,
                    [r.qname, readnum,
                     r.opt("R2"), r.opt("Q2")]))
    f.close()

    #
    # now sort the file of sequences by read name/number to
    # eliminate duplicates
    #
    def sort_by_qname(line):
        fields = line.strip().split('\t')
        return (fields[0], int(fields[1]))

    tmp_sorted_seqs_to_remap = os.path.join(tmp_dir,
                                            "tmp_singlemap_seqs.sorted.txt")
    batch_sort(input=tmp_seqs_to_remap,
               output=tmp_sorted_seqs_to_remap,
               key=sort_by_qname,
               buffer_size=32000,
               tempdirs=[tmp_dir])
    #
    # read file and write fastq, ignoring duplicates
    #
    fqfh = open(single_mapped_fastq_file, "w")
    prev = None
    for line in open(tmp_sorted_seqs_to_remap):
        fields = line.strip().split('\t')
        qname, readnum, seq, qual = fields[0], int(
            fields[1]), fields[2], fields[3]
        cur = (fields[0], int(fields[1]))
        if prev != cur:
            if prev is not None:
                print >> fqfh, to_fastq(qname, readnum, seq, qual)
            prev = cur
    if prev is not None:
        print >> fqfh, to_fastq(qname, readnum, seq, qual)
    fqfh.close()
    # TODO: remove temporary files
    #os.remove(tmp_chimera_file_sorted_3p)
    #os.remove(tmp_seqs_to_remap)
    #os.remove(tmp_sorted_seqs_to_remap)
    return config.JOB_SUCCESS