def filter_spanning_reads(chimeras, reads, 
                          anchor_min, 
                          anchor_length, 
                          anchor_mismatches,
                          library_type):
    for i,r in enumerate(reads):
        if r.is_unmapped:
            continue
        # make a discordant read object
        # TODO: need to annotate reads elsewhere since they have already been sorted here
        r.tags = r.tags + [("HI", 0),
                           ("IH", 1),
                           ("NH", 1),
                           (DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_GENE),
                           (ORIENTATION_TAG_NAME, get_gene_orientation(r, library_type))]
        dr = DiscordantRead.from_read(r)
        dr.is_spanning = True
        # check read alignment against chimeras
        for c in chimeras:
            if check_breakpoint_alignment(c, r, 
                                          anchor_min, 
                                          anchor_length, 
                                          anchor_mismatches):
                # valid spanning read
                yield c,dr
def extract_single_mapped_reads(
    chimera_file, unmapped_bam_file, single_mapped_bam_file, unmapped_fastq_file, library_type, tmp_dir
):
    # find all reads that need to be remapped to see if they span the
    # breakpoint junction
    fqfh = open(unmapped_fastq_file, "w")
    # annotate mapped reads with sequence/quality of unmapped mate
    bamfh = pysam.Samfile(unmapped_bam_file, "rb")
    unsorted_single_mapped_bam_file = os.path.join(tmp_dir, "unsorted_single_mapped_reads.bam")
    singlemap_bamfh = pysam.Samfile(unsorted_single_mapped_bam_file, "wb", template=bamfh)
    # get list of 'gene' references in bam file to compare with
    gene_tids = set([tid for tid, refname in enumerate(bamfh.references) if refname.startswith(config.GENE_REF_PREFIX)])
    for pe_reads in parse_pe_reads(bamfh):
        # find which of the original reads was unmapped
        r1_unmapped = any(r.is_unmapped for r in pe_reads[0])
        r2_unmapped = any(r.is_unmapped for r in pe_reads[1])
        # if both reads unmapped, then remap to breakpoints
        if r1_unmapped and r2_unmapped:
            for readnum in (0, 1):
                print >> fqfh, to_fastq(
                    pe_reads[readnum][0].qname, readnum, pe_reads[readnum][0].seq, pe_reads[readnum][0].qual
                )
        else:
            # annotate the mapped reads with the seq/qual of the
            # unmapped reads
            mapped_readnum = 0 if r2_unmapped else 1
            unmapped_readnum = 1 if r2_unmapped else 0
            unmapped_seq = pe_reads[unmapped_readnum][0].seq
            unmapped_qual = pe_reads[unmapped_readnum][0].qual
            for r in pe_reads[mapped_readnum]:
                # only consider gene mappings
                if r.rname not in gene_tids:
                    continue
                orientation = get_gene_orientation(r, library_type)
                # TODO: may need to REVERSE read here to get original
                r.tags = r.tags + [("R2", unmapped_seq), ("Q2", unmapped_qual), (ORIENTATION_TAG_NAME, orientation)]
                singlemap_bamfh.write(r)
    singlemap_bamfh.close()
    fqfh.close()
    # sort/index the annotated single-mapper unmapped reads by reference/position
    logging.debug("Sorting single-mapped mates by reference")
    single_mapped_bam_prefix = os.path.splitext(single_mapped_bam_file)[0]
    pysam.sort("-m", str(int(1e9)), unsorted_single_mapped_bam_file, single_mapped_bam_prefix)
    pysam.index(single_mapped_bam_file)
    # remove unsorted file
    if os.path.exists(unsorted_single_mapped_bam_file):
        os.remove(unsorted_single_mapped_bam_file)
    return config.JOB_SUCCESS