コード例 #1
0
def filter_multihits(transcript_file,
                     input_bam_file,
                     output_bam_file,
                     max_multihits=1):
    logging.debug("Reading transcript features")
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    # parse and convert sam -> bam
    inbamfh = pysam.Samfile(input_bam_file, "rb")
    outbamfh = pysam.Samfile(output_bam_file, "wb", template=inbamfh)
    # build a transcript to genome coordinate map
    tid_tx_genome_map = build_tid_transcript_genome_map(outbamfh, transcripts)
    num_frags = 0
    logging.debug("Annotating and filtering multihits")
    for pe_reads in parse_pe_reads(inbamfh):
        mate_num_hits = []
        for reads in pe_reads:
            num_hits = annotate_multihits(reads, tid_tx_genome_map)
            mate_num_hits.append(num_hits)
        new_pe_reads = [[], []]
        if mate_num_hits[0] > max_multihits:
            r = copy_read(pe_reads[0][0])
            r.is_unmapped = True
            r.is_proper_pair = False
            r.is_secondary = False
            r.rname = -1
            r.pos = 0
            if mate_num_hits[1] > max_multihits:
                r.mate_is_unmapped = True
                r.mrnm = -1
                r.mpos = 0
            new_pe_reads[0] = [r]
        else:
            new_pe_reads[0] = pe_reads[0]
        if mate_num_hits[1] > max_multihits:
            r = copy_read(pe_reads[1][0])
            r.is_unmapped = True
            r.is_proper_pair = False
            r.is_secondary = False
            r.rname = -1
            r.pos = 0
            if mate_num_hits[0] > max_multihits:
                r.mate_is_unmapped = True
                r.mrnm = -1
                r.mpos = 0
            new_pe_reads[1] = [r]
        else:
            new_pe_reads[1] = pe_reads[1]
        for reads in pe_reads:
            for r in reads:
                outbamfh.write(r)
        num_frags += 1
    logging.debug("Found %d fragments" % (num_frags))
    inbamfh.close()
    outbamfh.close()
    return config.JOB_SUCCESS
コード例 #2
0
def filter_multihits(transcript_file, input_bam_file, output_bam_file,
                     max_multihits=1):
    logging.debug("Reading transcript features")
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    # parse and convert sam -> bam
    inbamfh = pysam.Samfile(input_bam_file, "rb")
    outbamfh = pysam.Samfile(output_bam_file, "wb", template=inbamfh)
    # build a transcript to genome coordinate map   
    tid_tx_genome_map = build_tid_transcript_genome_map(outbamfh, transcripts)
    num_frags = 0
    logging.debug("Annotating and filtering multihits")
    for pe_reads in parse_pe_reads(inbamfh):        
        mate_num_hits = []
        for reads in pe_reads:
            num_hits = annotate_multihits(reads, tid_tx_genome_map)
            mate_num_hits.append(num_hits)
        new_pe_reads = [[],[]]
        if mate_num_hits[0] > max_multihits:
            r = copy_read(pe_reads[0][0])
            r.is_unmapped = True
            r.is_proper_pair = False
            r.is_secondary = False
            r.rname = -1
            r.pos = 0
            if mate_num_hits[1] > max_multihits:
                r.mate_is_unmapped = True
                r.mrnm = -1
                r.mpos = 0
            new_pe_reads[0] = [r]
        else:
            new_pe_reads[0] = pe_reads[0]
        if mate_num_hits[1] > max_multihits:
            r = copy_read(pe_reads[1][0])
            r.is_unmapped = True
            r.is_proper_pair = False
            r.is_secondary = False
            r.rname = -1
            r.pos = 0
            if mate_num_hits[0] > max_multihits:
                r.mate_is_unmapped = True
                r.mrnm = -1
                r.mpos = 0
            new_pe_reads[1] = [r]
        else:
            new_pe_reads[1] = pe_reads[1]
        for reads in pe_reads:
            for r in reads:
                outbamfh.write(r)
        num_frags += 1
    logging.debug("Found %d fragments" % (num_frags))
    inbamfh.close()
    outbamfh.close()
    return config.JOB_SUCCESS
コード例 #3
0
def find_discordant_pairs(pe_reads, library_type):
    """
    iterate through combinations of read1/read2 to predict valid 
    discordant read pairs
    """
    # classify the reads as 5' or 3' gene alignments or genome alignments
    r1_5p_gene_hits, r1_3p_gene_hits = classify_unpaired_reads(pe_reads[0], library_type)
    r2_5p_gene_hits, r2_3p_gene_hits = classify_unpaired_reads(pe_reads[1], library_type)
    # pair 5' and 3' gene alignments
    gene_pairs = []
    combos = [(r1_5p_gene_hits, r2_3p_gene_hits), (r1_3p_gene_hits, r2_5p_gene_hits)]
    for r1_list, r2_list in combos:
        for r1 in r1_list:
            for r2 in r2_list:
                cr1 = copy_read(r1)
                cr2 = copy_read(r2)
                pair_reads(cr1, cr2)
                gene_pairs.append((cr1, cr2))
    return gene_pairs
コード例 #4
0
def convert_read(r, transcript_tid_map, library_type):
    if r.is_unmapped:
        # return copy of original read
        return copy_read(r)
    # copy and modify tags
    tagdict = collections.OrderedDict(r.tags)
    if 'XS' in tagdict:
        del tagdict['XS']
    if 'NH' in tagdict:
        del tagdict['NH']
    # convert transcript reference to genome
    genome_tid, negstrand, exons = transcript_tid_map[r.tid]
    # find genomic start position of transcript
    newpos, eindex, testart, toffset = convert_pos(r.pos, negstrand, exons)
    # parse and convert transcript cigar string
    newcigar, alen, spliced = \
        convert_cigar(r.cigar, negstrand, exons, 
                      eindex, testart, toffset)            
    if negstrand:
        # set position to left end of transcript
        newpos = newpos - alen + 1            
        # flip is_reverse flag
        is_reverse = (not r.is_reverse)
        # reverse complement seq and quals
        seq = DNA_reverse_complement(r.seq)
        qual = None if r.qual is None else r.qual[::-1]
        # flip MD tag
        if 'MD' in tagdict:
            tagdict['MD'] = reverse_complement_MD_tag(tagdict['MD'])
    else:
        is_reverse = r.is_reverse
        seq = r.seq
        qual = r.qual
    # add XS tag
    strand = get_read_strand(r.is_read2, is_reverse, negstrand, library_type)
    tagdict['XS'] = strand
    # create copy of read
    a = pysam.AlignedRead()
    a.qname = r.qname
    a.flag = r.flag
    a.seq = seq
    a.qual = qual
    a.is_reverse = is_reverse
    a.tid = genome_tid
    a.pos = newpos
    a.cigar = newcigar
    a.mapq = r.mapq
    a.rnext = r.rnext
    a.pnext = r.pnext
    a.tlen = r.tlen
    a.tags = tuple(tagdict.iteritems())
    return a
コード例 #5
0
def convert_read(r, transcript_tid_map, library_type):
    if r.is_unmapped:
        # return copy of original read
        return copy_read(r)
    # copy and modify tags
    tagdict = collections.OrderedDict(r.tags)
    if 'XS' in tagdict:
        del tagdict['XS']
    if 'NH' in tagdict:
        del tagdict['NH']
    # convert transcript reference to genome
    genome_tid, negstrand, exons = transcript_tid_map[r.tid]
    # find genomic start position of transcript
    newpos, eindex, testart, toffset = convert_pos(r.pos, negstrand, exons)
    # parse and convert transcript cigar string
    newcigar, alen, spliced = \
        convert_cigar(r.cigar, negstrand, exons,
                      eindex, testart, toffset)
    if negstrand:
        # set position to left end of transcript
        newpos = newpos - alen + 1
        # flip is_reverse flag
        is_reverse = (not r.is_reverse)
        # reverse complement seq and quals
        seq = DNA_reverse_complement(r.seq)
        qual = None if r.qual is None else r.qual[::-1]
        # flip MD tag
        if 'MD' in tagdict:
            tagdict['MD'] = reverse_complement_MD_tag(tagdict['MD'])
    else:
        is_reverse = r.is_reverse
        seq = r.seq
        qual = r.qual
    # add XS tag
    strand = get_read_strand(r.is_read2, is_reverse, negstrand, library_type)
    tagdict['XS'] = strand
    # create copy of read
    a = pysam.AlignedRead()
    a.qname = r.qname
    a.flag = r.flag
    a.seq = seq
    a.qual = qual
    a.is_reverse = is_reverse
    a.tid = genome_tid
    a.pos = newpos
    a.cigar = newcigar
    a.mapq = r.mapq
    a.rnext = r.rnext
    a.pnext = r.pnext
    a.tlen = r.tlen
    a.tags = tuple(tagdict.iteritems())
    return a
コード例 #6
0
def find_discordant_pairs(pe_reads, library_type):
    """
    iterate through combinations of read1/read2 to predict valid 
    discordant read pairs
    """
    # classify the reads as 5' or 3' gene alignments or genome alignments
    r1_5p_gene_hits, r1_3p_gene_hits = \
        classify_unpaired_reads(pe_reads[0], library_type)
    r2_5p_gene_hits, r2_3p_gene_hits = \
        classify_unpaired_reads(pe_reads[1], library_type)
    # pair 5' and 3' gene alignments
    gene_pairs = []
    combos = [(r1_5p_gene_hits, r2_3p_gene_hits),
              (r1_3p_gene_hits, r2_5p_gene_hits)]
    for r1_list, r2_list in combos:
        for r1 in r1_list:
            for r2 in r2_list:
                cr1 = copy_read(r1)
                cr2 = copy_read(r2)
                pair_reads(cr1, cr2)
                gene_pairs.append((cr1, cr2))
    return gene_pairs
コード例 #7
0
def find_discordant_pairs(pe_reads, 
                          tid_genome_map,
                          library_type):
    """
    iterate through combinations of read1/read2 to predict valid 
    discordant read pairs
    """
    # classify the reads as 5' or 3' gene alignments or genome alignments
    r1_5p_gene_hits, r1_3p_gene_hits, r1_genome_hits = \
        classify_unpaired_reads(pe_reads[0], tid_genome_map, library_type)
    r2_5p_gene_hits, r2_3p_gene_hits, r2_genome_hits = \
        classify_unpaired_reads(pe_reads[1], tid_genome_map, library_type)
    # pair 5' and 3' gene alignments
    gene_pairs = []
    combos = [(r1_5p_gene_hits,r2_3p_gene_hits),
              (r1_3p_gene_hits,r2_5p_gene_hits)]
    for r1_list,r2_list in combos:
        for r1 in r1_list:
            for r2 in r2_list:
                cr1 = copy_read(r1)
                cr2 = copy_read(r2)
                pair_reads(cr1,cr2)
                gene_pairs.append((cr1,cr2))
    # pair genome alignments
    genome_pairs = []
    for r1 in r1_genome_hits:
        for r2 in r2_genome_hits:
            cr1 = copy_read(r1)
            cr2 = copy_read(r2)
            pair_reads(cr1,cr2)
            genome_pairs.append((cr1,cr2))
    if len(gene_pairs) > 0 or len(genome_pairs) > 0:
        return gene_pairs, genome_pairs, []
    # if no pairs were found, then we can try to pair gene reads
    # with genome reads
    pairs = []
    combos = [(r1_5p_gene_hits, r2_genome_hits),
              (r1_3p_gene_hits, r2_genome_hits),
              (r1_genome_hits, r2_5p_gene_hits),
              (r1_genome_hits, r2_3p_gene_hits)]    
    for r1_list,r2_list in combos:        
        for r1 in r1_list:
            for r2 in r2_list:
                # check orientation compatibility
                if cmp_orientation(r1.opt(ORIENTATION_TAG_NAME),
                                   r2.opt(ORIENTATION_TAG_NAME)):
                    cr1 = copy_read(r1)
                    cr2 = copy_read(r2)
                    pair_reads(cr1,cr2)
                    pairs.append((cr1,cr2))
    return [],[],pairs
コード例 #8
0
def classify_read_pairs(pe_reads, max_isize,
                        library_type, tid_genome_map,
                        tid_tx_cluster_map):
    """
    examines all the alignments of a single fragment and tries to find ways
    to pair reads together.
    
    annotates all read pairs with an integer tag corresponding to a value
    in the DiscordantTags class
    
    returns a tuple with the following lists:
    1) pairs (r1,r2) aligning to genes (pairs may be discordant)
    2) pairs (r1,r2) aligning to genome (pairs may be discordant)
    3) unpaired reads, if any
    """
    # to satisfy library type reads must either be on 
    # same strand or opposite strands
    concordant_tx_pairs = []
    discordant_tx_pairs = []
    concordant_gene_pairs = []
    discordant_gene_pairs = []
    concordant_genome_pairs = []
    discordant_genome_pairs = []
    # 
    # first, try to pair reads that map to the same transcript, or to the
    # genome within the insert size range
    #
    same_strand = LibraryTypes.same_strand(library_type)
    refdict,clusterdict = map_reads_to_references(pe_reads, tid_tx_cluster_map)
    found_pair = False
    for tid, tid_pe_reads in refdict.iteritems():
        # check if there are alignments involving both reads in a pair
        if len(tid_pe_reads[0]) == 0 or len(tid_pe_reads[1]) == 0:
            # no paired alignments exist at this reference
            continue
        # check if there are alignments involving both reads in a pair        
        for r1 in tid_pe_reads[0]:
            for r2 in tid_pe_reads[1]:
                # read strands must agree with library type
                strand_match = (same_strand == (r1.is_reverse == r2.is_reverse))
                # check to see if this tid is a gene or genomic
                if (tid not in tid_genome_map):
                    # this is a genomic hit so check insert size                                         
                    if r1.pos > r2.pos:
                        isize = r1.aend - r2.pos
                    else:
                        isize = r2.aend - r1.pos
                    if (isize <= max_isize):
                        # these reads can be paired
                        found_pair = True
                        cr1 = copy_read(r1)
                        cr2 = copy_read(r2)
                        # reads are close to each other on same chromosome
                        # so check strand
                        if strand_match:
                            tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_GENOME)]
                            concordant_genome_pairs.append((cr1,cr2))
                        else:
                            tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_GENOME)]
                            discordant_genome_pairs.append((cr1, cr2))                     
                        pair_reads(cr1,cr2,tags)
                else:
                    # these reads can be paired
                    found_pair = True
                    cr1 = copy_read(r1)
                    cr2 = copy_read(r2)                    
                    # this is a hit to same transcript (gene)
                    # pair the reads if strand comparison is correct
                    if strand_match:
                        tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_TX)]
                        concordant_tx_pairs.append((cr1,cr2))
                    else:
                        # hit to same gene with wrong strand, which
                        # could happen in certain wacky cases
                        tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_TX)]
                        discordant_tx_pairs.append((cr1,cr2))
                    pair_reads(cr1,cr2,tags)
    # at this point, if we have not been able to find a suitable way
    # to pair the reads, then search within the transcript cluster
    if not found_pair:
        for cluster_id, cluster_pe_reads in clusterdict.iteritems():
            # check if there are alignments involving both reads in a pair
            if len(cluster_pe_reads[0]) == 0 or len(cluster_pe_reads[1]) == 0:
                # no paired alignments in this transcript cluster            
                continue
            for r1 in cluster_pe_reads[0]:
                for r2 in cluster_pe_reads[1]:
                    # check strand compatibility
                    strand_match = (same_strand == (r1.is_reverse == r2.is_reverse))
                    # these reads can be paired
                    found_pair = True
                    cr1 = copy_read(r1)
                    cr2 = copy_read(r2)                    
                    if strand_match:
                        tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_GENE)]
                        concordant_gene_pairs.append((cr1,cr2))
                    else:
                        tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_GENE)]
                        discordant_gene_pairs.append((cr1,cr2))
                    pair_reads(cr1,cr2,tags)
    # at this point, we have tried all combinations.  if any paired reads
    # are concordant then return them without considering discordant reads 
    gene_pairs = []
    if len(concordant_tx_pairs) > 0:
        gene_pairs = concordant_tx_pairs
    elif len(concordant_gene_pairs) > 0:
        gene_pairs = concordant_gene_pairs
    if len(gene_pairs) > 0 or len(concordant_genome_pairs) > 0:
        return gene_pairs, concordant_genome_pairs, []
    # if no concordant reads in transcripts or genome, return any
    # discordant reads that may violate strand requirements but still
    # remain colocalized on the same gene/chromosome
    gene_pairs = []
    if len(discordant_tx_pairs) > 0:
        gene_pairs = discordant_tx_pairs
    elif len(discordant_gene_pairs) > 0:
        gene_pairs = discordant_gene_pairs    
    if len(gene_pairs) > 0 or len(discordant_genome_pairs) > 0:
        return gene_pairs, discordant_genome_pairs, []
    #
    # at this point, no read pairings were found so the read is 
    # assumed to be discordant.  
    #
    # TODO: now that we know that the reads are discordant, no reason
    # to keep all the mappings hanging around if there is a small subset
    # with a small number of mismatches.  is this the right thing to do
    # here?
    # 
    pe_reads = (select_best_mismatch_strata(pe_reads[0]),
                select_best_mismatch_strata(pe_reads[1]))
    #
    # now we can create all valid combinations of read1/read2 as putative 
    # discordant read pairs 
    #    
    gene_pairs, genome_pairs, combo_pairs = \
        find_discordant_pairs(pe_reads, tid_genome_map, library_type)
    if len(gene_pairs) > 0 or len(genome_pairs) > 0:
        return gene_pairs, genome_pairs, []
    elif len(combo_pairs) > 0:
        return combo_pairs, [], []
    # last resort suggests that there are some complex read mappings that
    # don't make sense and cannot be explained, warranting further 
    # investigation
    return [], [], pe_reads
コード例 #9
0
def classify_read_pairs(pe_reads, max_isize, library_type, tid_tx_map):
    """
    examines all the alignments of a single fragment and tries to find ways
    to pair reads together.
    
    annotates all read pairs with an integer tag corresponding to a value
    in the DiscordantTags class
    
    returns a tuple containing 3 lists:
    1) concordant (r1,r2) pairs
    2) discordant (r1,r2) pairs
    3) unpaired reads
    """
    # to satisfy library type reads must either be on
    # same strand or opposite strands
    concordant_tx_pairs = []
    discordant_tx_pairs = []
    concordant_cluster_pairs = []
    discordant_cluster_pairs = []
    #
    # first, try to pair reads that map to the same transcript or
    # cluster or overlapping transcripts
    #
    same_strand = LibraryTypes.same_strand(library_type)
    refdict, clusterdict = map_reads_to_references(pe_reads, tid_tx_map)
    found_pair = False
    for tid, tid_pe_reads in refdict.iteritems():
        # check if there are alignments involving both reads in a pair
        if len(tid_pe_reads[0]) == 0 or len(tid_pe_reads[1]) == 0:
            # no paired alignments exist at this reference
            continue
        for r1 in tid_pe_reads[0]:
            for r2 in tid_pe_reads[1]:
                # read strands must agree with library type
                strand_match = (same_strand == (
                    r1.is_reverse == r2.is_reverse))
                # these reads can be paired
                found_pair = True
                cr1 = copy_read(r1)
                cr2 = copy_read(r2)
                # this is a hit to same transcript (gene)
                # pair the reads if strand comparison is correct
                if strand_match:
                    tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_TX)
                            ]
                    concordant_tx_pairs.append((cr1, cr2))
                else:
                    # hit to same gene with wrong strand, which
                    # could happen in certain wacky cases
                    tags = [(DISCORDANT_TAG_NAME,
                             DiscordantTags.DISCORDANT_STRAND_TX)]
                    discordant_tx_pairs.append((cr1, cr2))
                pair_reads(cr1, cr2, tags)
    # at this point, if we have not been able to find a suitable way
    # to pair the reads, then search within the transcript cluster
    if not found_pair:
        for cluster_id, cluster_pe_reads in clusterdict.iteritems():
            # check if there are alignments involving both reads in a pair
            if len(cluster_pe_reads[0]) == 0 or len(cluster_pe_reads[1]) == 0:
                # no paired alignments in this transcript cluster
                continue
            for r1 in cluster_pe_reads[0]:
                for r2 in cluster_pe_reads[1]:
                    # check strand compatibility
                    strand_match = (same_strand == (
                        r1.is_reverse == r2.is_reverse))
                    # these reads can be paired
                    found_pair = True
                    cr1 = copy_read(r1)
                    cr2 = copy_read(r2)
                    if strand_match:
                        tags = [(DISCORDANT_TAG_NAME,
                                 DiscordantTags.CONCORDANT_GENE)]
                        concordant_cluster_pairs.append((cr1, cr2))
                    else:
                        tags = [(DISCORDANT_TAG_NAME,
                                 DiscordantTags.DISCORDANT_STRAND_GENE)]
                        discordant_cluster_pairs.append((cr1, cr2))
                    pair_reads(cr1, cr2, tags)
    # at this point, we have tried all combinations.  if any paired reads
    # are concordant then return them without considering discordant reads
    gene_pairs = []
    if len(concordant_tx_pairs) > 0:
        gene_pairs = concordant_tx_pairs
    elif len(concordant_cluster_pairs) > 0:
        gene_pairs = concordant_cluster_pairs
    if len(gene_pairs) > 0:
        return gene_pairs, [], []
    # if no concordant reads in transcripts, return any discordant reads
    # that may violate strand requirements but still remain colocalized
    # on the same gene/chromosome
    gene_pairs = []
    if len(discordant_tx_pairs) > 0:
        gene_pairs = discordant_tx_pairs
    elif len(discordant_cluster_pairs) > 0:
        gene_pairs = discordant_cluster_pairs
    if len(gene_pairs) > 0:
        return gene_pairs, [], []
    #
    # at this point, no read pairings were found so the read is
    # assumed to be discordant. now we can create all valid
    # combinations of read1/read2 as putative discordant read pairs
    #
    pairs = find_discordant_pairs(pe_reads, library_type)
    if len(pairs) > 0:
        # sort valid pairs by sum of alignment score and retain the best
        # scoring pairs
        pairs = select_best_scoring_pairs(pairs)
        return [], pairs, []
    #
    # no valid pairs could be found suggesting that these alignments are
    # either artifacts or that the current transcript annotations do not
    # support this pair
    #
    return [], [], pe_reads
コード例 #10
0
def classify_read_pairs(pe_reads, max_isize,
                        library_type, 
                        tid_tx_map):
    """
    examines all the alignments of a single fragment and tries to find ways
    to pair reads together.
    
    annotates all read pairs with an integer tag corresponding to a value
    in the DiscordantTags class
    
    returns a tuple containing 3 lists:
    1) concordant (r1,r2) pairs
    2) discordant (r1,r2) pairs
    3) unpaired reads
    """
    # to satisfy library type reads must either be on 
    # same strand or opposite strands
    concordant_tx_pairs = []
    discordant_tx_pairs = []
    concordant_cluster_pairs = []
    discordant_cluster_pairs = []
    # 
    # first, try to pair reads that map to the same transcript or 
    # cluster or overlapping transcripts
    #
    same_strand = LibraryTypes.same_strand(library_type)
    refdict, clusterdict = map_reads_to_references(pe_reads, tid_tx_map)
    found_pair = False
    for tid, tid_pe_reads in refdict.iteritems():
        # check if there are alignments involving both reads in a pair
        if len(tid_pe_reads[0]) == 0 or len(tid_pe_reads[1]) == 0:
            # no paired alignments exist at this reference
            continue
        for r1 in tid_pe_reads[0]:
            for r2 in tid_pe_reads[1]:
                # read strands must agree with library type
                strand_match = (same_strand == (r1.is_reverse == r2.is_reverse))
                # these reads can be paired
                found_pair = True
                cr1 = copy_read(r1)
                cr2 = copy_read(r2)                    
                # this is a hit to same transcript (gene)
                # pair the reads if strand comparison is correct
                if strand_match:
                    tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_TX)]
                    concordant_tx_pairs.append((cr1,cr2))
                else:
                    # hit to same gene with wrong strand, which
                    # could happen in certain wacky cases
                    tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_TX)]
                    discordant_tx_pairs.append((cr1,cr2))
                pair_reads(cr1,cr2,tags)
    # at this point, if we have not been able to find a suitable way
    # to pair the reads, then search within the transcript cluster
    if not found_pair:
        for cluster_id, cluster_pe_reads in clusterdict.iteritems():
            # check if there are alignments involving both reads in a pair
            if len(cluster_pe_reads[0]) == 0 or len(cluster_pe_reads[1]) == 0:
                # no paired alignments in this transcript cluster            
                continue
            for r1 in cluster_pe_reads[0]:
                for r2 in cluster_pe_reads[1]:
                    # check strand compatibility
                    strand_match = (same_strand == (r1.is_reverse == r2.is_reverse))
                    # these reads can be paired
                    found_pair = True
                    cr1 = copy_read(r1)
                    cr2 = copy_read(r2)                    
                    if strand_match:
                        tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_GENE)]
                        concordant_cluster_pairs.append((cr1,cr2))
                    else:
                        tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_GENE)]
                        discordant_cluster_pairs.append((cr1,cr2))
                    pair_reads(cr1,cr2,tags)
    # at this point, we have tried all combinations.  if any paired reads
    # are concordant then return them without considering discordant reads 
    gene_pairs = []
    if len(concordant_tx_pairs) > 0:
        gene_pairs = concordant_tx_pairs
    elif len(concordant_cluster_pairs) > 0:
        gene_pairs = concordant_cluster_pairs
    if len(gene_pairs) > 0:
        return gene_pairs, [], []
    # if no concordant reads in transcripts, return any discordant reads 
    # that may violate strand requirements but still remain colocalized 
    # on the same gene/chromosome
    gene_pairs = []
    if len(discordant_tx_pairs) > 0:
        gene_pairs = discordant_tx_pairs
    elif len(discordant_cluster_pairs) > 0:
        gene_pairs = discordant_cluster_pairs    
    if len(gene_pairs) > 0:
        return gene_pairs, [], []
    #
    # at this point, no read pairings were found so the read is 
    # assumed to be discordant. now we can create all valid 
    # combinations of read1/read2 as putative discordant read pairs 
    #    
    pairs = find_discordant_pairs(pe_reads, library_type)
    if len(pairs) > 0:        
        # sort valid pairs by sum of alignment score and retain the best 
        # scoring pairs
        pairs = select_best_scoring_pairs(pairs)
        return [], pairs, []
    # 
    # no valid pairs could be found suggesting that these alignments are
    # either artifacts or that the current transcript annotations do not
    # support this pair
    # 
    return [], [], pe_reads