コード例 #1
0
def convert_read_pairs(pairs, transcript_tid_map, library_type):
    # convert pairs
    pairs_dict = collections.OrderedDict()
    for r1,r2 in pairs:
        newr1 = convert_read(r1, transcript_tid_map, library_type)
        newr2 = convert_read(r2, transcript_tid_map, library_type)
        pair_reads(newr1, newr2)
        # key to identify independent alignments
        k = (newr1.tid, newr1.pos, newr1.aend, 
             newr2.tid, newr2.pos, newr2.aend)
        if k not in pairs_dict:
            pairs_dict[k] = (newr1, newr2)
    # compute number of alignment hits
    num_hits = len(pairs_dict)
    # write reads to BAM file
    for r1,r2 in pairs_dict.itervalues():
        tagdict1 = collections.OrderedDict(r1.tags)
        tagdict2 = collections.OrderedDict(r2.tags)
        # annotate multihits
        tagdict1['NH'] = num_hits
        tagdict2['NH'] = num_hits
        # write
        r1.tags = tagdict1.items() 
        r2.tags = tagdict2.items()
        yield r1,r2
コード例 #2
0
def convert_read_pairs(pairs, transcript_tid_map, library_type):
    # convert pairs
    pairs_dict = collections.OrderedDict()
    for r1, r2 in pairs:
        newr1 = convert_read(r1, transcript_tid_map, library_type)
        newr2 = convert_read(r2, transcript_tid_map, library_type)
        pair_reads(newr1, newr2)
        # key to identify independent alignments
        k = (newr1.tid, newr1.pos, newr1.aend, newr2.tid, newr2.pos,
             newr2.aend)
        if k not in pairs_dict:
            pairs_dict[k] = (newr1, newr2)
    # compute number of alignment hits
    num_hits = len(pairs_dict)
    # write reads to BAM file
    for r1, r2 in pairs_dict.itervalues():
        tagdict1 = collections.OrderedDict(r1.tags)
        tagdict2 = collections.OrderedDict(r2.tags)
        # annotate multihits
        tagdict1['NH'] = num_hits
        tagdict2['NH'] = num_hits
        # write
        r1.tags = tagdict1.items()
        r2.tags = tagdict2.items()
        yield r1, r2
コード例 #3
0
def find_discordant_pairs(pe_reads, library_type):
    """
    iterate through combinations of read1/read2 to predict valid 
    discordant read pairs
    """
    # classify the reads as 5' or 3' gene alignments or genome alignments
    r1_5p_gene_hits, r1_3p_gene_hits = classify_unpaired_reads(pe_reads[0], library_type)
    r2_5p_gene_hits, r2_3p_gene_hits = classify_unpaired_reads(pe_reads[1], library_type)
    # pair 5' and 3' gene alignments
    gene_pairs = []
    combos = [(r1_5p_gene_hits, r2_3p_gene_hits), (r1_3p_gene_hits, r2_5p_gene_hits)]
    for r1_list, r2_list in combos:
        for r1 in r1_list:
            for r2 in r2_list:
                cr1 = copy_read(r1)
                cr2 = copy_read(r2)
                pair_reads(cr1, cr2)
                gene_pairs.append((cr1, cr2))
    return gene_pairs
コード例 #4
0
def find_discordant_pairs(pe_reads, library_type):
    """
    iterate through combinations of read1/read2 to predict valid 
    discordant read pairs
    """
    # classify the reads as 5' or 3' gene alignments or genome alignments
    r1_5p_gene_hits, r1_3p_gene_hits = \
        classify_unpaired_reads(pe_reads[0], library_type)
    r2_5p_gene_hits, r2_3p_gene_hits = \
        classify_unpaired_reads(pe_reads[1], library_type)
    # pair 5' and 3' gene alignments
    gene_pairs = []
    combos = [(r1_5p_gene_hits, r2_3p_gene_hits),
              (r1_3p_gene_hits, r2_5p_gene_hits)]
    for r1_list, r2_list in combos:
        for r1 in r1_list:
            for r2 in r2_list:
                cr1 = copy_read(r1)
                cr2 = copy_read(r2)
                pair_reads(cr1, cr2)
                gene_pairs.append((cr1, cr2))
    return gene_pairs
コード例 #5
0
def find_discordant_pairs(pe_reads, 
                          tid_genome_map,
                          library_type):
    """
    iterate through combinations of read1/read2 to predict valid 
    discordant read pairs
    """
    # classify the reads as 5' or 3' gene alignments or genome alignments
    r1_5p_gene_hits, r1_3p_gene_hits, r1_genome_hits = \
        classify_unpaired_reads(pe_reads[0], tid_genome_map, library_type)
    r2_5p_gene_hits, r2_3p_gene_hits, r2_genome_hits = \
        classify_unpaired_reads(pe_reads[1], tid_genome_map, library_type)
    # pair 5' and 3' gene alignments
    gene_pairs = []
    combos = [(r1_5p_gene_hits,r2_3p_gene_hits),
              (r1_3p_gene_hits,r2_5p_gene_hits)]
    for r1_list,r2_list in combos:
        for r1 in r1_list:
            for r2 in r2_list:
                cr1 = copy_read(r1)
                cr2 = copy_read(r2)
                pair_reads(cr1,cr2)
                gene_pairs.append((cr1,cr2))
    # pair genome alignments
    genome_pairs = []
    for r1 in r1_genome_hits:
        for r2 in r2_genome_hits:
            cr1 = copy_read(r1)
            cr2 = copy_read(r2)
            pair_reads(cr1,cr2)
            genome_pairs.append((cr1,cr2))
    if len(gene_pairs) > 0 or len(genome_pairs) > 0:
        return gene_pairs, genome_pairs, []
    # if no pairs were found, then we can try to pair gene reads
    # with genome reads
    pairs = []
    combos = [(r1_5p_gene_hits, r2_genome_hits),
              (r1_3p_gene_hits, r2_genome_hits),
              (r1_genome_hits, r2_5p_gene_hits),
              (r1_genome_hits, r2_3p_gene_hits)]    
    for r1_list,r2_list in combos:        
        for r1 in r1_list:
            for r2 in r2_list:
                # check orientation compatibility
                if cmp_orientation(r1.opt(ORIENTATION_TAG_NAME),
                                   r2.opt(ORIENTATION_TAG_NAME)):
                    cr1 = copy_read(r1)
                    cr2 = copy_read(r2)
                    pair_reads(cr1,cr2)
                    pairs.append((cr1,cr2))
    return [],[],pairs
コード例 #6
0
def classify_read_pairs(pe_reads, max_isize,
                        library_type, tid_genome_map,
                        tid_tx_cluster_map):
    """
    examines all the alignments of a single fragment and tries to find ways
    to pair reads together.
    
    annotates all read pairs with an integer tag corresponding to a value
    in the DiscordantTags class
    
    returns a tuple with the following lists:
    1) pairs (r1,r2) aligning to genes (pairs may be discordant)
    2) pairs (r1,r2) aligning to genome (pairs may be discordant)
    3) unpaired reads, if any
    """
    # to satisfy library type reads must either be on 
    # same strand or opposite strands
    concordant_tx_pairs = []
    discordant_tx_pairs = []
    concordant_gene_pairs = []
    discordant_gene_pairs = []
    concordant_genome_pairs = []
    discordant_genome_pairs = []
    # 
    # first, try to pair reads that map to the same transcript, or to the
    # genome within the insert size range
    #
    same_strand = LibraryTypes.same_strand(library_type)
    refdict,clusterdict = map_reads_to_references(pe_reads, tid_tx_cluster_map)
    found_pair = False
    for tid, tid_pe_reads in refdict.iteritems():
        # check if there are alignments involving both reads in a pair
        if len(tid_pe_reads[0]) == 0 or len(tid_pe_reads[1]) == 0:
            # no paired alignments exist at this reference
            continue
        # check if there are alignments involving both reads in a pair        
        for r1 in tid_pe_reads[0]:
            for r2 in tid_pe_reads[1]:
                # read strands must agree with library type
                strand_match = (same_strand == (r1.is_reverse == r2.is_reverse))
                # check to see if this tid is a gene or genomic
                if (tid not in tid_genome_map):
                    # this is a genomic hit so check insert size                                         
                    if r1.pos > r2.pos:
                        isize = r1.aend - r2.pos
                    else:
                        isize = r2.aend - r1.pos
                    if (isize <= max_isize):
                        # these reads can be paired
                        found_pair = True
                        cr1 = copy_read(r1)
                        cr2 = copy_read(r2)
                        # reads are close to each other on same chromosome
                        # so check strand
                        if strand_match:
                            tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_GENOME)]
                            concordant_genome_pairs.append((cr1,cr2))
                        else:
                            tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_GENOME)]
                            discordant_genome_pairs.append((cr1, cr2))                     
                        pair_reads(cr1,cr2,tags)
                else:
                    # these reads can be paired
                    found_pair = True
                    cr1 = copy_read(r1)
                    cr2 = copy_read(r2)                    
                    # this is a hit to same transcript (gene)
                    # pair the reads if strand comparison is correct
                    if strand_match:
                        tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_TX)]
                        concordant_tx_pairs.append((cr1,cr2))
                    else:
                        # hit to same gene with wrong strand, which
                        # could happen in certain wacky cases
                        tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_TX)]
                        discordant_tx_pairs.append((cr1,cr2))
                    pair_reads(cr1,cr2,tags)
    # at this point, if we have not been able to find a suitable way
    # to pair the reads, then search within the transcript cluster
    if not found_pair:
        for cluster_id, cluster_pe_reads in clusterdict.iteritems():
            # check if there are alignments involving both reads in a pair
            if len(cluster_pe_reads[0]) == 0 or len(cluster_pe_reads[1]) == 0:
                # no paired alignments in this transcript cluster            
                continue
            for r1 in cluster_pe_reads[0]:
                for r2 in cluster_pe_reads[1]:
                    # check strand compatibility
                    strand_match = (same_strand == (r1.is_reverse == r2.is_reverse))
                    # these reads can be paired
                    found_pair = True
                    cr1 = copy_read(r1)
                    cr2 = copy_read(r2)                    
                    if strand_match:
                        tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_GENE)]
                        concordant_gene_pairs.append((cr1,cr2))
                    else:
                        tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_GENE)]
                        discordant_gene_pairs.append((cr1,cr2))
                    pair_reads(cr1,cr2,tags)
    # at this point, we have tried all combinations.  if any paired reads
    # are concordant then return them without considering discordant reads 
    gene_pairs = []
    if len(concordant_tx_pairs) > 0:
        gene_pairs = concordant_tx_pairs
    elif len(concordant_gene_pairs) > 0:
        gene_pairs = concordant_gene_pairs
    if len(gene_pairs) > 0 or len(concordant_genome_pairs) > 0:
        return gene_pairs, concordant_genome_pairs, []
    # if no concordant reads in transcripts or genome, return any
    # discordant reads that may violate strand requirements but still
    # remain colocalized on the same gene/chromosome
    gene_pairs = []
    if len(discordant_tx_pairs) > 0:
        gene_pairs = discordant_tx_pairs
    elif len(discordant_gene_pairs) > 0:
        gene_pairs = discordant_gene_pairs    
    if len(gene_pairs) > 0 or len(discordant_genome_pairs) > 0:
        return gene_pairs, discordant_genome_pairs, []
    #
    # at this point, no read pairings were found so the read is 
    # assumed to be discordant.  
    #
    # TODO: now that we know that the reads are discordant, no reason
    # to keep all the mappings hanging around if there is a small subset
    # with a small number of mismatches.  is this the right thing to do
    # here?
    # 
    pe_reads = (select_best_mismatch_strata(pe_reads[0]),
                select_best_mismatch_strata(pe_reads[1]))
    #
    # now we can create all valid combinations of read1/read2 as putative 
    # discordant read pairs 
    #    
    gene_pairs, genome_pairs, combo_pairs = \
        find_discordant_pairs(pe_reads, tid_genome_map, library_type)
    if len(gene_pairs) > 0 or len(genome_pairs) > 0:
        return gene_pairs, genome_pairs, []
    elif len(combo_pairs) > 0:
        return combo_pairs, [], []
    # last resort suggests that there are some complex read mappings that
    # don't make sense and cannot be explained, warranting further 
    # investigation
    return [], [], pe_reads
コード例 #7
0
def classify_read_pairs(pe_reads, max_isize, library_type, tid_tx_map):
    """
    examines all the alignments of a single fragment and tries to find ways
    to pair reads together.
    
    annotates all read pairs with an integer tag corresponding to a value
    in the DiscordantTags class
    
    returns a tuple containing 3 lists:
    1) concordant (r1,r2) pairs
    2) discordant (r1,r2) pairs
    3) unpaired reads
    """
    # to satisfy library type reads must either be on
    # same strand or opposite strands
    concordant_tx_pairs = []
    discordant_tx_pairs = []
    concordant_cluster_pairs = []
    discordant_cluster_pairs = []
    #
    # first, try to pair reads that map to the same transcript or
    # cluster or overlapping transcripts
    #
    same_strand = LibraryTypes.same_strand(library_type)
    refdict, clusterdict = map_reads_to_references(pe_reads, tid_tx_map)
    found_pair = False
    for tid, tid_pe_reads in refdict.iteritems():
        # check if there are alignments involving both reads in a pair
        if len(tid_pe_reads[0]) == 0 or len(tid_pe_reads[1]) == 0:
            # no paired alignments exist at this reference
            continue
        for r1 in tid_pe_reads[0]:
            for r2 in tid_pe_reads[1]:
                # read strands must agree with library type
                strand_match = (same_strand == (
                    r1.is_reverse == r2.is_reverse))
                # these reads can be paired
                found_pair = True
                cr1 = copy_read(r1)
                cr2 = copy_read(r2)
                # this is a hit to same transcript (gene)
                # pair the reads if strand comparison is correct
                if strand_match:
                    tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_TX)
                            ]
                    concordant_tx_pairs.append((cr1, cr2))
                else:
                    # hit to same gene with wrong strand, which
                    # could happen in certain wacky cases
                    tags = [(DISCORDANT_TAG_NAME,
                             DiscordantTags.DISCORDANT_STRAND_TX)]
                    discordant_tx_pairs.append((cr1, cr2))
                pair_reads(cr1, cr2, tags)
    # at this point, if we have not been able to find a suitable way
    # to pair the reads, then search within the transcript cluster
    if not found_pair:
        for cluster_id, cluster_pe_reads in clusterdict.iteritems():
            # check if there are alignments involving both reads in a pair
            if len(cluster_pe_reads[0]) == 0 or len(cluster_pe_reads[1]) == 0:
                # no paired alignments in this transcript cluster
                continue
            for r1 in cluster_pe_reads[0]:
                for r2 in cluster_pe_reads[1]:
                    # check strand compatibility
                    strand_match = (same_strand == (
                        r1.is_reverse == r2.is_reverse))
                    # these reads can be paired
                    found_pair = True
                    cr1 = copy_read(r1)
                    cr2 = copy_read(r2)
                    if strand_match:
                        tags = [(DISCORDANT_TAG_NAME,
                                 DiscordantTags.CONCORDANT_GENE)]
                        concordant_cluster_pairs.append((cr1, cr2))
                    else:
                        tags = [(DISCORDANT_TAG_NAME,
                                 DiscordantTags.DISCORDANT_STRAND_GENE)]
                        discordant_cluster_pairs.append((cr1, cr2))
                    pair_reads(cr1, cr2, tags)
    # at this point, we have tried all combinations.  if any paired reads
    # are concordant then return them without considering discordant reads
    gene_pairs = []
    if len(concordant_tx_pairs) > 0:
        gene_pairs = concordant_tx_pairs
    elif len(concordant_cluster_pairs) > 0:
        gene_pairs = concordant_cluster_pairs
    if len(gene_pairs) > 0:
        return gene_pairs, [], []
    # if no concordant reads in transcripts, return any discordant reads
    # that may violate strand requirements but still remain colocalized
    # on the same gene/chromosome
    gene_pairs = []
    if len(discordant_tx_pairs) > 0:
        gene_pairs = discordant_tx_pairs
    elif len(discordant_cluster_pairs) > 0:
        gene_pairs = discordant_cluster_pairs
    if len(gene_pairs) > 0:
        return gene_pairs, [], []
    #
    # at this point, no read pairings were found so the read is
    # assumed to be discordant. now we can create all valid
    # combinations of read1/read2 as putative discordant read pairs
    #
    pairs = find_discordant_pairs(pe_reads, library_type)
    if len(pairs) > 0:
        # sort valid pairs by sum of alignment score and retain the best
        # scoring pairs
        pairs = select_best_scoring_pairs(pairs)
        return [], pairs, []
    #
    # no valid pairs could be found suggesting that these alignments are
    # either artifacts or that the current transcript annotations do not
    # support this pair
    #
    return [], [], pe_reads
コード例 #8
0
def classify_read_pairs(pe_reads, max_isize,
                        library_type, 
                        tid_tx_map):
    """
    examines all the alignments of a single fragment and tries to find ways
    to pair reads together.
    
    annotates all read pairs with an integer tag corresponding to a value
    in the DiscordantTags class
    
    returns a tuple containing 3 lists:
    1) concordant (r1,r2) pairs
    2) discordant (r1,r2) pairs
    3) unpaired reads
    """
    # to satisfy library type reads must either be on 
    # same strand or opposite strands
    concordant_tx_pairs = []
    discordant_tx_pairs = []
    concordant_cluster_pairs = []
    discordant_cluster_pairs = []
    # 
    # first, try to pair reads that map to the same transcript or 
    # cluster or overlapping transcripts
    #
    same_strand = LibraryTypes.same_strand(library_type)
    refdict, clusterdict = map_reads_to_references(pe_reads, tid_tx_map)
    found_pair = False
    for tid, tid_pe_reads in refdict.iteritems():
        # check if there are alignments involving both reads in a pair
        if len(tid_pe_reads[0]) == 0 or len(tid_pe_reads[1]) == 0:
            # no paired alignments exist at this reference
            continue
        for r1 in tid_pe_reads[0]:
            for r2 in tid_pe_reads[1]:
                # read strands must agree with library type
                strand_match = (same_strand == (r1.is_reverse == r2.is_reverse))
                # these reads can be paired
                found_pair = True
                cr1 = copy_read(r1)
                cr2 = copy_read(r2)                    
                # this is a hit to same transcript (gene)
                # pair the reads if strand comparison is correct
                if strand_match:
                    tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_TX)]
                    concordant_tx_pairs.append((cr1,cr2))
                else:
                    # hit to same gene with wrong strand, which
                    # could happen in certain wacky cases
                    tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_TX)]
                    discordant_tx_pairs.append((cr1,cr2))
                pair_reads(cr1,cr2,tags)
    # at this point, if we have not been able to find a suitable way
    # to pair the reads, then search within the transcript cluster
    if not found_pair:
        for cluster_id, cluster_pe_reads in clusterdict.iteritems():
            # check if there are alignments involving both reads in a pair
            if len(cluster_pe_reads[0]) == 0 or len(cluster_pe_reads[1]) == 0:
                # no paired alignments in this transcript cluster            
                continue
            for r1 in cluster_pe_reads[0]:
                for r2 in cluster_pe_reads[1]:
                    # check strand compatibility
                    strand_match = (same_strand == (r1.is_reverse == r2.is_reverse))
                    # these reads can be paired
                    found_pair = True
                    cr1 = copy_read(r1)
                    cr2 = copy_read(r2)                    
                    if strand_match:
                        tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_GENE)]
                        concordant_cluster_pairs.append((cr1,cr2))
                    else:
                        tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_GENE)]
                        discordant_cluster_pairs.append((cr1,cr2))
                    pair_reads(cr1,cr2,tags)
    # at this point, we have tried all combinations.  if any paired reads
    # are concordant then return them without considering discordant reads 
    gene_pairs = []
    if len(concordant_tx_pairs) > 0:
        gene_pairs = concordant_tx_pairs
    elif len(concordant_cluster_pairs) > 0:
        gene_pairs = concordant_cluster_pairs
    if len(gene_pairs) > 0:
        return gene_pairs, [], []
    # if no concordant reads in transcripts, return any discordant reads 
    # that may violate strand requirements but still remain colocalized 
    # on the same gene/chromosome
    gene_pairs = []
    if len(discordant_tx_pairs) > 0:
        gene_pairs = discordant_tx_pairs
    elif len(discordant_cluster_pairs) > 0:
        gene_pairs = discordant_cluster_pairs    
    if len(gene_pairs) > 0:
        return gene_pairs, [], []
    #
    # at this point, no read pairings were found so the read is 
    # assumed to be discordant. now we can create all valid 
    # combinations of read1/read2 as putative discordant read pairs 
    #    
    pairs = find_discordant_pairs(pe_reads, library_type)
    if len(pairs) > 0:        
        # sort valid pairs by sum of alignment score and retain the best 
        # scoring pairs
        pairs = select_best_scoring_pairs(pairs)
        return [], pairs, []
    # 
    # no valid pairs could be found suggesting that these alignments are
    # either artifacts or that the current transcript annotations do not
    # support this pair
    # 
    return [], [], pe_reads