def convert_read_pairs(pairs, transcript_tid_map, library_type): # convert pairs pairs_dict = collections.OrderedDict() for r1,r2 in pairs: newr1 = convert_read(r1, transcript_tid_map, library_type) newr2 = convert_read(r2, transcript_tid_map, library_type) pair_reads(newr1, newr2) # key to identify independent alignments k = (newr1.tid, newr1.pos, newr1.aend, newr2.tid, newr2.pos, newr2.aend) if k not in pairs_dict: pairs_dict[k] = (newr1, newr2) # compute number of alignment hits num_hits = len(pairs_dict) # write reads to BAM file for r1,r2 in pairs_dict.itervalues(): tagdict1 = collections.OrderedDict(r1.tags) tagdict2 = collections.OrderedDict(r2.tags) # annotate multihits tagdict1['NH'] = num_hits tagdict2['NH'] = num_hits # write r1.tags = tagdict1.items() r2.tags = tagdict2.items() yield r1,r2
def convert_read_pairs(pairs, transcript_tid_map, library_type): # convert pairs pairs_dict = collections.OrderedDict() for r1, r2 in pairs: newr1 = convert_read(r1, transcript_tid_map, library_type) newr2 = convert_read(r2, transcript_tid_map, library_type) pair_reads(newr1, newr2) # key to identify independent alignments k = (newr1.tid, newr1.pos, newr1.aend, newr2.tid, newr2.pos, newr2.aend) if k not in pairs_dict: pairs_dict[k] = (newr1, newr2) # compute number of alignment hits num_hits = len(pairs_dict) # write reads to BAM file for r1, r2 in pairs_dict.itervalues(): tagdict1 = collections.OrderedDict(r1.tags) tagdict2 = collections.OrderedDict(r2.tags) # annotate multihits tagdict1['NH'] = num_hits tagdict2['NH'] = num_hits # write r1.tags = tagdict1.items() r2.tags = tagdict2.items() yield r1, r2
def find_discordant_pairs(pe_reads, library_type): """ iterate through combinations of read1/read2 to predict valid discordant read pairs """ # classify the reads as 5' or 3' gene alignments or genome alignments r1_5p_gene_hits, r1_3p_gene_hits = classify_unpaired_reads(pe_reads[0], library_type) r2_5p_gene_hits, r2_3p_gene_hits = classify_unpaired_reads(pe_reads[1], library_type) # pair 5' and 3' gene alignments gene_pairs = [] combos = [(r1_5p_gene_hits, r2_3p_gene_hits), (r1_3p_gene_hits, r2_5p_gene_hits)] for r1_list, r2_list in combos: for r1 in r1_list: for r2 in r2_list: cr1 = copy_read(r1) cr2 = copy_read(r2) pair_reads(cr1, cr2) gene_pairs.append((cr1, cr2)) return gene_pairs
def find_discordant_pairs(pe_reads, library_type): """ iterate through combinations of read1/read2 to predict valid discordant read pairs """ # classify the reads as 5' or 3' gene alignments or genome alignments r1_5p_gene_hits, r1_3p_gene_hits = \ classify_unpaired_reads(pe_reads[0], library_type) r2_5p_gene_hits, r2_3p_gene_hits = \ classify_unpaired_reads(pe_reads[1], library_type) # pair 5' and 3' gene alignments gene_pairs = [] combos = [(r1_5p_gene_hits, r2_3p_gene_hits), (r1_3p_gene_hits, r2_5p_gene_hits)] for r1_list, r2_list in combos: for r1 in r1_list: for r2 in r2_list: cr1 = copy_read(r1) cr2 = copy_read(r2) pair_reads(cr1, cr2) gene_pairs.append((cr1, cr2)) return gene_pairs
def find_discordant_pairs(pe_reads, tid_genome_map, library_type): """ iterate through combinations of read1/read2 to predict valid discordant read pairs """ # classify the reads as 5' or 3' gene alignments or genome alignments r1_5p_gene_hits, r1_3p_gene_hits, r1_genome_hits = \ classify_unpaired_reads(pe_reads[0], tid_genome_map, library_type) r2_5p_gene_hits, r2_3p_gene_hits, r2_genome_hits = \ classify_unpaired_reads(pe_reads[1], tid_genome_map, library_type) # pair 5' and 3' gene alignments gene_pairs = [] combos = [(r1_5p_gene_hits,r2_3p_gene_hits), (r1_3p_gene_hits,r2_5p_gene_hits)] for r1_list,r2_list in combos: for r1 in r1_list: for r2 in r2_list: cr1 = copy_read(r1) cr2 = copy_read(r2) pair_reads(cr1,cr2) gene_pairs.append((cr1,cr2)) # pair genome alignments genome_pairs = [] for r1 in r1_genome_hits: for r2 in r2_genome_hits: cr1 = copy_read(r1) cr2 = copy_read(r2) pair_reads(cr1,cr2) genome_pairs.append((cr1,cr2)) if len(gene_pairs) > 0 or len(genome_pairs) > 0: return gene_pairs, genome_pairs, [] # if no pairs were found, then we can try to pair gene reads # with genome reads pairs = [] combos = [(r1_5p_gene_hits, r2_genome_hits), (r1_3p_gene_hits, r2_genome_hits), (r1_genome_hits, r2_5p_gene_hits), (r1_genome_hits, r2_3p_gene_hits)] for r1_list,r2_list in combos: for r1 in r1_list: for r2 in r2_list: # check orientation compatibility if cmp_orientation(r1.opt(ORIENTATION_TAG_NAME), r2.opt(ORIENTATION_TAG_NAME)): cr1 = copy_read(r1) cr2 = copy_read(r2) pair_reads(cr1,cr2) pairs.append((cr1,cr2)) return [],[],pairs
def classify_read_pairs(pe_reads, max_isize, library_type, tid_genome_map, tid_tx_cluster_map): """ examines all the alignments of a single fragment and tries to find ways to pair reads together. annotates all read pairs with an integer tag corresponding to a value in the DiscordantTags class returns a tuple with the following lists: 1) pairs (r1,r2) aligning to genes (pairs may be discordant) 2) pairs (r1,r2) aligning to genome (pairs may be discordant) 3) unpaired reads, if any """ # to satisfy library type reads must either be on # same strand or opposite strands concordant_tx_pairs = [] discordant_tx_pairs = [] concordant_gene_pairs = [] discordant_gene_pairs = [] concordant_genome_pairs = [] discordant_genome_pairs = [] # # first, try to pair reads that map to the same transcript, or to the # genome within the insert size range # same_strand = LibraryTypes.same_strand(library_type) refdict,clusterdict = map_reads_to_references(pe_reads, tid_tx_cluster_map) found_pair = False for tid, tid_pe_reads in refdict.iteritems(): # check if there are alignments involving both reads in a pair if len(tid_pe_reads[0]) == 0 or len(tid_pe_reads[1]) == 0: # no paired alignments exist at this reference continue # check if there are alignments involving both reads in a pair for r1 in tid_pe_reads[0]: for r2 in tid_pe_reads[1]: # read strands must agree with library type strand_match = (same_strand == (r1.is_reverse == r2.is_reverse)) # check to see if this tid is a gene or genomic if (tid not in tid_genome_map): # this is a genomic hit so check insert size if r1.pos > r2.pos: isize = r1.aend - r2.pos else: isize = r2.aend - r1.pos if (isize <= max_isize): # these reads can be paired found_pair = True cr1 = copy_read(r1) cr2 = copy_read(r2) # reads are close to each other on same chromosome # so check strand if strand_match: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_GENOME)] concordant_genome_pairs.append((cr1,cr2)) else: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_GENOME)] discordant_genome_pairs.append((cr1, cr2)) pair_reads(cr1,cr2,tags) else: # these reads can be paired found_pair = True cr1 = copy_read(r1) cr2 = copy_read(r2) # this is a hit to same transcript (gene) # pair the reads if strand comparison is correct if strand_match: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_TX)] concordant_tx_pairs.append((cr1,cr2)) else: # hit to same gene with wrong strand, which # could happen in certain wacky cases tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_TX)] discordant_tx_pairs.append((cr1,cr2)) pair_reads(cr1,cr2,tags) # at this point, if we have not been able to find a suitable way # to pair the reads, then search within the transcript cluster if not found_pair: for cluster_id, cluster_pe_reads in clusterdict.iteritems(): # check if there are alignments involving both reads in a pair if len(cluster_pe_reads[0]) == 0 or len(cluster_pe_reads[1]) == 0: # no paired alignments in this transcript cluster continue for r1 in cluster_pe_reads[0]: for r2 in cluster_pe_reads[1]: # check strand compatibility strand_match = (same_strand == (r1.is_reverse == r2.is_reverse)) # these reads can be paired found_pair = True cr1 = copy_read(r1) cr2 = copy_read(r2) if strand_match: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_GENE)] concordant_gene_pairs.append((cr1,cr2)) else: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_GENE)] discordant_gene_pairs.append((cr1,cr2)) pair_reads(cr1,cr2,tags) # at this point, we have tried all combinations. if any paired reads # are concordant then return them without considering discordant reads gene_pairs = [] if len(concordant_tx_pairs) > 0: gene_pairs = concordant_tx_pairs elif len(concordant_gene_pairs) > 0: gene_pairs = concordant_gene_pairs if len(gene_pairs) > 0 or len(concordant_genome_pairs) > 0: return gene_pairs, concordant_genome_pairs, [] # if no concordant reads in transcripts or genome, return any # discordant reads that may violate strand requirements but still # remain colocalized on the same gene/chromosome gene_pairs = [] if len(discordant_tx_pairs) > 0: gene_pairs = discordant_tx_pairs elif len(discordant_gene_pairs) > 0: gene_pairs = discordant_gene_pairs if len(gene_pairs) > 0 or len(discordant_genome_pairs) > 0: return gene_pairs, discordant_genome_pairs, [] # # at this point, no read pairings were found so the read is # assumed to be discordant. # # TODO: now that we know that the reads are discordant, no reason # to keep all the mappings hanging around if there is a small subset # with a small number of mismatches. is this the right thing to do # here? # pe_reads = (select_best_mismatch_strata(pe_reads[0]), select_best_mismatch_strata(pe_reads[1])) # # now we can create all valid combinations of read1/read2 as putative # discordant read pairs # gene_pairs, genome_pairs, combo_pairs = \ find_discordant_pairs(pe_reads, tid_genome_map, library_type) if len(gene_pairs) > 0 or len(genome_pairs) > 0: return gene_pairs, genome_pairs, [] elif len(combo_pairs) > 0: return combo_pairs, [], [] # last resort suggests that there are some complex read mappings that # don't make sense and cannot be explained, warranting further # investigation return [], [], pe_reads
def classify_read_pairs(pe_reads, max_isize, library_type, tid_tx_map): """ examines all the alignments of a single fragment and tries to find ways to pair reads together. annotates all read pairs with an integer tag corresponding to a value in the DiscordantTags class returns a tuple containing 3 lists: 1) concordant (r1,r2) pairs 2) discordant (r1,r2) pairs 3) unpaired reads """ # to satisfy library type reads must either be on # same strand or opposite strands concordant_tx_pairs = [] discordant_tx_pairs = [] concordant_cluster_pairs = [] discordant_cluster_pairs = [] # # first, try to pair reads that map to the same transcript or # cluster or overlapping transcripts # same_strand = LibraryTypes.same_strand(library_type) refdict, clusterdict = map_reads_to_references(pe_reads, tid_tx_map) found_pair = False for tid, tid_pe_reads in refdict.iteritems(): # check if there are alignments involving both reads in a pair if len(tid_pe_reads[0]) == 0 or len(tid_pe_reads[1]) == 0: # no paired alignments exist at this reference continue for r1 in tid_pe_reads[0]: for r2 in tid_pe_reads[1]: # read strands must agree with library type strand_match = (same_strand == ( r1.is_reverse == r2.is_reverse)) # these reads can be paired found_pair = True cr1 = copy_read(r1) cr2 = copy_read(r2) # this is a hit to same transcript (gene) # pair the reads if strand comparison is correct if strand_match: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_TX) ] concordant_tx_pairs.append((cr1, cr2)) else: # hit to same gene with wrong strand, which # could happen in certain wacky cases tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_TX)] discordant_tx_pairs.append((cr1, cr2)) pair_reads(cr1, cr2, tags) # at this point, if we have not been able to find a suitable way # to pair the reads, then search within the transcript cluster if not found_pair: for cluster_id, cluster_pe_reads in clusterdict.iteritems(): # check if there are alignments involving both reads in a pair if len(cluster_pe_reads[0]) == 0 or len(cluster_pe_reads[1]) == 0: # no paired alignments in this transcript cluster continue for r1 in cluster_pe_reads[0]: for r2 in cluster_pe_reads[1]: # check strand compatibility strand_match = (same_strand == ( r1.is_reverse == r2.is_reverse)) # these reads can be paired found_pair = True cr1 = copy_read(r1) cr2 = copy_read(r2) if strand_match: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_GENE)] concordant_cluster_pairs.append((cr1, cr2)) else: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_GENE)] discordant_cluster_pairs.append((cr1, cr2)) pair_reads(cr1, cr2, tags) # at this point, we have tried all combinations. if any paired reads # are concordant then return them without considering discordant reads gene_pairs = [] if len(concordant_tx_pairs) > 0: gene_pairs = concordant_tx_pairs elif len(concordant_cluster_pairs) > 0: gene_pairs = concordant_cluster_pairs if len(gene_pairs) > 0: return gene_pairs, [], [] # if no concordant reads in transcripts, return any discordant reads # that may violate strand requirements but still remain colocalized # on the same gene/chromosome gene_pairs = [] if len(discordant_tx_pairs) > 0: gene_pairs = discordant_tx_pairs elif len(discordant_cluster_pairs) > 0: gene_pairs = discordant_cluster_pairs if len(gene_pairs) > 0: return gene_pairs, [], [] # # at this point, no read pairings were found so the read is # assumed to be discordant. now we can create all valid # combinations of read1/read2 as putative discordant read pairs # pairs = find_discordant_pairs(pe_reads, library_type) if len(pairs) > 0: # sort valid pairs by sum of alignment score and retain the best # scoring pairs pairs = select_best_scoring_pairs(pairs) return [], pairs, [] # # no valid pairs could be found suggesting that these alignments are # either artifacts or that the current transcript annotations do not # support this pair # return [], [], pe_reads
def classify_read_pairs(pe_reads, max_isize, library_type, tid_tx_map): """ examines all the alignments of a single fragment and tries to find ways to pair reads together. annotates all read pairs with an integer tag corresponding to a value in the DiscordantTags class returns a tuple containing 3 lists: 1) concordant (r1,r2) pairs 2) discordant (r1,r2) pairs 3) unpaired reads """ # to satisfy library type reads must either be on # same strand or opposite strands concordant_tx_pairs = [] discordant_tx_pairs = [] concordant_cluster_pairs = [] discordant_cluster_pairs = [] # # first, try to pair reads that map to the same transcript or # cluster or overlapping transcripts # same_strand = LibraryTypes.same_strand(library_type) refdict, clusterdict = map_reads_to_references(pe_reads, tid_tx_map) found_pair = False for tid, tid_pe_reads in refdict.iteritems(): # check if there are alignments involving both reads in a pair if len(tid_pe_reads[0]) == 0 or len(tid_pe_reads[1]) == 0: # no paired alignments exist at this reference continue for r1 in tid_pe_reads[0]: for r2 in tid_pe_reads[1]: # read strands must agree with library type strand_match = (same_strand == (r1.is_reverse == r2.is_reverse)) # these reads can be paired found_pair = True cr1 = copy_read(r1) cr2 = copy_read(r2) # this is a hit to same transcript (gene) # pair the reads if strand comparison is correct if strand_match: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_TX)] concordant_tx_pairs.append((cr1,cr2)) else: # hit to same gene with wrong strand, which # could happen in certain wacky cases tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_TX)] discordant_tx_pairs.append((cr1,cr2)) pair_reads(cr1,cr2,tags) # at this point, if we have not been able to find a suitable way # to pair the reads, then search within the transcript cluster if not found_pair: for cluster_id, cluster_pe_reads in clusterdict.iteritems(): # check if there are alignments involving both reads in a pair if len(cluster_pe_reads[0]) == 0 or len(cluster_pe_reads[1]) == 0: # no paired alignments in this transcript cluster continue for r1 in cluster_pe_reads[0]: for r2 in cluster_pe_reads[1]: # check strand compatibility strand_match = (same_strand == (r1.is_reverse == r2.is_reverse)) # these reads can be paired found_pair = True cr1 = copy_read(r1) cr2 = copy_read(r2) if strand_match: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_GENE)] concordant_cluster_pairs.append((cr1,cr2)) else: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_GENE)] discordant_cluster_pairs.append((cr1,cr2)) pair_reads(cr1,cr2,tags) # at this point, we have tried all combinations. if any paired reads # are concordant then return them without considering discordant reads gene_pairs = [] if len(concordant_tx_pairs) > 0: gene_pairs = concordant_tx_pairs elif len(concordant_cluster_pairs) > 0: gene_pairs = concordant_cluster_pairs if len(gene_pairs) > 0: return gene_pairs, [], [] # if no concordant reads in transcripts, return any discordant reads # that may violate strand requirements but still remain colocalized # on the same gene/chromosome gene_pairs = [] if len(discordant_tx_pairs) > 0: gene_pairs = discordant_tx_pairs elif len(discordant_cluster_pairs) > 0: gene_pairs = discordant_cluster_pairs if len(gene_pairs) > 0: return gene_pairs, [], [] # # at this point, no read pairings were found so the read is # assumed to be discordant. now we can create all valid # combinations of read1/read2 as putative discordant read pairs # pairs = find_discordant_pairs(pe_reads, library_type) if len(pairs) > 0: # sort valid pairs by sum of alignment score and retain the best # scoring pairs pairs = select_best_scoring_pairs(pairs) return [], pairs, [] # # no valid pairs could be found suggesting that these alignments are # either artifacts or that the current transcript annotations do not # support this pair # return [], [], pe_reads