def classify_read_pairs(pe_reads, max_isize, library_type, tid_genome_map, tid_tx_cluster_map): """ examines all the alignments of a single fragment and tries to find ways to pair reads together. annotates all read pairs with an integer tag corresponding to a value in the DiscordantTags class returns a tuple with the following lists: 1) pairs (r1,r2) aligning to genes (pairs may be discordant) 2) pairs (r1,r2) aligning to genome (pairs may be discordant) 3) unpaired reads, if any """ # to satisfy library type reads must either be on # same strand or opposite strands concordant_tx_pairs = [] discordant_tx_pairs = [] concordant_gene_pairs = [] discordant_gene_pairs = [] concordant_genome_pairs = [] discordant_genome_pairs = [] # # first, try to pair reads that map to the same transcript, or to the # genome within the insert size range # same_strand = LibraryTypes.same_strand(library_type) refdict,clusterdict = map_reads_to_references(pe_reads, tid_tx_cluster_map) found_pair = False for tid, tid_pe_reads in refdict.iteritems(): # check if there are alignments involving both reads in a pair if len(tid_pe_reads[0]) == 0 or len(tid_pe_reads[1]) == 0: # no paired alignments exist at this reference continue # check if there are alignments involving both reads in a pair for r1 in tid_pe_reads[0]: for r2 in tid_pe_reads[1]: # read strands must agree with library type strand_match = (same_strand == (r1.is_reverse == r2.is_reverse)) # check to see if this tid is a gene or genomic if (tid not in tid_genome_map): # this is a genomic hit so check insert size if r1.pos > r2.pos: isize = r1.aend - r2.pos else: isize = r2.aend - r1.pos if (isize <= max_isize): # these reads can be paired found_pair = True cr1 = copy_read(r1) cr2 = copy_read(r2) # reads are close to each other on same chromosome # so check strand if strand_match: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_GENOME)] concordant_genome_pairs.append((cr1,cr2)) else: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_GENOME)] discordant_genome_pairs.append((cr1, cr2)) pair_reads(cr1,cr2,tags) else: # these reads can be paired found_pair = True cr1 = copy_read(r1) cr2 = copy_read(r2) # this is a hit to same transcript (gene) # pair the reads if strand comparison is correct if strand_match: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_TX)] concordant_tx_pairs.append((cr1,cr2)) else: # hit to same gene with wrong strand, which # could happen in certain wacky cases tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_TX)] discordant_tx_pairs.append((cr1,cr2)) pair_reads(cr1,cr2,tags) # at this point, if we have not been able to find a suitable way # to pair the reads, then search within the transcript cluster if not found_pair: for cluster_id, cluster_pe_reads in clusterdict.iteritems(): # check if there are alignments involving both reads in a pair if len(cluster_pe_reads[0]) == 0 or len(cluster_pe_reads[1]) == 0: # no paired alignments in this transcript cluster continue for r1 in cluster_pe_reads[0]: for r2 in cluster_pe_reads[1]: # check strand compatibility strand_match = (same_strand == (r1.is_reverse == r2.is_reverse)) # these reads can be paired found_pair = True cr1 = copy_read(r1) cr2 = copy_read(r2) if strand_match: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_GENE)] concordant_gene_pairs.append((cr1,cr2)) else: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_GENE)] discordant_gene_pairs.append((cr1,cr2)) pair_reads(cr1,cr2,tags) # at this point, we have tried all combinations. if any paired reads # are concordant then return them without considering discordant reads gene_pairs = [] if len(concordant_tx_pairs) > 0: gene_pairs = concordant_tx_pairs elif len(concordant_gene_pairs) > 0: gene_pairs = concordant_gene_pairs if len(gene_pairs) > 0 or len(concordant_genome_pairs) > 0: return gene_pairs, concordant_genome_pairs, [] # if no concordant reads in transcripts or genome, return any # discordant reads that may violate strand requirements but still # remain colocalized on the same gene/chromosome gene_pairs = [] if len(discordant_tx_pairs) > 0: gene_pairs = discordant_tx_pairs elif len(discordant_gene_pairs) > 0: gene_pairs = discordant_gene_pairs if len(gene_pairs) > 0 or len(discordant_genome_pairs) > 0: return gene_pairs, discordant_genome_pairs, [] # # at this point, no read pairings were found so the read is # assumed to be discordant. # # TODO: now that we know that the reads are discordant, no reason # to keep all the mappings hanging around if there is a small subset # with a small number of mismatches. is this the right thing to do # here? # pe_reads = (select_best_mismatch_strata(pe_reads[0]), select_best_mismatch_strata(pe_reads[1])) # # now we can create all valid combinations of read1/read2 as putative # discordant read pairs # gene_pairs, genome_pairs, combo_pairs = \ find_discordant_pairs(pe_reads, tid_genome_map, library_type) if len(gene_pairs) > 0 or len(genome_pairs) > 0: return gene_pairs, genome_pairs, [] elif len(combo_pairs) > 0: return combo_pairs, [], [] # last resort suggests that there are some complex read mappings that # don't make sense and cannot be explained, warranting further # investigation return [], [], pe_reads
def classify_read_pairs(pe_reads, max_isize, library_type, tid_tx_map): """ examines all the alignments of a single fragment and tries to find ways to pair reads together. annotates all read pairs with an integer tag corresponding to a value in the DiscordantTags class returns a tuple containing 3 lists: 1) concordant (r1,r2) pairs 2) discordant (r1,r2) pairs 3) unpaired reads """ # to satisfy library type reads must either be on # same strand or opposite strands concordant_tx_pairs = [] discordant_tx_pairs = [] concordant_cluster_pairs = [] discordant_cluster_pairs = [] # # first, try to pair reads that map to the same transcript or # cluster or overlapping transcripts # same_strand = LibraryTypes.same_strand(library_type) refdict, clusterdict = map_reads_to_references(pe_reads, tid_tx_map) found_pair = False for tid, tid_pe_reads in refdict.iteritems(): # check if there are alignments involving both reads in a pair if len(tid_pe_reads[0]) == 0 or len(tid_pe_reads[1]) == 0: # no paired alignments exist at this reference continue for r1 in tid_pe_reads[0]: for r2 in tid_pe_reads[1]: # read strands must agree with library type strand_match = (same_strand == ( r1.is_reverse == r2.is_reverse)) # these reads can be paired found_pair = True cr1 = copy_read(r1) cr2 = copy_read(r2) # this is a hit to same transcript (gene) # pair the reads if strand comparison is correct if strand_match: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_TX) ] concordant_tx_pairs.append((cr1, cr2)) else: # hit to same gene with wrong strand, which # could happen in certain wacky cases tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_TX)] discordant_tx_pairs.append((cr1, cr2)) pair_reads(cr1, cr2, tags) # at this point, if we have not been able to find a suitable way # to pair the reads, then search within the transcript cluster if not found_pair: for cluster_id, cluster_pe_reads in clusterdict.iteritems(): # check if there are alignments involving both reads in a pair if len(cluster_pe_reads[0]) == 0 or len(cluster_pe_reads[1]) == 0: # no paired alignments in this transcript cluster continue for r1 in cluster_pe_reads[0]: for r2 in cluster_pe_reads[1]: # check strand compatibility strand_match = (same_strand == ( r1.is_reverse == r2.is_reverse)) # these reads can be paired found_pair = True cr1 = copy_read(r1) cr2 = copy_read(r2) if strand_match: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_GENE)] concordant_cluster_pairs.append((cr1, cr2)) else: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_GENE)] discordant_cluster_pairs.append((cr1, cr2)) pair_reads(cr1, cr2, tags) # at this point, we have tried all combinations. if any paired reads # are concordant then return them without considering discordant reads gene_pairs = [] if len(concordant_tx_pairs) > 0: gene_pairs = concordant_tx_pairs elif len(concordant_cluster_pairs) > 0: gene_pairs = concordant_cluster_pairs if len(gene_pairs) > 0: return gene_pairs, [], [] # if no concordant reads in transcripts, return any discordant reads # that may violate strand requirements but still remain colocalized # on the same gene/chromosome gene_pairs = [] if len(discordant_tx_pairs) > 0: gene_pairs = discordant_tx_pairs elif len(discordant_cluster_pairs) > 0: gene_pairs = discordant_cluster_pairs if len(gene_pairs) > 0: return gene_pairs, [], [] # # at this point, no read pairings were found so the read is # assumed to be discordant. now we can create all valid # combinations of read1/read2 as putative discordant read pairs # pairs = find_discordant_pairs(pe_reads, library_type) if len(pairs) > 0: # sort valid pairs by sum of alignment score and retain the best # scoring pairs pairs = select_best_scoring_pairs(pairs) return [], pairs, [] # # no valid pairs could be found suggesting that these alignments are # either artifacts or that the current transcript annotations do not # support this pair # return [], [], pe_reads
def get_argument_parser(): parser = argparse.ArgumentParser(usage="%(prog)s [options] <index> " "<mate1.fq> <mate2.fq> <output_dir>") # required options parser.add_argument("index_dir", default=None, help="Location of chimerascan index directory") parser.add_argument("read1", default=None, help="Path to read1 FASTQ file") parser.add_argument("read2", default=None, help="Path to read2 FASTQ file") parser.add_argument("output_dir", default=None, help="Location of output files") # standard options parser.add_argument('--version', action='version', version='%s' % __version__) parser.add_argument("--config-file", dest="config_file", help="Load parameters from a XML file " "generated during a previous run ", default=None) parser.add_argument("-v", "--verbose", dest="verbose", action="store_true", default=False, help="enable verbose logging output " "[default=%(default)s]") parser.add_argument("-p", "--processors", dest="num_processors", type=int, default=DEFAULT_NUM_PROCESSORS, help="Number of processor cores to allocate to " "chimerascan [default=%(default)s]") parser.add_argument("--keep-tmp", dest="keep_tmp", action="store_true", default=DEFAULT_KEEP_TMP, help="DO NOT delete intermediate files after " "run [default=%(default)s]") parser.add_argument("--rm-tmp", dest="keep_tmp", action="store_false", help="Delete intermediate files after run " "[default=%s]" % str(not DEFAULT_KEEP_TMP)) parser.add_argument("--quals", dest="quals", choices=FASTQ_QUAL_FORMATS, default=DEFAULT_FASTQ_QUAL_FORMAT, metavar="FMT", help="FASTQ quality score format " "[default=%(default)s]") parser.add_argument('--library-type', dest="library_type", choices=LibraryTypes.choices(), default=DEFAULT_LIBRARY_TYPE, help="Library type [default=%(default)s]") parser.add_argument("--isize-mean", dest="isize_mean", type=int, default=DEFAULT_ISIZE_MEAN, metavar="N", help="Mean insert size to sample from when " "insert size distribution cannot be determined " "empirically [default=%(default)s]") parser.add_argument("--isize-stdev", dest="isize_stdev", type=float, default=DEFAULT_ISIZE_STDEV, metavar="N", help="Insert size standard deviation to sample " "from when insert size distribution cannot be " "determined empirically [default=%(default)s]") parser.add_argument("--trim5", type=int, dest="trim5", default=DEFAULT_TRIM5, metavar="N", help="Trim N bases from 5' end of read") parser.add_argument("--trim3", type=int, dest="trim3", default=DEFAULT_TRIM3, metavar="N", help="Trim N bases from 3' end of read") parser.add_argument("--min-fragment-length", type=int, dest="min_fragment_length", default=config.DEFAULT_MIN_FRAG_LENGTH, help="Smallest expected fragment length " "[default=%(default)s]") parser.add_argument("--max-fragment-length", type=int, dest="max_fragment_length", default=config.DEFAULT_MAX_FRAG_LENGTH, help="Largest expected fragment length (reads " "less than this fragment length are assumed to " "be unspliced and contiguous) " "[default=%(default)s]") parser.add_argument("--segment-length", type=int, dest="segment_length", default=DEFAULT_SEGMENT_LENGTH, metavar="N", help="Override size of soft-clipped read " "segments during discordant alignment phase " "(determined empirically by default)") parser.add_argument("--multihits", type=int, dest="max_multihits", default=config.DEFAULT_MAX_MULTIHITS, metavar="N", help="Maximum alignments allowed for each " "discordant read") parser.add_argument("--local-multihits", type=int, dest="local_multihits", default=config.DEFAULT_LOCAL_MULTIHITS, metavar="N", help="Maximum alignments allowed for each " "discordant read") parser.add_argument("--local-anchor-length", type=int, dest="local_anchor_length", default=config.DEFAULT_LOCAL_ANCHOR_LENGTH, metavar="N", help="Number of bases that read must span " "on each side of a chimera to be considered " "a valid breakpoint read") # filtering options group = parser.add_argument_group('Filtering options') group.add_argument("--filter-num-frags", type=float, default=config.DEFAULT_FILTER_FRAGS, dest="filter_num_frags", metavar="N", help="Filter chimeras with less than N " "aligned fragments [default=%(default)s]") group.add_argument("--filter-allele-fraction", type=float, default=config.DEFAULT_FILTER_ALLELE_FRACTION, dest="filter_allele_fraction", metavar="X", help="Filter chimeras with expression less than " "the specified fraction of the total expression " "level [default=%(default)s") group.add_argument("--mask-biotypes-file", default="", dest="mask_biotypes_file", help="File containing list of gene biotypes " "to ignore (ex. pseudogenes, rRNA)") group.add_argument("--mask-rnames-file", default="", dest="mask_rnames_file", help="File containing list of reference names " "to ignore (ex. MT or chrM)") # filtering options return parser
def classify_read_pairs(pe_reads, max_isize, library_type, tid_tx_map): """ examines all the alignments of a single fragment and tries to find ways to pair reads together. annotates all read pairs with an integer tag corresponding to a value in the DiscordantTags class returns a tuple containing 3 lists: 1) concordant (r1,r2) pairs 2) discordant (r1,r2) pairs 3) unpaired reads """ # to satisfy library type reads must either be on # same strand or opposite strands concordant_tx_pairs = [] discordant_tx_pairs = [] concordant_cluster_pairs = [] discordant_cluster_pairs = [] # # first, try to pair reads that map to the same transcript or # cluster or overlapping transcripts # same_strand = LibraryTypes.same_strand(library_type) refdict, clusterdict = map_reads_to_references(pe_reads, tid_tx_map) found_pair = False for tid, tid_pe_reads in refdict.iteritems(): # check if there are alignments involving both reads in a pair if len(tid_pe_reads[0]) == 0 or len(tid_pe_reads[1]) == 0: # no paired alignments exist at this reference continue for r1 in tid_pe_reads[0]: for r2 in tid_pe_reads[1]: # read strands must agree with library type strand_match = (same_strand == (r1.is_reverse == r2.is_reverse)) # these reads can be paired found_pair = True cr1 = copy_read(r1) cr2 = copy_read(r2) # this is a hit to same transcript (gene) # pair the reads if strand comparison is correct if strand_match: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_TX)] concordant_tx_pairs.append((cr1,cr2)) else: # hit to same gene with wrong strand, which # could happen in certain wacky cases tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_TX)] discordant_tx_pairs.append((cr1,cr2)) pair_reads(cr1,cr2,tags) # at this point, if we have not been able to find a suitable way # to pair the reads, then search within the transcript cluster if not found_pair: for cluster_id, cluster_pe_reads in clusterdict.iteritems(): # check if there are alignments involving both reads in a pair if len(cluster_pe_reads[0]) == 0 or len(cluster_pe_reads[1]) == 0: # no paired alignments in this transcript cluster continue for r1 in cluster_pe_reads[0]: for r2 in cluster_pe_reads[1]: # check strand compatibility strand_match = (same_strand == (r1.is_reverse == r2.is_reverse)) # these reads can be paired found_pair = True cr1 = copy_read(r1) cr2 = copy_read(r2) if strand_match: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_GENE)] concordant_cluster_pairs.append((cr1,cr2)) else: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_GENE)] discordant_cluster_pairs.append((cr1,cr2)) pair_reads(cr1,cr2,tags) # at this point, we have tried all combinations. if any paired reads # are concordant then return them without considering discordant reads gene_pairs = [] if len(concordant_tx_pairs) > 0: gene_pairs = concordant_tx_pairs elif len(concordant_cluster_pairs) > 0: gene_pairs = concordant_cluster_pairs if len(gene_pairs) > 0: return gene_pairs, [], [] # if no concordant reads in transcripts, return any discordant reads # that may violate strand requirements but still remain colocalized # on the same gene/chromosome gene_pairs = [] if len(discordant_tx_pairs) > 0: gene_pairs = discordant_tx_pairs elif len(discordant_cluster_pairs) > 0: gene_pairs = discordant_cluster_pairs if len(gene_pairs) > 0: return gene_pairs, [], [] # # at this point, no read pairings were found so the read is # assumed to be discordant. now we can create all valid # combinations of read1/read2 as putative discordant read pairs # pairs = find_discordant_pairs(pe_reads, library_type) if len(pairs) > 0: # sort valid pairs by sum of alignment score and retain the best # scoring pairs pairs = select_best_scoring_pairs(pairs) return [], pairs, [] # # no valid pairs could be found suggesting that these alignments are # either artifacts or that the current transcript annotations do not # support this pair # return [], [], pe_reads