def classify_mapped_reads_new(bamfile, settings=get_setting('CHIMERAS_SETTINGS'), file_format='fastq', mate_length_range=[2000, 4000], out_format=SEQITEM): #settings. Include in function properties with default values max_coincidences = settings['MAX_COINCIDENCES'] max_mapq_difference = settings['MAX_MAPQ_DIFFERENCE'] limit = settings['MAX_DISTANCE_TO_END'] max_clipping = settings['MAX_CLIPPING'] max_pe_len = settings['MAX_PE_LEN'] min_mp_len = settings['MIN_MP_LEN'] #It tries to find out the kind of each pair of sequences for grouped_mates in _group_alignments_by_reads(bamfile): mates_alignments = _split_mates(grouped_mates) if _mates_are_not_chimeric(mates_alignments, max_clipping, mate_length_range, bamfile, max_coincidences, max_mapq_difference, limit): kind = NON_CHIMERIC elif _mates_are_chimeric(mates_alignments, bamfile, max_clipping, max_pe_len): kind = CHIMERA else: kind = UNKNOWN if out_format == SEQITEM: pair = [alignedread_to_seqitem(read[0], file_format) for read in mates_alignments] elif out_format == 'aligned_read': pair = mates_alignments yield [pair, kind]
def classify_mapped_reads(bam_fhand, mate_distance, settings=get_setting('CHIMERAS_SETTINGS')): '''It classifies sequences from bam file in chimeric, unknown and non chimeric, according to its distance and orientation in the reference sequence''' bamfile = Samfile(bam_fhand.name) # settings. Include in function properties with default values max_clipping = settings['MAX_CLIPPING'] max_pe_len = settings['MAX_PE_LEN'] variation = settings['MATE_DISTANCE_VARIATION'] mate_length_range = [mate_distance - variation, mate_distance + variation] reference_lengths = _get_ref_lengths(bamfile) # It tries to find out the kind of each pair of sequences for grouped_mates in _group_alignments_reads_by_qname(bamfile): mates_alignments = _split_mates(grouped_mates) if _mates_are_not_chimeric(mates_alignments, max_clipping, mate_length_range, bamfile, reference_lengths): kind = NON_CHIMERIC elif _mates_are_chimeric(mates_alignments, bamfile, max_clipping, max_pe_len, reference_lengths): kind = CHIMERA else: kind = UNKNOWN pair = [alignedread_to_seqitem(_get_primary_alignment(mates)) for mates in mates_alignments] if None not in pair: yield pair, kind
def classify_mapped_reads(bam_fhand, mate_distance, settings=get_setting('CHIMERAS_SETTINGS')): '''It classifies sequences from bam file in chimeric, unknown and non chimeric, according to its distance and orientation in the reference sequence''' bamfile = AlignmentFile(bam_fhand.name) # settings. Include in function properties with default values max_clipping = settings['MAX_CLIPPING'] max_pe_len = settings['MAX_PE_LEN'] variation = settings['MATE_DISTANCE_VARIATION'] mate_length_range = [mate_distance - variation, mate_distance + variation] reference_lengths = _get_ref_lengths(bamfile) # It tries to find out the kind of each pair of sequences for grouped_mates in _group_alignments_reads_by_qname(bamfile): mates_alignments = _split_mates(grouped_mates) if _mates_are_not_chimeric(mates_alignments, max_clipping, mate_length_range, bamfile, reference_lengths): kind = NON_CHIMERIC elif _mates_are_chimeric(mates_alignments, bamfile, max_clipping, max_pe_len, reference_lengths): kind = CHIMERA else: kind = UNKNOWN pair = [ alignedread_to_seqitem(_get_primary_alignment(mates)) for mates in mates_alignments ] if None not in pair: yield pair, kind
def _do_trim(self, aligned_reads): max_clipping = self.max_clipping primary_alignment = _get_primary_alignment(aligned_reads) _5end = _get_longest_5end_alinged_read(aligned_reads, max_clipping) seq = alignedread_to_seqitem(primary_alignment) segments = None if _5end is not None: if not _read_is_totally_mapped([_5end], max_clipping): if not _5end.is_reverse: qend = _get_qend(_5end) else: qend = get_length(seq) - _get_qstart(_5end) segments = [(qend, get_length(seq) - 1)] if segments is not None: _add_trim_segments(segments, seq, kind=OTHER) return seq
def classify_mapped_reads(bamfile, settings=get_setting('CHIMERAS_SETTINGS'), paired_result=True, file_format='fastq'): #settings. Include in function properties with default values max_coincidences = settings['MAX_COINCIDENCES'] max_mapq_difference = settings['MAX_MAPQ_DIFFERENCE'] limit = settings['MAX_DISTANCE_TO_END'] max_clipping = settings['MAX_CLIPPING'] max_pe_len = settings['MAX_PE_LEN'] min_mp_len = settings['MIN_MP_LEN'] #parameters for statistics len_mp_non_chimeric = 0 n_non_chimeric_mates = 0 len_pe_sum = 0 len_mp_sum = 0 typic_chimeras = 0 pe_like_chimeras = 0 #It tries to find out the kind of each pair of sequences for grouped_mates in _group_alignments_by_reads(bamfile): mates_alignments = _split_mates(grouped_mates) i = 0 pair = [] for alignments_group in mates_alignments: i += 1 mate = _get_mate(i, mates_alignments) primary_mate = _get_primary_alignment(mate) primary_alignment = _get_primary_alignment(alignments_group) if _read_is_totally_mapped(alignments_group, max_clipping): if primary_alignment.mate_is_unmapped: kind = UNKNOWN else: if primary_alignment.rname != primary_alignment.rnext: kind = NON_CHIMERIC else: mates = [primary_mate, primary_alignment] distance = _find_distance(mates) if _mapped_read_is_chimeric(primary_alignment, mate, max_mapq_difference, max_pe_len): kind = CHIMERA elif _mates_are_outies(mates) and distance > min_mp_len: len_mp_non_chimeric += abs(distance) n_non_chimeric_mates += 1 kind = NON_CHIMERIC else: kind = UNKNOWN else: fragment = _find_secondary_fragment(alignments_group, max_coincidences, max_mapq_difference=100) #For fragmented reads. likely to be chimeric if fragment is not None: fragments = [primary_alignment, fragment] if _alignments_in_same_ref(fragments): if _read_is_typical_chimera(fragments, max_pe_len, min_mp_len, mate): len_pe, len_mp = _find_distances_with_mate(fragments, mate) len_pe_sum += len_pe len_mp_sum += len_mp typic_chimeras += 1 #typic chimera kind = CHIMERA else: if (_alignment_at_ends_of_reference(fragments[0], limit, bamfile) and _alignment_at_ends_of_reference(fragments[1], limit, bamfile)): kind = _guess_kind_fragments_at_ends(fragments, bamfile, limit) else: kind = CHIMERA else: #Find PE-like chimeras in partially mapping reads if primary_alignment.is_unmapped: kind = UNKNOWN elif not _3end_mapped(primary_alignment, max_clipping): kind = UNKNOWN else: if primary_alignment.rname == primary_alignment.rnext: if _mapped_read_is_chimeric(primary_alignment, mate, max_mapq_difference, max_pe_len): pe_like_chimeras += 1 kind = CHIMERA else: kind = UNKNOWN else: kind = UNKNOWN read = [alignedread_to_seqitem(alignments_group[0], file_format), kind] #read = [alignments_group, kind] if paired_result == False: yield [read[0]], read[1] else: pair.append(read) if paired_result: kinds = [read[1] for read in pair] reads = [read[0] for read in pair] if CHIMERA in kinds: yield [reads, CHIMERA] elif UNKNOWN in kinds: yield [reads, UNKNOWN] else: yield [reads, NON_CHIMERIC] try: mean_mp_non_chimeric = len_mp_non_chimeric / float(n_non_chimeric_mates) mean_pe_len = len_pe_sum / float(typic_chimeras) mean_mp_len = len_mp_sum / float(typic_chimeras) print 'MP mean length in non chimeric reads: ', mean_mp_non_chimeric print 'Typical chimera number: ', typic_chimeras print 'PE-like chimera number: ', pe_like_chimeras print 'PE mean length in chimeric reads: ', mean_pe_len print 'MP mean length in chimeric reads: ', mean_mp_len except ZeroDivisionError: print 'typic chimeras or non_chimeric_mates not found'