Ejemplo n.º 1
0
def classify_mapped_reads_new(bamfile,
                              settings=get_setting('CHIMERAS_SETTINGS'),
                              file_format='fastq',
                              mate_length_range=[2000, 4000],
                              out_format=SEQITEM):
    #settings. Include in function properties with default values
    max_coincidences = settings['MAX_COINCIDENCES']
    max_mapq_difference = settings['MAX_MAPQ_DIFFERENCE']
    limit = settings['MAX_DISTANCE_TO_END']
    max_clipping = settings['MAX_CLIPPING']
    max_pe_len = settings['MAX_PE_LEN']
    min_mp_len = settings['MIN_MP_LEN']

    #It tries to find out the kind of each pair of sequences
    for grouped_mates in _group_alignments_by_reads(bamfile):
        mates_alignments = _split_mates(grouped_mates)
        if _mates_are_not_chimeric(mates_alignments, max_clipping,
                                   mate_length_range, bamfile,
                                   max_coincidences, max_mapq_difference,
                                   limit):
            kind = NON_CHIMERIC
        elif _mates_are_chimeric(mates_alignments, bamfile, max_clipping,
                                 max_pe_len):
            kind = CHIMERA
        else:
            kind = UNKNOWN
        if out_format == SEQITEM:
            pair = [alignedread_to_seqitem(read[0], file_format) for read in mates_alignments]
        elif out_format == 'aligned_read':
            pair = mates_alignments
        yield [pair, kind]
Ejemplo n.º 2
0
def classify_mapped_reads(bam_fhand, mate_distance,
                          settings=get_setting('CHIMERAS_SETTINGS')):
    '''It classifies sequences from bam file in chimeric, unknown and
    non chimeric, according to its distance and orientation in the reference
    sequence'''
    bamfile = Samfile(bam_fhand.name)

    # settings. Include in function properties with default values
    max_clipping = settings['MAX_CLIPPING']
    max_pe_len = settings['MAX_PE_LEN']
    variation = settings['MATE_DISTANCE_VARIATION']
    mate_length_range = [mate_distance - variation, mate_distance + variation]
    reference_lengths = _get_ref_lengths(bamfile)
    # It tries to find out the kind of each pair of sequences
    for grouped_mates in _group_alignments_reads_by_qname(bamfile):
        mates_alignments = _split_mates(grouped_mates)
        if _mates_are_not_chimeric(mates_alignments, max_clipping,
                                   mate_length_range, bamfile,
                                   reference_lengths):
            kind = NON_CHIMERIC
        elif _mates_are_chimeric(mates_alignments, bamfile, max_clipping,
                                 max_pe_len, reference_lengths):
            kind = CHIMERA
        else:
            kind = UNKNOWN

        pair = [alignedread_to_seqitem(_get_primary_alignment(mates))
                for mates in mates_alignments]

        if None not in pair:
            yield pair, kind
Ejemplo n.º 3
0
def classify_mapped_reads(bam_fhand,
                          mate_distance,
                          settings=get_setting('CHIMERAS_SETTINGS')):
    '''It classifies sequences from bam file in chimeric, unknown and
    non chimeric, according to its distance and orientation in the reference
    sequence'''
    bamfile = AlignmentFile(bam_fhand.name)

    # settings. Include in function properties with default values
    max_clipping = settings['MAX_CLIPPING']
    max_pe_len = settings['MAX_PE_LEN']
    variation = settings['MATE_DISTANCE_VARIATION']
    mate_length_range = [mate_distance - variation, mate_distance + variation]
    reference_lengths = _get_ref_lengths(bamfile)
    # It tries to find out the kind of each pair of sequences
    for grouped_mates in _group_alignments_reads_by_qname(bamfile):
        mates_alignments = _split_mates(grouped_mates)
        if _mates_are_not_chimeric(mates_alignments, max_clipping,
                                   mate_length_range, bamfile,
                                   reference_lengths):
            kind = NON_CHIMERIC
        elif _mates_are_chimeric(mates_alignments, bamfile, max_clipping,
                                 max_pe_len, reference_lengths):
            kind = CHIMERA
        else:
            kind = UNKNOWN

        pair = [
            alignedread_to_seqitem(_get_primary_alignment(mates))
            for mates in mates_alignments
        ]

        if None not in pair:
            yield pair, kind
Ejemplo n.º 4
0
 def _do_trim(self, aligned_reads):
     max_clipping = self.max_clipping
     primary_alignment = _get_primary_alignment(aligned_reads)
     _5end = _get_longest_5end_alinged_read(aligned_reads, max_clipping)
     seq = alignedread_to_seqitem(primary_alignment)
     segments = None
     if _5end is not None:
         if not _read_is_totally_mapped([_5end], max_clipping):
             if not _5end.is_reverse:
                 qend = _get_qend(_5end)
             else:
                 qend = get_length(seq) - _get_qstart(_5end)
             segments = [(qend, get_length(seq) - 1)]
     if segments is not None:
         _add_trim_segments(segments, seq, kind=OTHER)
     return seq
Ejemplo n.º 5
0
 def _do_trim(self, aligned_reads):
     max_clipping = self.max_clipping
     primary_alignment = _get_primary_alignment(aligned_reads)
     _5end = _get_longest_5end_alinged_read(aligned_reads, max_clipping)
     seq = alignedread_to_seqitem(primary_alignment)
     segments = None
     if _5end is not None:
         if not _read_is_totally_mapped([_5end], max_clipping):
             if not _5end.is_reverse:
                 qend = _get_qend(_5end)
             else:
                 qend = get_length(seq) - _get_qstart(_5end)
             segments = [(qend, get_length(seq) - 1)]
     if segments is not None:
         _add_trim_segments(segments, seq, kind=OTHER)
     return seq
Ejemplo n.º 6
0
def classify_mapped_reads(bamfile, settings=get_setting('CHIMERAS_SETTINGS'),
                          paired_result=True, file_format='fastq'):
    #settings. Include in function properties with default values
    max_coincidences = settings['MAX_COINCIDENCES']
    max_mapq_difference = settings['MAX_MAPQ_DIFFERENCE']
    limit = settings['MAX_DISTANCE_TO_END']
    max_clipping = settings['MAX_CLIPPING']
    max_pe_len = settings['MAX_PE_LEN']
    min_mp_len = settings['MIN_MP_LEN']

    #parameters for statistics
    len_mp_non_chimeric = 0
    n_non_chimeric_mates = 0
    len_pe_sum = 0
    len_mp_sum = 0
    typic_chimeras = 0
    pe_like_chimeras = 0
    #It tries to find out the kind of each pair of sequences
    for grouped_mates in _group_alignments_by_reads(bamfile):
        mates_alignments = _split_mates(grouped_mates)
        i = 0
        pair = []
        for alignments_group in mates_alignments:
            i += 1
            mate = _get_mate(i, mates_alignments)
            primary_mate = _get_primary_alignment(mate)
            primary_alignment = _get_primary_alignment(alignments_group)
            if _read_is_totally_mapped(alignments_group, max_clipping):
                if  primary_alignment.mate_is_unmapped:
                    kind = UNKNOWN
                else:
                    if primary_alignment.rname != primary_alignment.rnext:
                        kind = NON_CHIMERIC
                    else:
                        mates = [primary_mate, primary_alignment]
                        distance = _find_distance(mates)
                        if _mapped_read_is_chimeric(primary_alignment, mate,
                                                    max_mapq_difference,
                                                    max_pe_len):
                            kind = CHIMERA
                        elif _mates_are_outies(mates) and distance > min_mp_len:
                            len_mp_non_chimeric += abs(distance)
                            n_non_chimeric_mates += 1
                            kind = NON_CHIMERIC
                        else:
                            kind = UNKNOWN
            else:
                fragment = _find_secondary_fragment(alignments_group,
                                                    max_coincidences,
                                                    max_mapq_difference=100)
                #For fragmented reads. likely to be chimeric
                if fragment is not None:
                    fragments = [primary_alignment, fragment]
                    if _alignments_in_same_ref(fragments):
                        if _read_is_typical_chimera(fragments, max_pe_len,
                                                    min_mp_len, mate):
                            len_pe, len_mp = _find_distances_with_mate(fragments, mate)
                            len_pe_sum += len_pe
                            len_mp_sum += len_mp
                            typic_chimeras += 1
                            #typic chimera
                        kind = CHIMERA
                    else:
                        if (_alignment_at_ends_of_reference(fragments[0],
                                                            limit, bamfile)
                            and _alignment_at_ends_of_reference(fragments[1],
                                                                limit, bamfile)):
                            kind = _guess_kind_fragments_at_ends(fragments,
                                                                 bamfile, limit)
                        else:
                            kind = CHIMERA
                else:
                    #Find PE-like chimeras in partially mapping reads
                    if primary_alignment.is_unmapped:
                        kind = UNKNOWN
                    elif not _3end_mapped(primary_alignment, max_clipping):
                        kind = UNKNOWN
                    else:
                        if primary_alignment.rname == primary_alignment.rnext:
                            if _mapped_read_is_chimeric(primary_alignment,
                                                    mate, max_mapq_difference,
                                                    max_pe_len):
                                pe_like_chimeras += 1
                                kind = CHIMERA
                            else:
                                kind = UNKNOWN
                        else:
                            kind = UNKNOWN
            read = [alignedread_to_seqitem(alignments_group[0], file_format),
                    kind]
            #read = [alignments_group, kind]
            if paired_result == False:
                yield [read[0]], read[1]
            else:
                pair.append(read)
        if paired_result:
            kinds = [read[1] for read in pair]
            reads = [read[0] for read in pair]
            if CHIMERA in kinds:
                yield [reads, CHIMERA]
            elif UNKNOWN in kinds:
                yield [reads, UNKNOWN]
            else:
                yield [reads, NON_CHIMERIC]
    try:
        mean_mp_non_chimeric = len_mp_non_chimeric / float(n_non_chimeric_mates)
        mean_pe_len = len_pe_sum / float(typic_chimeras)
        mean_mp_len = len_mp_sum / float(typic_chimeras)
        print 'MP mean length in non chimeric reads: ', mean_mp_non_chimeric
        print 'Typical chimera number: ', typic_chimeras
        print 'PE-like chimera number: ', pe_like_chimeras
        print 'PE mean length in chimeric reads: ', mean_pe_len
        print 'MP mean length in chimeric reads: ', mean_mp_len
    except ZeroDivisionError:
        print 'typic chimeras or non_chimeric_mates not found'