Ejemplo n.º 1
0
def is_fragmented_ref_fake_translocation(align1, align2, ref_lens):
    # Check whether translocation is caused by fragmented reference and thus should be marked Fake misassembly
    # Return inconsistency value if translocation is fake or None if translocation is real
    # !!! it is assumed that align1.ref != align2.ref
    # assert align1.ref != align2.ref, "Internal QUAST bug: is_fragmented_ref_fake_translocation() " \
    #                                 "should be called only if align1.ref != align2.ref"
    if align1.ref == align2.ref:
        return False

    if qconfig.check_for_fragmented_ref:
        if qconfig.is_combined_ref and not is_same_reference(align1.ref, align2.ref):
            return False
        if all([d <= qconfig.fragmented_max_indent for d in __get_border_gaps(align1, align2, ref_lens)]):
            return True
    return False
Ejemplo n.º 2
0
def get_score(score, aligns, ref_lens, is_cyclic, uncovered_len, seq, region_struct_variations, penalties):
    if len(aligns) > 1:
        align1, align2 = aligns[-2], aligns[-1]
        is_fake_translocation = is_fragmented_ref_fake_translocation(align1, align2, ref_lens)
        overlaped_len = max(0, align1.end() - align2.start() + 1)
        if len(aligns) > 2:  # does not affect score and uncovered but it is important for further checking on set correctness
            exclude_internal_overlaps(aligns[-3], align1)
        reduced_len = exclude_internal_overlaps(align1, align2)  # reduced_len is for align1 only
        # check whether the set is still correct, i.e both alignments are rather large
        if min(align1.len2, align2.len2) < max(qconfig.min_cluster, qconfig.min_alignment):
            return None, None

        added_len = get_added_len(aligns, aligns[-1])
        uncovered_len -= (added_len - reduced_len)
        score += score_single_align(align2, ctg_len=added_len) - score_single_align(align1, ctg_len=reduced_len)
        is_extensive_misassembly, aux_data = is_misassembly(align1, align2, seq, ref_lens, is_cyclic, region_struct_variations,
                                                            is_fake_translocation)
        if is_extensive_misassembly:
            misassembly_penalty = penalties['extensive']
            if align1.ref != align2.ref:
                if qconfig.is_combined_ref and not is_same_reference(align1.ref, align2.ref):
                    misassembly = Misassembly.INTERSPECTRANSLOCATION
                else:
                    misassembly = Misassembly.TRANSLOCATION
            elif abs(aux_data["inconsistency"]) > qconfig.extensive_misassembly_threshold:
                    misassembly = Misassembly.RELOCATION
                    score -= float(abs(aux_data["inconsistency"])) / ref_lens[align1.ref]
            else:
                    misassembly = Misassembly.INVERSION
            score -= misassembly - Misassembly.INVERSION
        elif aux_data['is_sv']:
            misassembly_penalty = 0
        elif abs(aux_data['inconsistency']) > qconfig.MAX_INDEL_LENGTH and not aux_data['is_scaffold_gap']:
            misassembly_penalty = penalties['local']
        elif aux_data['is_scaffold_gap']:
            misassembly_penalty = penalties['scaffold']
        else:
            misassembly_penalty = 0
        overlap_penalty = min(overlaped_len * penalties['overlap_multiplier'], misassembly_penalty)
        score -= (misassembly_penalty + overlap_penalty)
    else:
        score += score_single_align(aligns[-1])
        uncovered_len -= aligns[-1].len2
    return score, uncovered_len
Ejemplo n.º 3
0
def process_misassembled_contig(sorted_aligns, cyclic, aligned_lengths, region_misassemblies, ref_lens, ref_aligns,
                                ref_features, contig_seq, references_misassemblies, region_struct_variations,
                                misassemblies_matched_sv, ca_output, is_ambiguous=False):
    misassembly_internal_overlap = 0
    prev_align = sorted_aligns[0]
    cur_aligned_length = prev_align.len2
    is_misassembled = False
    contig_is_printed = False
    indels_info = IndelsInfo()
    contig_aligned_length = 0  # for internal debugging purposes

    for i in range(len(sorted_aligns) - 1):
        next_align = sorted_aligns[i + 1]

        is_fake_translocation = is_fragmented_ref_fake_translocation(prev_align, next_align, ref_lens)
        cur_aligned_length -= exclude_internal_overlaps(prev_align, next_align, i, ca_output)
        is_extensive_misassembly, aux_data = is_misassembly(prev_align, next_align, contig_seq, ref_lens,
                                                            cyclic, region_struct_variations, is_fake_translocation)
        inconsistency = aux_data["inconsistency"]
        distance_on_contig = aux_data["distance_on_contig"]
        misassembly_internal_overlap += aux_data["misassembly_internal_overlap"]
        cyclic_moment = aux_data["cyclic_moment"]
        ca_output.icarus_out_f.write(prev_align.icarus_report_str() + '\n')
        ca_output.stdout_f.write('\t\t\tReal Alignment %d: %s\n' % (i+1, str(prev_align)))

        ref_aligns.setdefault(prev_align.ref, []).append(prev_align)
        ca_output.coords_filtered_f.write(str(prev_align) + '\n')
        if aux_data["is_sv"]:
            ca_output.stdout_f.write('\t\t\t  Not a misassembly (structural variation of the genome) between these two alignments\n')
            ca_output.icarus_out_f.write('fake: not a misassembly (structural variation of the genome)\n')
            misassemblies_matched_sv += 1
        elif aux_data["is_scaffold_gap"] and abs(inconsistency) > qconfig.extensive_misassembly_threshold:
            ca_output.stdout_f.write('\t\t\t  Incorrectly estimated size of scaffold gap between these two alignments: ')
            ca_output.stdout_f.write('gap length difference = ' + str(inconsistency) + '\n')
            region_misassemblies.append(Misassembly.SCAFFOLD_GAP)
            ca_output.icarus_out_f.write('fake: scaffold gap size wrong estimation' + '\n')
        elif is_extensive_misassembly:
            is_misassembled = True
            aligned_lengths.append(cur_aligned_length)
            contig_aligned_length += cur_aligned_length
            cur_aligned_length = 0
            if not contig_is_printed:
                ca_output.misassembly_f.write(prev_align.contig + '\n')
                contig_is_printed = True
            ca_output.misassembly_f.write('Extensive misassembly (')
            ca_output.stdout_f.write('\t\t\t  Extensive misassembly (')
            if prev_align.ref != next_align.ref:  # it is not a Fake translocation, because is_extensive_misassembly is True
                if qconfig.is_combined_ref and \
                        not is_same_reference(prev_align.ref, next_align.ref):  # if chromosomes from different references
                        region_misassemblies.append(Misassembly.INTERSPECTRANSLOCATION)
                        ref1, ref2 = get_ref_by_chromosome(prev_align.ref), get_ref_by_chromosome(next_align.ref)
                        references_misassemblies[ref1][ref2] += 1
                        references_misassemblies[ref2][ref1] += 1
                        ca_output.stdout_f.write('interspecies translocation')
                        ca_output.misassembly_f.write('interspecies translocation')
                        ca_output.icarus_out_f.write('interspecies translocation')
                else:
                    region_misassemblies.append(Misassembly.TRANSLOCATION)
                    ca_output.stdout_f.write('translocation')
                    ca_output.misassembly_f.write('translocation')
                    ca_output.icarus_out_f.write('translocation')
            elif abs(inconsistency) > qconfig.extensive_misassembly_threshold:
                region_misassemblies.append(Misassembly.RELOCATION)
                msg = 'relocation, inconsistency = ' + str(inconsistency) + \
                      (' [linear representation of circular genome]' if cyclic_moment else '')
                ca_output.stdout_f.write(msg)
                ca_output.misassembly_f.write(msg)
                ca_output.icarus_out_f.write(msg)
            else: #if strand1 != strand2:
                region_misassemblies.append(Misassembly.INVERSION)
                ca_output.stdout_f.write('inversion')
                ca_output.misassembly_f.write('inversion')
                ca_output.icarus_out_f.write('inversion')
            ca_output.stdout_f.write(') between these two alignments\n')
            ca_output.misassembly_f.write(') between %s %s and %s %s' % (prev_align.s2, prev_align.e2, next_align.s2, next_align.e2) + '\n')
            ca_output.icarus_out_f.write('\n')
            ref_features.setdefault(prev_align.ref, {})[prev_align.e1] = 'M'
            ref_features.setdefault(next_align.ref, {})[next_align.e1] = 'M'
        else:
            reason_msg = "" + (" [linear representation of circular genome]" if cyclic_moment else "") + \
                         (" [fragmentation of reference genome]" if prev_align.ref != next_align.ref else "")
            if inconsistency == 0 and cyclic_moment:
                ca_output.stdout_f.write('\t\t\t  Not a misassembly' + reason_msg + ' between these two alignments\n')
                ca_output.icarus_out_f.write('fake: not a misassembly' + reason_msg + '\n')
            elif inconsistency == 0 and prev_align.ref != next_align.ref:  # is_fragmented_ref_fake_translocation is True, because is_extensive_misassembly is False
                ca_output.stdout_f.write('\t\t\t  Not a misassembly' + reason_msg + ' between these two alignments\n')
                region_misassemblies.append(Misassembly.FRAGMENTED)
                ca_output.icarus_out_f.write('fake: not a misassembly' + reason_msg + '\n')
            elif abs(inconsistency) <= qconfig.MAX_INDEL_LENGTH and \
                            count_ns_and_not_ns_between_aligns(contig_seq, prev_align, next_align)[1] <= qconfig.MAX_INDEL_LENGTH:
                ns_number, not_ns_number = count_ns_and_not_ns_between_aligns(contig_seq, prev_align, next_align)

                if inconsistency == 0:
                    ca_output.stdout_f.write(('\t\t\t  Stretch of %d mismatches between these two alignments (number of Ns: %d)' %
                                              (not_ns_number, ns_number)) + reason_msg + '\n')
                    indels_info.mismatches += not_ns_number
                    ca_output.icarus_out_f.write('indel: stretch of mismatches' + reason_msg + '\n')
                else:
                    indel_length = abs(inconsistency)
                    indel_class = 'Indel (<= 5bp)' if indel_length <= qconfig.SHORT_INDEL_THRESHOLD else 'Indel (> 5bp)'
                    indel_type = 'insertion' if inconsistency < 0 else 'deletion'
                    mismatches = max(0, not_ns_number - indel_length)
                    ca_output.stdout_f.write(('\t\t\t  %s between these two alignments: %s of length %d; %d mismatches (number of Ns: %d)')
                                                 % (indel_class, indel_type, indel_length, mismatches, ns_number) + reason_msg + '\n')
                    indels_info.indels_list.append(indel_length)
                    if indel_type == 'insertion':
                        indels_info.insertions += indel_length
                    else:
                        indels_info.deletions += indel_length
                    indels_info.mismatches += mismatches
                    ca_output.icarus_out_f.write('indel: ' + indel_class.lower() + reason_msg + '\n')
            else:
                if qconfig.strict_NA:
                    aligned_lengths.append(cur_aligned_length)
                    contig_aligned_length += cur_aligned_length
                    cur_aligned_length = 0

                if distance_on_contig < 0:
                    #There is an overlap between the two alignments, a local misassembly
                    ca_output.stdout_f.write('\t\t\t  Overlap between these two alignments (local misassembly).')
                elif distance_on_contig > 0:
                    #There is a small gap between the two alignments, a local misassembly
                    ca_output.stdout_f.write('\t\t\t  Gap between these two alignments (local misassembly).')
                elif inconsistency < 0:
                    ca_output.stdout_f.write('\t\t\t  Overlap between these two alignments (local misassembly).')
                else:
                    ca_output.stdout_f.write('\t\t\t  Gap between these two alignments (local misassembly).')
                ca_output.stdout_f.write(' Inconsistency = ' + str(inconsistency) + reason_msg + '\n')
                ca_output.icarus_out_f.write('local misassembly' + reason_msg + '\n')
                region_misassemblies.append(Misassembly.LOCAL)

        prev_align = next_align
        cur_aligned_length += prev_align.len2 - (-distance_on_contig if distance_on_contig < 0 else 0)

    #Record the very last alignment
    i = len(sorted_aligns) - 1
    ca_output.stdout_f.write('\t\t\tReal Alignment %d: %s' % (i + 1, str(next_align)) + '\n')
    ca_output.icarus_out_f.write(next_align.icarus_report_str() + '\n')
    ref_aligns.setdefault(next_align.ref, []).append(next_align)
    ca_output.coords_filtered_f.write(str(next_align) + '\n')
    aligned_lengths.append(cur_aligned_length)
    contig_aligned_length += cur_aligned_length

    assert contig_aligned_length <= len(contig_seq), "Internal QUAST bug: contig aligned length is greater than " \
                                                     "contig length (contig: %s, len: %d, aligned: %d)!" % \
                                                     (sorted_aligns[0].contig, contig_aligned_length, len(contig_seq))

    return is_misassembled, misassembly_internal_overlap, references_misassemblies, indels_info, misassemblies_matched_sv