Example #1
0
def process_misassembled_contig(sorted_aligns, is_cyclic, aligned_lengths, region_misassemblies, ref_lens, ref_aligns,
                                ref_features, contig_seq, misassemblies_by_ref, istranslocations_by_ref, region_struct_variations,
                                ca_output):
    misassembly_internal_overlap = 0
    prev_align = sorted_aligns[0]
    cur_aligned_length = prev_align.len2
    is_misassembled = False
    contig_is_printed = False
    indels_info = IndelsInfo()
    cnt_misassemblies = 0

    misassemblies = []
    misassembly_info = []
    for i in range(len(sorted_aligns) - 1):
        next_align = sorted_aligns[i + 1]

        is_fake_translocation = is_fragmented_ref_fake_translocation(prev_align, next_align, ref_lens)
        internal_overlap, overlap_msg = exclude_internal_overlaps(prev_align, next_align, i)
        is_extensive_misassembly, aux_data = is_misassembly(prev_align, next_align, contig_seq, ref_lens,
                                                            is_cyclic, region_struct_variations, is_fake_translocation)
        misassembly_type = ''
        if is_extensive_misassembly: # it is not a Fake translocation, because is_extensive_misassembly is True
            prev_ref, next_ref = get_ref_by_chromosome(prev_align.ref), get_ref_by_chromosome(next_align.ref)
            if prev_align.ref != next_align.ref:  # if chromosomes from different references
                if qconfig.is_combined_ref and prev_ref != next_ref:
                    misassembly_type = 'interspecies translocation'
                else:
                    misassembly_type = 'translocation'
            elif abs(aux_data["inconsistency"]) > qconfig.extensive_misassembly_threshold:
                misassembly_type = 'relocation'
            else: #if strand1 != strand2:
                misassembly_type = 'inversion'
            if next_align.s1 > prev_align.e1:
                start_in_ref, end_in_ref = prev_align.e1, next_align.s1
            else:
                start_in_ref, end_in_ref = next_align.s1, prev_align.e1
            misassemblies.append([(prev_align, start_in_ref, misassembly_type, next_align.len2), (next_align, end_in_ref, misassembly_type)])
        else:
            misassemblies.append([])
        misassembly_info.append((internal_overlap, overlap_msg, is_extensive_misassembly, aux_data, misassembly_type))
        prev_align = next_align
    is_potential_mge = None
    if qconfig.large_genome:
        is_potential_mge = detect_potential_mge(misassemblies)

    prev_align = sorted_aligns[0]
    contig_aligned_lengths = []
    for i in range(len(sorted_aligns) - 1):
        next_align = sorted_aligns[i + 1]
        internal_overlap, overlap_msg, is_extensive_misassembly, aux_data, misassembly_type = misassembly_info[i]
        if overlap_msg:
            cur_aligned_length -= internal_overlap
            ca_output.stdout_f.write(overlap_msg)

        inconsistency = aux_data["inconsistency"]
        distance_on_contig = aux_data["distance_on_contig"]
        misassembly_internal_overlap += aux_data["misassembly_internal_overlap"]
        cyclic_moment = aux_data["cyclic_moment"]
        ca_output.icarus_out_f.write(prev_align.icarus_report_str() + '\n')
        ca_output.stdout_f.write('\t\t\tReal Alignment %d: %s\n' % (i+1, str(prev_align)))

        ref_aligns.setdefault(prev_align.ref, []).append(prev_align)
        ca_output.coords_filtered_f.write(prev_align.coords_str() + '\n')
        prev_ref, next_ref = get_ref_by_chromosome(prev_align.ref), get_ref_by_chromosome(next_align.ref)
        if aux_data["is_sv"]:
            ca_output.stdout_f.write('\t\t\t  Not a misassembly (structural variation of the genome) between these two alignments\n')
            ca_output.icarus_out_f.write('fake: not a misassembly (structural variation of the genome)\n')
            region_misassemblies.append(Misassembly.MATCHED_SV)
        elif aux_data["is_scaffold_gap"]:
            if abs(inconsistency) > qconfig.extensive_misassembly_threshold:
                scaff_gap_type = ' (extensive)'
                region_misassemblies.append(Misassembly.SCAFFOLD_GAP)
                misassemblies_by_ref[prev_ref].append(Misassembly.SCAFFOLD_GAP)
                ca_output.icarus_out_f.write('fake: scaffold gap size wrong estimation' + scaff_gap_type + '\n')
            else:
                scaff_gap_type = ' (local)'
                region_misassemblies.append(Misassembly.LOCAL_SCAFFOLD_GAP)
                misassemblies_by_ref[prev_ref].append(Misassembly.LOCAL_SCAFFOLD_GAP)
                ca_output.icarus_out_f.write('fake: scaffold gap size wrong estimation' + scaff_gap_type + '\n')
            ca_output.stdout_f.write('\t\t\t  Scaffold gap between these two alignments, ')
            ca_output.stdout_f.write('gap lengths difference (reference vs assembly) = ' + str(inconsistency) + scaff_gap_type + '\n')
        elif is_extensive_misassembly and is_potential_mge and is_potential_mge[i]:
            ca_output.stdout_f.write(
                '\t\t\t  Not a misassembly (possible transposable element) between these two alignments\n')
            ca_output.icarus_out_f.write('fake: not a misassembly (possible transposable element)\n')
            region_misassemblies.append(Misassembly.POTENTIAL_MGE)
        elif is_extensive_misassembly:
            is_misassembled = True
            cnt_misassemblies += 1
            contig_aligned_lengths.append(cur_aligned_length)
            cur_aligned_length = 0
            if not contig_is_printed:
                ca_output.misassembly_f.write(prev_align.contig + '\n')
                contig_is_printed = True
            ca_output.misassembly_f.write('Extensive misassembly (')
            ca_output.stdout_f.write('\t\t\t  Extensive misassembly (')
            msg = ''
            if misassembly_type == 'interspecies translocation':
                misassembly_id = Misassembly.INTERSPECTRANSLOCATION
                istranslocations_by_ref[prev_ref][next_ref] += 1
                istranslocations_by_ref[next_ref][prev_ref] += 1
            elif misassembly_type == 'translocation':
                misassembly_id = Misassembly.TRANSLOCATION
            elif misassembly_type == 'relocation':
                misassembly_id = Misassembly.RELOCATION
                msg = ', inconsistency = ' + str(inconsistency) + \
                      (' [linear representation of circular genome]' if cyclic_moment else '')
            else: #if strand1 != strand2:
                misassembly_id = Misassembly.INVERSION
            region_misassemblies.append(misassembly_id)
            misassemblies_by_ref[prev_ref].append(misassembly_id)
            if misassembly_id == Misassembly.INTERSPECTRANSLOCATION:  # special case
                misassemblies_by_ref[next_ref].append(misassembly_id)
            if is_gap_filled_ns(contig_seq, prev_align, next_align):
                misassembly_type += ', scaffold gap is present'
                region_misassemblies.append(misassembly_id + (Misassembly.SCF_INVERSION - Misassembly.INVERSION))
            ca_output.stdout_f.write(misassembly_type + msg)
            ca_output.misassembly_f.write(misassembly_type + msg)
            ca_output.icarus_out_f.write(misassembly_type + msg)
            ca_output.stdout_f.write(') between these two alignments\n')
            ca_output.misassembly_f.write(') between %s %s and %s %s' % (prev_align.s2, prev_align.e2, next_align.s2, next_align.e2) + '\n')
            ca_output.icarus_out_f.write('\n')
            ref_features.setdefault(prev_align.ref, {})[prev_align.e1] = 'M'
            ref_features.setdefault(next_align.ref, {})[next_align.e1] = 'M'
        else:
            reason_msg = "" + (" [linear representation of circular genome]" if cyclic_moment else "") + \
                         (" [fragmentation of reference genome]" if prev_align.ref != next_align.ref else "")
            if inconsistency == 0 and cyclic_moment:
                ca_output.stdout_f.write('\t\t\t  Not a misassembly' + reason_msg + ' between these two alignments\n')
                ca_output.icarus_out_f.write('fake: not a misassembly' + reason_msg + '\n')
            elif inconsistency == 0 and prev_align.ref != next_align.ref:  # is_fragmented_ref_fake_translocation is True, because is_extensive_misassembly is False
                ca_output.stdout_f.write('\t\t\t  Not a misassembly' + reason_msg + ' between these two alignments\n')
                region_misassemblies.append(Misassembly.FRAGMENTED)
                misassemblies_by_ref[prev_ref].append(Misassembly.FRAGMENTED)
                ca_output.icarus_out_f.write('fake: not a misassembly' + reason_msg + '\n')
            elif abs(inconsistency) <= qconfig.MAX_INDEL_LENGTH and \
                            count_ns_and_not_ns_between_aligns(contig_seq, prev_align, next_align)[1] <= max(qconfig.min_alignment, qconfig.MAX_INDEL_LENGTH):
                ns_number, not_ns_number = count_ns_and_not_ns_between_aligns(contig_seq, prev_align, next_align)

                if inconsistency == 0:
                    ca_output.stdout_f.write(('\t\t\t  Stretch of %d mismatches between these two alignments (number of Ns: %d)' %
                                              (not_ns_number, ns_number)) + reason_msg + '\n')
                    indels_info.mismatches += not_ns_number
                    ca_output.icarus_out_f.write('indel: stretch of mismatches' + reason_msg + '\n')
                else:
                    indel_length = abs(inconsistency)
                    indel_class = 'Indel (<= 5bp)' if indel_length <= qconfig.SHORT_INDEL_THRESHOLD else 'Indel (> 5bp)'
                    indel_type = 'insertion' if inconsistency < 0 else 'deletion'
                    mismatches = max(0, not_ns_number - indel_length)
                    ca_output.stdout_f.write(('\t\t\t  %s between these two alignments: %s of length %d; %d mismatches (number of Ns: %d)')
                                                 % (indel_class, indel_type, indel_length, mismatches, ns_number) + reason_msg + '\n')
                    indels_info.indels_list.append(indel_length)
                    if indel_type == 'insertion':
                        indels_info.insertions += indel_length
                    else:
                        indels_info.deletions += indel_length
                    indels_info.mismatches += mismatches
                    ca_output.icarus_out_f.write('indel: ' + indel_class.lower() + reason_msg + '\n')
            else:
                if qconfig.strict_NA:
                    contig_aligned_lengths.append(cur_aligned_length)
                    cur_aligned_length = 0

                if distance_on_contig < 0:
                    #There is an overlap between the two alignments, a local misassembly
                    ca_output.stdout_f.write('\t\t\t  Overlap between these two alignments (local misassembly).')
                elif distance_on_contig > 0:
                    #There is a small gap between the two alignments, a local misassembly
                    ca_output.stdout_f.write('\t\t\t  Gap between these two alignments (local misassembly).')
                elif inconsistency < 0:
                    ca_output.stdout_f.write('\t\t\t  Overlap between these two alignments (local misassembly).')
                else:
                    ca_output.stdout_f.write('\t\t\t  Gap between these two alignments (local misassembly).')
                ca_output.stdout_f.write(' Inconsistency = ' + str(inconsistency) + reason_msg + '\n')
                ca_output.icarus_out_f.write('local misassembly' + reason_msg + '\n')
                region_misassemblies.append(Misassembly.LOCAL)
                misassemblies_by_ref[prev_ref].append(Misassembly.LOCAL)

        prev_align = next_align
        cur_aligned_length += prev_align.len2 - (-distance_on_contig if distance_on_contig < 0 else 0)

    #Record the very last alignment
    i = len(sorted_aligns) - 1
    ca_output.stdout_f.write('\t\t\tReal Alignment %d: %s' % (i + 1, str(next_align)) + '\n')
    ca_output.icarus_out_f.write(next_align.icarus_report_str() + '\n')
    ref_aligns.setdefault(next_align.ref, []).append(next_align)
    ca_output.coords_filtered_f.write(next_align.coords_str() + '\n')

    contig_aligned_lengths.append(cur_aligned_length)
    contig_aligned_length = sum(contig_aligned_lengths)

    # if contig covers more than 95% of cyclic chromosome/plasmid consider it as cyclic and do not split the first and the last aligned blocks
    if is_cyclic and len(contig_aligned_lengths) > 1 and sorted_aligns[-1].ref == sorted_aligns[0].ref and contig_aligned_length >= 0.95 * ref_lens[sorted_aligns[0].ref]:
        is_extensive_misassembly, aux_data = is_misassembly(sorted_aligns[-1], sorted_aligns[0], contig_seq, ref_lens,
                                                            is_cyclic, is_cyclic_contig=True, region_struct_variations=region_struct_variations)
        if not is_extensive_misassembly and not aux_data["is_scaffold_gap"] and not aux_data["is_sv"]:
            inconsistency = abs(aux_data["inconsistency"])
            if not qconfig.strict_NA or inconsistency <= qconfig.MAX_INDEL_LENGTH:
                contig_aligned_lengths[0] += contig_aligned_lengths[-1]
                contig_aligned_lengths = contig_aligned_lengths[:-1]

    aligned_lengths.extend(contig_aligned_lengths)
    assert contig_aligned_length <= len(contig_seq), "Internal QUAST bug: contig aligned length is greater than " \
                                                     "contig length (contig: %s, len: %d, aligned: %d)!" % \
                                                     (sorted_aligns[0].contig, contig_aligned_length, len(contig_seq))

    return is_misassembled, misassembly_internal_overlap, indels_info, cnt_misassemblies, contig_aligned_length
Example #2
0
def process_misassembled_contig(sorted_aligns, is_cyclic, aligned_lengths,
                                region_misassemblies, ref_lens, ref_aligns,
                                ref_features, contig_seq, misassemblies_by_ref,
                                istranslocations_by_ref,
                                region_struct_variations,
                                misassemblies_matched_sv, ca_output):
    misassembly_internal_overlap = 0
    prev_align = sorted_aligns[0]
    cur_aligned_length = prev_align.len2
    is_misassembled = False
    contig_is_printed = False
    indels_info = IndelsInfo()
    contig_aligned_length = 0  # for internal debugging purposes
    cnt_misassemblies = 0

    for i in range(len(sorted_aligns) - 1):
        next_align = sorted_aligns[i + 1]

        is_fake_translocation = is_fragmented_ref_fake_translocation(
            prev_align, next_align, ref_lens)
        cur_aligned_length -= exclude_internal_overlaps(
            prev_align, next_align, i, ca_output)
        is_extensive_misassembly, aux_data = is_misassembly(
            prev_align, next_align, contig_seq, ref_lens, is_cyclic,
            region_struct_variations, is_fake_translocation)
        inconsistency = aux_data["inconsistency"]
        distance_on_contig = aux_data["distance_on_contig"]
        misassembly_internal_overlap += aux_data[
            "misassembly_internal_overlap"]
        cyclic_moment = aux_data["cyclic_moment"]
        ca_output.icarus_out_f.write(prev_align.icarus_report_str() + '\n')
        ca_output.stdout_f.write('\t\t\tReal Alignment %d: %s\n' %
                                 (i + 1, str(prev_align)))

        ref_aligns.setdefault(prev_align.ref, []).append(prev_align)
        ca_output.coords_filtered_f.write(str(prev_align) + '\n')
        prev_ref, next_ref = get_ref_by_chromosome(
            prev_align.ref), get_ref_by_chromosome(next_align.ref)
        if aux_data["is_sv"]:
            ca_output.stdout_f.write(
                '\t\t\t  Not a misassembly (structural variation of the genome) between these two alignments\n'
            )
            ca_output.icarus_out_f.write(
                'fake: not a misassembly (structural variation of the genome)\n'
            )
            misassemblies_matched_sv += 1
        elif aux_data["is_scaffold_gap"] and abs(
                inconsistency) > qconfig.extensive_misassembly_threshold:
            ca_output.stdout_f.write(
                '\t\t\t  Incorrectly estimated size of scaffold gap between these two alignments: '
            )
            ca_output.stdout_f.write('gap length difference = ' +
                                     str(inconsistency) + '\n')
            region_misassemblies.append(Misassembly.SCAFFOLD_GAP)
            misassemblies_by_ref[prev_ref].append(Misassembly.SCAFFOLD_GAP)
            ca_output.icarus_out_f.write(
                'fake: scaffold gap size wrong estimation' + '\n')
        elif is_extensive_misassembly:
            is_misassembled = True
            cnt_misassemblies += 1
            aligned_lengths.append(cur_aligned_length)
            contig_aligned_length += cur_aligned_length
            cur_aligned_length = 0
            if not contig_is_printed:
                ca_output.misassembly_f.write(prev_align.contig + '\n')
                contig_is_printed = True
            ca_output.misassembly_f.write('Extensive misassembly (')
            ca_output.stdout_f.write('\t\t\t  Extensive misassembly (')
            if prev_align.ref != next_align.ref:  # it is not a Fake translocation, because is_extensive_misassembly is True
                if qconfig.is_combined_ref and prev_ref != next_ref:  # if chromosomes from different references
                    region_misassemblies.append(
                        Misassembly.INTERSPECTRANSLOCATION)
                    istranslocations_by_ref[prev_ref][next_ref] += 1
                    istranslocations_by_ref[next_ref][prev_ref] += 1
                    misassemblies_by_ref[prev_ref].append(
                        Misassembly.INTERSPECTRANSLOCATION)
                    misassemblies_by_ref[next_ref].append(
                        Misassembly.INTERSPECTRANSLOCATION)
                    ca_output.stdout_f.write('interspecies translocation')
                    ca_output.misassembly_f.write('interspecies translocation')
                    ca_output.icarus_out_f.write('interspecies translocation')
                else:
                    region_misassemblies.append(Misassembly.TRANSLOCATION)
                    misassemblies_by_ref[prev_ref].append(
                        Misassembly.TRANSLOCATION)
                    ca_output.stdout_f.write('translocation')
                    ca_output.misassembly_f.write('translocation')
                    ca_output.icarus_out_f.write('translocation')
            elif abs(inconsistency) > qconfig.extensive_misassembly_threshold:
                region_misassemblies.append(Misassembly.RELOCATION)
                misassemblies_by_ref[prev_ref].append(Misassembly.RELOCATION)
                msg = 'relocation, inconsistency = ' + str(inconsistency) + \
                      (' [linear representation of circular genome]' if cyclic_moment else '')
                ca_output.stdout_f.write(msg)
                ca_output.misassembly_f.write(msg)
                ca_output.icarus_out_f.write(msg)
            else:  #if strand1 != strand2:
                region_misassemblies.append(Misassembly.INVERSION)
                misassemblies_by_ref[prev_ref].append(Misassembly.INVERSION)
                ca_output.stdout_f.write('inversion')
                ca_output.misassembly_f.write('inversion')
                ca_output.icarus_out_f.write('inversion')
            ca_output.stdout_f.write(') between these two alignments\n')
            ca_output.misassembly_f.write(
                ') between %s %s and %s %s' %
                (prev_align.s2, prev_align.e2, next_align.s2, next_align.e2) +
                '\n')
            ca_output.icarus_out_f.write('\n')
            ref_features.setdefault(prev_align.ref, {})[prev_align.e1] = 'M'
            ref_features.setdefault(next_align.ref, {})[next_align.e1] = 'M'
        else:
            reason_msg = "" + (" [linear representation of circular genome]" if cyclic_moment else "") + \
                         (" [fragmentation of reference genome]" if prev_align.ref != next_align.ref else "")
            if inconsistency == 0 and cyclic_moment:
                ca_output.stdout_f.write('\t\t\t  Not a misassembly' +
                                         reason_msg +
                                         ' between these two alignments\n')
                ca_output.icarus_out_f.write('fake: not a misassembly' +
                                             reason_msg + '\n')
            elif inconsistency == 0 and prev_align.ref != next_align.ref:  # is_fragmented_ref_fake_translocation is True, because is_extensive_misassembly is False
                ca_output.stdout_f.write('\t\t\t  Not a misassembly' +
                                         reason_msg +
                                         ' between these two alignments\n')
                region_misassemblies.append(Misassembly.FRAGMENTED)
                misassemblies_by_ref[prev_ref].append(Misassembly.FRAGMENTED)
                ca_output.icarus_out_f.write('fake: not a misassembly' +
                                             reason_msg + '\n')
            elif abs(inconsistency) <= qconfig.MAX_INDEL_LENGTH and \
                            count_ns_and_not_ns_between_aligns(contig_seq, prev_align, next_align)[1] <= qconfig.MAX_INDEL_LENGTH:
                ns_number, not_ns_number = count_ns_and_not_ns_between_aligns(
                    contig_seq, prev_align, next_align)

                if inconsistency == 0:
                    ca_output.stdout_f.write((
                        '\t\t\t  Stretch of %d mismatches between these two alignments (number of Ns: %d)'
                        % (not_ns_number, ns_number)) + reason_msg + '\n')
                    indels_info.mismatches += not_ns_number
                    ca_output.icarus_out_f.write(
                        'indel: stretch of mismatches' + reason_msg + '\n')
                else:
                    indel_length = abs(inconsistency)
                    indel_class = 'Indel (<= 5bp)' if indel_length <= qconfig.SHORT_INDEL_THRESHOLD else 'Indel (> 5bp)'
                    indel_type = 'insertion' if inconsistency < 0 else 'deletion'
                    mismatches = max(0, not_ns_number - indel_length)
                    ca_output.stdout_f.write((
                        '\t\t\t  %s between these two alignments: %s of length %d; %d mismatches (number of Ns: %d)'
                    ) % (indel_class, indel_type, indel_length, mismatches,
                         ns_number) + reason_msg + '\n')
                    indels_info.indels_list.append(indel_length)
                    if indel_type == 'insertion':
                        indels_info.insertions += indel_length
                    else:
                        indels_info.deletions += indel_length
                    indels_info.mismatches += mismatches
                    ca_output.icarus_out_f.write('indel: ' +
                                                 indel_class.lower() +
                                                 reason_msg + '\n')
            else:
                if qconfig.strict_NA:
                    aligned_lengths.append(cur_aligned_length)
                    contig_aligned_length += cur_aligned_length
                    cur_aligned_length = 0

                if distance_on_contig < 0:
                    #There is an overlap between the two alignments, a local misassembly
                    ca_output.stdout_f.write(
                        '\t\t\t  Overlap between these two alignments (local misassembly).'
                    )
                elif distance_on_contig > 0:
                    #There is a small gap between the two alignments, a local misassembly
                    ca_output.stdout_f.write(
                        '\t\t\t  Gap between these two alignments (local misassembly).'
                    )
                elif inconsistency < 0:
                    ca_output.stdout_f.write(
                        '\t\t\t  Overlap between these two alignments (local misassembly).'
                    )
                else:
                    ca_output.stdout_f.write(
                        '\t\t\t  Gap between these two alignments (local misassembly).'
                    )
                ca_output.stdout_f.write(' Inconsistency = ' +
                                         str(inconsistency) + reason_msg +
                                         '\n')
                ca_output.icarus_out_f.write('local misassembly' + reason_msg +
                                             '\n')
                region_misassemblies.append(Misassembly.LOCAL)
                misassemblies_by_ref[prev_ref].append(Misassembly.LOCAL)

        prev_align = next_align
        cur_aligned_length += prev_align.len2 - (-distance_on_contig if
                                                 distance_on_contig < 0 else 0)

    #Record the very last alignment
    i = len(sorted_aligns) - 1
    ca_output.stdout_f.write('\t\t\tReal Alignment %d: %s' %
                             (i + 1, str(next_align)) + '\n')
    ca_output.icarus_out_f.write(next_align.icarus_report_str() + '\n')
    ref_aligns.setdefault(next_align.ref, []).append(next_align)
    ca_output.coords_filtered_f.write(str(next_align) + '\n')
    aligned_lengths.append(cur_aligned_length)
    contig_aligned_length += cur_aligned_length

    assert contig_aligned_length <= len(contig_seq), "Internal QUAST bug: contig aligned length is greater than " \
                                                     "contig length (contig: %s, len: %d, aligned: %d)!" % \
                                                     (sorted_aligns[0].contig, contig_aligned_length, len(contig_seq))

    return is_misassembled, misassembly_internal_overlap, indels_info, misassemblies_matched_sv, cnt_misassemblies, contig_aligned_length
def process_misassembled_contig(sorted_aligns, cyclic, aligned_lengths, region_misassemblies, ref_lens, ref_aligns,
                                ref_features, contig_seq, references_misassemblies, region_struct_variations,
                                misassemblies_matched_sv, ca_output, is_ambiguous=False):
    misassembly_internal_overlap = 0
    prev_align = sorted_aligns[0]
    cur_aligned_length = prev_align[5]
    is_misassembled = False
    contig_is_printed = False
    indels_info = IndelsInfo()
    contig_aligned_length = 0  # for internal debugging purposes

    for i in range(len(sorted_aligns) - 1):
        next_align = sorted_aligns[i + 1]

        cur_aligned_length -= exclude_internal_overlaps(prev_align, next_align, i, ca_output)
        is_extensive_misassembly, aux_data = is_misassembly(prev_align, next_align, contig_seq, ref_lens,
                                                            cyclic, region_struct_variations)
        inconsistency = aux_data["inconsistency"]
        distance_on_contig = aux_data["distance_on_contig"]
        misassembly_internal_overlap += aux_data["misassembly_internal_overlap"]
        cyclic_moment = aux_data["cyclic_moment"]
#        print >> ca_output.icarus_out_f, prev_align.icarus_report_str()
        print >> ca_output.icarus_out_f, icarus_report_str(prev_align)
        print >> ca_output.stdout_f, '\t\t\tReal Alignment %d: %s' % (i+1, str(prev_align))

        ref_aligns.setdefault(prev_align[7], []).append(prev_align)
        print >> ca_output.coords_filtered_f, str(prev_align)
        if aux_data["is_sv"]:
            print >> ca_output.stdout_f, '\t\t\t  Not a misassembly (structural variation of the genome) between these two alignments'
            print >> ca_output.icarus_out_f, 'fake: not a misassembly (structural variation of the genome)'
            misassemblies_matched_sv += 1
        elif aux_data["is_scaffold_gap"]:
            print >> ca_output.stdout_f, '\t\t\t  Incorrectly estimated size of scaffold gap between these two alignments:',
            print >> ca_output.stdout_f, 'gap length difference =', inconsistency
            region_misassemblies.append(Misassembly.SCAFFOLD_GAP)
            print >> ca_output.icarus_out_f, 'fake: scaffold gap size wrong estimation'
        elif is_extensive_misassembly:
            is_misassembled = True
            aligned_lengths.append(cur_aligned_length)
            contig_aligned_length += cur_aligned_length
            cur_aligned_length = 0
            if not contig_is_printed:
                print >> ca_output.misassembly_f, prev_align[8]
                contig_is_printed = True
            print >> ca_output.misassembly_f, 'Extensive misassembly (',
            print >> ca_output.stdout_f, '\t\t\t  Extensive misassembly (',
            if prev_align[7] != next_align[7]:  # it is not a Fake translocation, because is_extensive_misassembly is True
                if qconfig.is_combined_ref and \
                        not is_same_reference(prev_align[7], next_align[7]):  # if chromosomes from different references
                        region_misassemblies.append(Misassembly.INTERSPECTRANSLOCATION)
                        ref1, ref2 = get_ref_by_chromosome(prev_align[7]), get_ref_by_chromosome(next_align[7])
                        references_misassemblies[ref1][ref2] += 1
                        references_misassemblies[ref2][ref1] += 1
                        print >> ca_output.stdout_f, 'interspecies translocation',
                        print >> ca_output.misassembly_f, 'interspecies translocation',
                        print >> ca_output.icarus_out_f, 'interspecies translocation'
                else:
                    region_misassemblies.append(Misassembly.TRANSLOCATION)
                    print >> ca_output.stdout_f, 'translocation',
                    print >> ca_output.misassembly_f, 'translocation',
                    print >> ca_output.icarus_out_f, 'translocation'
            elif abs(inconsistency) > qconfig.extensive_misassembly_threshold:
                region_misassemblies.append(Misassembly.RELOCATION)
                print >> ca_output.stdout_f, 'relocation, inconsistency =', inconsistency,
                print >> ca_output.misassembly_f, 'relocation, inconsistency =', inconsistency,
                print >> ca_output.icarus_out_f, 'relocation, inconsistency =', inconsistency
            else: #if strand1 != strand2:
                region_misassemblies.append(Misassembly.INVERSION)
                print >> ca_output.stdout_f, 'inversion',
                print >> ca_output.misassembly_f, 'inversion',
                print >> ca_output.icarus_out_f, 'inversion'
            print >> ca_output.stdout_f, ') between these two alignments'
            print >> ca_output.misassembly_f, ') between %s %s and %s %s' % (prev_align[2], prev_align[3],
                                                                      next_align[2], next_align[3])
            ref_features.setdefault(prev_align[7], {})[prev_align[1]] = 'M'
            ref_features.setdefault(next_align[7], {})[next_align[1]] = 'M'
        else:
            reason_msg = "" + (" (linear representation of circular genome)" if cyclic_moment else "") + \
                         (" (fragmentation of reference genome)" if prev_align[7] != next_align[7] else "")
            if inconsistency == 0 and cyclic_moment:
                print >> ca_output.stdout_f, '\t\t\t  Not a misassembly' + reason_msg + ' between these two alignments'
                print >> ca_output.icarus_out_f, 'fake: not a misassembly' + reason_msg
            elif inconsistency == 0 and prev_align[7] != next_align[7]:  # is_fragmented_ref_fake_translocation is True, because is_extensive_misassembly is False
                print >> ca_output.stdout_f, '\t\t\t  Not a misassembly' + reason_msg + ' between these two alignments'
                region_misassemblies.append(Misassembly.FRAGMENTED)
                print >> ca_output.icarus_out_f, 'fake: not a misassembly' + reason_msg
            elif abs(inconsistency) <= qconfig.MAX_INDEL_LENGTH and \
                    count_ns_and_not_ns_between_aligns(contig_seq, prev_align, next_align)[1] <= qconfig.MAX_INDEL_LENGTH:
                ns_number, not_ns_number = count_ns_and_not_ns_between_aligns(contig_seq, prev_align, next_align)

                if inconsistency == 0:
                    print >> ca_output.stdout_f, ('\t\t\t  Short stretch of %d mismatches and %d Ns between these two alignments' % (not_ns_number, ns_number)) + reason_msg
                    indels_info.mismatches += not_ns_number
                    print >> ca_output.icarus_out_f, 'indel: stretch of mismatches and Ns' + reason_msg
                else:
                    indel_length = abs(inconsistency)
                    indel_class = 'Indel (<= 5bp)' if indel_length <= qconfig.SHORT_INDEL_THRESHOLD else 'Indel (> 5bp)'
                    indel_type = 'insertion' if inconsistency < 0 else 'deletion'
                    mismatches = max(0, not_ns_number - indel_length)
                    print >> ca_output.stdout_f, ('\t\t\t  %s between these two alignments: %s of length %d; %d mismatches'
                                                 % (indel_class, indel_type, indel_length, mismatches)) + reason_msg
                    indels_info.indels_list.append(indel_length)
                    if indel_type == 'insertion':
                        indels_info.insertions += indel_length
                    else:
                        indels_info.deletions += indel_length
                    indels_info.mismatches += mismatches
                    print >> ca_output.icarus_out_f, 'indel: ' + indel_class.lower() + reason_msg
            else:
                if qconfig.strict_NA:
                    aligned_lengths.append(cur_aligned_length)
                    contig_aligned_length += cur_aligned_length
                    cur_aligned_length = 0

                if inconsistency < 0:
                    #There is an overlap between the two alignments, a local misassembly
                    print >> ca_output.stdout_f, '\t\t\t  Overlap between these two alignments (local misassembly).',
                else:
                    #There is a small gap between the two alignments, a local misassembly
                    print >> ca_output.stdout_f, '\t\t\t  Gap between these two alignments (local misassembly).',
                    #print >> plantafile_out, 'Distance on contig =', distance_on_contig, ', distance on reference =', distance_on_reference
                print >> ca_output.stdout_f, 'Inconsistency = ' + str(inconsistency) + reason_msg
                print >> ca_output.icarus_out_f, 'local misassembly'
                region_misassemblies.append(Misassembly.LOCAL)

        prev_align = next_align
        cur_aligned_length += prev_align[5] - (-distance_on_contig if distance_on_contig < 0 else 0)

    #Record the very last alignment
    i = len(sorted_aligns) - 1
    print >> ca_output.stdout_f, '\t\t\tReal Alignment %d: %s' % (i + 1, str(next_align))
#    print >> ca_output.icarus_out_f, next_align.icarus_report_str()
    print >> ca_output.icarus_out_f, icarus_report_str(next_align)
    ref_aligns.setdefault(next_align[7], []).append(next_align)
    print >> ca_output.coords_filtered_f, str(next_align)
    aligned_lengths.append(cur_aligned_length)
    contig_aligned_length += cur_aligned_length

    assert contig_aligned_length <= len(contig_seq), "Internal QUAST bug: contig aligned length is greater than " \
                                                     "contig length (contig: %s, len: %d, aligned: %d)!" % \
                                                     (sorted_aligns[0][8], contig_aligned_length, len(contig_seq))

    return is_misassembled, misassembly_internal_overlap, references_misassemblies, indels_info, misassemblies_matched_sv
Example #4
0
def process_misassembled_contig(sorted_aligns, cyclic, aligned_lengths, region_misassemblies, ref_lens, ref_aligns,
                                ref_features, contig_seq, references_misassemblies, region_struct_variations,
                                misassemblies_matched_sv, ca_output, is_ambiguous=False):
    misassembly_internal_overlap = 0
    prev_align = sorted_aligns[0]
    cur_aligned_length = prev_align.len2
    is_misassembled = False
    contig_is_printed = False
    indels_info = IndelsInfo()
    contig_aligned_length = 0  # for internal debugging purposes

    for i in range(len(sorted_aligns) - 1):
        next_align = sorted_aligns[i + 1]

        is_fake_translocation = is_fragmented_ref_fake_translocation(prev_align, next_align, ref_lens)
        cur_aligned_length -= exclude_internal_overlaps(prev_align, next_align, i, ca_output)
        is_extensive_misassembly, aux_data = is_misassembly(prev_align, next_align, contig_seq, ref_lens,
                                                            cyclic, region_struct_variations, is_fake_translocation)
        inconsistency = aux_data["inconsistency"]
        distance_on_contig = aux_data["distance_on_contig"]
        misassembly_internal_overlap += aux_data["misassembly_internal_overlap"]
        cyclic_moment = aux_data["cyclic_moment"]
        ca_output.icarus_out_f.write(prev_align.icarus_report_str() + '\n')
        ca_output.stdout_f.write('\t\t\tReal Alignment %d: %s\n' % (i+1, str(prev_align)))

        ref_aligns.setdefault(prev_align.ref, []).append(prev_align)
        ca_output.coords_filtered_f.write(str(prev_align) + '\n')
        if aux_data["is_sv"]:
            ca_output.stdout_f.write('\t\t\t  Not a misassembly (structural variation of the genome) between these two alignments\n')
            ca_output.icarus_out_f.write('fake: not a misassembly (structural variation of the genome)\n')
            misassemblies_matched_sv += 1
        elif aux_data["is_scaffold_gap"] and abs(inconsistency) > qconfig.extensive_misassembly_threshold:
            ca_output.stdout_f.write('\t\t\t  Incorrectly estimated size of scaffold gap between these two alignments: ')
            ca_output.stdout_f.write('gap length difference = ' + str(inconsistency) + '\n')
            region_misassemblies.append(Misassembly.SCAFFOLD_GAP)
            ca_output.icarus_out_f.write('fake: scaffold gap size wrong estimation' + '\n')
        elif is_extensive_misassembly:
            is_misassembled = True
            aligned_lengths.append(cur_aligned_length)
            contig_aligned_length += cur_aligned_length
            cur_aligned_length = 0
            if not contig_is_printed:
                ca_output.misassembly_f.write(prev_align.contig + '\n')
                contig_is_printed = True
            ca_output.misassembly_f.write('Extensive misassembly (')
            ca_output.stdout_f.write('\t\t\t  Extensive misassembly (')
            if prev_align.ref != next_align.ref:  # it is not a Fake translocation, because is_extensive_misassembly is True
                if qconfig.is_combined_ref and \
                        not is_same_reference(prev_align.ref, next_align.ref):  # if chromosomes from different references
                        region_misassemblies.append(Misassembly.INTERSPECTRANSLOCATION)
                        ref1, ref2 = get_ref_by_chromosome(prev_align.ref), get_ref_by_chromosome(next_align.ref)
                        references_misassemblies[ref1][ref2] += 1
                        references_misassemblies[ref2][ref1] += 1
                        ca_output.stdout_f.write('interspecies translocation')
                        ca_output.misassembly_f.write('interspecies translocation')
                        ca_output.icarus_out_f.write('interspecies translocation')
                else:
                    region_misassemblies.append(Misassembly.TRANSLOCATION)
                    ca_output.stdout_f.write('translocation')
                    ca_output.misassembly_f.write('translocation')
                    ca_output.icarus_out_f.write('translocation')
            elif abs(inconsistency) > qconfig.extensive_misassembly_threshold:
                region_misassemblies.append(Misassembly.RELOCATION)
                msg = 'relocation, inconsistency = ' + str(inconsistency) + \
                      (' [linear representation of circular genome]' if cyclic_moment else '')
                ca_output.stdout_f.write(msg)
                ca_output.misassembly_f.write(msg)
                ca_output.icarus_out_f.write(msg)
            else: #if strand1 != strand2:
                region_misassemblies.append(Misassembly.INVERSION)
                ca_output.stdout_f.write('inversion')
                ca_output.misassembly_f.write('inversion')
                ca_output.icarus_out_f.write('inversion')
            ca_output.stdout_f.write(') between these two alignments\n')
            ca_output.misassembly_f.write(') between %s %s and %s %s' % (prev_align.s2, prev_align.e2, next_align.s2, next_align.e2) + '\n')
            ca_output.icarus_out_f.write('\n')
            ref_features.setdefault(prev_align.ref, {})[prev_align.e1] = 'M'
            ref_features.setdefault(next_align.ref, {})[next_align.e1] = 'M'
        else:
            reason_msg = "" + (" [linear representation of circular genome]" if cyclic_moment else "") + \
                         (" [fragmentation of reference genome]" if prev_align.ref != next_align.ref else "")
            if inconsistency == 0 and cyclic_moment:
                ca_output.stdout_f.write('\t\t\t  Not a misassembly' + reason_msg + ' between these two alignments\n')
                ca_output.icarus_out_f.write('fake: not a misassembly' + reason_msg + '\n')
            elif inconsistency == 0 and prev_align.ref != next_align.ref:  # is_fragmented_ref_fake_translocation is True, because is_extensive_misassembly is False
                ca_output.stdout_f.write('\t\t\t  Not a misassembly' + reason_msg + ' between these two alignments\n')
                region_misassemblies.append(Misassembly.FRAGMENTED)
                ca_output.icarus_out_f.write('fake: not a misassembly' + reason_msg + '\n')
            elif abs(inconsistency) <= qconfig.MAX_INDEL_LENGTH and \
                            count_ns_and_not_ns_between_aligns(contig_seq, prev_align, next_align)[1] <= qconfig.MAX_INDEL_LENGTH:
                ns_number, not_ns_number = count_ns_and_not_ns_between_aligns(contig_seq, prev_align, next_align)

                if inconsistency == 0:
                    ca_output.stdout_f.write(('\t\t\t  Stretch of %d mismatches between these two alignments (number of Ns: %d)' %
                                              (not_ns_number, ns_number)) + reason_msg + '\n')
                    indels_info.mismatches += not_ns_number
                    ca_output.icarus_out_f.write('indel: stretch of mismatches' + reason_msg + '\n')
                else:
                    indel_length = abs(inconsistency)
                    indel_class = 'Indel (<= 5bp)' if indel_length <= qconfig.SHORT_INDEL_THRESHOLD else 'Indel (> 5bp)'
                    indel_type = 'insertion' if inconsistency < 0 else 'deletion'
                    mismatches = max(0, not_ns_number - indel_length)
                    ca_output.stdout_f.write(('\t\t\t  %s between these two alignments: %s of length %d; %d mismatches (number of Ns: %d)')
                                                 % (indel_class, indel_type, indel_length, mismatches, ns_number) + reason_msg + '\n')
                    indels_info.indels_list.append(indel_length)
                    if indel_type == 'insertion':
                        indels_info.insertions += indel_length
                    else:
                        indels_info.deletions += indel_length
                    indels_info.mismatches += mismatches
                    ca_output.icarus_out_f.write('indel: ' + indel_class.lower() + reason_msg + '\n')
            else:
                if qconfig.strict_NA:
                    aligned_lengths.append(cur_aligned_length)
                    contig_aligned_length += cur_aligned_length
                    cur_aligned_length = 0

                if distance_on_contig < 0:
                    #There is an overlap between the two alignments, a local misassembly
                    ca_output.stdout_f.write('\t\t\t  Overlap between these two alignments (local misassembly).')
                elif distance_on_contig > 0:
                    #There is a small gap between the two alignments, a local misassembly
                    ca_output.stdout_f.write('\t\t\t  Gap between these two alignments (local misassembly).')
                elif inconsistency < 0:
                    ca_output.stdout_f.write('\t\t\t  Overlap between these two alignments (local misassembly).')
                else:
                    ca_output.stdout_f.write('\t\t\t  Gap between these two alignments (local misassembly).')
                ca_output.stdout_f.write(' Inconsistency = ' + str(inconsistency) + reason_msg + '\n')
                ca_output.icarus_out_f.write('local misassembly' + reason_msg + '\n')
                region_misassemblies.append(Misassembly.LOCAL)

        prev_align = next_align
        cur_aligned_length += prev_align.len2 - (-distance_on_contig if distance_on_contig < 0 else 0)

    #Record the very last alignment
    i = len(sorted_aligns) - 1
    ca_output.stdout_f.write('\t\t\tReal Alignment %d: %s' % (i + 1, str(next_align)) + '\n')
    ca_output.icarus_out_f.write(next_align.icarus_report_str() + '\n')
    ref_aligns.setdefault(next_align.ref, []).append(next_align)
    ca_output.coords_filtered_f.write(str(next_align) + '\n')
    aligned_lengths.append(cur_aligned_length)
    contig_aligned_length += cur_aligned_length

    assert contig_aligned_length <= len(contig_seq), "Internal QUAST bug: contig aligned length is greater than " \
                                                     "contig length (contig: %s, len: %d, aligned: %d)!" % \
                                                     (sorted_aligns[0].contig, contig_aligned_length, len(contig_seq))

    return is_misassembled, misassembly_internal_overlap, references_misassemblies, indels_info, misassemblies_matched_sv
Example #5
0
def process_misassembled_contig(sorted_aligns, is_cyclic, aligned_lengths, region_misassemblies, ref_lens, ref_aligns,
                                ref_features, contig_seq, misassemblies_by_ref, istranslocations_by_ref, region_struct_variations,
                                ca_output):
    misassembly_internal_overlap = 0
    prev_align = sorted_aligns[0]
    cur_aligned_length = prev_align.len2
    is_misassembled = False
    contig_is_printed = False
    indels_info = IndelsInfo()
    contig_aligned_length = 0  # for internal debugging purposes
    cnt_misassemblies = 0

    misassemblies = []
    misassembly_info = []
    for i in range(len(sorted_aligns) - 1):
        next_align = sorted_aligns[i + 1]

        is_fake_translocation = is_fragmented_ref_fake_translocation(prev_align, next_align, ref_lens)
        internal_overlap, overlap_msg = exclude_internal_overlaps(prev_align, next_align, i)
        is_extensive_misassembly, aux_data = is_misassembly(prev_align, next_align, contig_seq, ref_lens,
                                                            is_cyclic, region_struct_variations, is_fake_translocation)
        misassembly_type = ''
        if is_extensive_misassembly: # it is not a Fake translocation, because is_extensive_misassembly is True
            prev_ref, next_ref = get_ref_by_chromosome(prev_align.ref), get_ref_by_chromosome(next_align.ref)
            if prev_align.ref != next_align.ref:  # if chromosomes from different references
                if qconfig.is_combined_ref and prev_ref != next_ref:
                    misassembly_type = 'interspecies translocation'
                else:
                    misassembly_type = 'translocation'
            elif abs(aux_data["inconsistency"]) > qconfig.extensive_misassembly_threshold:
                misassembly_type = 'relocation'
            else: #if strand1 != strand2:
                misassembly_type = 'inversion'
            if next_align.s1 > prev_align.e1:
                start_in_ref, end_in_ref = prev_align.e1, next_align.s1
            else:
                start_in_ref, end_in_ref = next_align.s1, prev_align.e1
            misassemblies.append([(prev_align, start_in_ref, misassembly_type, next_align.len2), (next_align, end_in_ref, misassembly_type)])
        else:
            misassemblies.append([])
        misassembly_info.append((internal_overlap, overlap_msg, is_extensive_misassembly, aux_data, misassembly_type))
        prev_align = next_align
    is_potential_mge = None
    if qconfig.large_genome:
        is_potential_mge = detect_potential_mge(misassemblies)

    prev_align = sorted_aligns[0]
    for i in range(len(sorted_aligns) - 1):
        next_align = sorted_aligns[i + 1]
        internal_overlap, overlap_msg, is_extensive_misassembly, aux_data, misassembly_type = misassembly_info[i]
        if overlap_msg:
            cur_aligned_length -= internal_overlap
            ca_output.stdout_f.write(overlap_msg)

        inconsistency = aux_data["inconsistency"]
        distance_on_contig = aux_data["distance_on_contig"]
        misassembly_internal_overlap += aux_data["misassembly_internal_overlap"]
        cyclic_moment = aux_data["cyclic_moment"]
        ca_output.icarus_out_f.write(prev_align.icarus_report_str() + '\n')
        ca_output.stdout_f.write('\t\t\tReal Alignment %d: %s\n' % (i+1, str(prev_align)))

        ref_aligns.setdefault(prev_align.ref, []).append(prev_align)
        ca_output.coords_filtered_f.write(prev_align.coords_str() + '\n')
        prev_ref, next_ref = get_ref_by_chromosome(prev_align.ref), get_ref_by_chromosome(next_align.ref)
        if aux_data["is_sv"]:
            ca_output.stdout_f.write('\t\t\t  Not a misassembly (structural variation of the genome) between these two alignments\n')
            ca_output.icarus_out_f.write('fake: not a misassembly (structural variation of the genome)\n')
            region_misassemblies.append(Misassembly.MATCHED_SV)
        elif aux_data["is_scaffold_gap"]:
            if abs(inconsistency) > qconfig.extensive_misassembly_threshold:
                scaff_gap_type = ' (extensive)'
                region_misassemblies.append(Misassembly.SCAFFOLD_GAP)
                misassemblies_by_ref[prev_ref].append(Misassembly.SCAFFOLD_GAP)
                ca_output.icarus_out_f.write('fake: scaffold gap size wrong estimation' + scaff_gap_type + '\n')
            else:
                scaff_gap_type = ' (local)'
                region_misassemblies.append(Misassembly.LOCAL_SCAFFOLD_GAP)
                misassemblies_by_ref[prev_ref].append(Misassembly.LOCAL_SCAFFOLD_GAP)
                ca_output.icarus_out_f.write('fake: scaffold gap size wrong estimation' + scaff_gap_type + '\n')
            ca_output.stdout_f.write('\t\t\t  Scaffold gap between these two alignments, ')
            ca_output.stdout_f.write('gap lengths difference (reference vs assembly) = ' + str(inconsistency) + scaff_gap_type + '\n')
        elif is_extensive_misassembly and is_potential_mge and is_potential_mge[i]:
            ca_output.stdout_f.write(
                '\t\t\t  Not a misassembly (possible transposable element) between these two alignments\n')
            ca_output.icarus_out_f.write('fake: not a misassembly (possible transposable element)\n')
            region_misassemblies.append(Misassembly.POTENTIAL_MGE)
        elif is_extensive_misassembly:
            is_misassembled = True
            cnt_misassemblies += 1
            aligned_lengths.append(cur_aligned_length)
            contig_aligned_length += cur_aligned_length
            cur_aligned_length = 0
            if not contig_is_printed:
                ca_output.misassembly_f.write(prev_align.contig + '\n')
                contig_is_printed = True
            ca_output.misassembly_f.write('Extensive misassembly (')
            ca_output.stdout_f.write('\t\t\t  Extensive misassembly (')
            msg = ''
            if misassembly_type == 'interspecies translocation':
                misassembly_id = Misassembly.INTERSPECTRANSLOCATION
                istranslocations_by_ref[prev_ref][next_ref] += 1
                istranslocations_by_ref[next_ref][prev_ref] += 1
            elif misassembly_type == 'translocation':
                misassembly_id = Misassembly.TRANSLOCATION
            elif misassembly_type == 'relocation':
                misassembly_id = Misassembly.RELOCATION
                msg = ', inconsistency = ' + str(inconsistency) + \
                      (' [linear representation of circular genome]' if cyclic_moment else '')
            else: #if strand1 != strand2:
                misassembly_id = Misassembly.INVERSION
            region_misassemblies.append(misassembly_id)
            misassemblies_by_ref[prev_ref].append(misassembly_id)
            if misassembly_id == Misassembly.INTERSPECTRANSLOCATION:  # special case
                misassemblies_by_ref[next_ref].append(misassembly_id)
            if is_gap_filled_ns(contig_seq, prev_align, next_align):
                misassembly_type += ', scaffold gap is present'
                region_misassemblies.append(misassembly_id + (Misassembly.SCF_INVERSION - Misassembly.INVERSION))
            ca_output.stdout_f.write(misassembly_type + msg)
            ca_output.misassembly_f.write(misassembly_type + msg)
            ca_output.icarus_out_f.write(misassembly_type + msg)
            ca_output.stdout_f.write(') between these two alignments\n')
            ca_output.misassembly_f.write(') between %s %s and %s %s' % (prev_align.s2, prev_align.e2, next_align.s2, next_align.e2) + '\n')
            ca_output.icarus_out_f.write('\n')
            ref_features.setdefault(prev_align.ref, {})[prev_align.e1] = 'M'
            ref_features.setdefault(next_align.ref, {})[next_align.e1] = 'M'
        else:
            reason_msg = "" + (" [linear representation of circular genome]" if cyclic_moment else "") + \
                         (" [fragmentation of reference genome]" if prev_align.ref != next_align.ref else "")
            if inconsistency == 0 and cyclic_moment:
                ca_output.stdout_f.write('\t\t\t  Not a misassembly' + reason_msg + ' between these two alignments\n')
                ca_output.icarus_out_f.write('fake: not a misassembly' + reason_msg + '\n')
            elif inconsistency == 0 and prev_align.ref != next_align.ref:  # is_fragmented_ref_fake_translocation is True, because is_extensive_misassembly is False
                ca_output.stdout_f.write('\t\t\t  Not a misassembly' + reason_msg + ' between these two alignments\n')
                region_misassemblies.append(Misassembly.FRAGMENTED)
                misassemblies_by_ref[prev_ref].append(Misassembly.FRAGMENTED)
                ca_output.icarus_out_f.write('fake: not a misassembly' + reason_msg + '\n')
            elif abs(inconsistency) <= qconfig.MAX_INDEL_LENGTH and \
                            count_ns_and_not_ns_between_aligns(contig_seq, prev_align, next_align)[1] <= max(qconfig.min_alignment, qconfig.MAX_INDEL_LENGTH):
                ns_number, not_ns_number = count_ns_and_not_ns_between_aligns(contig_seq, prev_align, next_align)

                if inconsistency == 0:
                    ca_output.stdout_f.write(('\t\t\t  Stretch of %d mismatches between these two alignments (number of Ns: %d)' %
                                              (not_ns_number, ns_number)) + reason_msg + '\n')
                    indels_info.mismatches += not_ns_number
                    ca_output.icarus_out_f.write('indel: stretch of mismatches' + reason_msg + '\n')
                else:
                    indel_length = abs(inconsistency)
                    indel_class = 'Indel (<= 5bp)' if indel_length <= qconfig.SHORT_INDEL_THRESHOLD else 'Indel (> 5bp)'
                    indel_type = 'insertion' if inconsistency < 0 else 'deletion'
                    mismatches = max(0, not_ns_number - indel_length)
                    ca_output.stdout_f.write(('\t\t\t  %s between these two alignments: %s of length %d; %d mismatches (number of Ns: %d)')
                                                 % (indel_class, indel_type, indel_length, mismatches, ns_number) + reason_msg + '\n')
                    indels_info.indels_list.append(indel_length)
                    if indel_type == 'insertion':
                        indels_info.insertions += indel_length
                    else:
                        indels_info.deletions += indel_length
                    indels_info.mismatches += mismatches
                    ca_output.icarus_out_f.write('indel: ' + indel_class.lower() + reason_msg + '\n')
            else:
                if qconfig.strict_NA:
                    aligned_lengths.append(cur_aligned_length)
                    contig_aligned_length += cur_aligned_length
                    cur_aligned_length = 0

                if distance_on_contig < 0:
                    #There is an overlap between the two alignments, a local misassembly
                    ca_output.stdout_f.write('\t\t\t  Overlap between these two alignments (local misassembly).')
                elif distance_on_contig > 0:
                    #There is a small gap between the two alignments, a local misassembly
                    ca_output.stdout_f.write('\t\t\t  Gap between these two alignments (local misassembly).')
                elif inconsistency < 0:
                    ca_output.stdout_f.write('\t\t\t  Overlap between these two alignments (local misassembly).')
                else:
                    ca_output.stdout_f.write('\t\t\t  Gap between these two alignments (local misassembly).')
                ca_output.stdout_f.write(' Inconsistency = ' + str(inconsistency) + reason_msg + '\n')
                ca_output.icarus_out_f.write('local misassembly' + reason_msg + '\n')
                region_misassemblies.append(Misassembly.LOCAL)
                misassemblies_by_ref[prev_ref].append(Misassembly.LOCAL)

        prev_align = next_align
        cur_aligned_length += prev_align.len2 - (-distance_on_contig if distance_on_contig < 0 else 0)

    #Record the very last alignment
    i = len(sorted_aligns) - 1
    ca_output.stdout_f.write('\t\t\tReal Alignment %d: %s' % (i + 1, str(next_align)) + '\n')
    ca_output.icarus_out_f.write(next_align.icarus_report_str() + '\n')
    ref_aligns.setdefault(next_align.ref, []).append(next_align)
    ca_output.coords_filtered_f.write(next_align.coords_str() + '\n')
    aligned_lengths.append(cur_aligned_length)
    contig_aligned_length += cur_aligned_length

    assert contig_aligned_length <= len(contig_seq), "Internal QUAST bug: contig aligned length is greater than " \
                                                     "contig length (contig: %s, len: %d, aligned: %d)!" % \
                                                     (sorted_aligns[0].contig, contig_aligned_length, len(contig_seq))

    return is_misassembled, misassembly_internal_overlap, indels_info, cnt_misassemblies, contig_aligned_length