Ejemplo n.º 1
0
def analyze_coverage(ref_aligns, reference_chromosomes, ns_by_chromosomes, used_snps_fpath):
    indels_info = IndelsInfo()
    genome_mapping = {}
    for chr_name, chr_len in reference_chromosomes.items():
        genome_mapping[chr_name] = [0] * (chr_len + 1)
    with open(used_snps_fpath, 'w') as used_snps_f:
        for chr_name, aligns in ref_aligns.items():
            for align in aligns:
                ref_pos, ctg_pos = align.s1, align.s2
                strand_direction = 1 if align.s2 < align.e2 else -1
                for op in parse_cs_tag(align.cigar):
                    if op.startswith(':'):
                        n_bases = int(op[1:])
                    else:
                        n_bases = len(op) - 1
                    if op.startswith('*'):
                        ref_nucl, ctg_nucl = op[1].upper(), op[2].upper()
                        if ctg_nucl != 'N' and ref_nucl != 'N':
                            indels_info.mismatches += 1
                            if qconfig.show_snps:
                                used_snps_f.write('%s\t%s\t%d\t%s\t%s\t%d\n' % (chr_name, align.contig, ref_pos, ref_nucl, ctg_nucl, ctg_pos))
                        ref_pos += 1
                        ctg_pos += 1 * strand_direction
                    elif op.startswith('+'):
                        indels_info.indels_list.append(n_bases)
                        indels_info.insertions += n_bases
                        if qconfig.show_snps and n_bases < qconfig.MAX_INDEL_LENGTH:
                            ref_nucl, ctg_nucl = '.', op[1:].upper()
                            used_snps_f.write('%s\t%s\t%d\t%s\t%s\t%d\n' % (chr_name, align.contig, ref_pos, ref_nucl, ctg_nucl, ctg_pos))
                        ctg_pos += n_bases * strand_direction
                    elif op.startswith('-'):
                        indels_info.indels_list.append(n_bases)
                        indels_info.deletions += n_bases
                        if qconfig.show_snps and n_bases < qconfig.MAX_INDEL_LENGTH:
                            ref_nucl, ctg_nucl = op[1:].upper(), '.'
                            used_snps_f.write('%s\t%s\t%d\t%s\t%s\t%d\n' % (chr_name, align.contig, ref_pos, ref_nucl, ctg_nucl, ctg_pos))
                        ref_pos += n_bases
                    else:
                        ref_pos += n_bases
                        ctg_pos += n_bases * strand_direction
                if align.s1 < align.e1:
                    for pos in range(align.s1, align.e1 + 1):
                        genome_mapping[align.ref][pos] = 1
                else:
                    for pos in range(align.s1, len(genome_mapping[align.ref])):
                        genome_mapping[align.ref][pos] = 1
                    for pos in range(1, align.e1 + 1):
                        genome_mapping[align.ref][pos] = 1
            for i in ns_by_chromosomes[align.ref]:
                genome_mapping[align.ref][i] = 0

    covered_bases = sum([sum(genome_mapping[chrom]) for chrom in genome_mapping])
    return covered_bases, indels_info
Ejemplo n.º 2
0
def analyze_contigs(ca_output,
                    contigs_fpath,
                    unaligned_fpath,
                    unaligned_info_fpath,
                    aligns,
                    ref_features,
                    ref_lens,
                    is_cyclic=None):
    maxun = 10
    epsilon = 0.99
    umt = 0.5  # threshold for misassembled contigs with aligned less than $umt * 100% (Unaligned Missassembled Threshold)

    unaligned = 0
    partially_unaligned = 0
    fully_unaligned_bases = 0
    partially_unaligned_bases = 0
    ambiguous_contigs = 0
    ambiguous_contigs_extra_bases = 0
    ambiguous_contigs_len = 0
    half_unaligned_with_misassembly = 0
    misassembly_internal_overlap = 0
    misassemblies_matched_sv = 0

    ref_aligns = dict()
    contigs_aligned_lengths = []
    aligned_lengths = []
    region_misassemblies = []
    misassembled_contigs = dict()
    misassemblies_in_contigs = []

    region_struct_variations = find_all_sv(qconfig.bed)

    istranslocations_by_ref = dict()
    misassemblies_by_ref = defaultdict(list)
    for ref in ref_labels_by_chromosomes.values():
        istranslocations_by_ref[ref] = dict(
            (key, 0) for key in ref_labels_by_chromosomes.values())

    # for counting SNPs and indels (both original (.all_snps) and corrected from Nucmer's local misassemblies)
    total_indels_info = IndelsInfo()

    unaligned_file = open(unaligned_fpath, 'w')
    unaligned_info_file = open(unaligned_info_fpath, 'w')
    unaligned_info_file.write('\t'.join([
        'Contig', 'Total_length', 'Unaligned_length', 'Unaligned_type',
        'Unaligned_parts'
    ]) + '\n')
    for contig, seq in fastaparser.read_fasta(contigs_fpath):
        #Recording contig stats
        ctg_len = len(seq)
        ca_output.stdout_f.write('CONTIG: %s (%dbp)\n' % (contig, ctg_len))
        contig_type = 'unaligned'
        misassemblies_in_contigs.append(0)
        contigs_aligned_lengths.append(0)

        #Check if this contig aligned to the reference
        if contig in aligns:
            for align in aligns[contig]:
                sub_seq = seq[align.start():align.end()]
                if 'N' in sub_seq:
                    ns_pos = [
                        pos for pos in range(align.start(), align.end())
                        if seq[pos] == 'N'
                    ]
            contig_type = 'correct'
            #Pull all aligns for this contig
            num_aligns = len(aligns[contig])

            #Sort aligns by aligned_length * identity - unaligned_length (as we do in BSS)
            sorted_aligns = sorted(aligns[contig],
                                   key=lambda x:
                                   (score_single_align(x), x.len2),
                                   reverse=True)
            top_len = sorted_aligns[0].len2
            top_id = sorted_aligns[0].idy
            top_score = score_single_align(sorted_aligns[0])
            top_aligns = []
            ca_output.stdout_f.write(
                'Best alignment score: %.1f (LEN: %d, IDY: %.2f)\n' %
                (top_score, top_len, top_id))

            #Check that top hit captures most of the contig
            if top_len > ctg_len * epsilon or ctg_len - top_len < maxun:
                #Reset top aligns: aligns that share the same value of longest and highest identity
                top_aligns.append(sorted_aligns[0])
                sorted_aligns = sorted_aligns[1:]

                #Continue grabbing alignments while length and identity are identical
                #while sorted_aligns and top_len == sorted_aligns[0].len2 and top_id == sorted_aligns[0].idy:
                while sorted_aligns and (score_single_align(
                        sorted_aligns[0]) >=
                                         qconfig.ambiguity_score * top_score):
                    top_aligns.append(sorted_aligns[0])
                    sorted_aligns = sorted_aligns[1:]

                #Mark other alignments as insignificant (former ambiguous)
                if sorted_aligns:
                    ca_output.stdout_f.write(
                        '\t\tSkipping these alignments as insignificant (option --ambiguity-score is set to "%s"):\n'
                        % str(qconfig.ambiguity_score))
                    for align in sorted_aligns:
                        ca_output.stdout_f.write('\t\t\tSkipping alignment ' +
                                                 str(align) + '\n')

                if len(top_aligns) == 1:
                    #There is only one top align, life is good
                    ca_output.stdout_f.write(
                        '\t\tOne align captures most of this contig: %s\n' %
                        str(top_aligns[0]))
                    ca_output.icarus_out_f.write(
                        top_aligns[0].icarus_report_str() + '\n')
                    ref_aligns.setdefault(top_aligns[0].ref,
                                          []).append(top_aligns[0])
                    ca_output.coords_filtered_f.write(
                        str(top_aligns[0]) + '\n')
                    aligned_lengths.append(top_aligns[0].len2)
                    contigs_aligned_lengths[-1] = top_aligns[0].len2
                else:
                    #There is more than one top align
                    ca_output.stdout_f.write(
                        '\t\tThis contig has %d significant alignments. [An ambiguously mapped contig]\n'
                        % len(top_aligns))

                    #Increment count of ambiguously mapped contigs and bases in them
                    ambiguous_contigs += 1
                    # we count only extra bases, so we shouldn't include bases in the first alignment
                    # if --ambiguity-usage is 'none', the number of extra bases will be negative!
                    ambiguous_contigs_len += ctg_len

                    # Alex: skip all alignments or count them as normal (just different aligns of one repeat). Depend on --allow-ambiguity option
                    if qconfig.ambiguity_usage == "none":
                        ambiguous_contigs_extra_bases -= top_aligns[0].len2
                        ca_output.stdout_f.write(
                            '\t\tSkipping these alignments (option --ambiguity-usage is set to "none"):\n'
                        )
                        for align in top_aligns:
                            ca_output.stdout_f.write(
                                '\t\t\tSkipping alignment ' + str(align) +
                                '\n')
                    elif qconfig.ambiguity_usage == "one":
                        ambiguous_contigs_extra_bases += 0
                        ca_output.stdout_f.write(
                            '\t\tUsing only first of these alignment (option --ambiguity-usage is set to "one"):\n'
                        )
                        ca_output.stdout_f.write('\t\t\tAlignment: %s\n' %
                                                 str(top_aligns[0]))
                        ca_output.icarus_out_f.write(
                            top_aligns[0].icarus_report_str() + '\n')
                        ref_aligns.setdefault(top_aligns[0].ref,
                                              []).append(top_aligns[0])
                        aligned_lengths.append(top_aligns[0].len2)
                        contigs_aligned_lengths[-1] = top_aligns[0].len2
                        ca_output.coords_filtered_f.write(
                            str(top_aligns[0]) + '\n')
                        top_aligns = top_aligns[1:]
                        for align in top_aligns:
                            ca_output.stdout_f.write(
                                '\t\t\tSkipping alignment ' + str(align) +
                                '\n')
                    elif qconfig.ambiguity_usage == "all":
                        ambiguous_contigs_extra_bases -= top_aligns[0].len2
                        ca_output.stdout_f.write(
                            '\t\tUsing all these alignments (option --ambiguity-usage is set to "all"):\n'
                        )
                        # we count only extra bases, so we shouldn't include bases in the first alignment
                        first_alignment = True
                        contig_type = 'ambiguous'
                        while len(top_aligns):
                            ca_output.stdout_f.write('\t\t\tAlignment: %s\n' %
                                                     str(top_aligns[0]))
                            ca_output.icarus_out_f.write(
                                top_aligns[0].icarus_report_str(
                                    ambiguity=True) + '\n')
                            ref_aligns.setdefault(top_aligns[0].ref,
                                                  []).append(top_aligns[0])
                            if first_alignment:
                                first_alignment = False
                                aligned_lengths.append(top_aligns[0].len2)
                                contigs_aligned_lengths[-1] = top_aligns[
                                    0].len2
                            ambiguous_contigs_extra_bases += top_aligns[0].len2
                            ca_output.coords_filtered_f.write(
                                str(top_aligns[0]) + ' ambiguous\n')
                            top_aligns = top_aligns[1:]
            else:
                # choose appropriate alignments (to maximize total size of contig alignment and reduce # misassemblies)
                is_ambiguous, too_much_best_sets, sorted_aligns, best_sets = get_best_aligns_sets(
                    sorted_aligns, ctg_len, ca_output.stdout_f, seq, ref_lens,
                    is_cyclic, region_struct_variations)
                the_best_set = best_sets[0]
                used_indexes = list(
                    range(len(sorted_aligns))
                    if too_much_best_sets else get_used_indexes(best_sets))
                if len(used_indexes) < len(sorted_aligns):
                    ca_output.stdout_f.write(
                        '\t\t\tSkipping redundant alignments after choosing the best set of alignments\n'
                    )
                    for idx in set([
                            idx for idx in range(len(sorted_aligns))
                            if idx not in used_indexes
                    ]):
                        ca_output.stdout_f.write(
                            '\t\tSkipping redundant alignment ' +
                            str(sorted_aligns[idx]) + '\n')

                if is_ambiguous:
                    ca_output.stdout_f.write(
                        '\t\tThis contig has several significant sets of alignments. [An ambiguously mapped contig]\n'
                    )
                    # similar to regular ambiguous contigs, see above
                    ambiguous_contigs += 1
                    ambiguous_contigs_len += ctg_len

                    if qconfig.ambiguity_usage == "none":
                        ambiguous_contigs_extra_bases -= (
                            ctg_len - the_best_set.uncovered)
                        ca_output.stdout_f.write(
                            '\t\tSkipping all alignments in these sets (option --ambiguity-usage is set to "none"):\n'
                        )
                        for idx in used_indexes:
                            ca_output.stdout_f.write(
                                '\t\t\tSkipping alignment ' +
                                str(sorted_aligns[idx]) + '\n')
                        continue
                    elif qconfig.ambiguity_usage == "one":
                        ambiguous_contigs_extra_bases += 0
                        ca_output.stdout_f.write(
                            '\t\tUsing only the very best set (option --ambiguity-usage is set to "one").\n'
                        )
                        if len(the_best_set.indexes) < len(used_indexes):
                            ca_output.stdout_f.write(
                                '\t\tSo, skipping alignments from other sets:\n'
                            )
                            for idx in used_indexes:
                                if idx not in the_best_set.indexes:
                                    ca_output.stdout_f.write(
                                        '\t\t\tSkipping alignment ' +
                                        str(sorted_aligns[idx]) + '\n')
                    elif qconfig.ambiguity_usage == "all":
                        ca_output.stdout_f.write(
                            '\t\tUsing all alignments in these sets (option --ambiguity-usage is set to "all"):\n'
                        )
                        ca_output.stdout_f.write(
                            '\t\t\tThe very best set is shown in details below, the rest are:\n'
                        )
                        for idx, cur_set in enumerate(best_sets[1:]):
                            ca_output.stdout_f.write('\t\t\t\tGroup #%d. Score: %.1f, number of alignments: %d, unaligned bases: %d\n' % \
                                (idx + 2, cur_set.score, len(cur_set.indexes), cur_set.uncovered))
                        if too_much_best_sets:
                            ca_output.stdout_f.write('\t\t\t\tetc...\n')
                        if len(the_best_set.indexes) < len(used_indexes):
                            ambiguous_contigs_extra_bases -= (
                                ctg_len - the_best_set.uncovered)
                            ca_output.stdout_f.write(
                                '\t\t\tList of alignments used in the sets above:\n'
                            )
                            for idx in used_indexes:
                                align = sorted_aligns[idx]
                                ca_output.stdout_f.write(
                                    '\t\tAlignment: %s\n' % str(align))
                                ref_aligns.setdefault(align.ref,
                                                      []).append(align)
                                ambiguous_contigs_extra_bases += align.len2
                                ca_output.coords_filtered_f.write(
                                    str(align) + " ambiguous\n")
                                if idx not in the_best_set.indexes:
                                    ca_output.icarus_out_f.write(
                                        align.icarus_report_str(
                                            is_best=False) + '\n')

                ca_output.stdout_f.write('\t\t\tThe best set is below. Score: %.1f, number of alignments: %d, unaligned bases: %d\n' % \
                                             (the_best_set.score, len(the_best_set.indexes), the_best_set.uncovered))
                real_aligns = [sorted_aligns[i] for i in the_best_set.indexes]

                # main processing part
                if len(real_aligns) == 1:
                    the_only_align = real_aligns[0]

                    #There is only one alignment of this contig to the reference
                    ca_output.coords_filtered_f.write(
                        str(the_only_align) + '\n')
                    aligned_lengths.append(the_only_align.len2)
                    contigs_aligned_lengths[-1] = the_only_align.len2

                    begin, end = the_only_align.start(), the_only_align.end()
                    unaligned_bases = (begin - 1) + (ctg_len - end)
                    aligned_bases_in_contig = ctg_len - unaligned_bases
                    is_partially_unaligned = check_partially_unaligned(
                        real_aligns, ctg_len)
                    if is_partially_unaligned:
                        partially_unaligned += 1
                        partially_unaligned_bases += unaligned_bases
                        if aligned_bases_in_contig < umt * ctg_len:
                            contig_type = 'correct_unaligned'
                        ca_output.stdout_f.write(
                            '\t\tThis contig is partially unaligned. (Aligned %d out of %d bases)\n'
                            % (aligned_bases_in_contig, ctg_len))
                        save_unaligned_info(real_aligns, contig, ctg_len,
                                            unaligned_bases,
                                            unaligned_info_file)
                    ca_output.stdout_f.write('\t\tAlignment: %s\n' %
                                             str(the_only_align))
                    ca_output.icarus_out_f.write(
                        the_only_align.icarus_report_str() + '\n')
                    if is_partially_unaligned:
                        if begin - 1:
                            ca_output.stdout_f.write(
                                '\t\tUnaligned bases: 1 to %d (%d)\n' %
                                (begin - 1, begin - 1))
                        if ctg_len - end:
                            ca_output.stdout_f.write(
                                '\t\tUnaligned bases: %d to %d (%d)\n' %
                                (end + 1, ctg_len, ctg_len - end))
                        if qconfig.is_combined_ref and aligned_bases_in_contig >= umt * ctg_len:
                            check_for_potential_translocation(
                                seq, ctg_len, real_aligns,
                                region_misassemblies, misassemblies_by_ref,
                                ca_output.stdout_f)
                    ref_aligns.setdefault(the_only_align.ref,
                                          []).append(the_only_align)
                else:
                    #Sort real alignments by position on the contig
                    sorted_aligns = sorted(real_aligns,
                                           key=lambda x: (x.end(), x.start()))

                    #There is more than one alignment of this contig to the reference
                    ca_output.stdout_f.write(
                        '\t\tThis contig is misassembled. %d total aligns.\n' %
                        num_aligns)
                    unaligned_bases = the_best_set.uncovered
                    aligned_bases_in_contig = ctg_len - unaligned_bases
                    is_partially_unaligned = check_partially_unaligned(
                        sorted_aligns, ctg_len)
                    if is_partially_unaligned:
                        partially_unaligned += 1
                        partially_unaligned_bases += unaligned_bases
                        if aligned_bases_in_contig >= umt * ctg_len:
                            ca_output.stdout_f.write(
                                '\t\tThis contig is partially unaligned. (Aligned %d out of %d bases)\n'
                                % (aligned_bases_in_contig, ctg_len))
                        save_unaligned_info(sorted_aligns, contig, ctg_len,
                                            unaligned_bases,
                                            unaligned_info_file)

                    if aligned_bases_in_contig < umt * ctg_len:
                        ca_output.stdout_f.write('\t\t\tWarning! This contig is more unaligned than misassembled. ' + \
                            'Contig length is %d and total length of all aligns is %d\n' % (ctg_len, aligned_bases_in_contig))
                        contigs_aligned_lengths[-1] = sum(
                            align.len2 for align in sorted_aligns)
                        for align in sorted_aligns:
                            ca_output.stdout_f.write('\t\tAlignment: %s\n' %
                                                     str(align))
                            ca_output.icarus_out_f.write(
                                align.icarus_report_str() + '\n')
                            ca_output.icarus_out_f.write('unknown\n')
                            ca_output.coords_filtered_f.write(
                                str(align) + '\n')
                            aligned_lengths.append(align.len2)
                            ref_aligns.setdefault(align.ref, []).append(align)

                        half_unaligned_with_misassembly += 1
                        ca_output.stdout_f.write('\t\tUnaligned bases: %d\n' %
                                                 unaligned_bases)
                        contig_type = 'mis_unaligned'
                        ca_output.icarus_out_f.write('\t'.join([
                            'CONTIG', contig,
                            str(ctg_len), contig_type + '\n'
                        ]))
                        ca_output.stdout_f.write('\n')
                        continue

                    ### processing misassemblies
                    is_misassembled, current_mio, indels_info, misassemblies_matched_sv, cnt_misassemblies, contig_aligned_length = \
                        process_misassembled_contig(sorted_aligns, is_cyclic, aligned_lengths, region_misassemblies,
                                                    ref_lens, ref_aligns, ref_features, seq, misassemblies_by_ref,
                                                    istranslocations_by_ref, region_struct_variations, misassemblies_matched_sv,
                                                    ca_output)
                    contigs_aligned_lengths[-1] = contig_aligned_length
                    misassembly_internal_overlap += current_mio
                    total_indels_info += indels_info
                    if is_misassembled:
                        misassembled_contigs[contig] = ctg_len
                        contig_type = 'misassembled'
                        misassemblies_in_contigs[-1] = cnt_misassemblies
                    if is_partially_unaligned:
                        ca_output.stdout_f.write('\t\tUnaligned bases: %d\n' %
                                                 unaligned_bases)
                        if qconfig.is_combined_ref:
                            check_for_potential_translocation(
                                seq, ctg_len, sorted_aligns,
                                region_misassemblies, misassemblies_by_ref,
                                ca_output.stdout_f)
        else:
            #No aligns to this contig
            ca_output.stdout_f.write(
                '\t\tThis contig is unaligned. (%d bp)\n' % ctg_len)
            unaligned_file.write(contig)

            #Increment unaligned contig count and bases
            unaligned += 1
            fully_unaligned_bases += ctg_len
            ca_output.stdout_f.write('\t\tUnaligned bases: %d total: %d\n' %
                                     (ctg_len, fully_unaligned_bases))
            save_unaligned_info([], contig, ctg_len, ctg_len,
                                unaligned_info_file)

        ca_output.icarus_out_f.write('\t'.join(
            ['CONTIG', contig, str(ctg_len), contig_type]) + '\n')
        ca_output.stdout_f.write('\n')

    unaligned_file.close()
    unaligned_info_file.close()
    misassembled_bases = sum(misassembled_contigs.values())

    result = {
        'region_misassemblies':
        region_misassemblies,
        'region_struct_variations':
        region_struct_variations.get_count()
        if region_struct_variations else None,
        'misassemblies_matched_sv':
        misassemblies_matched_sv,
        'misassembled_contigs':
        misassembled_contigs,
        'misassembled_bases':
        misassembled_bases,
        'misassembly_internal_overlap':
        misassembly_internal_overlap,
        'unaligned':
        unaligned,
        'partially_unaligned':
        partially_unaligned,
        'partially_unaligned_bases':
        partially_unaligned_bases,
        'fully_unaligned_bases':
        fully_unaligned_bases,
        'ambiguous_contigs':
        ambiguous_contigs,
        'ambiguous_contigs_extra_bases':
        ambiguous_contigs_extra_bases,
        'ambiguous_contigs_len':
        ambiguous_contigs_len,
        'half_unaligned_with_misassembly':
        half_unaligned_with_misassembly,
        'misassemblies_by_ref':
        misassemblies_by_ref,
        'istranslocations_by_refs':
        istranslocations_by_ref
    }

    return result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, contigs_aligned_lengths
Ejemplo n.º 3
0
def analyze_coverage(ref_aligns, reference_chromosomes, ns_by_chromosomes,
                     used_snps_fpath):
    logger.info("    Enter analyze_coverage")
    #logger.info(f"    {ref_aligns=}")
    indels_info = IndelsInfo()
    maximum_contig_align_size_per_ref_base = {}
    strict_maximum_contig_align_size_per_ref_base = {}
    genome_mapping = {}
    genome_length = 0
    for chr_name, chr_len in reference_chromosomes.items():
        genome_mapping[chr_name] = [0] * (chr_len + 1)
        maximum_contig_align_size_per_ref_base[chr_name] = [0] * (chr_len + 1)
        strict_maximum_contig_align_size_per_ref_base[chr_name] = [0] * (
            chr_len + 1)
        genome_length += chr_len
    logger.info("      Genome length: " + str(genome_length))

    alignment_total_length = 0
    with open(used_snps_fpath, 'w') as used_snps_f:
        for chr_name, aligns in ref_aligns.items():
            for align in aligns:
                # Vars with 1 are on the reference, vars with 2 are on the contig
                ref_pos, ctg_pos = align.s1, align.s2
                strand_direction = 1 if align.s2 < align.e2 else -1
                for op in parse_cs_tag(align.cigar):
                    if op.startswith(':'):
                        n_bases = int(op[1:])
                    else:
                        n_bases = len(op) - 1
                    if op.startswith('*'):
                        ref_nucl, ctg_nucl = op[1].upper(), op[2].upper()
                        if ctg_nucl != 'N' and ref_nucl != 'N':
                            indels_info.mismatches += 1
                            if qconfig.show_snps:
                                used_snps_f.write(
                                    '%s\t%s\t%d\t%s\t%s\t%d\n' %
                                    (chr_name, align.contig, ref_pos, ref_nucl,
                                     ctg_nucl, ctg_pos))
                        ref_pos += 1
                        ctg_pos += 1 * strand_direction
                    elif op.startswith('+'):
                        indels_info.indels_list.append(n_bases)
                        indels_info.insertions += n_bases
                        if qconfig.show_snps and n_bases < qconfig.MAX_INDEL_LENGTH:
                            ref_nucl, ctg_nucl = '.', op[1:].upper()
                            used_snps_f.write('%s\t%s\t%d\t%s\t%s\t%d\n' %
                                              (chr_name, align.contig, ref_pos,
                                               ref_nucl, ctg_nucl, ctg_pos))
                        ctg_pos += n_bases * strand_direction
                    elif op.startswith('-'):
                        indels_info.indels_list.append(n_bases)
                        indels_info.deletions += n_bases
                        if qconfig.show_snps and n_bases < qconfig.MAX_INDEL_LENGTH:
                            ref_nucl, ctg_nucl = op[1:].upper(), '.'
                            used_snps_f.write('%s\t%s\t%d\t%s\t%s\t%d\n' %
                                              (chr_name, align.contig, ref_pos,
                                               ref_nucl, ctg_nucl, ctg_pos))
                        ref_pos += n_bases
                    else:
                        ref_pos += n_bases
                        ctg_pos += n_bases * strand_direction

                alignment_total_length += align.len2_excluding_local_misassemblies
                if align.s1 < align.e1:
                    align_size = align.len2_excluding_local_misassemblies  # Use the same len that is used to compute NGAx
                    strict_align_size = align.len2_including_local_misassemblies  # Use the same len that is used to strict compute NGAx
                    for pos in range(align.s1, align.e1 + 1):
                        genome_mapping[align.ref][pos] = 1
                        maximum_contig_align_size_per_ref_base[
                            align.ref][pos] = max(
                                align_size,
                                maximum_contig_align_size_per_ref_base[
                                    align.ref][pos])
                        strict_maximum_contig_align_size_per_ref_base[
                            align.ref][pos] = max(
                                strict_align_size,
                                strict_maximum_contig_align_size_per_ref_base[
                                    align.ref][pos])
                else:
                    align_size = align.len2_excluding_local_misassemblies  # Use the same len that is used to compute NGAx
                    strict_align_size = align.len2_including_local_misassemblies  # Use the same len that is used to strict compute NGAx
                    for pos in range(align.s1, len(genome_mapping[align.ref])):
                        genome_mapping[align.ref][pos] = 1
                        maximum_contig_align_size_per_ref_base[
                            align.ref][pos] = max(
                                align_size,
                                maximum_contig_align_size_per_ref_base[
                                    align.ref][pos])
                        strict_maximum_contig_align_size_per_ref_base[
                            align.ref][pos] = max(
                                strict_align_size,
                                strict_maximum_contig_align_size_per_ref_base[
                                    align.ref][pos])
                    for pos in range(1, align.e1 + 1):
                        genome_mapping[align.ref][pos] = 1
                        maximum_contig_align_size_per_ref_base[
                            align.ref][pos] = max(
                                align_size,
                                maximum_contig_align_size_per_ref_base[
                                    align.ref][pos])
                        strict_maximum_contig_align_size_per_ref_base[
                            align.ref][pos] = max(
                                strict_align_size,
                                strict_maximum_contig_align_size_per_ref_base[
                                    align.ref][pos])
            for i in ns_by_chromosomes[align.ref]:
                genome_mapping[align.ref][i] = 0
                maximum_contig_align_size_per_ref_base[align.ref][pos] = 0
                strict_maximum_contig_align_size_per_ref_base[
                    align.ref][pos] = 0

    covered_bases = sum(
        [sum(genome_mapping[chrom]) for chrom in genome_mapping])

    maximum_contig_align_size_per_ref_base = [
        align_size
        for contig in maximum_contig_align_size_per_ref_base.values()
        for align_size in contig
    ]
    maximum_contig_align_size_per_ref_base.sort(reverse=True)
    ea_x_max = [0] * 101
    for i in range(0, 100):
        ea_x_max[i] = maximum_contig_align_size_per_ref_base[
            (len(maximum_contig_align_size_per_ref_base) * i) // 100]
    ea_x_max[100] = maximum_contig_align_size_per_ref_base[-1]
    ea_mean_max = int(
        sum(maximum_contig_align_size_per_ref_base) /
        len(maximum_contig_align_size_per_ref_base))
    p5k = P(maximum_contig_align_size_per_ref_base, 5000)
    p10k = P(maximum_contig_align_size_per_ref_base, 10000)
    p15k = P(maximum_contig_align_size_per_ref_base, 15000)
    p20k = P(maximum_contig_align_size_per_ref_base, 20000)

    strict_maximum_contig_align_size_per_ref_base = [
        align_size
        for contig in strict_maximum_contig_align_size_per_ref_base.values()
        for align_size in contig
    ]
    strict_maximum_contig_align_size_per_ref_base.sort(reverse=True)
    strict_ea_x_max = [0] * 101
    for i in range(0, 100):
        strict_ea_x_max[i] = strict_maximum_contig_align_size_per_ref_base[
            (len(strict_maximum_contig_align_size_per_ref_base) * i) // 100]
    strict_ea_x_max[100] = strict_maximum_contig_align_size_per_ref_base[-1]
    strict_ea_mean_max = int(
        sum(strict_maximum_contig_align_size_per_ref_base) /
        len(strict_maximum_contig_align_size_per_ref_base))
    strict_p5k = P(strict_maximum_contig_align_size_per_ref_base, 5000)
    strict_p10k = P(strict_maximum_contig_align_size_per_ref_base, 10000)
    strict_p15k = P(strict_maximum_contig_align_size_per_ref_base, 15000)
    strict_p20k = P(strict_maximum_contig_align_size_per_ref_base, 20000)

    #print("computed ea_x_max as " + str(ea_x_max))
    logger.info("      Duplication ratio = %.2f = %d/%d" %
                ((alignment_total_length / covered_bases),
                 alignment_total_length, covered_bases))
    logger.info("      EA50max = {}".format(ea_x_max[50]))
    logger.info("      Strict EA50max = {}".format(strict_ea_x_max[50]))
    logger.info("      len2 NGA50 = {}".format(
        N50.NG50_and_LG50(
            [align.len2 for aligns in ref_aligns.values() for align in aligns],
            genome_length,
            need_sort=True)[0]))
    logger.info("      len2_excluding_local_misassemblies NGA50 = {}".format(
        N50.NG50_and_LG50([
            align.len2_excluding_local_misassemblies
            for aligns in ref_aligns.values() for align in aligns
        ],
                          genome_length,
                          need_sort=True)[0]))

    return covered_bases, indels_info, ea_x_max, strict_ea_x_max, ea_mean_max, strict_ea_mean_max, p5k, p10k, p15k, p20k, strict_p5k, strict_p10k, strict_p15k, strict_p20k
Ejemplo n.º 4
0
def analyze_contigs(ca_output,
                    contigs_fpath,
                    unaligned_fpath,
                    aligns,
                    ref_features,
                    ref_lens,
                    cyclic=None):
    maxun = 10
    epsilon = 0.99
    umt = 0.5  # threshold for misassembled contigs with aligned less than $umt * 100% (Unaligned Missassembled Threshold)

    unaligned = 0
    partially_unaligned = 0
    fully_unaligned_bases = 0
    partially_unaligned_bases = 0
    ambiguous_contigs = 0
    ambiguous_contigs_extra_bases = 0
    ambiguous_contigs_len = 0
    partially_unaligned_with_misassembly = 0
    partially_unaligned_with_significant_parts = 0
    misassembly_internal_overlap = 0
    contigs_with_istranslocations = 0
    misassemblies_matched_sv = 0

    ref_aligns = dict()
    aligned_lengths = []
    region_misassemblies = []
    misassembled_contigs = dict()

    region_struct_variations = find_all_sv(qconfig.bed)

    references_misassemblies = {}
    for ref in ref_labels_by_chromosomes.values():
        references_misassemblies[ref] = dict(
            (key, 0) for key in ref_labels_by_chromosomes.values())

    # for counting SNPs and indels (both original (.all_snps) and corrected from Nucmer's local misassemblies)
    total_indels_info = IndelsInfo()

    unaligned_file = open(unaligned_fpath, 'w')
    for contig, seq in fastaparser.read_fasta(contigs_fpath):
        #Recording contig stats
        ctg_len = len(seq)
        print >> ca_output.stdout_f, 'CONTIG: %s (%dbp)' % (contig, ctg_len)
        contig_type = 'unaligned'

        #Check if this contig aligned to the reference
        if contig in aligns:
            for align in aligns[contig]:
                #sub_seq = seq[align.start(): align.end()]
                sub_seq = seq[_start(align):_end(align)]
                if 'N' in sub_seq:
                    ns_pos = [
                        pos for pos in xrange(_start(align), _end(align))
                        if seq[pos] == 'N'
                    ]
#                    ns_pos = [pos for pos in xrange(align.start(), align.end()) if seq[pos] == 'N']
            contig_type = 'correct'
            #Pull all aligns for this contig
            num_aligns = len(aligns[contig])

            #Sort aligns by aligned_length * identity - unaligned_length (as we do in BSS)
            sorted_aligns = sorted(aligns[contig],
                                   key=lambda x: (score_single_align(x), x[5]),
                                   reverse=True)
            top_len = sorted_aligns[0][5]
            top_id = sorted_aligns[0][6]
            top_score = score_single_align(sorted_aligns[0])
            top_aligns = []
            print >> ca_output.stdout_f, 'Top Length: %d  Top ID: %.2f (Score: %.1f)' % (
                top_len, top_id, top_score)

            #Check that top hit captures most of the contig
            if top_len > ctg_len * epsilon or ctg_len - top_len < maxun:
                #Reset top aligns: aligns that share the same value of longest and highest identity
                top_aligns.append(sorted_aligns[0])
                sorted_aligns = sorted_aligns[1:]

                #Continue grabbing alignments while length and identity are identical
                #while sorted_aligns and top_len == sorted_aligns[0][5] and top_id == sorted_aligns[0][6]:
                while sorted_aligns and (score_single_align(
                        sorted_aligns[0]) >=
                                         qconfig.ambiguity_score * top_score):
                    top_aligns.append(sorted_aligns[0])
                    sorted_aligns = sorted_aligns[1:]

                #Mark other alignments as insignificant (former ambiguous)
                if sorted_aligns:
                    print >> ca_output.stdout_f, '\t\tSkipping these alignments as insignificant (option --ambiguity-score is set to "%s"):' % str(
                        qconfig.ambiguity_score)
                    for align in sorted_aligns:
                        print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', align

                if len(top_aligns) == 1:
                    #There is only one top align, life is good
                    print >> ca_output.stdout_f, '\t\tOne align captures most of this contig: %s' % str(
                        top_aligns[0])
                    #                    print >> ca_output.icarus_out_f, top_aligns[0].icarus_report_str()
                    print >> ca_output.icarus_out_f, icarus_report_str(
                        top_aligns[0])
                    ref_aligns.setdefault(top_aligns[0][7],
                                          []).append(top_aligns[0])
                    print >> ca_output.coords_filtered_f, str(top_aligns[0])
                    aligned_lengths.append(top_aligns[0][5])
                else:
                    #There is more than one top align
                    print >> ca_output.stdout_f, '\t\tThis contig has %d significant alignments. [An ambiguously mapped contig]' % len(
                        top_aligns)

                    #Increment count of ambiguously mapped contigs and bases in them
                    ambiguous_contigs += 1
                    # we count only extra bases, so we shouldn't include bases in the first alignment
                    # if --ambiguity-usage is 'none', the number of extra bases will be negative!
                    ambiguous_contigs_len += ctg_len

                    # Alex: skip all alignments or count them as normal (just different aligns of one repeat). Depend on --allow-ambiguity option
                    if qconfig.ambiguity_usage == "none":
                        ambiguous_contigs_extra_bases -= top_aligns[0][5]
                        print >> ca_output.stdout_f, '\t\tSkipping these alignments (option --ambiguity-usage is set to "none"):'
                        for align in top_aligns:
                            print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', align
                    elif qconfig.ambiguity_usage == "one":
                        ambiguous_contigs_extra_bases += 0
                        print >> ca_output.stdout_f, '\t\tUsing only first of these alignment (option --ambiguity-usage is set to "one"):'
                        print >> ca_output.stdout_f, '\t\t\tAlignment: %s' % str(
                            top_aligns[0])
                        #                        print >> ca_output.icarus_out_f, top_aligns[0].icarus_report_str()
                        print >> ca_output.icarus_out_f, icarus_report_str(
                            top_aligns[0])
                        ref_aligns.setdefault(top_aligns[0][7],
                                              []).append(top_aligns[0])
                        aligned_lengths.append(top_aligns[0][5])
                        print >> ca_output.coords_filtered_f, str(
                            top_aligns[0])
                        top_aligns = top_aligns[1:]
                        for align in top_aligns:
                            print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', align
                    elif qconfig.ambiguity_usage == "all":
                        ambiguous_contigs_extra_bases -= top_aligns[0][5]
                        print >> ca_output.stdout_f, '\t\tUsing all these alignments (option --ambiguity-usage is set to "all"):'
                        # we count only extra bases, so we shouldn't include bases in the first alignment
                        first_alignment = True
                        while len(top_aligns):
                            print >> ca_output.stdout_f, '\t\t\tAlignment: %s' % str(
                                top_aligns[0])
                            #                            print >> ca_output.icarus_out_f, top_aligns[0].icarus_report_str(ambiguity=True)
                            print >> ca_output.icarus_out_f, icarus_report_str(
                                top_aligns[0], ambiguity=True)
                            ref_aligns.setdefault(top_aligns[0][7],
                                                  []).append(top_aligns[0])
                            if first_alignment:
                                first_alignment = False
                                aligned_lengths.append(top_aligns[0][5])
                            ambiguous_contigs_extra_bases += top_aligns[0][5]
                            print >> ca_output.coords_filtered_f, str(
                                top_aligns[0]), "ambiguous"
                            top_aligns = top_aligns[1:]
            else:
                # choose appropriate alignments (to maximize total size of contig alignment and reduce # misassemblies)
                is_ambiguous, too_much_best_sets, sorted_aligns, best_sets = get_best_aligns_sets(
                    sorted_aligns, ctg_len, ca_output.stdout_f, seq, ref_lens,
                    cyclic, region_struct_variations)
                the_best_set = best_sets[0]
                used_indexes = range(
                    len(sorted_aligns)
                ) if too_much_best_sets else get_used_indexes(best_sets)
                if len(used_indexes) < len(sorted_aligns):
                    print >> ca_output.stdout_f, '\t\t\tSkipping redundant alignments after choosing the best set of alignments'
                    for idx in set(range(len(sorted_aligns))) - used_indexes:
                        print >> ca_output.stdout_f, '\t\tSkipping redundant alignment', sorted_aligns[
                            idx]

                if is_ambiguous:
                    print >> ca_output.stdout_f, '\t\tThis contig has several significant sets of alignments. [An ambiguously mapped contig]'
                    # similar to regular ambiguous contigs, see above
                    ambiguous_contigs += 1
                    ambiguous_contigs_len += ctg_len

                    if qconfig.ambiguity_usage == "none":
                        ambiguous_contigs_extra_bases -= (
                            ctg_len - the_best_set.uncovered)
                        print >> ca_output.stdout_f, '\t\tSkipping all alignments in these sets (option --ambiguity-usage is set to "none"):'
                        for idx in used_indexes:
                            print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', sorted_aligns[
                                idx]
                        continue
                    elif qconfig.ambiguity_usage == "one":
                        ambiguous_contigs_extra_bases += 0
                        print >> ca_output.stdout_f, '\t\tUsing only the very best set (option --ambiguity-usage is set to "one").'
                        if len(the_best_set.indexes) < len(used_indexes):
                            print >> ca_output.stdout_f, '\t\tSo, skipping alignments from other sets:'
                            for idx in used_indexes:
                                if idx not in the_best_set.indexes:
                                    print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', sorted_aligns[
                                        idx]
                    elif qconfig.ambiguity_usage == "all":
                        print >> ca_output.stdout_f, '\t\tUsing all alignments in these sets (option --ambiguity-usage is set to "all"):'
                        print >> ca_output.stdout_f, '\t\t\tThe very best set is shown in details below, the rest are:'
                        for idx, cur_set in enumerate(best_sets[1:]):
                            print >> ca_output.stdout_f, '\t\t\t\tGroup #%d. Score: %.1f, number of alignments: %d, unaligned bases: %d' % \
                                (idx + 2, cur_set.score, len(cur_set.indexes), cur_set.uncovered)
                        if too_much_best_sets:
                            print >> ca_output.stdout_f, '\t\t\t\tetc...'
                        if len(the_best_set.indexes) < len(used_indexes):
                            ambiguous_contigs_extra_bases -= (
                                ctg_len - the_best_set.uncovered)
                            print >> ca_output.stdout_f, '\t\t\tList of alignments used in the sets above:'
                            for idx in used_indexes:
                                align = sorted_aligns[idx]
                                print >> ca_output.stdout_f, '\t\tAlignment: %s' % str(
                                    align)
                                ref_aligns.setdefault(align[7],
                                                      []).append(align)
                                ambiguous_contigs_extra_bases += align[5]
                                print >> ca_output.coords_filtered_f, str(
                                    align), "ambiguous"
                                if idx not in the_best_set.indexes:
                                    print >> ca_output.icarus_out_f, icarus_report_str(
                                        align, is_best=False)
#                                    print >> ca_output.icarus_out_f, align.icarus_report_str(is_best=False)

                print >> ca_output.stdout_f, '\t\t\tThe best set is below. Score: %.1f, number of alignments: %d, unaligned bases: %d' % \
                                             (the_best_set.score, len(the_best_set.indexes), the_best_set.uncovered)
                real_aligns = [sorted_aligns[i] for i in the_best_set.indexes]

                # main processing part
                if len(real_aligns) == 1:
                    the_only_align = real_aligns[0]

                    #There is only one alignment of this contig to the reference
                    print >> ca_output.coords_filtered_f, str(the_only_align)
                    aligned_lengths.append(the_only_align[5])

                    #                    begin, end = the_only_align.start(), the_only_align.end()
                    begin, end = _start(the_only_align), _end(the_only_align)
                    unaligned_bases = 0
                    if (begin - 1) or (ctg_len - end):
                        partially_unaligned += 1
                        unaligned_bases = (begin - 1) + (ctg_len - end)
                        partially_unaligned_bases += unaligned_bases
                        print >> ca_output.stdout_f, '\t\tThis contig is partially unaligned. (Aligned %d out of %d bases)' % (
                            top_len, ctg_len)
                    print >> ca_output.stdout_f, '\t\tAlignment: %s' % str(
                        the_only_align)
                    #                    print >> ca_output.icarus_out_f, the_only_align.icarus_report_str()
                    print >> ca_output.icarus_out_f, icarus_report_str(
                        the_only_align)
                    if begin - 1:
                        print >> ca_output.stdout_f, '\t\tUnaligned bases: 1 to %d (%d)' % (
                            begin - 1, begin - 1)
                    if ctg_len - end:
                        print >> ca_output.stdout_f, '\t\tUnaligned bases: %d to %d (%d)' % (
                            end + 1, ctg_len, ctg_len - end)
                    # check if both parts (aligned and unaligned) have significant length
                    if (unaligned_bases >= qconfig.significant_part_size) and (
                            ctg_len - unaligned_bases >=
                            qconfig.significant_part_size):
                        print >> ca_output.stdout_f, '\t\tThis contig has both significant aligned and unaligned parts ' \
                                                     '(of length >= %d)!' % (qconfig.significant_part_size)
                        partially_unaligned_with_significant_parts += 1
                        if qconfig.meta:
                            contigs_with_istranslocations += check_for_potential_translocation(
                                seq, ctg_len, real_aligns, ca_output.stdout_f)
                    ref_aligns.setdefault(the_only_align[7],
                                          []).append(the_only_align)
                else:
                    #Sort real alignments by position on the contig
                    sorted_aligns = sorted(real_aligns,
                                           key=lambda x: (_end(x), _start(x)))
                    #                    sorted_aligns = sorted(real_aligns, key=lambda x: (x.end(), x.start()))

                    #There is more than one alignment of this contig to the reference
                    print >> ca_output.stdout_f, '\t\tThis contig is misassembled. %d total aligns.' % num_aligns
                    aligned_bases_in_contig = ctg_len - the_best_set.uncovered

                    if aligned_bases_in_contig < umt * ctg_len:
                        print >> ca_output.stdout_f, '\t\t\tWarning! This contig is more unaligned than misassembled. ' + \
                            'Contig length is %d and total length of all aligns is %d' % (ctg_len, aligned_bases_in_contig)
                        for align in sorted_aligns:
                            print >> ca_output.stdout_f, '\t\tAlignment: %s' % str(
                                align)
                            #                            print >> ca_output.icarus_out_f, align.icarus_report_str()
                            print >> ca_output.icarus_out_f, icarus_report_str(
                                align)
                            print >> ca_output.coords_filtered_f, str(align)
                            aligned_lengths.append(align[5])
                            ref_aligns.setdefault(align[7], []).append(align)

                        partially_unaligned_with_misassembly += 1
                        partially_unaligned += 1
                        partially_unaligned_bases += ctg_len - aligned_bases_in_contig
                        print >> ca_output.stdout_f, '\t\tUnaligned bases: %d' % (
                            ctg_len - aligned_bases_in_contig)
                        # check if both parts (aligned and unaligned) have significant length
                        if (aligned_bases_in_contig >=
                                qconfig.significant_part_size) and (
                                    ctg_len - aligned_bases_in_contig >=
                                    qconfig.significant_part_size):
                            print >> ca_output.stdout_f, '\t\tThis contig has both significant aligned and unaligned parts ' \
                                                         '(of length >= %d)!' % (qconfig.significant_part_size)
                            partially_unaligned_with_significant_parts += 1
                            if qconfig.meta:
                                contigs_with_istranslocations += check_for_potential_translocation(
                                    seq, ctg_len, sorted_aligns,
                                    ca_output.stdout_f)
                        contig_type = 'misassembled'
                        print >> ca_output.icarus_out_f, '\t'.join(
                            ['CONTIG', contig,
                             str(ctg_len), contig_type])
                        print >> ca_output.stdout_f
                        continue

                    ### processing misassemblies
                    is_misassembled, current_mio, references_misassemblies, indels_info, misassemblies_matched_sv = \
                        process_misassembled_contig(sorted_aligns, cyclic, aligned_lengths, region_misassemblies,
                                                    ref_lens, ref_aligns, ref_features, seq, references_misassemblies,
                                                    region_struct_variations, misassemblies_matched_sv, ca_output,
                                                    is_ambiguous)
                    misassembly_internal_overlap += current_mio
                    total_indels_info += indels_info
                    if is_misassembled:
                        misassembled_contigs[contig] = ctg_len
                        contig_type = 'misassembled'
                    if ctg_len - aligned_bases_in_contig >= qconfig.significant_part_size:
                        print >> ca_output.stdout_f, '\t\tThis contig has significant unaligned parts ' \
                                                     '(of length >= %d)!' % (qconfig.significant_part_size)
                        if qconfig.meta:
                            contigs_with_istranslocations += check_for_potential_translocation(
                                seq, ctg_len, sorted_aligns,
                                ca_output.stdout_f)
        else:
            #No aligns to this contig
            print >> ca_output.stdout_f, '\t\tThis contig is unaligned. (%d bp)' % ctg_len
            print >> unaligned_file, contig

            #Increment unaligned contig count and bases
            unaligned += 1
            fully_unaligned_bases += ctg_len
            print >> ca_output.stdout_f, '\t\tUnaligned bases: %d  total: %d' % (
                ctg_len, fully_unaligned_bases)

        print >> ca_output.icarus_out_f, '\t'.join(
            ['CONTIG', contig, str(ctg_len), contig_type])
        print >> ca_output.stdout_f

    ca_output.coords_filtered_f.close()
    unaligned_file.close()
    misassembled_bases = sum(misassembled_contigs.itervalues())

    result = {
        'region_misassemblies':
        region_misassemblies,
        'region_struct_variations':
        region_struct_variations.get_count()
        if region_struct_variations else None,
        'misassemblies_matched_sv':
        misassemblies_matched_sv,
        'misassembled_contigs':
        misassembled_contigs,
        'misassembled_bases':
        misassembled_bases,
        'misassembly_internal_overlap':
        misassembly_internal_overlap,
        'unaligned':
        unaligned,
        'partially_unaligned':
        partially_unaligned,
        'partially_unaligned_bases':
        partially_unaligned_bases,
        'fully_unaligned_bases':
        fully_unaligned_bases,
        'ambiguous_contigs':
        ambiguous_contigs,
        'ambiguous_contigs_extra_bases':
        ambiguous_contigs_extra_bases,
        'ambiguous_contigs_len':
        ambiguous_contigs_len,
        'partially_unaligned_with_misassembly':
        partially_unaligned_with_misassembly,
        'partially_unaligned_with_significant_parts':
        partially_unaligned_with_significant_parts,
        'contigs_with_istranslocations':
        contigs_with_istranslocations,
        'istranslocations_by_refs':
        references_misassemblies
    }

    return result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs