Esempio n. 1
0
def check_for_potential_translocation(seq, ctg_len, sorted_aligns, log_out_f):
    count_ns = 0
    unaligned_len = 0
    prev_start = 0
    for align in sorted_aligns:
        #        if align.start() > prev_start + 1:
        if _start(align) > prev_start + 1:
            #            unaligned_part = seq[prev_start + 1: align.start()]
            unaligned_part = seq[prev_start + 1:_start(align)]
            unaligned_len += len(unaligned_part)
            count_ns += unaligned_part.count('N')
#        prev_start = align.end()
        prev_start = _end(align)


#    if ctg_len > sorted_aligns[-1].end() + 1:
    if ctg_len > _end(sorted_aligns[-1]) + 1:
        #        unaligned_part = seq[sorted_aligns[-1].end() + 1: ctg_len]
        unaligned_part = seq[_end(sorted_aligns[-1]) + 1:ctg_len]
        unaligned_len += len(unaligned_part)
        count_ns += unaligned_part.count('N')
    # if contig consists mostly of Ns, it cannot contain interspecies translocations
    if count_ns / float(
            unaligned_len
    ) >= 0.95 or unaligned_len - count_ns < qconfig.significant_part_size:
        return 0

    print >> log_out_f, '\t\tIt can contain interspecies translocations.'
    return 1
Esempio n. 2
0
    def intersect_and_go_next(self, align, solids, min_unique_len):
#        if self.unique_end - align.end() > min_unique_len:  # if enough len on the right side
        if self.unique_end - _end(align) > min_unique_len:  # if enough len on the right side
            if self.is_solid(min_unique_len):
                solids.append(self.align)
                return True
        self.unique_end = min(self.unique_end, _start(align) - 1)
#        self.unique_end = min(self.unique_end, align.start() - 1)
        return not self.is_solid(min_unique_len)  # if self is not solid anymore we can switch to the next PSA
Esempio n. 3
0
def get_added_len(set_aligns, cur_align):
    last_align_idx = -2
    last_align = set_aligns[last_align_idx]
#    added_right = cur_align.end() - max(cur_align.start() - 1, last_align.end())
    added_right = _end(cur_align) - max(_start(cur_align) - 1, _end(last_align))
    added_left = 0
#    while cur_align.start() < last_align.start():
    while _start(cur_align) < _start(last_align):
#        added_left += last_align.start() - cur_align.start()
        added_left += _start(last_align) - _start(cur_align)
        last_align_idx -= 1
        if -last_align_idx <= len(set_aligns):
#            prev_start = last_align.start()  # in case of overlapping of old and new last_align
            prev_start = _start(last_align)  # in case of overlapping of old and new last_align
            last_align = set_aligns[last_align_idx]
            added_left -= max(0, min(prev_start, _end(last_align)) - _start(cur_align) + 1)
#            added_left -= max(0, min(prev_start, last_align.end()) - cur_align.start() + 1)
        else:
            break
    return added_right + added_left
Esempio n. 4
0
def is_misassembly(align1, align2, contig_seq, ref_lens, is_cyclic=False, region_struct_variations=None):
    #Calculate inconsistency between distances on the reference and on the contig
#    distance_on_contig = align2.start() - align1.end() - 1
    distance_on_contig = _start(align2) - _end(align1) - 1
    cyclic_ref_lens = ref_lens if is_cyclic else None
    if cyclic_ref_lens is not None and align1[7] == align2[7]:
        distance_on_reference, cyclic_moment = distance_between_alignments(align1, align2, align1[2] < align1[3],
            align2[2] < align2[3], cyclic_ref_lens[align1[7]])
    else:
        distance_on_reference, cyclic_moment = distance_between_alignments(align1, align2, align1[2] < align1[3],
                                                                           align2[2] < align2[3])

    misassembly_internal_overlap = 0
    if distance_on_contig < 0:
        if distance_on_reference >= 0:
            misassembly_internal_overlap = (-distance_on_contig)
        elif (-distance_on_reference) < (-distance_on_contig):
            misassembly_internal_overlap = (distance_on_reference - distance_on_contig)

    strand1 = (align1[2] <= align1[3])
    strand2 = (align2[2] <= align2[3])
    inconsistency = distance_on_reference - distance_on_contig
    aux_data = {"inconsistency": inconsistency, "distance_on_contig": distance_on_contig,
                "misassembly_internal_overlap": misassembly_internal_overlap, "cyclic_moment": cyclic_moment,
                "is_sv": False, "is_scaffold_gap": False}

    if qconfig.scaffolds and contig_seq and check_is_scaffold_gap(inconsistency, contig_seq, align1, align2):
        aux_data["is_scaffold_gap"] = True
        return False, aux_data
    if region_struct_variations and check_sv(align1, align2, inconsistency, region_struct_variations):
        aux_data['is_sv'] = True
        return False, aux_data
    if align1[7] != align2[7] and is_fragmented_ref_fake_translocation(align1, align2, ref_lens):
        aux_data["inconsistency"] = sum(__get_border_gaps(align1, align2, ref_lens))
        return False, aux_data
    if align1[7] != align2[7] or abs(inconsistency) > qconfig.extensive_misassembly_threshold or strand1 != strand2:
        return True, aux_data
    return False, aux_data  # regular local misassembly
Esempio n. 5
0
def is_gap_filled_ns(contig_seq, align1, align2):
#    gap_in_contig = contig_seq[align1.end(): align2.start() - 1]
    gap_in_contig = contig_seq[_end(align1): _start(align2) - 1]
    if len(gap_in_contig) < qconfig.Ns_break_threshold:
        return False
    return gap_in_contig.count('N')/len(gap_in_contig) > 0.95
Esempio n. 6
0
def count_ns_and_not_ns_between_aligns(contig_seq, align1, align2):
#    gap_in_contig = contig_seq[align1.end(): align2.start() - 1]
    gap_in_contig = contig_seq[_end(align1): _start(align2) - 1]
    ns_count = gap_in_contig.count('N')
    return ns_count, len(gap_in_contig) - ns_count
Esempio n. 7
0
def exclude_internal_overlaps(align1, align2, i=None, ca_output=None):
    # returns size of align1[5] decrease (or 0 if not changed). It is important for cur_aligned_len calculation

    def __shift_start(align, new_start, indent=''):
        if ca_output is not None:
            print >> ca_output.stdout_f, indent + '%s' % short_str(align),
#            print >> ca_output.stdout_f, indent + '%s' % align.short_str(),
        align = list(align)
        if align[2] < align[3]:
            align[0] += (new_start - align[2])
            align[2] = new_start
            align[5] = align[3] - align[2] + 1
        else:
            align[1] -= (new_start - align[3])
            align[3] = new_start
            align[5] = align[2] - align[3] + 1
        align[4] = align[1] - align[0] + 1
        align = tuple(align)
        if ca_output is not None:
            print >> ca_output.stdout_f, '--> %s' % short_str(align)
#            print >> ca_output.stdout_f, '--> %s' % align.short_str()

    def __shift_end(align, new_end, indent=''):
        if ca_output is not None:
            print >> ca_output.stdout_f, indent + '%s' % short_str(align),
#            print >> ca_output.stdout_f, indent + '%s' % align.short_str(),
        align = list(align)
        if align[2] < align[3]:
            align[1] -= (align[3] - new_end)
            align[3] = new_end
            align[5] = align[3] - align[2] + 1
        else:
            align[0] += (align[2] - new_end)
            align[2] = new_end
            align[5] = align[2] - align[3] + 1
        align[4] = align[1] - align[0] + 1
        align = tuple(align)
        if ca_output is not None:
            print >> ca_output.stdout_f, '--> %s' % short_str(align)
#            print >> ca_output.stdout_f, '--> %s' % align.short_str()

    if qconfig.ambiguity_usage == 'all':
        return 0

#    distance_on_contig = align2.start() - align1.end() - 1
    distance_on_contig = _start(align2) - _end(align1) - 1
    if distance_on_contig >= 0:  # no overlap
        return 0
    prev_len2 = align1[5]
    if ca_output is not None:
        print >> ca_output.stdout_f, '\t\t\tExcluding internal overlap of size %d between Alignment %d and %d: ' \
                                     % (-distance_on_contig, i+1, i+2),
    if qconfig.ambiguity_usage == 'one':  # left only one of two copies (remove overlap from shorter alignment)
        if align1[5] >= align2[5]:
#            __shift_start(align2, align1.end() + 1)
            __shift_start(align2, _end(align1) + 1)
        else:
#            __shift_end(align1, align2.start() - 1)
            __shift_end(align1, _start(align2) - 1)
    elif qconfig.ambiguity_usage == 'none':  # removing both copies
        if ca_output is not None:
            print >> ca_output.stdout_f
#        new_end = align2.start() - 1
#        __shift_start(align2, align1.end() + 1, '\t\t\t  ')
        new_end = _start(align2) - 1
        __shift_start(align2, _end(align1) + 1, '\t\t\t  ')
        __shift_end(align1, new_end, '\t\t\t  ')
    return prev_len2 - align1[5]
Esempio n. 8
0
def get_best_aligns_sets(sorted_aligns, ctg_len, planta_out_f, seq, ref_lens, is_cyclic=False, region_struct_variations=None):
    critical_number_of_aligns = 200  # use additional optimizations for large number of alignments

    penalties = dict()
    penalties['extensive'] = max(50, int(round(min(qconfig.extensive_misassembly_threshold / 4.0, ctg_len * 0.05)))) - 1
    penalties['local'] = max(2, int(round(min(qconfig.MAX_INDEL_LENGTH / 2.0, ctg_len * 0.01)))) - 1
    penalties['scaffold'] = 5

#    sorted_aligns = sorted(sorted_aligns, key=lambda x: (x.end(), x.start()))
    sorted_aligns = sorted(sorted_aligns, key=lambda x: (_end(x), _start(x)))

    # trying to optimise the algorithm if the number of possible alignments is large
    if len(sorted_aligns) > critical_number_of_aligns:
        print >> planta_out_f, '\t\t\tSkipping redundant alignments which can\'t be in the best set of alignments A PRIORI'

        # FIRST STEP: find solid aligns (which are present in the best selection for sure)
        # they should have unique (not covered by other aligns) region of length > 2 * extensive_penalty
        min_unique_len = 2 * penalties['extensive']

        possible_solids = [PSA(align) for align in sorted_aligns if align[5] > min_unique_len]
        solids = []
        try:
            cur_PSA = possible_solids.pop()
            for align in reversed(sorted_aligns):
                if align != cur_PSA.align and cur_PSA.intersect_and_go_next(align, solids, min_unique_len):
                    next_PSA = possible_solids.pop()
                    while next_PSA.intersect_and_go_next(cur_PSA.align, solids, min_unique_len):
                        next_PSA = possible_solids.pop()
                    while align != next_PSA.align and next_PSA.intersect_and_go_next(align, solids, min_unique_len):
                        next_PSA = possible_solids.pop()
                    cur_PSA = next_PSA
        except IndexError:  # possible_solids is empty
            pass

        # SECOND STEP: remove all aligns which are inside solid ones
        if len(solids):
            solid_regions = []  # intersection of all solid aligns
            cur_region = SolidRegion(solids[0])
            for align in solids[1:]:
#                if align.end() + 1 < cur_region.start:
                if _end(align) + 1 < cur_region.start:
                    solid_regions.append(cur_region)
                    cur_region = SolidRegion(align)
                else:  # shift start of the current region
                    cur_region.start = _start(align)
#                    cur_region.start = align.start()
            solid_regions.append(cur_region)

            filtered_aligns = solids
            idx = 0
            try:
                cur_region = solid_regions.pop()
                for idx, align in enumerate(sorted_aligns):
                    while not cur_region.include(align):
#                        if align.start() > cur_region.end:
                        if _start(align) > cur_region.end:
                            cur_region = solid_regions.pop()
                            continue
                        filtered_aligns.append(align)
                        break
                    else:
                        print >> planta_out_f, '\t\tSkipping redundant alignment %s' % (str(align))
            except IndexError:  # solid_regions is empty
                filtered_aligns += sorted_aligns[idx:]

#            sorted_aligns = sorted(filtered_aligns, key=lambda x: (x.end(), x.start()))
            sorted_aligns = sorted(filtered_aligns, key=lambda x: (_end(x), _start(x)))

    # Stage 1: Dynamic programming for finding the best score
    all_scored_sets = [ScoredSet(0, [], ctg_len)]
    max_score = 0

    for idx, align in enumerate(sorted_aligns):
        local_max_score = 0
        new_scored_set = None
        for scored_set in all_scored_sets:
#            cur_set_aligns = [sorted_aligns[i].clone() for i in scored_set.indexes] + [align.clone()]
            cur_set_aligns = [_clone(sorted_aligns[i]) for i in scored_set.indexes] + [_clone(align)]
            score, uncovered = get_score(scored_set.score, cur_set_aligns, ref_lens, is_cyclic, scored_set.uncovered,
                                         seq, region_struct_variations, penalties)
            if score is None:  # incorrect set, i.e. internal overlap excluding resulted in incorrectly short alignment
                continue
            if score > local_max_score:
                local_max_score = score
                new_scored_set = ScoredSet(score, scored_set.indexes + [idx], uncovered)
        if new_scored_set:
            all_scored_sets.append(new_scored_set)
            if local_max_score > max_score:
                max_score = local_max_score

    # Stage 2: DFS for finding multiple best sets with almost equally good score
    max_allowed_score_drop = max_score - max_score * qconfig.ambiguity_score

    putative_sets = []  # TODO: use priority queue -- minimal score_drop first
    best_sets = []
    for scored_set in all_scored_sets:
        score_drop = max_score - scored_set.score
        if score_drop <= max_allowed_score_drop:
            heappush(putative_sets, PutativeBestSet([scored_set.indexes[-1]], score_drop, scored_set.uncovered))

    ambiguity_check_is_needed = True
    too_much_best_sets = False
    while len(putative_sets):
        putative_set = heappop(putative_sets)
        # special case: no options to enlarge this set, already at the left most point
        if putative_set.indexes[0] == -1:
            best_sets.append(ScoredSet(max_score - putative_set.score_drop, putative_set.indexes[1:],
                                       putative_set.uncovered))
            # special case: we added the very best set and we need decide what to do next (based on ambiguity-usage)
            if ambiguity_check_is_needed and len(best_sets) == 1:
                if not putative_sets:  # no ambiguity at all, only one good set was there
                    return False, too_much_best_sets, sorted_aligns, best_sets
                elif not qconfig.ambiguity_usage == 'all':  # several good sets are present (the contig is ambiguous) but we need only the best one
                    return True, too_much_best_sets, sorted_aligns, best_sets
                ambiguity_check_is_needed = False
            if len(best_sets) >= qconfig.BSS_MAX_SETS_NUMBER:
                too_much_best_sets = (len(putative_sets) > 0)
                break
            continue
        # the main part: trying to enlarge the set to the left (use "earlier" alignments)
        align = sorted_aligns[putative_set.indexes[0]]
        local_max_score = 0
        local_uncovered = putative_set.uncovered
        putative_predecessors = {}
        for scored_set in all_scored_sets:
            # we can enlarge the set with "earlier" alignments only
            if scored_set.indexes and scored_set.indexes[-1] >= putative_set.indexes[0]:
                break
#            cur_set_aligns = [sorted_aligns[i].clone() for i in scored_set.indexes] + [align.clone()]
            cur_set_aligns = [_clone(sorted_aligns[i]) for i in scored_set.indexes] + [_clone(align)]
            score, uncovered = get_score(scored_set.score, cur_set_aligns, ref_lens, is_cyclic, scored_set.uncovered,
                                         seq, region_struct_variations, penalties)
            if score is not None:
                putative_predecessors[scored_set] = (score, uncovered)
                if score > local_max_score:
                    local_max_score = score
                    local_uncovered = uncovered
                elif score == local_max_score and uncovered < local_uncovered:
                    local_uncovered = uncovered
        for preceding_set, (score, uncovered) in putative_predecessors.iteritems():
            score_drop = local_max_score - score + putative_set.score_drop
            if score_drop > max_allowed_score_drop:
                continue
            new_index = preceding_set.indexes[-1] if preceding_set.indexes else -1
            new_uncovered = uncovered + (putative_set.uncovered - local_uncovered)
            heappush(putative_sets, PutativeBestSet([new_index] + putative_set.indexes,
                                                    score_drop, new_uncovered))

    return True, too_much_best_sets, sorted_aligns, best_sets
Esempio n. 9
0
 def include(self, align):
     return self.start <= _start(align) and _end(align) <= self.end
Esempio n. 10
0
 def __init__(self, align):
     self.start = _start(align)
     self.end = _end(align)
Esempio n. 11
0
 def __init__(self, align):
     self.align = align
     self.unique_start = _start(align)
     self.unique_end = _end(align)
Esempio n. 12
0
def analyze_contigs(ca_output,
                    contigs_fpath,
                    unaligned_fpath,
                    aligns,
                    ref_features,
                    ref_lens,
                    cyclic=None):
    maxun = 10
    epsilon = 0.99
    umt = 0.5  # threshold for misassembled contigs with aligned less than $umt * 100% (Unaligned Missassembled Threshold)

    unaligned = 0
    partially_unaligned = 0
    fully_unaligned_bases = 0
    partially_unaligned_bases = 0
    ambiguous_contigs = 0
    ambiguous_contigs_extra_bases = 0
    ambiguous_contigs_len = 0
    partially_unaligned_with_misassembly = 0
    partially_unaligned_with_significant_parts = 0
    misassembly_internal_overlap = 0
    contigs_with_istranslocations = 0
    misassemblies_matched_sv = 0

    ref_aligns = dict()
    aligned_lengths = []
    region_misassemblies = []
    misassembled_contigs = dict()

    region_struct_variations = find_all_sv(qconfig.bed)

    references_misassemblies = {}
    for ref in ref_labels_by_chromosomes.values():
        references_misassemblies[ref] = dict(
            (key, 0) for key in ref_labels_by_chromosomes.values())

    # for counting SNPs and indels (both original (.all_snps) and corrected from Nucmer's local misassemblies)
    total_indels_info = IndelsInfo()

    unaligned_file = open(unaligned_fpath, 'w')
    for contig, seq in fastaparser.read_fasta(contigs_fpath):
        #Recording contig stats
        ctg_len = len(seq)
        print >> ca_output.stdout_f, 'CONTIG: %s (%dbp)' % (contig, ctg_len)
        contig_type = 'unaligned'

        #Check if this contig aligned to the reference
        if contig in aligns:
            for align in aligns[contig]:
                #sub_seq = seq[align.start(): align.end()]
                sub_seq = seq[_start(align):_end(align)]
                if 'N' in sub_seq:
                    ns_pos = [
                        pos for pos in xrange(_start(align), _end(align))
                        if seq[pos] == 'N'
                    ]
#                    ns_pos = [pos for pos in xrange(align.start(), align.end()) if seq[pos] == 'N']
            contig_type = 'correct'
            #Pull all aligns for this contig
            num_aligns = len(aligns[contig])

            #Sort aligns by aligned_length * identity - unaligned_length (as we do in BSS)
            sorted_aligns = sorted(aligns[contig],
                                   key=lambda x: (score_single_align(x), x[5]),
                                   reverse=True)
            top_len = sorted_aligns[0][5]
            top_id = sorted_aligns[0][6]
            top_score = score_single_align(sorted_aligns[0])
            top_aligns = []
            print >> ca_output.stdout_f, 'Top Length: %d  Top ID: %.2f (Score: %.1f)' % (
                top_len, top_id, top_score)

            #Check that top hit captures most of the contig
            if top_len > ctg_len * epsilon or ctg_len - top_len < maxun:
                #Reset top aligns: aligns that share the same value of longest and highest identity
                top_aligns.append(sorted_aligns[0])
                sorted_aligns = sorted_aligns[1:]

                #Continue grabbing alignments while length and identity are identical
                #while sorted_aligns and top_len == sorted_aligns[0][5] and top_id == sorted_aligns[0][6]:
                while sorted_aligns and (score_single_align(
                        sorted_aligns[0]) >=
                                         qconfig.ambiguity_score * top_score):
                    top_aligns.append(sorted_aligns[0])
                    sorted_aligns = sorted_aligns[1:]

                #Mark other alignments as insignificant (former ambiguous)
                if sorted_aligns:
                    print >> ca_output.stdout_f, '\t\tSkipping these alignments as insignificant (option --ambiguity-score is set to "%s"):' % str(
                        qconfig.ambiguity_score)
                    for align in sorted_aligns:
                        print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', align

                if len(top_aligns) == 1:
                    #There is only one top align, life is good
                    print >> ca_output.stdout_f, '\t\tOne align captures most of this contig: %s' % str(
                        top_aligns[0])
                    #                    print >> ca_output.icarus_out_f, top_aligns[0].icarus_report_str()
                    print >> ca_output.icarus_out_f, icarus_report_str(
                        top_aligns[0])
                    ref_aligns.setdefault(top_aligns[0][7],
                                          []).append(top_aligns[0])
                    print >> ca_output.coords_filtered_f, str(top_aligns[0])
                    aligned_lengths.append(top_aligns[0][5])
                else:
                    #There is more than one top align
                    print >> ca_output.stdout_f, '\t\tThis contig has %d significant alignments. [An ambiguously mapped contig]' % len(
                        top_aligns)

                    #Increment count of ambiguously mapped contigs and bases in them
                    ambiguous_contigs += 1
                    # we count only extra bases, so we shouldn't include bases in the first alignment
                    # if --ambiguity-usage is 'none', the number of extra bases will be negative!
                    ambiguous_contigs_len += ctg_len

                    # Alex: skip all alignments or count them as normal (just different aligns of one repeat). Depend on --allow-ambiguity option
                    if qconfig.ambiguity_usage == "none":
                        ambiguous_contigs_extra_bases -= top_aligns[0][5]
                        print >> ca_output.stdout_f, '\t\tSkipping these alignments (option --ambiguity-usage is set to "none"):'
                        for align in top_aligns:
                            print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', align
                    elif qconfig.ambiguity_usage == "one":
                        ambiguous_contigs_extra_bases += 0
                        print >> ca_output.stdout_f, '\t\tUsing only first of these alignment (option --ambiguity-usage is set to "one"):'
                        print >> ca_output.stdout_f, '\t\t\tAlignment: %s' % str(
                            top_aligns[0])
                        #                        print >> ca_output.icarus_out_f, top_aligns[0].icarus_report_str()
                        print >> ca_output.icarus_out_f, icarus_report_str(
                            top_aligns[0])
                        ref_aligns.setdefault(top_aligns[0][7],
                                              []).append(top_aligns[0])
                        aligned_lengths.append(top_aligns[0][5])
                        print >> ca_output.coords_filtered_f, str(
                            top_aligns[0])
                        top_aligns = top_aligns[1:]
                        for align in top_aligns:
                            print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', align
                    elif qconfig.ambiguity_usage == "all":
                        ambiguous_contigs_extra_bases -= top_aligns[0][5]
                        print >> ca_output.stdout_f, '\t\tUsing all these alignments (option --ambiguity-usage is set to "all"):'
                        # we count only extra bases, so we shouldn't include bases in the first alignment
                        first_alignment = True
                        while len(top_aligns):
                            print >> ca_output.stdout_f, '\t\t\tAlignment: %s' % str(
                                top_aligns[0])
                            #                            print >> ca_output.icarus_out_f, top_aligns[0].icarus_report_str(ambiguity=True)
                            print >> ca_output.icarus_out_f, icarus_report_str(
                                top_aligns[0], ambiguity=True)
                            ref_aligns.setdefault(top_aligns[0][7],
                                                  []).append(top_aligns[0])
                            if first_alignment:
                                first_alignment = False
                                aligned_lengths.append(top_aligns[0][5])
                            ambiguous_contigs_extra_bases += top_aligns[0][5]
                            print >> ca_output.coords_filtered_f, str(
                                top_aligns[0]), "ambiguous"
                            top_aligns = top_aligns[1:]
            else:
                # choose appropriate alignments (to maximize total size of contig alignment and reduce # misassemblies)
                is_ambiguous, too_much_best_sets, sorted_aligns, best_sets = get_best_aligns_sets(
                    sorted_aligns, ctg_len, ca_output.stdout_f, seq, ref_lens,
                    cyclic, region_struct_variations)
                the_best_set = best_sets[0]
                used_indexes = range(
                    len(sorted_aligns)
                ) if too_much_best_sets else get_used_indexes(best_sets)
                if len(used_indexes) < len(sorted_aligns):
                    print >> ca_output.stdout_f, '\t\t\tSkipping redundant alignments after choosing the best set of alignments'
                    for idx in set(range(len(sorted_aligns))) - used_indexes:
                        print >> ca_output.stdout_f, '\t\tSkipping redundant alignment', sorted_aligns[
                            idx]

                if is_ambiguous:
                    print >> ca_output.stdout_f, '\t\tThis contig has several significant sets of alignments. [An ambiguously mapped contig]'
                    # similar to regular ambiguous contigs, see above
                    ambiguous_contigs += 1
                    ambiguous_contigs_len += ctg_len

                    if qconfig.ambiguity_usage == "none":
                        ambiguous_contigs_extra_bases -= (
                            ctg_len - the_best_set.uncovered)
                        print >> ca_output.stdout_f, '\t\tSkipping all alignments in these sets (option --ambiguity-usage is set to "none"):'
                        for idx in used_indexes:
                            print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', sorted_aligns[
                                idx]
                        continue
                    elif qconfig.ambiguity_usage == "one":
                        ambiguous_contigs_extra_bases += 0
                        print >> ca_output.stdout_f, '\t\tUsing only the very best set (option --ambiguity-usage is set to "one").'
                        if len(the_best_set.indexes) < len(used_indexes):
                            print >> ca_output.stdout_f, '\t\tSo, skipping alignments from other sets:'
                            for idx in used_indexes:
                                if idx not in the_best_set.indexes:
                                    print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', sorted_aligns[
                                        idx]
                    elif qconfig.ambiguity_usage == "all":
                        print >> ca_output.stdout_f, '\t\tUsing all alignments in these sets (option --ambiguity-usage is set to "all"):'
                        print >> ca_output.stdout_f, '\t\t\tThe very best set is shown in details below, the rest are:'
                        for idx, cur_set in enumerate(best_sets[1:]):
                            print >> ca_output.stdout_f, '\t\t\t\tGroup #%d. Score: %.1f, number of alignments: %d, unaligned bases: %d' % \
                                (idx + 2, cur_set.score, len(cur_set.indexes), cur_set.uncovered)
                        if too_much_best_sets:
                            print >> ca_output.stdout_f, '\t\t\t\tetc...'
                        if len(the_best_set.indexes) < len(used_indexes):
                            ambiguous_contigs_extra_bases -= (
                                ctg_len - the_best_set.uncovered)
                            print >> ca_output.stdout_f, '\t\t\tList of alignments used in the sets above:'
                            for idx in used_indexes:
                                align = sorted_aligns[idx]
                                print >> ca_output.stdout_f, '\t\tAlignment: %s' % str(
                                    align)
                                ref_aligns.setdefault(align[7],
                                                      []).append(align)
                                ambiguous_contigs_extra_bases += align[5]
                                print >> ca_output.coords_filtered_f, str(
                                    align), "ambiguous"
                                if idx not in the_best_set.indexes:
                                    print >> ca_output.icarus_out_f, icarus_report_str(
                                        align, is_best=False)
#                                    print >> ca_output.icarus_out_f, align.icarus_report_str(is_best=False)

                print >> ca_output.stdout_f, '\t\t\tThe best set is below. Score: %.1f, number of alignments: %d, unaligned bases: %d' % \
                                             (the_best_set.score, len(the_best_set.indexes), the_best_set.uncovered)
                real_aligns = [sorted_aligns[i] for i in the_best_set.indexes]

                # main processing part
                if len(real_aligns) == 1:
                    the_only_align = real_aligns[0]

                    #There is only one alignment of this contig to the reference
                    print >> ca_output.coords_filtered_f, str(the_only_align)
                    aligned_lengths.append(the_only_align[5])

                    #                    begin, end = the_only_align.start(), the_only_align.end()
                    begin, end = _start(the_only_align), _end(the_only_align)
                    unaligned_bases = 0
                    if (begin - 1) or (ctg_len - end):
                        partially_unaligned += 1
                        unaligned_bases = (begin - 1) + (ctg_len - end)
                        partially_unaligned_bases += unaligned_bases
                        print >> ca_output.stdout_f, '\t\tThis contig is partially unaligned. (Aligned %d out of %d bases)' % (
                            top_len, ctg_len)
                    print >> ca_output.stdout_f, '\t\tAlignment: %s' % str(
                        the_only_align)
                    #                    print >> ca_output.icarus_out_f, the_only_align.icarus_report_str()
                    print >> ca_output.icarus_out_f, icarus_report_str(
                        the_only_align)
                    if begin - 1:
                        print >> ca_output.stdout_f, '\t\tUnaligned bases: 1 to %d (%d)' % (
                            begin - 1, begin - 1)
                    if ctg_len - end:
                        print >> ca_output.stdout_f, '\t\tUnaligned bases: %d to %d (%d)' % (
                            end + 1, ctg_len, ctg_len - end)
                    # check if both parts (aligned and unaligned) have significant length
                    if (unaligned_bases >= qconfig.significant_part_size) and (
                            ctg_len - unaligned_bases >=
                            qconfig.significant_part_size):
                        print >> ca_output.stdout_f, '\t\tThis contig has both significant aligned and unaligned parts ' \
                                                     '(of length >= %d)!' % (qconfig.significant_part_size)
                        partially_unaligned_with_significant_parts += 1
                        if qconfig.meta:
                            contigs_with_istranslocations += check_for_potential_translocation(
                                seq, ctg_len, real_aligns, ca_output.stdout_f)
                    ref_aligns.setdefault(the_only_align[7],
                                          []).append(the_only_align)
                else:
                    #Sort real alignments by position on the contig
                    sorted_aligns = sorted(real_aligns,
                                           key=lambda x: (_end(x), _start(x)))
                    #                    sorted_aligns = sorted(real_aligns, key=lambda x: (x.end(), x.start()))

                    #There is more than one alignment of this contig to the reference
                    print >> ca_output.stdout_f, '\t\tThis contig is misassembled. %d total aligns.' % num_aligns
                    aligned_bases_in_contig = ctg_len - the_best_set.uncovered

                    if aligned_bases_in_contig < umt * ctg_len:
                        print >> ca_output.stdout_f, '\t\t\tWarning! This contig is more unaligned than misassembled. ' + \
                            'Contig length is %d and total length of all aligns is %d' % (ctg_len, aligned_bases_in_contig)
                        for align in sorted_aligns:
                            print >> ca_output.stdout_f, '\t\tAlignment: %s' % str(
                                align)
                            #                            print >> ca_output.icarus_out_f, align.icarus_report_str()
                            print >> ca_output.icarus_out_f, icarus_report_str(
                                align)
                            print >> ca_output.coords_filtered_f, str(align)
                            aligned_lengths.append(align[5])
                            ref_aligns.setdefault(align[7], []).append(align)

                        partially_unaligned_with_misassembly += 1
                        partially_unaligned += 1
                        partially_unaligned_bases += ctg_len - aligned_bases_in_contig
                        print >> ca_output.stdout_f, '\t\tUnaligned bases: %d' % (
                            ctg_len - aligned_bases_in_contig)
                        # check if both parts (aligned and unaligned) have significant length
                        if (aligned_bases_in_contig >=
                                qconfig.significant_part_size) and (
                                    ctg_len - aligned_bases_in_contig >=
                                    qconfig.significant_part_size):
                            print >> ca_output.stdout_f, '\t\tThis contig has both significant aligned and unaligned parts ' \
                                                         '(of length >= %d)!' % (qconfig.significant_part_size)
                            partially_unaligned_with_significant_parts += 1
                            if qconfig.meta:
                                contigs_with_istranslocations += check_for_potential_translocation(
                                    seq, ctg_len, sorted_aligns,
                                    ca_output.stdout_f)
                        contig_type = 'misassembled'
                        print >> ca_output.icarus_out_f, '\t'.join(
                            ['CONTIG', contig,
                             str(ctg_len), contig_type])
                        print >> ca_output.stdout_f
                        continue

                    ### processing misassemblies
                    is_misassembled, current_mio, references_misassemblies, indels_info, misassemblies_matched_sv = \
                        process_misassembled_contig(sorted_aligns, cyclic, aligned_lengths, region_misassemblies,
                                                    ref_lens, ref_aligns, ref_features, seq, references_misassemblies,
                                                    region_struct_variations, misassemblies_matched_sv, ca_output,
                                                    is_ambiguous)
                    misassembly_internal_overlap += current_mio
                    total_indels_info += indels_info
                    if is_misassembled:
                        misassembled_contigs[contig] = ctg_len
                        contig_type = 'misassembled'
                    if ctg_len - aligned_bases_in_contig >= qconfig.significant_part_size:
                        print >> ca_output.stdout_f, '\t\tThis contig has significant unaligned parts ' \
                                                     '(of length >= %d)!' % (qconfig.significant_part_size)
                        if qconfig.meta:
                            contigs_with_istranslocations += check_for_potential_translocation(
                                seq, ctg_len, sorted_aligns,
                                ca_output.stdout_f)
        else:
            #No aligns to this contig
            print >> ca_output.stdout_f, '\t\tThis contig is unaligned. (%d bp)' % ctg_len
            print >> unaligned_file, contig

            #Increment unaligned contig count and bases
            unaligned += 1
            fully_unaligned_bases += ctg_len
            print >> ca_output.stdout_f, '\t\tUnaligned bases: %d  total: %d' % (
                ctg_len, fully_unaligned_bases)

        print >> ca_output.icarus_out_f, '\t'.join(
            ['CONTIG', contig, str(ctg_len), contig_type])
        print >> ca_output.stdout_f

    ca_output.coords_filtered_f.close()
    unaligned_file.close()
    misassembled_bases = sum(misassembled_contigs.itervalues())

    result = {
        'region_misassemblies':
        region_misassemblies,
        'region_struct_variations':
        region_struct_variations.get_count()
        if region_struct_variations else None,
        'misassemblies_matched_sv':
        misassemblies_matched_sv,
        'misassembled_contigs':
        misassembled_contigs,
        'misassembled_bases':
        misassembled_bases,
        'misassembly_internal_overlap':
        misassembly_internal_overlap,
        'unaligned':
        unaligned,
        'partially_unaligned':
        partially_unaligned,
        'partially_unaligned_bases':
        partially_unaligned_bases,
        'fully_unaligned_bases':
        fully_unaligned_bases,
        'ambiguous_contigs':
        ambiguous_contigs,
        'ambiguous_contigs_extra_bases':
        ambiguous_contigs_extra_bases,
        'ambiguous_contigs_len':
        ambiguous_contigs_len,
        'partially_unaligned_with_misassembly':
        partially_unaligned_with_misassembly,
        'partially_unaligned_with_significant_parts':
        partially_unaligned_with_significant_parts,
        'contigs_with_istranslocations':
        contigs_with_istranslocations,
        'istranslocations_by_refs':
        references_misassemblies
    }

    return result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs