def is_fragmented_ref_fake_translocation(align1, align2, ref_lens): # Check whether translocation is caused by fragmented reference and thus should be marked Fake misassembly # Return inconsistency value if translocation is fake or None if translocation is real # !!! it is assumed that align1.ref != align2.ref # assert align1.ref != align2.ref, "Internal QUAST bug: is_fragmented_ref_fake_translocation() " \ # "should be called only if align1.ref != align2.ref" if align1.ref == align2.ref: return False if qconfig.check_for_fragmented_ref: if qconfig.is_combined_ref and not is_same_reference(align1.ref, align2.ref): return False if all([d <= qconfig.fragmented_max_indent for d in __get_border_gaps(align1, align2, ref_lens)]): return True return False
def get_score(score, aligns, ref_lens, is_cyclic, uncovered_len, seq, region_struct_variations, penalties): if len(aligns) > 1: align1, align2 = aligns[-2], aligns[-1] is_fake_translocation = is_fragmented_ref_fake_translocation(align1, align2, ref_lens) overlaped_len = max(0, align1.end() - align2.start() + 1) if len(aligns) > 2: # does not affect score and uncovered but it is important for further checking on set correctness exclude_internal_overlaps(aligns[-3], align1) reduced_len = exclude_internal_overlaps(align1, align2) # reduced_len is for align1 only # check whether the set is still correct, i.e both alignments are rather large if min(align1.len2, align2.len2) < max(qconfig.min_cluster, qconfig.min_alignment): return None, None added_len = get_added_len(aligns, aligns[-1]) uncovered_len -= (added_len - reduced_len) score += score_single_align(align2, ctg_len=added_len) - score_single_align(align1, ctg_len=reduced_len) is_extensive_misassembly, aux_data = is_misassembly(align1, align2, seq, ref_lens, is_cyclic, region_struct_variations, is_fake_translocation) if is_extensive_misassembly: misassembly_penalty = penalties['extensive'] if align1.ref != align2.ref: if qconfig.is_combined_ref and not is_same_reference(align1.ref, align2.ref): misassembly = Misassembly.INTERSPECTRANSLOCATION else: misassembly = Misassembly.TRANSLOCATION elif abs(aux_data["inconsistency"]) > qconfig.extensive_misassembly_threshold: misassembly = Misassembly.RELOCATION score -= float(abs(aux_data["inconsistency"])) / ref_lens[align1.ref] else: misassembly = Misassembly.INVERSION score -= misassembly - Misassembly.INVERSION elif aux_data['is_sv']: misassembly_penalty = 0 elif abs(aux_data['inconsistency']) > qconfig.MAX_INDEL_LENGTH and not aux_data['is_scaffold_gap']: misassembly_penalty = penalties['local'] elif aux_data['is_scaffold_gap']: misassembly_penalty = penalties['scaffold'] else: misassembly_penalty = 0 overlap_penalty = min(overlaped_len * penalties['overlap_multiplier'], misassembly_penalty) score -= (misassembly_penalty + overlap_penalty) else: score += score_single_align(aligns[-1]) uncovered_len -= aligns[-1].len2 return score, uncovered_len
def process_misassembled_contig(sorted_aligns, cyclic, aligned_lengths, region_misassemblies, ref_lens, ref_aligns, ref_features, contig_seq, references_misassemblies, region_struct_variations, misassemblies_matched_sv, ca_output, is_ambiguous=False): misassembly_internal_overlap = 0 prev_align = sorted_aligns[0] cur_aligned_length = prev_align.len2 is_misassembled = False contig_is_printed = False indels_info = IndelsInfo() contig_aligned_length = 0 # for internal debugging purposes for i in range(len(sorted_aligns) - 1): next_align = sorted_aligns[i + 1] is_fake_translocation = is_fragmented_ref_fake_translocation(prev_align, next_align, ref_lens) cur_aligned_length -= exclude_internal_overlaps(prev_align, next_align, i, ca_output) is_extensive_misassembly, aux_data = is_misassembly(prev_align, next_align, contig_seq, ref_lens, cyclic, region_struct_variations, is_fake_translocation) inconsistency = aux_data["inconsistency"] distance_on_contig = aux_data["distance_on_contig"] misassembly_internal_overlap += aux_data["misassembly_internal_overlap"] cyclic_moment = aux_data["cyclic_moment"] ca_output.icarus_out_f.write(prev_align.icarus_report_str() + '\n') ca_output.stdout_f.write('\t\t\tReal Alignment %d: %s\n' % (i+1, str(prev_align))) ref_aligns.setdefault(prev_align.ref, []).append(prev_align) ca_output.coords_filtered_f.write(str(prev_align) + '\n') if aux_data["is_sv"]: ca_output.stdout_f.write('\t\t\t Not a misassembly (structural variation of the genome) between these two alignments\n') ca_output.icarus_out_f.write('fake: not a misassembly (structural variation of the genome)\n') misassemblies_matched_sv += 1 elif aux_data["is_scaffold_gap"] and abs(inconsistency) > qconfig.extensive_misassembly_threshold: ca_output.stdout_f.write('\t\t\t Incorrectly estimated size of scaffold gap between these two alignments: ') ca_output.stdout_f.write('gap length difference = ' + str(inconsistency) + '\n') region_misassemblies.append(Misassembly.SCAFFOLD_GAP) ca_output.icarus_out_f.write('fake: scaffold gap size wrong estimation' + '\n') elif is_extensive_misassembly: is_misassembled = True aligned_lengths.append(cur_aligned_length) contig_aligned_length += cur_aligned_length cur_aligned_length = 0 if not contig_is_printed: ca_output.misassembly_f.write(prev_align.contig + '\n') contig_is_printed = True ca_output.misassembly_f.write('Extensive misassembly (') ca_output.stdout_f.write('\t\t\t Extensive misassembly (') if prev_align.ref != next_align.ref: # it is not a Fake translocation, because is_extensive_misassembly is True if qconfig.is_combined_ref and \ not is_same_reference(prev_align.ref, next_align.ref): # if chromosomes from different references region_misassemblies.append(Misassembly.INTERSPECTRANSLOCATION) ref1, ref2 = get_ref_by_chromosome(prev_align.ref), get_ref_by_chromosome(next_align.ref) references_misassemblies[ref1][ref2] += 1 references_misassemblies[ref2][ref1] += 1 ca_output.stdout_f.write('interspecies translocation') ca_output.misassembly_f.write('interspecies translocation') ca_output.icarus_out_f.write('interspecies translocation') else: region_misassemblies.append(Misassembly.TRANSLOCATION) ca_output.stdout_f.write('translocation') ca_output.misassembly_f.write('translocation') ca_output.icarus_out_f.write('translocation') elif abs(inconsistency) > qconfig.extensive_misassembly_threshold: region_misassemblies.append(Misassembly.RELOCATION) msg = 'relocation, inconsistency = ' + str(inconsistency) + \ (' [linear representation of circular genome]' if cyclic_moment else '') ca_output.stdout_f.write(msg) ca_output.misassembly_f.write(msg) ca_output.icarus_out_f.write(msg) else: #if strand1 != strand2: region_misassemblies.append(Misassembly.INVERSION) ca_output.stdout_f.write('inversion') ca_output.misassembly_f.write('inversion') ca_output.icarus_out_f.write('inversion') ca_output.stdout_f.write(') between these two alignments\n') ca_output.misassembly_f.write(') between %s %s and %s %s' % (prev_align.s2, prev_align.e2, next_align.s2, next_align.e2) + '\n') ca_output.icarus_out_f.write('\n') ref_features.setdefault(prev_align.ref, {})[prev_align.e1] = 'M' ref_features.setdefault(next_align.ref, {})[next_align.e1] = 'M' else: reason_msg = "" + (" [linear representation of circular genome]" if cyclic_moment else "") + \ (" [fragmentation of reference genome]" if prev_align.ref != next_align.ref else "") if inconsistency == 0 and cyclic_moment: ca_output.stdout_f.write('\t\t\t Not a misassembly' + reason_msg + ' between these two alignments\n') ca_output.icarus_out_f.write('fake: not a misassembly' + reason_msg + '\n') elif inconsistency == 0 and prev_align.ref != next_align.ref: # is_fragmented_ref_fake_translocation is True, because is_extensive_misassembly is False ca_output.stdout_f.write('\t\t\t Not a misassembly' + reason_msg + ' between these two alignments\n') region_misassemblies.append(Misassembly.FRAGMENTED) ca_output.icarus_out_f.write('fake: not a misassembly' + reason_msg + '\n') elif abs(inconsistency) <= qconfig.MAX_INDEL_LENGTH and \ count_ns_and_not_ns_between_aligns(contig_seq, prev_align, next_align)[1] <= qconfig.MAX_INDEL_LENGTH: ns_number, not_ns_number = count_ns_and_not_ns_between_aligns(contig_seq, prev_align, next_align) if inconsistency == 0: ca_output.stdout_f.write(('\t\t\t Stretch of %d mismatches between these two alignments (number of Ns: %d)' % (not_ns_number, ns_number)) + reason_msg + '\n') indels_info.mismatches += not_ns_number ca_output.icarus_out_f.write('indel: stretch of mismatches' + reason_msg + '\n') else: indel_length = abs(inconsistency) indel_class = 'Indel (<= 5bp)' if indel_length <= qconfig.SHORT_INDEL_THRESHOLD else 'Indel (> 5bp)' indel_type = 'insertion' if inconsistency < 0 else 'deletion' mismatches = max(0, not_ns_number - indel_length) ca_output.stdout_f.write(('\t\t\t %s between these two alignments: %s of length %d; %d mismatches (number of Ns: %d)') % (indel_class, indel_type, indel_length, mismatches, ns_number) + reason_msg + '\n') indels_info.indels_list.append(indel_length) if indel_type == 'insertion': indels_info.insertions += indel_length else: indels_info.deletions += indel_length indels_info.mismatches += mismatches ca_output.icarus_out_f.write('indel: ' + indel_class.lower() + reason_msg + '\n') else: if qconfig.strict_NA: aligned_lengths.append(cur_aligned_length) contig_aligned_length += cur_aligned_length cur_aligned_length = 0 if distance_on_contig < 0: #There is an overlap between the two alignments, a local misassembly ca_output.stdout_f.write('\t\t\t Overlap between these two alignments (local misassembly).') elif distance_on_contig > 0: #There is a small gap between the two alignments, a local misassembly ca_output.stdout_f.write('\t\t\t Gap between these two alignments (local misassembly).') elif inconsistency < 0: ca_output.stdout_f.write('\t\t\t Overlap between these two alignments (local misassembly).') else: ca_output.stdout_f.write('\t\t\t Gap between these two alignments (local misassembly).') ca_output.stdout_f.write(' Inconsistency = ' + str(inconsistency) + reason_msg + '\n') ca_output.icarus_out_f.write('local misassembly' + reason_msg + '\n') region_misassemblies.append(Misassembly.LOCAL) prev_align = next_align cur_aligned_length += prev_align.len2 - (-distance_on_contig if distance_on_contig < 0 else 0) #Record the very last alignment i = len(sorted_aligns) - 1 ca_output.stdout_f.write('\t\t\tReal Alignment %d: %s' % (i + 1, str(next_align)) + '\n') ca_output.icarus_out_f.write(next_align.icarus_report_str() + '\n') ref_aligns.setdefault(next_align.ref, []).append(next_align) ca_output.coords_filtered_f.write(str(next_align) + '\n') aligned_lengths.append(cur_aligned_length) contig_aligned_length += cur_aligned_length assert contig_aligned_length <= len(contig_seq), "Internal QUAST bug: contig aligned length is greater than " \ "contig length (contig: %s, len: %d, aligned: %d)!" % \ (sorted_aligns[0].contig, contig_aligned_length, len(contig_seq)) return is_misassembled, misassembly_internal_overlap, references_misassemblies, indels_info, misassemblies_matched_sv