def execute_alignment(self, query): row = [] self.dna_alphabet = "AGTCNRYSWKMBDHV" matrix = ssw.DNA_ScoreMatrix(alphabet=self.dna_alphabet) aligner = ssw.Aligner(matrix=matrix) for reference in self.references: alignment = aligner.align(query, reference) row.append(alignment) row.sort(cmp=lambda x, y: cmp(x.score, y.score), reverse=True) winner = row[0] return (winner.reference, winner)
def get_ssw_alignments(best_edit_distances, querys, targets): score_matrix = ssw.DNA_ScoreMatrix(match=1, mismatch=-2) aligner = ssw.Aligner(gap_open=2, gap_extend=1, matrix=score_matrix) best_edit_distances_ssw = {} best_cigars_ssw = {} for acc1 in best_edit_distances: seq1 = querys[acc1] best_ed = len(seq1) best_edit_distances_ssw[acc1] = {} best_cigars_ssw[acc1] = {} for acc2 in best_edit_distances[acc1]: seq2 = targets[acc2] result = aligner.align(seq1, seq2, revcomp=False) seq2_aln, match_line, seq1_aln = result.alignment matches, mismatches, indels = match_line.count( "|"), match_line.count("*"), match_line.count(" ") insertion_count = seq2_aln.count("-") deletion_count = seq1_aln.count("-") sw_ed = mismatches + indels best_edit_distances_ssw[acc1][ acc2] = sw_ed # (deletion_count, insertion_count, mismatches ) seq1_aln, match_line, seq2_aln = result.alignment best_cigars_ssw[acc1][acc2] = (result.cigar, mismatches, indels, result.query_begin, len(seq1) - result.query_end - 1, result.reference_begin, len(seq2) - result.reference_end - 1) # print(acc1,acc2) # print(result.query_begin, len(seq1) - result.query_end - 1, result.reference_begin, len(seq2) - result.reference_end -1, result.cigar, mismatches, indels) # print() # print(sw_ed, (deletion_count, insertion_count, mismatches )) # print(seq1_aln) # print(match_line) # print(seq2_aln) # edit_distance, locations, cigar = edlib_traceback(seq1, seq2, k =1000) # print(edit_distance, locations, cigar) # print() # for acc in best_cigars_ssw: # if len(best_cigars_ssw[acc]) ==0: # print("!!!!", acc) # print(len(best_cigars_ssw)) # sys.exit() return best_edit_distances_ssw, best_cigars_ssw
def ssw_alignment( x_acc, y_acc, x, y, i,j, max_discrepancy = 50): """ Aligns two sequences with SSW x: query y: reference """ if i % 10 == 0 and j % 10000 == 0: print("processing alignments on all y's where read x_i is participating. i={0}".format(i+1)) score_matrix = ssw.DNA_ScoreMatrix(match=1, mismatch=-1) aligner = ssw.Aligner(gap_open=2, gap_extend=1, matrix=score_matrix) # for the ends that SSW leaves behind bio_matrix = matlist.blosum62 g_open = -1 g_extend = -0.5 ###################################### result = aligner.align(x, y, revcomp=True) y_alignment, match_line, x_alignment = result.alignment matches, mismatches, indels = match_line.count("|"), match_line.count("*"), match_line.count(" ") deletions = x_alignment.count("-") insertions = y_alignment.count("-") assert deletions + insertions == indels # alignment_length = len(match_line) start_discrepancy = max(result.query_begin, result.reference_begin) # 0-indexed # max(result.query_begin, result.reference_begin) - min(result.query_begin, result.reference_begin) query_end_discrepancy = len(x) - result.query_end - 1 ref_end_discrepancy = len(y) - result.reference_end - 1 end_discrepancy = max(query_end_discrepancy, ref_end_discrepancy) # max(result.query_end, result.reference_end) - min(result.query_end, result.reference_end) # print(query_end_discrepancy, ref_end_discrepancy) tot_discrepancy = start_discrepancy + end_discrepancy if 0 < start_discrepancy <= max_discrepancy: # print("HERE") matches_snippet = 0 mismatches_snippet = 0 if result.query_begin and result.reference_begin: query_start_snippet = x[:result.query_begin] ref_start_snippet = y[:result.reference_begin] alns = pairwise2.align.globalds(query_start_snippet, ref_start_snippet, bio_matrix, g_open, g_extend) top_aln = alns[0] # print(alns) mismatches_snippet = len(list(filter(lambda x: x[0] != x[1] and x[0] != '-' and x[1] != "-", zip(top_aln[0],top_aln[1])))) indels_snippet = top_aln[0].count("-") + top_aln[1].count("-") matches_snippet = len(top_aln[0]) - mismatches_snippet - indels_snippet # print(matches_snippet, mismatches_snippet, indels_snippet) query_start_alignment_snippet = top_aln[0] ref_start_alignment_snippet = top_aln[1] elif result.query_begin: query_start_alignment_snippet = x[:result.query_begin] ref_start_alignment_snippet = "-"*len(query_start_alignment_snippet) indels_snippet = len(ref_start_alignment_snippet) elif result.reference_begin: ref_start_alignment_snippet = y[:result.reference_begin] query_start_alignment_snippet = "-"*len(ref_start_alignment_snippet) indels_snippet = len(query_start_alignment_snippet) else: print("BUG") sys.exit() matches, mismatches, indels = matches + matches_snippet, mismatches + mismatches_snippet, indels + indels_snippet # print(ref_start_alignment_snippet) # print(query_start_alignment_snippet) y_alignment = ref_start_alignment_snippet + y_alignment x_alignment = query_start_alignment_snippet + x_alignment if 0 < end_discrepancy <= max_discrepancy: # print("HERE2", query_end_discrepancy, ref_end_discrepancy) # print(y_alignment) # print(y) # print(match_line) # print(x_alignment) # print(x) # print(matches, len(x_alignment)) matches_snippet = 0 mismatches_snippet = 0 if query_end_discrepancy and ref_end_discrepancy: query_end_snippet = x[result.query_end+1:] ref_end_snippet = y[result.reference_end+1:] alns = pairwise2.align.globalds(query_end_snippet, ref_end_snippet, bio_matrix, g_open, g_extend) top_aln = alns[0] mismatches_snippet = len(list(filter(lambda x: x[0] != x[1] and x[0] != '-' and x[1] != "-", zip(top_aln[0],top_aln[1])))) indels_snippet = top_aln[0].count("-") + top_aln[1].count("-") matches_snippet = len(top_aln[0]) - mismatches_snippet - indels_snippet query_end_alignment_snippet = top_aln[0] ref_end_alignment_snippet = top_aln[1] elif query_end_discrepancy: query_end_alignment_snippet = x[result.query_end+1:] ref_end_alignment_snippet = "-"*len(query_end_alignment_snippet) indels_snippet = len(ref_end_alignment_snippet) elif ref_end_discrepancy: ref_end_alignment_snippet = y[result.reference_end+1:] query_end_alignment_snippet = "-"*len(ref_end_alignment_snippet) indels_snippet = len(query_end_alignment_snippet) else: print("BUG") sys.exit() matches, mismatches, indels = matches + matches_snippet, mismatches + mismatches_snippet, indels + indels_snippet y_alignment = y_alignment + ref_end_alignment_snippet x_alignment = x_alignment + query_end_alignment_snippet # matches, mismatches, indels = match_line.count("|"), match_line.count("*"), match_line.count(" ") deletions = x_alignment.count("-") insertions = y_alignment.count("-") assert deletions + insertions == indels if start_discrepancy > max_discrepancy or end_discrepancy > max_discrepancy: # print("REMOVING", start_discrepancy, end_discrepancy) return (y_alignment, x_alignment, None) else: return (y_alignment, x_alignment, (matches, mismatches, indels, deletions, insertions))
def ssw_alignment(x, y, ends_discrepancy_threshold = 250 ): """ Aligns two sequences with SSW x: query y: reference """ score_matrix = ssw.DNA_ScoreMatrix(match=1, mismatch=-20) aligner = ssw.Aligner(gap_open=50, gap_extend=0, matrix=score_matrix) # for the ends that SSW leaves behind bio_matrix = matlist.blosum62 g_open = -1 g_extend = -0.5 ###################################### # result = aligner.align("GA", "G", revcomp=False) # y_alignment, match_line, x_alignment = result.alignment # c = Counter(match_line) # matches, mismatches, indels = c["|"], c["*"], c[" "] # alignment_length = len(match_line) # print("matches:{0}, mismatches:{1}, indels:{2} ".format(matches, mismatches, indels)) # print(match_line) result = aligner.align(x, y, revcomp=False) y_alignment, match_line, x_alignment = result.alignment # print() # print(y_alignment) # print(match_line) # print(x_alignment) matches, mismatches, indels = match_line.count("|"), match_line.count("*"), match_line.count(" ") # alignment_length = len(match_line) start_discrepancy = max(result.query_begin, result.reference_begin) # 0-indexed # max(result.query_begin, result.reference_begin) - min(result.query_begin, result.reference_begin) query_end_discrepancy = len(x) - result.query_end - 1 ref_end_discrepancy = len(y) - result.reference_end - 1 end_discrepancy = max(query_end_discrepancy, ref_end_discrepancy) # max(result.query_end, result.reference_end) - min(result.query_end, result.reference_end) # print("disc:", start_discrepancy, end_discrepancy) tot_discrepancy = start_discrepancy + end_discrepancy if 0 < start_discrepancy <= ends_discrepancy_threshold: print("HERE",start_discrepancy) matches_snippet = 0 mismatches_snippet = 0 if result.query_begin and result.reference_begin: query_start_snippet = x[:result.query_begin] ref_start_snippet = y[:result.reference_begin] alns = pairwise2.align.globalds(query_start_snippet, ref_start_snippet, bio_matrix, g_open, g_extend) top_aln = alns[0] # print(alns) mismatches_snippet = len(list(filter(lambda x: x[0] != x[1] and x[0] != '-' and x[1] != "-", zip(top_aln[0],top_aln[1])))) indels_snippet = top_aln[0].count("-") + top_aln[1].count("-") matches_snippet = len(top_aln[0]) - mismatches_snippet - indels_snippet # print(matches_snippet, mismatches_snippet, indels_snippet) query_start_alignment_snippet = top_aln[0] ref_start_alignment_snippet = top_aln[1] elif result.query_begin: query_start_alignment_snippet = x[:result.query_begin] ref_start_alignment_snippet = "-"*len(query_start_alignment_snippet) indels_snippet = len(ref_start_alignment_snippet) elif result.reference_begin: ref_start_alignment_snippet = y[:result.reference_begin] query_start_alignment_snippet = "-"*len(ref_start_alignment_snippet) indels_snippet = len(query_start_alignment_snippet) else: print("BUG") sys.exit() matches, mismatches, indels = matches + matches_snippet, mismatches + mismatches_snippet, indels + indels_snippet # print(ref_start_alignment_snippet) # print(query_start_alignment_snippet) y_alignment = ref_start_alignment_snippet + y_alignment x_alignment = query_start_alignment_snippet + x_alignment if 0 < end_discrepancy <= ends_discrepancy_threshold: print("HERE2", end_discrepancy) matches_snippet = 0 mismatches_snippet = 0 if query_end_discrepancy and ref_end_discrepancy: query_end_snippet = x[result.query_end+1:] ref_end_snippet = y[result.reference_end+1:] alns = pairwise2.align.globalds(query_end_snippet, ref_end_snippet, bio_matrix, g_open, g_extend) top_aln = alns[0] mismatches_snippet = len(list(filter(lambda x: x[0] != x[1] and x[0] != '-' and x[1] != "-", zip(top_aln[0],top_aln[1])))) indels_snippet = top_aln[0].count("-") + top_aln[1].count("-") matches_snippet = len(top_aln[0]) - mismatches_snippet - indels_snippet query_end_alignment_snippet = top_aln[0] ref_end_alignment_snippet = top_aln[1] elif query_end_discrepancy: query_end_alignment_snippet = x[result.query_end+1:] ref_end_alignment_snippet = "-"*len(query_end_alignment_snippet) indels_snippet = len(ref_end_alignment_snippet) elif ref_end_discrepancy: ref_end_alignment_snippet = y[result.reference_end+1:] query_end_alignment_snippet = "-"*len(ref_end_alignment_snippet) indels_snippet = len(query_end_alignment_snippet) else: print("BUG") sys.exit() matches, mismatches, indels = matches + matches_snippet, mismatches + mismatches_snippet, indels + indels_snippet y_alignment = y_alignment + ref_end_alignment_snippet x_alignment = x_alignment + query_end_alignment_snippet return x_alignment, y_alignment, matches, mismatches, indels, match_line
def create_isoform_graph(transcripts, min_exon): G = nx.Graph() for acc in transcripts.keys(): G.add_node(acc, accession = acc) score_matrix = ssw.DNA_ScoreMatrix(match=1, mismatch=-2) aligner = ssw.Aligner(gap_open=2, gap_extend=0, matrix=score_matrix) cntr = 0 processed = set() # already_assigned = set() for acc1, seq1 in sorted(transcripts.items(), key= lambda x: len(x[1]), reverse=True): cntr += 1 processed.add(acc1) # print("length t:", len(seq1), acc1) if cntr % 5 == 0: print(cntr, "sequences processed") # print(acc1) # if acc1 in already_assigned: # # print("allready assigned to larger sequence!") # continue for acc2, seq2 in sorted(transcripts.items(), key= lambda x: len(x[1]), reverse=True): if acc2 in processed: continue # if acc1 == acc2: # continue # if seq1 == seq2: # continue # result = aligner.align(seq1, seq2, revcomp=False) # seq2_aln, match_line, seq1_aln = result.alignment # print(seq1) # print(seq2) seq1_aln, seq2_aln, matches, mismatches, indels, match_line = ssw_alignment(seq1, seq2, ends_discrepancy_threshold = 2000 ) print(acc1, acc2, mismatches, indels) print(seq1_aln) print(seq2_aln) print(match_line) # remove differences in 3' and 5' ends tmp_seq1_aln = seq1_aln tmp_seq1_aln = tmp_seq1_aln.lstrip("-") tmp_seq1_aln = tmp_seq1_aln.rstrip("-") tmp_seq2_aln = seq2_aln tmp_seq2_aln = tmp_seq2_aln.lstrip("-") tmp_seq2_aln = tmp_seq2_aln.rstrip("-") del_seq1 = re.findall(r"[-]+",tmp_seq1_aln) del_seq2 = re.findall(r"[-]+",tmp_seq2_aln) mismatches = len([ 1 for n1, n2 in zip(seq1_aln,seq2_aln) if n1 != n2 and n1 != "-" and n2 != "-" ]) ## do not count length discrepancies in ends inner_del_seq1 = re.findall(r"[AGCT][-]+[AGCT]",seq1_aln) inner_del_seq2 = re.findall(r"[AGCT][-]+[AGCT]",seq2_aln) # print(inner_del_seq1) # print(inner_del_seq2) total_inner = sum([len(d) - 2 for d in inner_del_seq1]) + sum([len(d) - 2 for d in inner_del_seq2]) # print(indels, total_inner) # by default (since all transcripts are distinct if we end up here), each transcript is its on gene member # if we find an alingment that contains only structural changes of > X (2) nucleotides, and no other smaller differences we classify as same family if mismatches == 0: del_lengths1 = [len(del_) for del_ in del_seq1] del_lengths2 = [len(del_) for del_ in del_seq2] no_small_del_in_seq1 = ((len(del_lengths1) > 0 and min(del_lengths1) >= min_exon) or len(del_lengths1) == 0) no_small_del_in_seq2 = ((len(del_lengths2) > 0 and min(del_lengths2) >= min_exon) or len(del_lengths2) == 0) # print(no_small_del_in_seq1, no_small_del_in_seq2) # print((len(del_seq1) > 0 and min(del_seq1) >= 3), len(del_seq1) == 0) # if acc1[0][:14] == "transcript_460" and acc2[0][:14] == "transcript_467" : # print("we are here", no_small_del_in_seq1, no_small_del_in_seq2, mismatches) # sys.exit() if no_small_del_in_seq1 and no_small_del_in_seq2: G.add_edge(acc1, acc2, alignment={ acc1 : seq1_aln, acc2 : seq2_aln }) else: pass # print("Different only by small indel!!") list_of_maximal_cliques = list(nx.find_cliques(G)) print("Number of possible members:", len(list_of_maximal_cliques) ) print("clique sizes", [ len(cl) for cl in sorted(list_of_maximal_cliques, key= lambda x: len(x), reverse=True)] ) return G