def get_pairwise_hom(jxn1, jxn2, chim_dir, side): hom_res = [0] # jxn1 clean_jxn1 = su.common.safe_jxn(jxn1) jxn1_fa = os.path.join(chim_dir, 'transcripts-fusion-' + clean_jxn1 + '.fa') # jxn2 clean_jxn2 = su.common.safe_jxn(jxn2) jxn2_fa = os.path.join(chim_dir, 'transcripts-fusion-' + clean_jxn2 + '.fa') # get homology scores aligner = ssw.Aligner(gap_open=12, gap_extend=4) # match=2, mismatch=2 fa1_gen = su.common.fasta_iter(jxn1_fa) max_res = 0 # iterate through transcripts and do SW against fusion partner transcript for fa1_head, fa1_seq in fa1_gen: fa1_brk = int(fa1_head.split('|')[-1]) if side == "left": fa1_seq = fa1_seq[:fa1_brk] else: fa1_seq = fa1_seq[fa1_brk:] fa2_gen = su.common.fasta_iter(jxn2_fa) for fa2_head, fa2_seq in fa2_gen: fa2_brk = int(fa2_head.split('|')[-1]) if side == "left": fa2_seq = fa2_seq[:fa2_brk] else: fa2_seq = fa2_seq[fa2_brk:] cmp_align = aligner.align(reference=fa1_seq, query=fa2_seq) norm_res = cmp_align.score / (min(len(fa1_seq), len(fa2_seq)) * 2 ) # normalize hom from 0-1 range max_res = max(max_res, norm_res) hom_res.append(max_res) return (np.max(hom_res))
def run_crosshom_ssw(reads_fq, trxleft_fa, trxright_fa): '''get sw score for left and right gene''' aligner = ssw.Aligner(gap_open=12, gap_extend=4) # match=2, mismatch=2 rfq_gen = su.common.FastqParser(reads_fq) matches = [] # iterate through reads and do SW against fusion partner transcript for rfq in (x for _, x in zip(range(500), rfq_gen)): # just do top 500 reads to maintain speed l_max = 0 r_max = 0 if rfq.seq_len <= 20: # small sequences will always give positive hits, so effectively skip matches.append(0) trxl_gen = su.common.fasta_iter(trxleft_fa) trxr_gen = su.common.fasta_iter(trxright_fa) for trxl_id, trxl_seq in trxl_gen: rfql_align = aligner.align(reference=rfq.sequence, query=trxl_seq) l_max = max(rfql_align.score, l_max) for trxr_id, trxr_seq in trxr_gen: rfqr_align = aligner.align(reference=rfq.sequence, query=trxr_seq) r_max = max(rfqr_align.score, r_max) read_norm = min(l_max, r_max) / (rfq.seq_len * 2) # pct identity matching with flexibility for snvs matches.append(read_norm) if len(matches) > 0: return(float("{0:.3f}".format(np.sum(i > .5 for i in matches) / len(matches)))) else: return(0)
def realign(read, chrom, ref): ref_start = max(read.reference_start - len(read.seq), 0) ref_end = read.reference_end + len(read.seq) ref_seq = ref.fetch(chrom, ref_start, ref_end) aligner = ssw.Aligner() alignment = aligner.align(reference=ref_seq, query=read.seq) new_pos = ref_start + alignment.reference_begin new_cigar = alignment.cigar return new_pos, new_cigar
def test_mismatch(self): reference = "GTGCGATGTGCGATGAGATC" query = reference[:9] + 'A' + reference[10:] aligner = ssw.Aligner() al = aligner.align(query, reference) self.assertEqual(al.match_count, 19) self.assertEqual(al.mismatch_count, 1) self.assertEqual(al.insertion_count, 0) self.assertEqual(al.deletion_count, 0) self.assertEqual(al.cigar, "20M")
def test_rc_alignment(self): reference = "GTGCGATGTGCGATGAGATC" query = "GATCTCATCGCACATCGCAC" aligner = ssw.Aligner() al = aligner.align(query, reference) self.assertEqual(al.match_count, 20) self.assertEqual(al.mismatch_count, 0) self.assertEqual(al.insertion_count, 0) self.assertEqual(al.deletion_count, 0) self.assertEqual(al.cigar, "20M")
def test_perfect_alignment(self): reference = "GTGCGATGTGCGATGAGATC" query = reference aligner = ssw.Aligner() al = aligner.align(query, reference) self.assertEqual(al.match_count, len(reference)) self.assertEqual(al.mismatch_count, 0) self.assertEqual(al.insertion_count, 0) self.assertEqual(al.deletion_count, 0) self.assertEqual(al.cigar, '20M')
def test_issue_1(self): # https://github.com/vishnubob/ssw/issues/1 reference = "CCC" + "AGCT" * 10 query = "AGGT" * 10 aligner = ssw.Aligner() alignment = aligner.align(query, reference) (r_line, m_line, q_line) = alignment.alignment self.assertEqual(r_line, "AGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAG") self.assertEqual(m_line, "||*|||*|||*|||*|||*|||*|||*|||*|||*|||") self.assertEqual(q_line, "AGGTAGGTAGGTAGGTAGGTAGGTAGGTAGGTAGGTAG")
def getBestScoringAllelesForExon(genotypes, commonExon, secondary, consensus): exAlign = {} sw = ssw.Aligner() for allele in genotypes: exonSeq = secondary[allele][commonExon] alignment = sw.align(str(consensus.seq), exonSeq) exAlign[allele] = alignment.score return set( getBestScoringAlleles( sorted(exAlign.items(), key=operator.itemgetter(1), reverse=True)))
def test_deletion(self): reference = "GTGCGATGTGCGATGAGATC" query = reference[:10] + reference[11:] aligner = ssw.Aligner() al = aligner.align(query, reference) self.assertEqual(al.match_count, 19) self.assertEqual(al.mismatch_count, 0) self.assertEqual(al.insertion_count, 0) self.assertEqual(al.deletion_count, 1) self.assertEqual(al.cigar, "10M1D9M")
def test_alignment_pickle(self): reference = "GTGCGATGTGCGATGAGATC" query = reference aligner = ssw.Aligner() al = aligner.align(query, reference) orig_dict = al.__dict__.copy() clone = pickle.loads(pickle.dumps(al)) clone_dict = clone.__dict__.copy() for key in orig_dict.keys(): self.assertIn(key, clone_dict) self.assertEqual(orig_dict[key], clone_dict[key])
def execute_alignment(self, query): row = [] self.dna_alphabet = "AGTCNRYSWKMBDHV" matrix = ssw.DNA_ScoreMatrix(alphabet=self.dna_alphabet) aligner = ssw.Aligner(matrix=matrix) for reference in self.references: alignment = aligner.align(query, reference) row.append(alignment) row.sort(cmp=lambda x, y: cmp(x.score, y.score), reverse=True) winner = row[0] return (winner.reference, winner)
def test_degen_alignment(self): # XXX: note, this fails if seq_1 and seq_2 are switched # see: https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library/issues/63 seq_1 = "AGCGATCACGT" seq_2 = "MRYSWKBDHVN" aligner = ssw.Aligner() al = aligner.align(seq_1, seq_2) self.assertEqual(al.match_count, len(seq_1)) self.assertEqual(al.mismatch_count, 0) self.assertEqual(al.insertion_count, 0) self.assertEqual(al.deletion_count, 0) self.assertEqual(al.cigar, '%dM' % len(seq_1))
def realign(initref, lastref, lastref_profile): """ Realign lastref to initref The purpose is to generate a better alignment for restoring initref position numbers in multi-alignment results. """ # compatible consideration; different versions of # iterative alignment represent deletion slightly varied cons_delpos = set( str_index_all(lastref, '-') + str_index_all(lastref_profile, '-')) lastref = ''.join('' if p in cons_delpos else n for p, n in enumerate(lastref)) aligner = ssw.Aligner() aln = aligner.align(reference=initref, query=lastref) if aln.query_begin > 0 or aln.reference_begin > 0: # Why? raise RuntimeError('Consensus misaligned') initref, _, lastref = aln.alignment ref_pos0 = -1 alnprofile = [] for refna, consna in zip(initref, lastref): if refna != '-': ref_pos0 += 1 # The re-alignment could place deletions in slightly different # place comparing to the original alignment. Four cases need # to be considered: # - +n+o: deletion presents in both alignments: # no extra handling needed # - +n-o: deletion only presents in new alignment: # new deletion should be added to the multi-alignment # - -n+o: deletion only presents in old alignment: # multi-alignment NAs mapped to the old deletion # should be removed to previous refpos (as insertion) # - -n-o: no deletion: # no extra handling needed newdel = consna == '-' olddel = len(alnprofile) in cons_delpos while not newdel and olddel: # -n+o # mismatched old deletions prev_refpos0, _ = alnprofile[-1] alnprofile.append((prev_refpos0, -1)) olddel = len(alnprofile) in cons_delpos else: if newdel: # +n+o/+n-o # new deletions prev_refpos0, count = alnprofile[-1] alnprofile[-1] = (prev_refpos0, count + 1) else: # -n-o # agree, non deletion alnprofile.append((ref_pos0, 0)) return alnprofile
def test_coverage(self): bplen = 5 reference = "GTGCGATGTGCGATGAGATC" query = reference[bplen:bplen * 2] aligner = ssw.Aligner() al = aligner.align(query, reference) self.assertEqual(al.match_count, bplen) self.assertEqual(al.mismatch_count, 0) self.assertEqual(al.insertion_count, 0) self.assertEqual(al.deletion_count, 0) self.assertEqual(al.cigar, '%dM' % bplen) self.assertEqual(al.query_coverage, 1.0) self.assertEqual(al.reference_coverage, bplen / len(reference))
def selectGenotypesConsideringIntronsAndUTRs(genotypes, genomicRefs, consensus): """ Now we are doing something similar as in getBestScoringAllelesForExon(), but for whole sequences """ alleleAlign = {} sw = ssw.Aligner() for allele in genotypes: alleleSeq = genomicRefs[allele] alignment = sw.align(str(consensus.seq), alleleSeq) alleleAlign[allele] = alignment.score return set( getBestScoringAlleles( sorted(alleleAlign.items(), key=operator.itemgetter(1), reverse=True)))
def get_ssw_alignments(best_edit_distances, querys, targets): score_matrix = ssw.DNA_ScoreMatrix(match=1, mismatch=-2) aligner = ssw.Aligner(gap_open=2, gap_extend=1, matrix=score_matrix) best_edit_distances_ssw = {} best_cigars_ssw = {} for acc1 in best_edit_distances: seq1 = querys[acc1] best_ed = len(seq1) best_edit_distances_ssw[acc1] = {} best_cigars_ssw[acc1] = {} for acc2 in best_edit_distances[acc1]: seq2 = targets[acc2] result = aligner.align(seq1, seq2, revcomp=False) seq2_aln, match_line, seq1_aln = result.alignment matches, mismatches, indels = match_line.count( "|"), match_line.count("*"), match_line.count(" ") insertion_count = seq2_aln.count("-") deletion_count = seq1_aln.count("-") sw_ed = mismatches + indels best_edit_distances_ssw[acc1][ acc2] = sw_ed # (deletion_count, insertion_count, mismatches ) seq1_aln, match_line, seq2_aln = result.alignment best_cigars_ssw[acc1][acc2] = (result.cigar, mismatches, indels, result.query_begin, len(seq1) - result.query_end - 1, result.reference_begin, len(seq2) - result.reference_end - 1) # print(acc1,acc2) # print(result.query_begin, len(seq1) - result.query_end - 1, result.reference_begin, len(seq2) - result.reference_end -1, result.cigar, mismatches, indels) # print() # print(sw_ed, (deletion_count, insertion_count, mismatches )) # print(seq1_aln) # print(match_line) # print(seq2_aln) # edit_distance, locations, cigar = edlib_traceback(seq1, seq2, k =1000) # print(edit_distance, locations, cigar) # print() # for acc in best_cigars_ssw: # if len(best_cigars_ssw[acc]) ==0: # print("!!!!", acc) # print(len(best_cigars_ssw)) # sys.exit() return best_edit_distances_ssw, best_cigars_ssw
def preSelectTypes(primary, consensus, locus): """ For each primary exon (or exon pair) make an alignment for exon 2, and put the result into a dictionary as alignmentEx2['allele'] = #score # we are getting [score, matches, mismatches, inserts, deletions] if we want to if there is exon 3 also, do an alignmentEx3['allele'] = #mismatches merge the two in a way that sort both, keep the best for both, and make an intersect else sort, and keep the best alignments only """ print "Selecting best alleles using primary exons" # we are going to use https://github.com/vishnubob/ssw that is using # https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library for SW and SAM output # unfortunatelly it can not made multiprocess yet, but it is fast enough isClassI = locus in ["HLA-A", "HLA-B", "HLA-C"] alignmentEx2 = {} alignmentEx3 = {} sw = ssw.Aligner() for allele, exons in primary.items(): alignment = sw.align(str(consensus.seq), exons[0]) alignmentEx2[allele] = alignment.score # ditto for exon 3 if it is not a Class-I if isClassI: alignment = sw.align(str(consensus.seq), exons[1]) alignmentEx3[allele] = alignment.score # print allele + " scores: exon 2: " + str(alignmentEx2[allele]) + " exon 3 " + str(alignmentEx3[allele]) # sort the dict by values and reverse the result bestEx2 = getBestScoringAlleles( sorted(alignmentEx2.items(), key=operator.itemgetter(1), reverse=True)) if isClassI: bestEx3 = getBestScoringAlleles( sorted(alignmentEx3.items(), key=operator.itemgetter(1), reverse=True)) # return with the intersect of the two sets, leaving only entries that are in the besty matching set for both exon 2 and exon 3 # or exon 2 only for other than Class-I alleles print "done" return list(set(bestEx2) & set(bestEx3)) if isClassI else bestEx2
def realign2(initref, lastref, lastref_profile): """ Realign lastref to initref The purpose is to generate a better alignment for restoring initref position numbers in multi-alignment results. """ # compatible consideration; different versions of # iterative alignment represent deletion slightly varied cons_delpos = set( str_index_all(lastref, '-') + str_index_all(lastref_profile, '-')) lastref = ''.join('' if p in cons_delpos else n for p, n in enumerate(lastref)) aligner = ssw.Aligner() aln = aligner.align(reference=initref, query=lastref) if aln.query_begin > 0 or \ aln.reference_begin > 0 or \ aln.query_end + 1 < len(lastref) or \ aln.reference_end + 1 < len(initref): # this should never have since lastref is constructed # from initref but just in case raise NotImplementedError('Partial alignment not supported yet') initref, _, lastref = aln.alignment resultseq = [] alnprofile = [] for refna, consna in zip(initref, lastref): if refna == '-': resultseq.append(consna) alnprofile.append('+') elif consna == '-': resultseq.append(refna) alnprofile.append('-') else: resultseq.append(consna) alnprofile.append('.') return ''.join(resultseq), ''.join(alnprofile)
def get_best_match(consensus_transcripts, reference_transcripts, outfolder, transcript_abundances, transcript_copies, sampled_dict, params): out_file = open(os.path.join(outfolder, "results.tsv"), "w") aligner = ssw.Aligner(gap_open=2, gap_extend=1) # do SW nr_unique_refs = len(reference_transcripts) errors_container = {} identity_container = {} error_types_container = {} best_match_container = {} not_FN = set() # print(consensus_transcripts) if len(consensus_transcripts) == 0: out_file.write("{0}\t{1}\t{2}\n".format( nr_unique_refs, len(consensus_transcripts), ",".join([str(a) for a in transcript_abundances.values()]))) return sorted_lengths = sorted([ (len(q_seq), q_acc) for q_acc, q_seq in consensus_transcripts.items() ]) # for l in sorted_lengths: # print(l) print("REF LENGHTS") sorted_lengths = sorted( [len(r_seq) for r_acc, r_seq in reference_transcripts.items()]) # for l in sorted_lengths: # print(l) # pre check exact matches: if params.only_exact: ref_seq_to_acc = { seq: acc for acc, seq in reference_transcripts.items() } ref_seqs = set(reference_transcripts.values()) exact_matches = set() for q_acc, q_seq in consensus_transcripts.items(): if q_seq in ref_seqs: exact_matches.add(q_acc) ref_acc = ref_seq_to_acc[q_seq] #.split("copy")[0] print("Exact", q_acc, "to transcript with copy number:", transcript_copies[ref_acc]) errors_container[q_acc] = 0 best_match_container[q_acc] = ref_acc identity_container[q_acc] = 1.0 error_types_container[q_acc] = (0, 0, 0) not_FN.add(ref_acc) print(len(ref_seqs)) print(len(consensus_transcripts)) print("EXACT MATCHES:", len(exact_matches)) else: print("Start1") best_edit_distances = get_minimizers_2set_simple( consensus_transcripts, reference_transcripts) minimizer_graph_c_to_t = get_ssw_alignments(best_edit_distances, consensus_transcripts, reference_transcripts) for i, (q_acc, q_seq) in enumerate(minimizer_graph_c_to_t.items()): best_ed = 200000 r_acc_max_id = "NONE" fewest_errors = len(q_seq) best_mismatches, best_insertions, best_deletions = len(q_seq), len( q_seq), len(q_seq) for j, (r_acc, r_seq) in enumerate(minimizer_graph_c_to_t[q_acc].items()): deletions, insertions, mismatches = minimizer_graph_c_to_t[ q_acc][r_acc] edit_distance = deletions + insertions + mismatches if edit_distance < best_ed: best_ed = edit_distance r_acc_max_id = r_acc fewest_errors = edit_distance best_mismatches, best_insertions, best_deletions = mismatches, insertions, deletions errors_container[q_acc] = fewest_errors best_match_container[q_acc] = r_acc_max_id identity_container[q_acc] = 1.0 - (best_ed / float( max(len(q_seq), len(reference_transcripts[r_acc_max_id])))) error_types_container[q_acc] = (best_mismatches, best_insertions, best_deletions) not_FN.add(r_acc_max_id) print("Stop1!") if sampled_dict: FN = set(sampled_dict.keys()).difference(not_FN) else: FN = set(reference_transcripts.keys()).difference(not_FN) for ref in FN: print("FN:", ref, len(reference_transcripts[ref])) # current logging: # first row display info of number of uniqur reference transcripts, and number of inferred transcripts if sampled_dict: out_file.write("{0}\t{1}\t{2}\n".format( len(sampled_dict), len(consensus_transcripts), ",".join([str(a) for a in transcript_abundances.values()]))) else: out_file.write("{0}\t{1}\t{2}\n".format( nr_unique_refs, len(consensus_transcripts), ",".join([str(a) for a in transcript_abundances.values()]))) # total discoveries, total perfect matches (1.0 identity), errors for each consensus # print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n".format(nr_unique_refs, q_acc, best_match_container[q_acc], errors_container[q_acc], identity_container[q_acc], *error_types_container[q_acc])) for q_acc in errors_container: # each ro displays values for a consensus transcript if identity_container[q_acc] > params.sim_cutoff: ssw_stats = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format( q_acc, best_match_container[q_acc], errors_container[q_acc], identity_container[q_acc], *error_types_container[q_acc]) # print(ssw_stats, minimizer_graph_c_to_t[q_acc]) # print() out_file.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format( q_acc, best_match_container[q_acc], errors_container[q_acc], identity_container[q_acc], *error_types_container[q_acc]))
import ssw import sys if len(sys.argv) < 3: print "Usage: python remove_messy.py in.bed out.bed" sys.exit() in_bed = sys.argv[1] out_bed = sys.argv[2] aligner = ssw.Aligner() allowed_percent = 0.0 with open(in_bed, 'r') as refin: with open(out_bed, 'w') as refout: messy = 0 clean = 0 for line in refin: rec = line.strip().split("\t") ref = rec[5] query = rec[4] * int((int(rec[2]) - int(rec[1]) + 1) / int(rec[3])) #alignment = aligner.align(ref, query) #if alignment.mismatch_count + alignment.deletion_count + alignment.insertion_count > allowed_percent / 100.0 * len(ref): if (ref != query): #print (alignment.alignment_report()) messy = messy + 1 else: clean = clean + 1 refout.write('\t'.join(rec) + '\n') print('Deleted ' + str(messy) + ' loci') print(str(clean) + ' loci survived')
def ssw_alignment( x_acc, y_acc, x, y, i,j, max_discrepancy = 50): """ Aligns two sequences with SSW x: query y: reference """ if i % 10 == 0 and j % 10000 == 0: print("processing alignments on all y's where read x_i is participating. i={0}".format(i+1)) score_matrix = ssw.DNA_ScoreMatrix(match=1, mismatch=-1) aligner = ssw.Aligner(gap_open=2, gap_extend=1, matrix=score_matrix) # for the ends that SSW leaves behind bio_matrix = matlist.blosum62 g_open = -1 g_extend = -0.5 ###################################### result = aligner.align(x, y, revcomp=True) y_alignment, match_line, x_alignment = result.alignment matches, mismatches, indels = match_line.count("|"), match_line.count("*"), match_line.count(" ") deletions = x_alignment.count("-") insertions = y_alignment.count("-") assert deletions + insertions == indels # alignment_length = len(match_line) start_discrepancy = max(result.query_begin, result.reference_begin) # 0-indexed # max(result.query_begin, result.reference_begin) - min(result.query_begin, result.reference_begin) query_end_discrepancy = len(x) - result.query_end - 1 ref_end_discrepancy = len(y) - result.reference_end - 1 end_discrepancy = max(query_end_discrepancy, ref_end_discrepancy) # max(result.query_end, result.reference_end) - min(result.query_end, result.reference_end) # print(query_end_discrepancy, ref_end_discrepancy) tot_discrepancy = start_discrepancy + end_discrepancy if 0 < start_discrepancy <= max_discrepancy: # print("HERE") matches_snippet = 0 mismatches_snippet = 0 if result.query_begin and result.reference_begin: query_start_snippet = x[:result.query_begin] ref_start_snippet = y[:result.reference_begin] alns = pairwise2.align.globalds(query_start_snippet, ref_start_snippet, bio_matrix, g_open, g_extend) top_aln = alns[0] # print(alns) mismatches_snippet = len(list(filter(lambda x: x[0] != x[1] and x[0] != '-' and x[1] != "-", zip(top_aln[0],top_aln[1])))) indels_snippet = top_aln[0].count("-") + top_aln[1].count("-") matches_snippet = len(top_aln[0]) - mismatches_snippet - indels_snippet # print(matches_snippet, mismatches_snippet, indels_snippet) query_start_alignment_snippet = top_aln[0] ref_start_alignment_snippet = top_aln[1] elif result.query_begin: query_start_alignment_snippet = x[:result.query_begin] ref_start_alignment_snippet = "-"*len(query_start_alignment_snippet) indels_snippet = len(ref_start_alignment_snippet) elif result.reference_begin: ref_start_alignment_snippet = y[:result.reference_begin] query_start_alignment_snippet = "-"*len(ref_start_alignment_snippet) indels_snippet = len(query_start_alignment_snippet) else: print("BUG") sys.exit() matches, mismatches, indels = matches + matches_snippet, mismatches + mismatches_snippet, indels + indels_snippet # print(ref_start_alignment_snippet) # print(query_start_alignment_snippet) y_alignment = ref_start_alignment_snippet + y_alignment x_alignment = query_start_alignment_snippet + x_alignment if 0 < end_discrepancy <= max_discrepancy: # print("HERE2", query_end_discrepancy, ref_end_discrepancy) # print(y_alignment) # print(y) # print(match_line) # print(x_alignment) # print(x) # print(matches, len(x_alignment)) matches_snippet = 0 mismatches_snippet = 0 if query_end_discrepancy and ref_end_discrepancy: query_end_snippet = x[result.query_end+1:] ref_end_snippet = y[result.reference_end+1:] alns = pairwise2.align.globalds(query_end_snippet, ref_end_snippet, bio_matrix, g_open, g_extend) top_aln = alns[0] mismatches_snippet = len(list(filter(lambda x: x[0] != x[1] and x[0] != '-' and x[1] != "-", zip(top_aln[0],top_aln[1])))) indels_snippet = top_aln[0].count("-") + top_aln[1].count("-") matches_snippet = len(top_aln[0]) - mismatches_snippet - indels_snippet query_end_alignment_snippet = top_aln[0] ref_end_alignment_snippet = top_aln[1] elif query_end_discrepancy: query_end_alignment_snippet = x[result.query_end+1:] ref_end_alignment_snippet = "-"*len(query_end_alignment_snippet) indels_snippet = len(ref_end_alignment_snippet) elif ref_end_discrepancy: ref_end_alignment_snippet = y[result.reference_end+1:] query_end_alignment_snippet = "-"*len(ref_end_alignment_snippet) indels_snippet = len(query_end_alignment_snippet) else: print("BUG") sys.exit() matches, mismatches, indels = matches + matches_snippet, mismatches + mismatches_snippet, indels + indels_snippet y_alignment = y_alignment + ref_end_alignment_snippet x_alignment = x_alignment + query_end_alignment_snippet # matches, mismatches, indels = match_line.count("|"), match_line.count("*"), match_line.count(" ") deletions = x_alignment.count("-") insertions = y_alignment.count("-") assert deletions + insertions == indels if start_discrepancy > max_discrepancy or end_discrepancy > max_discrepancy: # print("REMOVING", start_discrepancy, end_discrepancy) return (y_alignment, x_alignment, None) else: return (y_alignment, x_alignment, (matches, mismatches, indels, deletions, insertions))
def reference_similarity(reference_transcripts, outfolder, params): """ Stats about reference transcripts """ seqs_seen = set() transcript_abundances = {} transcript_copies = Counter() transcript_sequences = {} for acc, seq in sorted(reference_transcripts.items(), key=lambda x: len(x[1])): try: tr_acc, copy_number_str = acc.split("copy") except ValueError: tr_acc, copy_number_str = acc, "1" # viral data not simulated transcript_copies[tr_acc] += 1 try : copy_number = int(copy_number_str) except ValueError: copy_number = 1 if tr_acc not in transcript_abundances: transcript_abundances[tr_acc] = copy_number transcript_sequences[tr_acc] = seq elif tr_acc in transcript_abundances and copy_number > transcript_abundances[tr_acc]: transcript_abundances[tr_acc] = copy_number transcript_sequences[tr_acc] = seq if seq in seqs_seen: # print("HERE!", len(seq)) del reference_transcripts[acc] else: seqs_seen.add(seq) print("Number of unique references:", len(reference_transcripts)) for t_acc, copy_nr in transcript_copies.items(): print(t_acc, copy_nr) print("abundances:", transcript_abundances) print("calculating reference similarities") aligner = ssw.Aligner(gap_open=2, gap_extend=1) # do SW sorted_reference_tuples = sorted(reference_transcripts.items(), key = lambda x: len(x[1])) reference_similarities = {} for q_acc, q_seq in transcript_sequences.items(): reference_similarities[q_acc] = {} for r_acc, r_seq in transcript_sequences.items(): # r_aligned, q_aligned, stats = ssw_alignment( q_acc, r_acc, q_seq, r_seq, 0,0, max_discrepancy = 10000 ) # if stats: # matches, mismatches, indels, deletions, insertions = stats # errors = mismatches + indels # identity = errors/ float(errors + matches) # reference_similarities[q_acc][r_acc] = errors # else: # reference_similarities[q_acc][r_acc] = min(len(q_seq), len(r_seq)) ed = edlib_ed(q_seq, r_seq, mode="NW", task="distance", k=10000) reference_similarities[q_acc][r_acc] = ed return transcript_abundances, transcript_copies, reference_similarities
def reference_similarity(reference_transcripts, outfolder, params): """ Stats about reference transcripts """ seqs_seen = set() transcript_abundances = {} transcript_copies = Counter() for acc, seq in sorted(reference_transcripts.items(), key=lambda x: len(x[1])): try: tr_acc, copy_number_str = acc.split("copy") except ValueError: tr_acc, copy_number_str = acc, "1" # viral data not simulated transcript_copies[tr_acc] += 1 try: copy_number = int(copy_number_str) except ValueError: copy_number = 1 if tr_acc not in transcript_abundances: transcript_abundances[tr_acc] = copy_number elif tr_acc in transcript_abundances and copy_number > transcript_abundances[ tr_acc]: transcript_abundances[tr_acc] = copy_number if seq in seqs_seen: # print("HERE!", len(seq)) del reference_transcripts[acc] else: seqs_seen.add(seq) print("Number of unique references:", len(reference_transcripts)) for t_acc, copy_nr in transcript_copies.items(): print(t_acc, copy_nr) # print("abundances:", transcript_abundances) if not params.no_ref_sim: # relative_abundance_matrix = {} # annotation_matrix = {} sorted_reference_tuples = sorted(reference_transcripts.items(), key=lambda x: len(x[1])) reference_abundances = [0] * len(sorted_reference_tuples) for i, (acc1, seq1) in enumerate(sorted_reference_tuples): reference_abundances[i] = [0] * len(sorted_reference_tuples) # relative_abundance_matrix[acc1] = {} # annotation_matrix[acc1] = {} for j, (acc2, seq2) in enumerate(sorted_reference_tuples): copy_nr_1 = transcript_abundances[acc1.split("copy")[0]] copy_nr_2 = transcript_abundances[acc2.split("copy")[0]] reference_abundances[i][j] = float(copy_nr_1) / copy_nr_2 # relative_abundance_matrix[acc1][acc2] = float(copy_nr_1)/copy_nr_2 # annotation_matrix[acc1][acc2] = str(fractions.Fraction(copy_nr_1, copy_nr_2)) # print(relative_abundance_matrix) # print(annotation_matrix) relative_abundance_matrix_data_frame = pd.DataFrame( reference_abundances) # msk = relative_abundance_matrix_data_frame > 99 # relative_abundance_matrix_data_frame_masked = relative_abundance_matrix_data_frame.mask(msk) plot_heatmap("relative_abundance", relative_abundance_matrix_data_frame) # plot_heatmap("relative_abundance", relative_abundance_matrix_data_frame, annotation=True) print("calculating reference similarities") aligner = ssw.Aligner(gap_open=2, gap_extend=1) # do SW sorted_reference_tuples = sorted(reference_transcripts.items(), key=lambda x: len(x[1])) reference_similarities = [0] * len(sorted_reference_tuples) for i, (q_acc, q_seq) in enumerate(sorted_reference_tuples): reference_similarities[i] = [0] * len(sorted_reference_tuples) # print("ref", i) for j, (r_acc, r_seq) in enumerate(sorted_reference_tuples): # if len(q_seq) - len(r_seq) > 99 or len(q_seq) - len(r_seq) < 99: # reference_similarities[q_acc][r_acc] = min(len(q_seq), len(r_seq)) # continue ed = edlib_ed(q_seq, r_seq, mode="NW", task="distance", k=2 * max(len(q_seq), len(r_seq))) reference_similarities[i][j] = ed # r_aligned, q_aligned, stats = ssw_alignment( q_acc, r_acc, q_seq, r_seq, i,j, max_discrepancy = 10000 ) # if stats: # matches, mismatches, indels, deletions, insertions = stats # errors = mismatches + indels # identity = errors/ float(errors + matches) # reference_similarities[i][j] = errors # # print(ed, errors) # else: # reference_similarities[i][j] = min(len(q_seq), len(r_seq)) ref_sim_data_frame = pd.DataFrame(reference_similarities) msk = ref_sim_data_frame > 99 ref_sim_data_frame_masked = ref_sim_data_frame.mask(msk) plot_heatmap("similarities", ref_sim_data_frame_masked) return transcript_abundances, transcript_copies, reference_similarities
def create_isoform_graph(transcripts, min_exon): G = nx.Graph() for acc in transcripts.keys(): G.add_node(acc, accession = acc) score_matrix = ssw.DNA_ScoreMatrix(match=1, mismatch=-2) aligner = ssw.Aligner(gap_open=2, gap_extend=0, matrix=score_matrix) cntr = 0 processed = set() # already_assigned = set() for acc1, seq1 in sorted(transcripts.items(), key= lambda x: len(x[1]), reverse=True): cntr += 1 processed.add(acc1) # print("length t:", len(seq1), acc1) if cntr % 5 == 0: print(cntr, "sequences processed") # print(acc1) # if acc1 in already_assigned: # # print("allready assigned to larger sequence!") # continue for acc2, seq2 in sorted(transcripts.items(), key= lambda x: len(x[1]), reverse=True): if acc2 in processed: continue # if acc1 == acc2: # continue # if seq1 == seq2: # continue # result = aligner.align(seq1, seq2, revcomp=False) # seq2_aln, match_line, seq1_aln = result.alignment # print(seq1) # print(seq2) seq1_aln, seq2_aln, matches, mismatches, indels, match_line = ssw_alignment(seq1, seq2, ends_discrepancy_threshold = 2000 ) print(acc1, acc2, mismatches, indels) print(seq1_aln) print(seq2_aln) print(match_line) # remove differences in 3' and 5' ends tmp_seq1_aln = seq1_aln tmp_seq1_aln = tmp_seq1_aln.lstrip("-") tmp_seq1_aln = tmp_seq1_aln.rstrip("-") tmp_seq2_aln = seq2_aln tmp_seq2_aln = tmp_seq2_aln.lstrip("-") tmp_seq2_aln = tmp_seq2_aln.rstrip("-") del_seq1 = re.findall(r"[-]+",tmp_seq1_aln) del_seq2 = re.findall(r"[-]+",tmp_seq2_aln) mismatches = len([ 1 for n1, n2 in zip(seq1_aln,seq2_aln) if n1 != n2 and n1 != "-" and n2 != "-" ]) ## do not count length discrepancies in ends inner_del_seq1 = re.findall(r"[AGCT][-]+[AGCT]",seq1_aln) inner_del_seq2 = re.findall(r"[AGCT][-]+[AGCT]",seq2_aln) # print(inner_del_seq1) # print(inner_del_seq2) total_inner = sum([len(d) - 2 for d in inner_del_seq1]) + sum([len(d) - 2 for d in inner_del_seq2]) # print(indels, total_inner) # by default (since all transcripts are distinct if we end up here), each transcript is its on gene member # if we find an alingment that contains only structural changes of > X (2) nucleotides, and no other smaller differences we classify as same family if mismatches == 0: del_lengths1 = [len(del_) for del_ in del_seq1] del_lengths2 = [len(del_) for del_ in del_seq2] no_small_del_in_seq1 = ((len(del_lengths1) > 0 and min(del_lengths1) >= min_exon) or len(del_lengths1) == 0) no_small_del_in_seq2 = ((len(del_lengths2) > 0 and min(del_lengths2) >= min_exon) or len(del_lengths2) == 0) # print(no_small_del_in_seq1, no_small_del_in_seq2) # print((len(del_seq1) > 0 and min(del_seq1) >= 3), len(del_seq1) == 0) # if acc1[0][:14] == "transcript_460" and acc2[0][:14] == "transcript_467" : # print("we are here", no_small_del_in_seq1, no_small_del_in_seq2, mismatches) # sys.exit() if no_small_del_in_seq1 and no_small_del_in_seq2: G.add_edge(acc1, acc2, alignment={ acc1 : seq1_aln, acc2 : seq2_aln }) else: pass # print("Different only by small indel!!") list_of_maximal_cliques = list(nx.find_cliques(G)) print("Number of possible members:", len(list_of_maximal_cliques) ) print("clique sizes", [ len(cl) for cl in sorted(list_of_maximal_cliques, key= lambda x: len(x), reverse=True)] ) return G
def ssw_alignment(x, y, ends_discrepancy_threshold = 250 ): """ Aligns two sequences with SSW x: query y: reference """ score_matrix = ssw.DNA_ScoreMatrix(match=1, mismatch=-20) aligner = ssw.Aligner(gap_open=50, gap_extend=0, matrix=score_matrix) # for the ends that SSW leaves behind bio_matrix = matlist.blosum62 g_open = -1 g_extend = -0.5 ###################################### # result = aligner.align("GA", "G", revcomp=False) # y_alignment, match_line, x_alignment = result.alignment # c = Counter(match_line) # matches, mismatches, indels = c["|"], c["*"], c[" "] # alignment_length = len(match_line) # print("matches:{0}, mismatches:{1}, indels:{2} ".format(matches, mismatches, indels)) # print(match_line) result = aligner.align(x, y, revcomp=False) y_alignment, match_line, x_alignment = result.alignment # print() # print(y_alignment) # print(match_line) # print(x_alignment) matches, mismatches, indels = match_line.count("|"), match_line.count("*"), match_line.count(" ") # alignment_length = len(match_line) start_discrepancy = max(result.query_begin, result.reference_begin) # 0-indexed # max(result.query_begin, result.reference_begin) - min(result.query_begin, result.reference_begin) query_end_discrepancy = len(x) - result.query_end - 1 ref_end_discrepancy = len(y) - result.reference_end - 1 end_discrepancy = max(query_end_discrepancy, ref_end_discrepancy) # max(result.query_end, result.reference_end) - min(result.query_end, result.reference_end) # print("disc:", start_discrepancy, end_discrepancy) tot_discrepancy = start_discrepancy + end_discrepancy if 0 < start_discrepancy <= ends_discrepancy_threshold: print("HERE",start_discrepancy) matches_snippet = 0 mismatches_snippet = 0 if result.query_begin and result.reference_begin: query_start_snippet = x[:result.query_begin] ref_start_snippet = y[:result.reference_begin] alns = pairwise2.align.globalds(query_start_snippet, ref_start_snippet, bio_matrix, g_open, g_extend) top_aln = alns[0] # print(alns) mismatches_snippet = len(list(filter(lambda x: x[0] != x[1] and x[0] != '-' and x[1] != "-", zip(top_aln[0],top_aln[1])))) indels_snippet = top_aln[0].count("-") + top_aln[1].count("-") matches_snippet = len(top_aln[0]) - mismatches_snippet - indels_snippet # print(matches_snippet, mismatches_snippet, indels_snippet) query_start_alignment_snippet = top_aln[0] ref_start_alignment_snippet = top_aln[1] elif result.query_begin: query_start_alignment_snippet = x[:result.query_begin] ref_start_alignment_snippet = "-"*len(query_start_alignment_snippet) indels_snippet = len(ref_start_alignment_snippet) elif result.reference_begin: ref_start_alignment_snippet = y[:result.reference_begin] query_start_alignment_snippet = "-"*len(ref_start_alignment_snippet) indels_snippet = len(query_start_alignment_snippet) else: print("BUG") sys.exit() matches, mismatches, indels = matches + matches_snippet, mismatches + mismatches_snippet, indels + indels_snippet # print(ref_start_alignment_snippet) # print(query_start_alignment_snippet) y_alignment = ref_start_alignment_snippet + y_alignment x_alignment = query_start_alignment_snippet + x_alignment if 0 < end_discrepancy <= ends_discrepancy_threshold: print("HERE2", end_discrepancy) matches_snippet = 0 mismatches_snippet = 0 if query_end_discrepancy and ref_end_discrepancy: query_end_snippet = x[result.query_end+1:] ref_end_snippet = y[result.reference_end+1:] alns = pairwise2.align.globalds(query_end_snippet, ref_end_snippet, bio_matrix, g_open, g_extend) top_aln = alns[0] mismatches_snippet = len(list(filter(lambda x: x[0] != x[1] and x[0] != '-' and x[1] != "-", zip(top_aln[0],top_aln[1])))) indels_snippet = top_aln[0].count("-") + top_aln[1].count("-") matches_snippet = len(top_aln[0]) - mismatches_snippet - indels_snippet query_end_alignment_snippet = top_aln[0] ref_end_alignment_snippet = top_aln[1] elif query_end_discrepancy: query_end_alignment_snippet = x[result.query_end+1:] ref_end_alignment_snippet = "-"*len(query_end_alignment_snippet) indels_snippet = len(ref_end_alignment_snippet) elif ref_end_discrepancy: ref_end_alignment_snippet = y[result.reference_end+1:] query_end_alignment_snippet = "-"*len(ref_end_alignment_snippet) indels_snippet = len(query_end_alignment_snippet) else: print("BUG") sys.exit() matches, mismatches, indels = matches + matches_snippet, mismatches + mismatches_snippet, indels + indels_snippet y_alignment = y_alignment + ref_end_alignment_snippet x_alignment = x_alignment + query_end_alignment_snippet return x_alignment, y_alignment, matches, mismatches, indels, match_line