def score_phonetic_alignment(srcw, tgtw, slang, tlang, sim_matrix_path, gap_start_p=-1.0, gap_extend_p=-1.0): # convert to ascii required by align library nsrcw = ''.join( make_ascii(srcw, slang) if slang in langinfo.SCRIPT_RANGES else [str(c) for c in srcw]) ntgtw = ''.join( make_ascii(tgtw, tlang) if tlang in langinfo.SCRIPT_RANGES else [str(c) for c in tgtw]) ## use global alignment src_aln, tgt_aln = nw.global_align(nsrcw, ntgtw, matrix=sim_matrix_path, gap_open=gap_start_p, gap_extend=gap_extend_p) return nw.score_alignment(src_aln, tgt_aln, matrix=sim_matrix_path, gap_open=gap_start_p, gap_extend=gap_extend_p)
def sound_seq_distance_str(self,seq1_str, seq2_str): seq1_str = np.asanyarray(seq1_str) seq2_str = np.asanyarray(seq2_str) align = nw.global_align(seq1_str.tostring(), seq2_str.tostring(), gap_open=0, gap_extend=-5, matrix='/tmp/som.costs') len1 = len(seq1_str.tostring()) len2 = len(seq2_str.tostring()) return (-nw.score_alignment(*align, gap_open=0, gap_extend=-5, matrix='/tmp/som.costs'))/(len1+len2+0.0)
def score_phonetic_alignment(srcw,tgtw,slang,tlang,sim_matrix_path,gap_start_p=-1.0,gap_extend_p=-1.0): # convert to ascii required by align library nsrcw=''.join(make_ascii(srcw,slang) if slang in langinfo.SCRIPT_RANGES else [str(c) for c in srcw ]) ntgtw=''.join(make_ascii(tgtw,tlang) if tlang in langinfo.SCRIPT_RANGES else [str(c) for c in tgtw ]) ## use global alignment src_aln,tgt_aln=nw.global_align(nsrcw,ntgtw,matrix=sim_matrix_path, gap_open=gap_start_p, gap_extend=gap_extend_p) return nw.score_alignment(src_aln,tgt_aln,matrix=sim_matrix_path, gap_open=gap_start_p, gap_extend=gap_extend_p)
def testNeedleman(N): alpha = [ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' ] Validated = 0 SamePath = 0 for epoch in tqdm(range(N)): sizeA = random.randint(1, SIZEMAX) sizeB = random.randint(sizeA // 2, 2 * sizeA) A = "" B = "" for i in range(sizeA): A += alpha[random.randint(0, len(alpha) - 1)] for i in range(sizeB): B += alpha[random.randint(0, len(alpha) - 1)] aligned = nw.global_align(A, B, matrix='atiam-fpa_alpha.dist', gap_open=-3, gap_extend=-3) score = nw.score_alignment(aligned[0], aligned[1], gap_open=-3, gap_extend=-3, matrix='atiam-fpa_alpha.dist') res = (aligned[0], aligned[1], score) try: (a, b, s) = myNeedleman(A, B, matrix='atiam-fpa_alpha.dist', gap_open=-3, gap_extend=-3) if s == score: Validated += 1 if res == (a, b, s): SamePath += 1 except RuntimeError: print(A, B) pass print(str(100 * Validated / N) + "% are validated.") print(str(100 * SamePath / N) + "% have the exact same path.")
def _score_alignment(self): if all([self._score_match is not None, self._score_mismatch is not None]): matrix = self._get_matrix_file(match=self._score_match, mismatch=self._score_mismatch) elif self._matrix is not None: matrix = self._get_matrix_file(matrix=self._matrix) else: matrix = self._get_matrix_file(match=self._match, mismatch=self._mismatch) gap_open = self._score_gap_open if self._score_gap_open is not None else self._gap_open gap_extend = self._score_gap_extend if self._score_gap_extend is not None else self._gap_extend aln = nw.score_alignment(self.aligned_query, self.aligned_target, gap_open=gap_open, gap_extend=gap_extend, matrix=matrix) return aln
def _score_alignment(self): if all( [self._score_match is not None, self._score_mismatch is not None]): matrix = self._get_matrix_file(match=self._score_match, mismatch=self._score_mismatch) elif self._matrix is not None: matrix = self._get_matrix_file(matrix=self._matrix) else: matrix = self._get_matrix_file(match=self._match, mismatch=self._mismatch) gap_open = self._score_gap_open if self._score_gap_open is not None else self._gap_open gap_extend = self._score_gap_extend if self._score_gap_extend is not None else self._gap_extend aln = nw.score_alignment(self.aligned_query, self.aligned_target, gap_open=gap_open, gap_extend=gap_extend, matrix=matrix) return aln
# Reference code for testing import nwalign as nw print("myNeedleman") print( myNeedleman("CEELECANTH", "PELICAN", matrix='atiam-fpa_alpha.dist', gap_open=-1, gap_extend=-1)) print("Nwalign") aligned = nw.global_align("CEELECANTH", "PELICAN", matrix='atiam-fpa_alpha.dist') score = nw.score_alignment(aligned[0], aligned[1], gap_open=-1, gap_extend=-1, matrix='atiam-fpa_alpha.dist') print('Results for basic gap costs (linear)') print(aligned[0]) print(aligned[1]) print('Score : ' + str(score)) print("myNeedleman") print( myNeedleman("CEELECANTH", "PELICAN", matrix='atiam-fpa_alpha.dist', gap_open=-5, gap_extend=-2)) print("Nwalign")
def score_alignment(self, seq1, seq2, gp_e=-2, gp_o=-5): score = nw.score_alignment(seq1, seq2, matrix='BLOSUM62.txt', gap_extend=gp_e, gap_open=gp_o) return score
def compare_datapoints(p1, p2): gist_1 = p1[2:len(p1) - 2] gist_2 = p2[2:len(p2) - 2] gist_1_flipped = list(gist_1) # We also try the 'flipped' (mirrored) version of the gesture to allow comparison between left-handed and right-handed gestures # (although it's not super accurate). # Need to take into account not to flip spine (so we drop the last 3 parts) for i in range(0, (len(gist_1_flipped) - 3) / 2): tmp = GestureComparison._flip_quadrants(gist_1_flipped[i]) # print gist_1_flipped[i] + " --> " + tmp # print str(i) + "<->" + str(len(gist_1_flipped) / 2 + i - 3 + 1) gist_1_flipped[i] = GestureComparison._flip_quadrants( gist_1_flipped[len(gist_1_flipped) / 2 + i - 3 + 1]) gist_1_flipped[len(gist_1_flipped) / 2 + i - 3 + 1] = tmp score = 0.0 score_flipped = 0.0 for i in range(0, len(gist_1)): if gist_1[i] != 0 and gist_2[i] != '': res = nwalign.global_align( gist_1[i], gist_2[i], matrix=os.path.dirname(os.path.realpath(__file__)) + '/alignment.matrix') this_score = nwalign.score_alignment( res[0], res[1], gap_open=0, gap_extend=-5, matrix=os.path.dirname(os.path.realpath(__file__)) + '/alignment.matrix') if i >= len(gist_1) - 3: this_score *= 2 score += this_score res = nwalign.global_align( gist_1_flipped[i], gist_2[i], matrix=os.path.dirname(os.path.realpath(__file__)) + '/alignment.matrix') this_score_flipped = nwalign.score_alignment( res[0], res[1], gap_open=0, gap_extend=-5, matrix=os.path.dirname(os.path.realpath(__file__)) + '/alignment.matrix') if i >= len(gist_1) - 3: this_score_flipped *= 2 score_flipped += this_score_flipped # print str(score_flipped) + " " + str(score) if score_flipped > score: score_flipped -= abs(int(p1[-2]) - int(p2[-1])) + abs(int(p1[-1]) - int(p2[-2])) # print "using flipped! " + p1[1] + " " + p2[1] + " " + str(score_flipped) + " > " + str(score) return -score_flipped else: score -= abs(int(p1[-2]) - int(p2[-2])) + abs(int(p1[-1]) - int(p2[-1])) return -score
def RemapReadsSingle(self, count=None): #scores = {} counter = 0 has_score = False write_mode = 'wb' if self.binary_mode is False: write_mode = 'wh' self._out = pysam.Samfile(self.sam_out, mode=write_mode, referencenames=self.sam_in.references, referencelengths=self.sam_in.lengths, header=self._MakeHeader(self.sam_in.header) ) for read in self.sam_in.fetch(): 'Optional setting of count, to only realign count reads' if count is not None and counter > count: break if read.is_unmapped is True: self._out.write(read) continue if counter == 0: # Check if an alignment score is already present # If it is then record this in the has_score flag tags = read.tags for i in range(0, len(tags)): if tags[i][0] == 'AS': has_score = True continue if self.only_gapped is True: has_indel = False for c in read.cigar: if c[0] == 1 or c[0] == 2: # read has an indel has_indel = True break if has_indel == False: # Read must not have an indel print(read.qname + ', ' + str(read.cigar) + " does not have an indel") self._out.write(read) continue ''' if the read is a perfect match then don't realign ''' fivep_soft_clip = 0 threep_soft_clip = 0 cigar_last = len(read.cigar) - 1 if read.cigar[0][0] == 4: fivep_soft_clip = read.cigar[0][1] if read.cigar[cigar_last][0] == 4: threep_soft_clip = read.cigar[cigar_last][1] ref = self.ref.fetch(reference=self.refnames[read.tid], start=read.aend - read.alen - fivep_soft_clip, end=read.aend + threep_soft_clip) # Realign sense strand reads query = '' subject = '' if self.reverse_sense is True and read.is_reverse is False: query = self.ReverseSeq(read.seq) subject = self.ReverseSeq(ref.upper()) #query = Seq(read.seq).complement().tostring() #subject = Seq(ref.upper()).complement().tostring() else: query = read.seq subject = ref.upper() aln = nw.global_align(query, subject, gap_open=self.gap_open, gap_extend=self.gap_extend, matrix=self.matrix) if self.compute_scores is True: score = nw.score_alignment(aln[0], aln[1], gap_open=self.gap_open, gap_extend=self.gap_extend, matrix=self.matrix) if has_score is True: as_index = None tags = read.tags for i in range(0, len(tags)): if tags[i][0] == 'AS': as_index = i if as_index is None: raise ValueError("Read " + read.qname + " is missing an alignment score.") tags[as_index] = ('AS', score) read.tags = tags else: read.tags = [('AS', score)] + read.tags bam_cigar = self._MakeBamCigar(aln, read) if self.reverse_sense is True and read.is_reverse is False: bam_cigar.reverse() if self.verbose is True: self.PrettyPrint(read, aln, bam_cigar) # New read read.cigar = bam_cigar read.tags = read.tags + [('OC', self._MakeSamCigar(read.cigar)), ('OP', read.pos)] self._out.write(read) counter += 1 self._out.close
def _RealignRead(self, read): has_score = False if read.is_unmapped is True: #self._out.write(read) return read tags = read.tags '''If any of the read tags are AS, then remember the read has an existing score.''' for i in range(0, len(tags)): if tags[i][0] == 'AS': has_score = True continue if self.only_gapped is True: has_indel = False for c in read.cigar: if c[0] == 1 or c[0] == 2: # read has an indel has_indel = True break if has_indel == False: # Read must not have an indel ''' If the read is a perfect match then don't realign ''' print(read.qname + ', ' + str(read.cigar) + " does not have an indel") #self._out.write(read) return read fivep_soft_clip = 0 threep_soft_clip = 0 cigar_last = len(read.cigar) - 1 if read.cigar[0][0] == 4: fivep_soft_clip = read.cigar[0][1] if read.cigar[cigar_last][0] == 4: threep_soft_clip = read.cigar[cigar_last][1] ref = self.ref.fetch(reference=self.refnames[read.tid], start=read.aend - read.alen - fivep_soft_clip, end=read.aend + threep_soft_clip) # Realign sense strand reads query = '' subject = '' if self.reverse_sense is True and read.is_reverse is False: query = self.ReverseSeq(read.seq) subject = self.ReverseSeq(ref.upper()) #query = Seq(read.seq).complement().tostring() #subject = Seq(ref.upper()).complement().tostring() else: query = read.seq subject = ref.upper() print query, subject aln = nw.global_align(query, subject, gap_open=self.gap_open, gap_extend=self.gap_extend, matrix=self.matrix) if self.compute_scores is True: score = nw.score_alignment(aln[0], aln[1], gap_open=self.gap_open, gap_extend=self.gap_extend, matrix=self.matrix) if has_score is True: as_index = None tags = read.tags for i in range(0, len(tags)): if tags[i][0] == 'AS': as_index = i if as_index is None: raise ValueError("Read " + read.qname + " is missing an alignment score.") tags[as_index] = ('AS', score) read.tags = tags else: read.tags = [('AS', score)] + read.tags bam_cigar = self._MakeBamCigar(aln, read) if self.reverse_sense is True and read.is_reverse is False: bam_cigar.reverse() if self.verbose is True: self.PrettyPrint(read, aln, bam_cigar) # New read read.cigar = bam_cigar read.tags = read.tags + [('OC', self._MakeSamCigar(read.cigar)), ('OP', read.pos)] #self._out.write(read) return read
print 'Performing alignement on CEELECANTH and PELICAN...' ali = myNeedleman('CEELECANTH', 'PELICAN') print ali[0] print ali[1] print '' # Reference code for testing import nwalign as nw aligned = nw.global_align("CEELECANTH", "PELICAN", matrix='atiam-fpa_alpha.dist') score = nw.score_alignment('CEELECANTH', '-PELICAN--', gap_open=-5, gap_extend=-2, matrix='atiam-fpa_alpha.dist') print('Results for basic gap costs (linear)') print(aligned[0]) print(aligned[1]) print('Score : ' + str(score)) aligned = nw.global_align("CEELECANTH", "PELICAN", matrix='atiam-fpa_alpha.dist', gap_open=-5, gap_extend=-2) score = nw.score_alignment('CEELECANTH', '-PELICAN--', gap_open=-5, gap_extend=-2,
singreps=line.split("#") rep_count.append(len(singreps)-1) poslist=[] for singrep in singreps: if singrep != "\n": repinfo=singrep.split(":") poslist.append(int(repinfo[1])) rep_pos.append(poslist) locus_num.append(i) locus.append(curloc) fullscore=0 array=() score=0 for k in range(len(locus)): print "Processing repeat", k, "of", locus[k] fullscore=nw.score_alignment(rep[k], rep[k], gap_open=-5,\ gap_extend=-2, matrix='/home/CT/server/pybin/BLOSUM62') print >> fileout, ">", k, locus[k], rep[k], rep_count[k], rep_pos[k] for j in range(len(rep)): array=nw.global_align(rep[k], rep[j], gap_open=-5,\ gap_extend=-2, matrix='/home/CT/server/pybin/BLOSUM62') score=nw.score_alignment(array[0], array[1], gap_open=-5,\ gap_extend=-2, matrix='/home/CT/server/pybin/BLOSUM62') if score>0 and score/float(fullscore)>=threshold and j!=k: print >> fileout, j, locus[j], rep[j], rep_count[j], rep_pos[j] filein.close() fileout.close()
scoring = sw.ScoringMatrix('scoring_matrix.txt') sw = sw.LocalAlignment(scoring) match = 2 n = 0 for x, seq1 in enumerate(unique_sequences): for y, seq2 in enumerate(unique_sequences): alignment = nw.global_align(allsequences[seq1], allsequences[seq2]) score = float( nw.score_alignment(alignment[0], alignment[1], gap_open=-5, gap_extend=-2, matrix='scoring_matrix.txt')) n = float(len(alignment[0]) * match) if abs(score) > n: score = 0 similarity_matrix[x, y] = int(score) dist_matrix[x, y] = float(score / n) print '%d/%d Calculated %d alignments in %f seconds' % ( x, len(unique_sequences), len(unique_sequences), time.clock() - t0) t0 = time.clock()