def structure_score(dom, pep, models, model_seqs, mat): ''' ''' model_max = {"1SSH": 49.900, "1ZUK": 61.800, "1N5Z": 48.100, "2KYM": 47.100, "2RQW": 56.800, "2VKN": 51.100} model_scr = [] for model in model_seqs: dom_align = nw.global_align(dom, "".join(model_seqs[model]["D"]), gap_open=-100, gap_extend=-1, matrix=mat) dom_matches = contact_map_helper(dom_align) pep_align = nw.global_align(pep, "".join(model_seqs[model]["P"]), gap_open=-100, gap_extend=-1, matrix=mat) pep_matches = contact_map_helper(pep_align) score = 0 count = 0 # max_model = sum(models[model].values()) for i in models[model]: if i[0] in dom_matches and i[1] in pep_matches: score += models[model][i] count += 1 if count == 0: model_scr.append(0) else: model_scr.append((score / float(model_max[model])) / count) return max(model_scr)
def worker(seq1, seq2): # ignore old / previously blased ones. # start = time.clock() align1 = nwalign.global_align(seq1, seq2, matrix='deps/nuc44') align2 = nwalign.global_align(seq1, rev_compliment(seq2), gap_open=-1, gap_extend=-1, matrix='deps/nuc44') t1 = sim(align1[0], align1[1]) t2 = sim(align2[0], align2[1]) res = float(max(t1, t2)) / float(min(len(seq1), len(seq2))) return res
def score_phonetic_alignment(srcw, tgtw, slang, tlang, sim_matrix_path, gap_start_p=-1.0, gap_extend_p=-1.0): # convert to ascii required by align library nsrcw = ''.join( make_ascii(srcw, slang) if slang in langinfo.SCRIPT_RANGES else [str(c) for c in srcw]) ntgtw = ''.join( make_ascii(tgtw, tlang) if tlang in langinfo.SCRIPT_RANGES else [str(c) for c in tgtw]) ## use global alignment src_aln, tgt_aln = nw.global_align(nsrcw, ntgtw, matrix=sim_matrix_path, gap_open=gap_start_p, gap_extend=gap_extend_p) return nw.score_alignment(src_aln, tgt_aln, matrix=sim_matrix_path, gap_open=gap_start_p, gap_extend=gap_extend_p)
def sound_seq_distance_str(self,seq1_str, seq2_str): seq1_str = np.asanyarray(seq1_str) seq2_str = np.asanyarray(seq2_str) align = nw.global_align(seq1_str.tostring(), seq2_str.tostring(), gap_open=0, gap_extend=-5, matrix='/tmp/som.costs') len1 = len(seq1_str.tostring()) len2 = len(seq2_str.tostring()) return (-nw.score_alignment(*align, gap_open=0, gap_extend=-5, matrix='/tmp/som.costs'))/(len1+len2+0.0)
def score_phonetic_alignment(srcw,tgtw,slang,tlang,sim_matrix_path,gap_start_p=-1.0,gap_extend_p=-1.0): # convert to ascii required by align library nsrcw=''.join(make_ascii(srcw,slang) if slang in langinfo.SCRIPT_RANGES else [str(c) for c in srcw ]) ntgtw=''.join(make_ascii(tgtw,tlang) if tlang in langinfo.SCRIPT_RANGES else [str(c) for c in tgtw ]) ## use global alignment src_aln,tgt_aln=nw.global_align(nsrcw,ntgtw,matrix=sim_matrix_path, gap_open=gap_start_p, gap_extend=gap_extend_p) return nw.score_alignment(src_aln,tgt_aln,matrix=sim_matrix_path, gap_open=gap_start_p, gap_extend=gap_extend_p)
def _align(self): matrix = self._get_matrix_file(match=self._match, mismatch=self._mismatch, matrix=self._matrix) aln = nw.global_align(self.query.sequence, self.target.sequence, gap_open=self._gap_open, gap_extend=self._gap_extend, matrix=matrix) return aln
def alignMotifs(junctionSeqStart, junctionSeqEnd): """Align the sequence of two motifs.""" for side in ['side1', 'side2']: if len(junctionSeqStart.loc[side]) != len(junctionSeqEnd.loc[side]): s1 = junctionSeqStart.loc[side] s2 = junctionSeqEnd.loc[side] a, b = nw.global_align(s1, s2, gap_open=-10) #print side, a, b junctionSeqStart.loc[side] = a.replace('-', '_') junctionSeqEnd.loc[side] = b.replace('-', '_') return junctionSeqStart, junctionSeqEnd
def calc_distance(x, y): if aln_fn == '': x, y = nw.global_align(x, y, gap_open=-5, gap_extend=-1, match=1) dxy = 0 lxy = 0 for i in range(min(len(x), len(y))): if x[i] != '-' and y[i] != '-': lxy += 1 if x[i] != y[i]: dxy += 1 return 1. * dxy / lxy
def calc_distance(x,y): if aln_fn == '': x, y = nw.global_align(x, y, gap_open=-5, gap_extend=-1, match=1) dxy = 0 lxy = 0 for i in range(min(len(x),len(y))): if x[i] != '-' and y[i] != '-': lxy += 1 if x[i] != y[i]: dxy += 1 return 1.*dxy/lxy
def getPathsFromSequence(seqInit, seqEnd): a, b = nw.global_align(seqInit, seqEnd, gap_open=-10) # add bases to vecInit while len(vecInit) < len(vecEnd): vecInit += '_' while len(vecInit) > len(vecEnd): vecEnd += '_' steps = getSteps(vecInit, vecEnd, numSteps) graph = getGraphFromSteps(steps) paths = np.array(list(dfs_paths(graph, vecInit, vecEnd))) paths_parsed = formatPaths(paths, numSteps / 2)
def getPathsFromSequence(seqInit, seqEnd): a, b = nw.global_align(seqInit, seqEnd, gap_open=-10) # add bases to vecInit while len(vecInit) < len(vecEnd): vecInit += '_' while len(vecInit) > len(vecEnd): vecEnd += '_' steps = getSteps(vecInit, vecEnd, numSteps) graph = getGraphFromSteps(steps) paths = np.array(list(dfs_paths(graph, vecInit, vecEnd))) paths_parsed = formatPaths(paths, numSteps/2)
def testNeedleman(N): alpha = [ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' ] Validated = 0 SamePath = 0 for epoch in tqdm(range(N)): sizeA = random.randint(1, SIZEMAX) sizeB = random.randint(sizeA // 2, 2 * sizeA) A = "" B = "" for i in range(sizeA): A += alpha[random.randint(0, len(alpha) - 1)] for i in range(sizeB): B += alpha[random.randint(0, len(alpha) - 1)] aligned = nw.global_align(A, B, matrix='atiam-fpa_alpha.dist', gap_open=-3, gap_extend=-3) score = nw.score_alignment(aligned[0], aligned[1], gap_open=-3, gap_extend=-3, matrix='atiam-fpa_alpha.dist') res = (aligned[0], aligned[1], score) try: (a, b, s) = myNeedleman(A, B, matrix='atiam-fpa_alpha.dist', gap_open=-3, gap_extend=-3) if s == score: Validated += 1 if res == (a, b, s): SamePath += 1 except RuntimeError: print(A, B) pass print(str(100 * Validated / N) + "% are validated.") print(str(100 * SamePath / N) + "% have the exact same path.")
def sim(a, b): # This data _has_ been aligned, but sometimes they're not the same length? # If that's the case, send it through NW if len(a) != len(b): (a, b) = nw.global_align(a,b) print "Length mismatch, realigned using nw" s = 0 n = 0 for i in range(len(a)): if(a[i] == '-' and b[i] == '-'): continue if(a[i] == b[i]): s += 1 n += 1 return float(s)/float(n)
def merge_overlaps(seq1, qual1, seq2, qual2): """Merge two sequences that overlap at the ends. This assumes both sequences are given in forward orientation. The second sequence is first complemented, then the two sequences are aligned to find the coordinates of overlap. The base calls are then compared for each position in the overlap; if there is a mismatch, compare the quality scores (reversed for seq2) using the ASCII values and keep the best one. >>> ord('B') 66 The alignment is done using the nwalign package which implements the Needleman-Wunsch algorithm in C via Cython. """ # reverse-complement the second sequence seq1 = seq1.upper() # to make sure seq2 = quick_revcom(seq2) qual2 = qual2[::-1] # run the alignment alignment = nw.global_align(seq1, seq2, gap_open=-50, gap_extend=-2) seqln1 = alignment[0] seqln2 = alignment[1] # compose the merged sequence merge_list = [] pos = 0 qpos1 = 0 qpos2 = 0 while pos < len(alignment[0]): if seqln1[pos] is '-' or seqln1[pos] is 'N': merge_list.append(seqln2[pos]) qpos1 +=1 elif seqln2[pos] is '-' or seqln2[pos] is 'N': merge_list.append(seqln1[pos]) qpos2 +=1 elif seqln1[pos] is seqln2[pos]: merge_list.append(seqln1[pos]) else: # determine the consensus of the overlap using quality scores if ord(qual1[qpos1]) >= ord(qual2[qpos2]): merge_list.append(seqln1[pos]) else: merge_list.append(seqln2[pos]) pos +=1 qpos1 +=1 qpos2 +=1 merged_seq = ''.join(merge_list) return merged_seq
def align_transliterations(src_wordlist, tgt_wordlist, lang): for srcw, tgtw in itertools.izip(src_wordlist, tgt_wordlist): # convert to ascii required by align library nsrcw = ''.join(make_ascii(srcw, lang)) ntgtw = ''.join(make_ascii(tgtw, lang)) # use global alignment src_aln, tgt_aln = nw.global_align(nsrcw, ntgtw) # make it readable again src_aln = restore_from_ascii(src_aln, lang) tgt_aln = restore_from_ascii(tgt_aln, lang) yield (src_aln, tgt_aln)
def gs_align(left, right): lstring, lduration, lstart, lend = left rstring, rduration, rstart, rend = right lalign, ralign = nw.global_align(lstring, rstring) def insert_placeholders(align, duration, start, end): for i, ch in enumerate(align): if ch == '-': duration.insert(i, None) start.insert(i, None) end.insert(i, None) insert_placeholders(lalign, lduration, lstart, lend) insert_placeholders(ralign, rduration, rstart, rend) align_ix = [i for i in range(len(lalign)) if lalign[i] == ralign[i]] return lalign, ralign, align_ix
def align_transliterations(src_wordlist,tgt_wordlist,lang): for srcw,tgtw in itertools.izip(src_wordlist,tgt_wordlist): # convert to ascii required by align library nsrcw=''.join(make_ascii(srcw,lang)) ntgtw=''.join(make_ascii(tgtw,lang)) # use global alignment src_aln,tgt_aln=nw.global_align(nsrcw,ntgtw) # make it readable again src_aln=restore_from_ascii(src_aln,lang) tgt_aln=restore_from_ascii(tgt_aln,lang) yield (src_aln,tgt_aln)
def cl2string(recorded, target): all_labels = list(set([l for _, l in recorded] + [l for _, l in target])) l2char = {all_labels[i]: string.ascii_letters[i] for i in range(len(all_labels))} rec_s = "".join(l2char[l] for _, l in recorded) tgt_s = "".join(l2char[l] for _, l in target) recalign, tgtalign = nw.global_align(rec_s, tgt_s) rectimes = [t for t, _ in recorded] tgttimes = [t for t, _ in target] for i, (recchar, tgtchar) in enumerate(zip(recalign, tgtalign)): if tgtchar == '-': tgttimes.insert(i, None) if recchar == '-': rectimes.insert(i, None) return recalign, tgtalign, rectimes, tgttimes
def cl2string(recorded, target): all_labels = list(set([l for _, l in recorded] + [l for _, l in target])) l2char = { all_labels[i]: string.ascii_letters[i] for i in range(len(all_labels)) } rec_s = "".join(l2char[l] for _, l in recorded) tgt_s = "".join(l2char[l] for _, l in target) recalign, tgtalign = nw.global_align(rec_s, tgt_s) rectimes = [t for t, _ in recorded] tgttimes = [t for t, _ in target] for i, (recchar, tgtchar) in enumerate(zip(recalign, tgtalign)): if tgtchar == '-': tgttimes.insert(i, None) if recchar == '-': rectimes.insert(i, None) return recalign, tgtalign, rectimes, tgttimes
def align(): hg19 = Fasta('hg19.fa') print hg19.keys() hg19Chr = sorted(hg19.keys(), reverse=True) YRI = Fasta('YRIref.fasta') print YRI.keys() YRIChr = sorted(YRI.keys()) print hg19[hg19Chr[0]][:20] print YRI[YRIChr[0]][:20] print hg19[hg19Chr[0]][:20] print YRI[YRIChr[0]][:20] fhout = open('hg19_YRI_diff.bed', 'w') header = 'chrom, chromStart, chromEnd, hg19, YRI \n' fhout.write(header) for each in hg19Chr: seq1 = hg19[each][:10000] seq2 = YRI[each][:10000] print 'reached 1' print 'doing alignment for ', each alignment = nw.global_align(seq1, seq2, gap=-2, matrix=None, match=1, mismatch=-1) print 'reached 2' len1 = len(alignment[0]) #hg19 len2 = len(alignment[1]) #YRI if len2>len1: x = len2 else: x = len1 for i in range(x): if alignment[0][i] != alignment[1][i]: #write to fhout outline = each + ',' + str(i) + ',' + str(i+1) + ',' + alignment[0][i] + ',' + alignment[1][i] + '\n' fhout.write(outline) fhout.close()
def pairwise_from_seq(self, in_seq): if self.pairwise is None: self.calculate_pairwise() s_seq, i_seq = global_align(self.seq, in_seq) scount = -1 mcount = -1 i_inds = [] for s, i in zip(s_seq, i_seq): if s != '-': scount += 1 if i != '-': mcount += 1 if scount != -1 and mcount != -1: i_inds.append((mcount, scount)) pdict = {} for s1, s2 in product(i_inds, repeat = 2): if s1[1] != s2[1]: pdict[(s1[0], s2[0])] = self.pairwise[(s1[1], s2[1])] return pdict
def alignSequences(targetsite_sequence, window_sequence, max_errors=7): window_sequence = window_sequence.upper() # Try both strands query_regex_standard, query_regex_gap = regexFromSequence( targetsite_sequence, errors=max_errors) alignments = list() alignments.append(('+', 'standard', regex.search(query_regex_standard, window_sequence, regex.BESTMATCH))) alignments.append(('-', 'standard', regex.search(query_regex_standard, reverseComplement(window_sequence), regex.BESTMATCH))) alignments.append(('+', 'gapped', regex.search(query_regex_gap, window_sequence, regex.BESTMATCH))) alignments.append(('-', 'gapped', regex.search(query_regex_gap, reverseComplement(window_sequence), regex.BESTMATCH))) lowest_distance_score = 100 chosen_alignment = None chosen_alignment_strand = None for i, aln in enumerate(alignments): strand, alignment_type, match = aln if match != None: substitutions, insertions, deletions = match.fuzzy_counts distance_score = substitutions + (insertions + deletions) * 3 if distance_score < lowest_distance_score: chosen_alignment = match chosen_alignment_strand = strand lowest_distance_score = distance_score if chosen_alignment: match_sequence = chosen_alignment.group() match_substitutions, match_insertions, match_deletions = chosen_alignment.fuzzy_counts distance = sum(chosen_alignment.fuzzy_counts) length = len(match_sequence) start = chosen_alignment.start() end = chosen_alignment.end() path = os.path.dirname(os.path.abspath(__file__)) if match_insertions or match_deletions: realigned_match_sequence, realigned_target = nw.global_align( match_sequence, targetsite_sequence, gap_open=-10, gap_extend=-100, matrix='{0}/NUC_SIMPLE'.format(path)) return [ realigned_match_sequence, distance, length, chosen_alignment_strand, start, end, realigned_target ] else: return [ match_sequence, distance, length, chosen_alignment_strand, start, end, targetsite_sequence ] else: return [''] * 6 + ['none']
def align_sequences(self, seq1, seq2): result = nw.global_align(seq1, seq2, matrix='BLOSUM62.txt') return result
kw[k] = int(kwargs[2 * i + 1]) return a, b, kw while True: client, address = server.accept() data = True while data: try: data = client.recv(CHUNK).strip() if data == "EXIT": client.close() server.close() print "EXITING service" sys.exit(0) a, b, kwargs = get_args(data) r = global_align(a, b, **kwargs) client.send(" ".join(r)) except Exception, e: try: client.send("ERROR:" + str(e)) except socket.error: # they already closed... client.close() break client.close() atexit.register(server.close)
while True: client, address = server.accept() data = True while data: try: data = client.recv(CHUNK).strip() if data == "EXIT": client.close() server.close() print "EXITING service" sys.exit(0) a,b, kwargs = get_args(data) r = global_align(a, b, **kwargs) client.send(" ".join(r)) except Exception, e: try: client.send("ERROR:" + str(e)) except socket.error: # they already closed... client.close() break client.close() atexit.register(server.close)
def compare_datapoints(p1, p2): gist_1 = p1[2:len(p1) - 2] gist_2 = p2[2:len(p2) - 2] gist_1_flipped = list(gist_1) # We also try the 'flipped' (mirrored) version of the gesture to allow comparison between left-handed and right-handed gestures # (although it's not super accurate). # Need to take into account not to flip spine (so we drop the last 3 parts) for i in range(0, (len(gist_1_flipped) - 3) / 2): tmp = GestureComparison._flip_quadrants(gist_1_flipped[i]) # print gist_1_flipped[i] + " --> " + tmp # print str(i) + "<->" + str(len(gist_1_flipped) / 2 + i - 3 + 1) gist_1_flipped[i] = GestureComparison._flip_quadrants( gist_1_flipped[len(gist_1_flipped) / 2 + i - 3 + 1]) gist_1_flipped[len(gist_1_flipped) / 2 + i - 3 + 1] = tmp score = 0.0 score_flipped = 0.0 for i in range(0, len(gist_1)): if gist_1[i] != 0 and gist_2[i] != '': res = nwalign.global_align( gist_1[i], gist_2[i], matrix=os.path.dirname(os.path.realpath(__file__)) + '/alignment.matrix') this_score = nwalign.score_alignment( res[0], res[1], gap_open=0, gap_extend=-5, matrix=os.path.dirname(os.path.realpath(__file__)) + '/alignment.matrix') if i >= len(gist_1) - 3: this_score *= 2 score += this_score res = nwalign.global_align( gist_1_flipped[i], gist_2[i], matrix=os.path.dirname(os.path.realpath(__file__)) + '/alignment.matrix') this_score_flipped = nwalign.score_alignment( res[0], res[1], gap_open=0, gap_extend=-5, matrix=os.path.dirname(os.path.realpath(__file__)) + '/alignment.matrix') if i >= len(gist_1) - 3: this_score_flipped *= 2 score_flipped += this_score_flipped # print str(score_flipped) + " " + str(score) if score_flipped > score: score_flipped -= abs(int(p1[-2]) - int(p2[-1])) + abs(int(p1[-1]) - int(p2[-2])) # print "using flipped! " + p1[1] + " " + p2[1] + " " + str(score_flipped) + " > " + str(score) return -score_flipped else: score -= abs(int(p1[-2]) - int(p2[-2])) + abs(int(p1[-1]) - int(p2[-1])) return -score
singreps=line.split("#") rep_count.append(len(singreps)-1) poslist=[] for singrep in singreps: if singrep != "\n": repinfo=singrep.split(":") poslist.append(int(repinfo[1])) rep_pos.append(poslist) locus_num.append(i) locus.append(curloc) fullscore=0 array=() score=0 for k in range(len(locus)): print "Processing repeat", k, "of", locus[k] fullscore=nw.score_alignment(rep[k], rep[k], gap_open=-5,\ gap_extend=-2, matrix='/home/CT/server/pybin/BLOSUM62') print >> fileout, ">", k, locus[k], rep[k], rep_count[k], rep_pos[k] for j in range(len(rep)): array=nw.global_align(rep[k], rep[j], gap_open=-5,\ gap_extend=-2, matrix='/home/CT/server/pybin/BLOSUM62') score=nw.score_alignment(array[0], array[1], gap_open=-5,\ gap_extend=-2, matrix='/home/CT/server/pybin/BLOSUM62') if score>0 and score/float(fullscore)>=threshold and j!=k: print >> fileout, j, locus[j], rep[j], rep_count[j], rep_pos[j] filein.close() fileout.close()
def run_align_nw(data): for s in data: nw.global_align(s[0], s[1])
0:MAX_SEQUENCES] # only do the first 50 for speed.. similarity_matrix = np.zeros((len(unique_sequences), len(unique_sequences))) dist_matrix = np.zeros((len(unique_sequences), len(unique_sequences))) scoring = sw.ScoringMatrix('scoring_matrix.txt') sw = sw.LocalAlignment(scoring) match = 2 n = 0 for x, seq1 in enumerate(unique_sequences): for y, seq2 in enumerate(unique_sequences): alignment = nw.global_align(allsequences[seq1], allsequences[seq2]) score = float( nw.score_alignment(alignment[0], alignment[1], gap_open=-5, gap_extend=-2, matrix='scoring_matrix.txt')) n = float(len(alignment[0]) * match) if abs(score) > n: score = 0 similarity_matrix[x, y] = int(score) dist_matrix[x, y] = float(score / n)
'''Lee un fasta y devuelve la secuencia que contiene''' F = open(fastafile) #Definimos la variable "secuencia" secuencia = '' #Iniciamos el bucle de lectura for linea in F: #Asociamos a la variable "linea" cada línea del archivo, pero sin el salto de línea linea = linea.strip('\n') #Comprobamos que la línea no está vacía, ni empieza por '>' (1ª linea de info) if linea != '' and linea[0] != '>' : #Añadimos a la variable "secuencia" cada iteración de linea secuencia = secuencia + linea return secuencia D = fasta2sec('Q9V429.fasta') H = fasta2sec('P10599.fasta') acounter = 0 bcounter = 0 a, b = nw.global_align(D, H, matrix='BLOSUM62.txt') for aa in a: if aa != '-': acounter += 1 for aa in b: if aa != '-': bcounter += 1 #print a #print b print ((acounter*100./len(a))+(bcounter*100./len(b))) / 2
matrix='atiam-fpa_alpha.dist', gap_open=-5, gap_extend=-2) # Reference code for testing import nwalign as nw print("myNeedleman") print( myNeedleman("CEELECANTH", "PELICAN", matrix='atiam-fpa_alpha.dist', gap_open=-1, gap_extend=-1)) print("Nwalign") aligned = nw.global_align("CEELECANTH", "PELICAN", matrix='atiam-fpa_alpha.dist') score = nw.score_alignment(aligned[0], aligned[1], gap_open=-1, gap_extend=-1, matrix='atiam-fpa_alpha.dist') print('Results for basic gap costs (linear)') print(aligned[0]) print(aligned[1]) print('Score : ' + str(score)) print("myNeedleman") print( myNeedleman("CEELECANTH", "PELICAN",
def _RealignRead(self, read): has_score = False if read.is_unmapped is True: #self._out.write(read) return read tags = read.tags '''If any of the read tags are AS, then remember the read has an existing score.''' for i in range(0, len(tags)): if tags[i][0] == 'AS': has_score = True continue if self.only_gapped is True: has_indel = False for c in read.cigar: if c[0] == 1 or c[0] == 2: # read has an indel has_indel = True break if has_indel == False: # Read must not have an indel ''' If the read is a perfect match then don't realign ''' print(read.qname + ', ' + str(read.cigar) + " does not have an indel") #self._out.write(read) return read fivep_soft_clip = 0 threep_soft_clip = 0 cigar_last = len(read.cigar) - 1 if read.cigar[0][0] == 4: fivep_soft_clip = read.cigar[0][1] if read.cigar[cigar_last][0] == 4: threep_soft_clip = read.cigar[cigar_last][1] ref = self.ref.fetch(reference=self.refnames[read.tid], start=read.aend - read.alen - fivep_soft_clip, end=read.aend + threep_soft_clip) # Realign sense strand reads query = '' subject = '' if self.reverse_sense is True and read.is_reverse is False: query = self.ReverseSeq(read.seq) subject = self.ReverseSeq(ref.upper()) #query = Seq(read.seq).complement().tostring() #subject = Seq(ref.upper()).complement().tostring() else: query = read.seq subject = ref.upper() print query, subject aln = nw.global_align(query, subject, gap_open=self.gap_open, gap_extend=self.gap_extend, matrix=self.matrix) if self.compute_scores is True: score = nw.score_alignment(aln[0], aln[1], gap_open=self.gap_open, gap_extend=self.gap_extend, matrix=self.matrix) if has_score is True: as_index = None tags = read.tags for i in range(0, len(tags)): if tags[i][0] == 'AS': as_index = i if as_index is None: raise ValueError("Read " + read.qname + " is missing an alignment score.") tags[as_index] = ('AS', score) read.tags = tags else: read.tags = [('AS', score)] + read.tags bam_cigar = self._MakeBamCigar(aln, read) if self.reverse_sense is True and read.is_reverse is False: bam_cigar.reverse() if self.verbose is True: self.PrettyPrint(read, aln, bam_cigar) # New read read.cigar = bam_cigar read.tags = read.tags + [('OC', self._MakeSamCigar(read.cigar)), ('OP', read.pos)] #self._out.write(read) return read
type = str, help = "Which allele to align against" ) if __name__ == '__main__': args = parser.parse_args() print args with open(args.input_filename,'r') as f: d = parse_fasta(f.read()) allele = args.reference_allele if allele in d: refseq = d[allele] else: refseq = d[allele.replace("*", "")] result = {} for k,v in d.iteritems(): x, y = nw.global_align(refseq, v, gap_open=-40, gap_extend=-20, matrix='BLOSUM50') good_positions = [i for i,xi in enumerate(x) if xi != "-"] x_subset = "".join(x[i] for i in good_positions) y_subset = "".join(y[i] for i in good_positions) result[k] = y_subset if args.output_filename: with open(args.output_filename, 'w') as f: for k,v in result.iteritems(): f.write(">%s\n%s\n" % (k, v))
def RemapReadsSingle(self, count=None): #scores = {} counter = 0 has_score = False write_mode = 'wb' if self.binary_mode is False: write_mode = 'wh' self._out = pysam.Samfile(self.sam_out, mode=write_mode, referencenames=self.sam_in.references, referencelengths=self.sam_in.lengths, header=self._MakeHeader(self.sam_in.header) ) for read in self.sam_in.fetch(): 'Optional setting of count, to only realign count reads' if count is not None and counter > count: break if read.is_unmapped is True: self._out.write(read) continue if counter == 0: # Check if an alignment score is already present # If it is then record this in the has_score flag tags = read.tags for i in range(0, len(tags)): if tags[i][0] == 'AS': has_score = True continue if self.only_gapped is True: has_indel = False for c in read.cigar: if c[0] == 1 or c[0] == 2: # read has an indel has_indel = True break if has_indel == False: # Read must not have an indel print(read.qname + ', ' + str(read.cigar) + " does not have an indel") self._out.write(read) continue ''' if the read is a perfect match then don't realign ''' fivep_soft_clip = 0 threep_soft_clip = 0 cigar_last = len(read.cigar) - 1 if read.cigar[0][0] == 4: fivep_soft_clip = read.cigar[0][1] if read.cigar[cigar_last][0] == 4: threep_soft_clip = read.cigar[cigar_last][1] ref = self.ref.fetch(reference=self.refnames[read.tid], start=read.aend - read.alen - fivep_soft_clip, end=read.aend + threep_soft_clip) # Realign sense strand reads query = '' subject = '' if self.reverse_sense is True and read.is_reverse is False: query = self.ReverseSeq(read.seq) subject = self.ReverseSeq(ref.upper()) #query = Seq(read.seq).complement().tostring() #subject = Seq(ref.upper()).complement().tostring() else: query = read.seq subject = ref.upper() aln = nw.global_align(query, subject, gap_open=self.gap_open, gap_extend=self.gap_extend, matrix=self.matrix) if self.compute_scores is True: score = nw.score_alignment(aln[0], aln[1], gap_open=self.gap_open, gap_extend=self.gap_extend, matrix=self.matrix) if has_score is True: as_index = None tags = read.tags for i in range(0, len(tags)): if tags[i][0] == 'AS': as_index = i if as_index is None: raise ValueError("Read " + read.qname + " is missing an alignment score.") tags[as_index] = ('AS', score) read.tags = tags else: read.tags = [('AS', score)] + read.tags bam_cigar = self._MakeBamCigar(aln, read) if self.reverse_sense is True and read.is_reverse is False: bam_cigar.reverse() if self.verbose is True: self.PrettyPrint(read, aln, bam_cigar) # New read read.cigar = bam_cigar read.tags = read.tags + [('OC', self._MakeSamCigar(read.cigar)), ('OP', read.pos)] self._out.write(read) counter += 1 self._out.close
def nwalign(s1,s2): return nw.global_align(s1,s2)
def get_rmsd_rotation_and_translations(file1, file2): nres1 = get_total_residue_number(file1) nres2 = get_total_residue_number(file2) seq1, ind1 = get_sequence_from_PDB(file1) seq2, ind2 = get_sequence_from_PDB(file2) alignment = nw.global_align(seq1, seq2) nalign = len(alignment[0]) nresidues_min = min(len(seq1), len(seq2)) ind1new = [] ind2new = [] idx1, idx2 = 0, 0 for idx in range(nalign): if (idx < nresidues_min) and seq1[idx] == seq2[idx] and seq1[idx] != '-': ind1new.append(ind1[idx1]) ind2new.append(ind2[idx2]) if (idx < len(seq1)) and seq1[idx] != '-': idx1 += 1 if (idx < len(seq2)) and seq2[idx] != '-': idx2 += 1 ind1 = ind1new ind2 = ind2new #TODO: add a threshold for the number of residues considered frac1 = len(ind1) * 100.0 / nres1 frac2 = len(ind2) * 100.0 / nres2 # get coordinates of specific residues coords1, ind1 = get_residues_coordinates(file1, ind1) coords2, ind2 = get_residues_coordinates(file2, ind2) new_coords1 = [] new_coords2 = [] # check if there is consistency in atom names nresidues1 = len(coords1) for idx in range(nresidues1): coords1_res = coords1[idx] coords2_res = coords2[idx] atomnames1 = [item[0] for item in coords1_res] atomnames2 = [item[0] for item in coords2_res] if set(atomnames1) != set(atomnames2): sys.exit( "Inconsistency found in residue %s in file %s and residue %s in file %s! Missing atom suspected..." % (ind1[idx], file1, ind2[idx], file2)) # create new coordinates for an1, x1, y1, z1 in coords1_res: for an2, x2, y2, z2 in coords2_res: if an1 == an2: new_coords1.append([x1, y1, z1]) new_coords2.append([x2, y2, z2]) break new_coords1 = np.array(new_coords1).T new_coords2 = np.array(new_coords2).T rotation = np.zeros(9) trans1 = -new_coords1[:, 0] trans2 = -new_coords2[:, 0] rmsd = pyqcprot.CalcRMSDRotationalMatrix(new_coords1, new_coords2, rotation, None) rotation = rotation.reshape((3, 3)) trans1 += new_coords1[:, 0] trans2 += new_coords2[:, 0] return rotation, trans1, trans2
idB = record.id seqB = str(record.seq) bSeqs[idB] = seqB handle.close() handle = open(fileNameToAlign, "rU") for line in handle: line = line.rstrip() genesList = line.split("\t") geneA = genesList[0] geneB = genesList[1] seqTxtA = aSeqs[geneA].upper() seqTxtB = bSeqs[geneB].upper() alignment = nwalign.global_align(seqTxtA, seqTxtB, matrix='NUC.4.4',gap_open=-11,gap_extend=-1) lenOfSeqs = len(alignment[0]) matched = 0 for i in range(len(alignment[0])): currLetterA = alignment[0][i] currLetterB = alignment[1][i] print currLetterA print currLetterB if currLetterA == currLetterB: matched = matched + 1; print str(matched) print str(lenOfSeqs)
norm = float(d[i-1][j-1]/((len(s1)+len(s2))/2.0)) return norm # 1 if sys.argv[1] == "1": print "Pregunta 1..." sec_ini = generar_secuencia() # d, d_matrix, sec_mut = [], [], [] d, sec_mut = [], [] sec_mut = sec_ini tmp1 = "".join(sec_ini) for i in range(m+1): sec_mut = mutacion(sec_mut,i) tmp2 = "".join(sec_mut) align = nw.global_align(tmp1,tmp2) distance =distancia(align[0],align[1]) # matrix,distance =distancia(align[0],align[1]) d.append(distance) # d_matrix.append(matrix) tmp2 = "" p.show() plot1 = p.plot(m_list,d) title(u'Distancia vs Número de mutaciones') grid(True) p.axis([0,m,0,max(d)+1],0.01) xlabel(u'Número de mutaciones') ylabel('Distancia') p.show()