def main(): print('##############################################################') print( 'This script must be in a directory with the folder called "Alignment" and the fastas you want to concatenate.' ) print('The fastas must have the extension ".fasta" or it wont work.') print( 'The output files are the concatenated seqs as a fasta and as a phylip for raxml-ing and a logfile.' ) print( 'The log file tells the length of the added sequence and the order in which they are added, so you can figure out your breakpoints. ' ) print('##############################################################') path2alignments = (input("path to the purged alignments: ")).strip() + "/" outfile = open('concat_alignment.fasta', 'w') newAlignment = Alignment('') # newAlignment = '' for f in os.listdir(path2alignments): # if re.search('.fas',f): if f.endswith('.fas'): tempAlignment = concat(newAlignment, path2alignments + f, 'LG') newAlignment = tempAlignment newAlignment.write('concat') time.sleep(20) makephy('concat_alignment.fasta')
def concat(newAlignment, f, model): print f #inSeq2 = SeqIO.parse(open('concatenated.fas','r'),'fasta') a = Alignment(f) #make an alignment instance from the file f a.setmodel(model) #set the model for the alignment a.populate() fixFastas(newAlignment, a) tempAlignment = Alignment('') writelog(f + ':' + str(a.numchar) + ' ' + model) for seq in newAlignment.seqs: flag = 0 for aseq in a.seqs: if seq.id == aseq.id: flag = 1 newseq = SeqRecord(seq=seq.seq + aseq.seq, id=seq.id) tempAlignment.seqs.append(newseq) tempAlignment.idlist.append(newseq.id) tempAlignment.numchar = len(tempAlignment.seqs[0].seq) tempAlignment.numseq = len(tempAlignment.seqs) #print tempAlignment.numchar #print tempAlignment.numseq return tempAlignment
def open_alignment(self, event): # wxGlade: MeerK40t.<event_handler> project.close_old_window("alignment") from Alignment import Alignment window = Alignment(None, wx.ID_ANY, "") window.set_project(project) window.Show() project.windows["alignment"] = window
def needleman_wunsch_matrix(self,seq1, seq2): """ fill in the DP matrix according to the Needleman-Wunsch algorithm. Returns the matrix of scores and the matrix of pointers """ indel = -1 # indel penalty n = len(seq1) m = len(seq2) s = np.zeros( (n+1, m+1) ) # DP matrix ptr = np.zeros( (n+1, m+1), dtype=int ) # matrix of pointers ##### INITIALIZE SCORING MATRIX (base case) ##### for i in range(1, n+1) : s[i,0] = indel * i for j in range(1, m+1): s[0,j] = indel * j ########## INITIALIZE TRACEBACK MATRIX ########## # Tag first row by LEFT, indicating initial "-"s ptr[0,1:] = self.LEFT # Tag first column by UP, indicating initial "-"s ptr[1:,0] = self.UP ##################################################### for i in range(1,n+1): for j in range(1,m+1): # match simpleAlign=Alignment() simpleAlign.needleman_wunsch(seq1[i-1], seq2[j-1]) score=simpleAlign.score s[i,j] = s[i-1,j-1]+ score # indel penalty if s[i-1,j] + indel > s[i,j] : s[i,j] = s[i-1,j] + indel ptr[i,j] = self.UP # indel penalty if s[i, j-1] + indel > s[i,j]: s[i,j] = s[i, j-1] + indel ptr[i,j] = self.LEFT return s, ptr
def needleman_wunsch_matrix(self, seq1, seq2): """ fill in the DP matrix according to the Needleman-Wunsch algorithm. Returns the matrix of scores and the matrix of pointers """ indel = -1 # indel penalty n = len(seq1) m = len(seq2) s = np.zeros((n + 1, m + 1)) # DP matrix ptr = np.zeros((n + 1, m + 1), dtype=int) # matrix of pointers ##### INITIALIZE SCORING MATRIX (base case) ##### for i in range(1, n + 1): s[i, 0] = indel * i for j in range(1, m + 1): s[0, j] = indel * j ########## INITIALIZE TRACEBACK MATRIX ########## # Tag first row by LEFT, indicating initial "-"s ptr[0, 1:] = self.LEFT # Tag first column by UP, indicating initial "-"s ptr[1:, 0] = self.UP ##################################################### for i in range(1, n + 1): for j in range(1, m + 1): # match simpleAlign = Alignment() simpleAlign.needleman_wunsch(seq1[i - 1], seq2[j - 1]) score = simpleAlign.score s[i, j] = s[i - 1, j - 1] + score # indel penalty if s[i - 1, j] + indel > s[i, j]: s[i, j] = s[i - 1, j] + indel ptr[i, j] = self.UP # indel penalty if s[i, j - 1] + indel > s[i, j]: s[i, j] = s[i, j - 1] + indel ptr[i, j] = self.LEFT return s, ptr
def gen_alignment_list(msa_xml_file): result = open(msa_xml_file, "r") records = NCBIXML.parse(result) alignment_obj_list = [] item = next(records) for alignment in tqdm(item.alignments, desc="Parsing MSA XML"): for hsp in alignment.hsps: hit_id = alignment.accession[0:4] chain_id = alignment.accession[5] query_range = (hsp.query_start, hsp.query_end) hit_range = (hsp.sbjct_start, hsp.sbjct_end) query_seq = hsp.query hit_seq = hsp.sbjct midline = hsp.match # dont want native if hit_range == query_range: break alignment_obj = Alignment(hit_id, chain_id, query_range, hit_range, query_seq, hit_seq, midline) alignment_obj_list.append(alignment_obj) return alignment_obj_list
def command_setup(args): print "Running GraphMatch...\n" print "Parameters chosen: " print "Query Graph: %s\nReference Graph: %s\nCorrespondences File: %s" % (args.q, args.i, args.c) print "Number of indels allowed: %d\nIndel Penalty: %.2f\nNon-associated query vertex penalty: %.2f\nNumber of top results: %d" % (args.m, args.ip, args.np, args.k) if args.s: print "Scoring Method: negative log(E-value)" else: print "Scoring Method: Raw match score" print "WARNING: due to scores being positive or negative for this approach, no branch and bound available." print #Setting up the parameters global m global indel_penalty global non_assoc_vertex_penalty global score_method global top_alignments m = args.m indel_penalty = args.ip non_assoc_vertex_penalty = args.np score_method = args.s #Instantiate the top alignments with low scoring Alignments (So they will get replaced) for i in range(0, args.k): a = Alignment(float("-inf"), [], [], [], []) top_alignments.append(a)
def realign(self, outbase): """Calls the Alignment class, which calls Bowtie2 to realign reads""" input_fq = outbase + "_input-homolog.fq" output_sam = outbase + "_input-homolog_realigned.sam" self.l.log("Aligning " + input_fq + " to the genome (output at " + output_sam + ")...") Alignment(self.args, input_fq, output_sam)
def __init__(self): super(Tools, self).__init__() vboxIndex = gtk.VBox(False, 5) aIndex = gtk.Alignment(0.5, 0.25, 0, 0) aIndex.add(Index()) vboxAlign = gtk.VBox(False, 5) aAlign = gtk.Alignment(0.5, 0.25, 0, 0) aAlign.add(Alignment()) #Viewer notebook vboxViewer = gtk.VBox(False, 5) aViewer = gtk.Alignment(0.5, 0.25, 0, 0) aViewer.add(Viewer()) vboxIndex.pack_start(aIndex) vboxAlign.pack_start(aAlign) vboxViewer.pack_start(aViewer) self.set_tab_pos(gtk.POS_TOP) self.append_page(vboxIndex) self.set_tab_label_text(vboxIndex, config.LOC["tools_index"]) self.append_page(vboxAlign) self.set_tab_label_text(vboxAlign, config.LOC["tools_align"]) self.append_page(vboxViewer) self.set_tab_label_text(vboxViewer, config.LOC["tools_viewer"])
def alignment(self): """Make self into an alignment, and return it. If all the sequences are the same length and type, then self, a sequenceList, could be an Alignment. This method generates an Alignment instance, runs the Alignment method checkLengthsAndTypes(), and returns the Alignment. If you feed p4 a fasta sequence, it makes SequenceList object, and runs this method on it. If it works then p4 puts the Alignment object in var.alignments, and if not it puts the SequenceList object in var.sequenceLists. It is possible that p4 might think that some short sequences are DNA when they are really protein. In that case it will fail to make an alignment, because it will fail the types check. So what you can do is something like this:: sl = var.sequenceLists[0] for s in sl.sequences: s.dataType = 'protein' a = sl.alignment() """ from Alignment import Alignment a = Alignment() a.fName = self.fName import copy a.sequences = copy.deepcopy(self.sequences) # self will be deleted a.fName = self.fName a.checkLengthsAndTypes() return a
def extract_bisents(file1, lang1, file2, lang2, alignment_file): assert isinstance(lang1, (unicode, str)) and len(lang1)==2 assert isinstance(lang2, (unicode, str)) and len(lang2)==2 t1 = Text.from_file(file1, lang1) t2 = Text.from_file(file2, lang2) alignment = Alignment.from_file(alignment_file) bisents = alignment.as_pairs(t1.as_sentences_flat(), t2.as_sentences_flat()) return bisents
def get_alignment(self, langs, backend=None): """like fetcher""" assert len(langs) >= 2 assert not backend or backend in possible_backends real_langs = list(set(lang[:2] for lang in langs)) if len(real_langs) == 1: text_len = len(fetch_sentences(basename, real_langs[0])) return Alignment.create_straight(text_len, len(langs)) elif len(real_langs) == 2: a = None for i in range(2): for b in ([backend] if backend else possible_backends): try: langs_string = '-'.join(str(l) for l in real_langs) a = Alignment.from_file(self._p(langs_string + '.' + b)) break except IOError: continue if a: break real_langs.reverse() if not a: raise IOError else: # len(real_langs) == 3 :( a1 = self.get_alignment(['pl', 'cu'], backend).as_ladder() a2 = self.get_alignment(['cu', 'el'], backend).as_ladder() a3 = self.get_alignment(['pl', 'el'], backend).as_ladder() # a3 = [(b, a) for (a, b) in a3] # reversed a = merge_3_alignments(a1, a2, a3) real_langs = ['pl', 'cu', 'el'] # needed later columns = _transpose(a.data) columns_map = { real_langs[i] : columns[i] for i in range(len(real_langs)) } # common part for 2 and 3 chosen_columns = [columns_map[lang[:2]] for lang in langs] chosen_columns.append(columns[2]) return Alignment(_transpose(chosen_columns))
def calcUnconstrainedLogLikelihood1(self): """Calculate likelihood under the multinomial model. This calculates the unconstrained (multinomial) log like without regard to character partitions. The result is placed in the data variable unconstrainedLogLikelihood. If there is more than one partition, it makes a new temporary alignment and puts all the sequences in one part in that alignment. So it ultimately only works on one data partition. If there is more than one alignment, there is possibly more than one datatype, and so this method will refuse to do it. Note that the unconstrained log like of the combined data is not the sum of the unconstrained log likes of the separate partitions. See also calcUnconstrainedLogLikelihood2 """ if len(self.alignments) > 1: gm = ["Data.calcUnconstrainedLogLikelihood()"] gm.append("This method is not implemented for more than one alignment.") raise Glitch, gm if self.nParts == 1: # no problem self.unconstrainedLogLikelihood = pf.getUnconstrainedLogLike(self.parts[0].cPart) else: a = self.alignments[0] import copy newAlig = Alignment() newAlig.dataType = a.dataType newAlig.symbols = a.symbols newAlig.dim = a.dim newAlig.equates = a.equates newAlig.taxNames = a.taxNames for s in a.sequences: newAlig.sequences.append(copy.deepcopy(s)) newAlig.checkLengthsAndTypes() newAlig._initParts() #newAlig.dump() self.unconstrainedLogLikelihood = pf.getUnconstrainedLogLike(newAlig.parts[0].cPart) del(newAlig)
def fetch_alignment(basename, langs, backend='hunalign'): assert langs real_langs = list(set(lang[:2] for lang in langs)) if len(real_langs) == 1: text_len = len(fetch_sentences(basename, real_langs[0])) return Alignment.create_straight(text_len, len(langs)) elif len(real_langs) == 2: try: a = Alignment.from_file("%s/%s-%s.%s" % (basename, real_langs[0], real_langs[1], backend)) except IOError: real_langs.reverse() a = Alignment.from_file("%s/%s-%s.%s" % (basename, real_langs[0], real_langs[1], backend)) else: # len(real_langs) == 3 :( a1 = Alignment.from_file('%s/pl-cu.%s' % (basename, backend)).as_ladder() a2 = Alignment.from_file('%s/cu-el.%s' % (basename, backend)).as_ladder() a3 = Alignment.from_file('%s/pl-el.%s' % (basename, backend)).as_ladder() a3 = [(b, a) for (a, b) in a3] # reversed a = merge_3_alignments(a1, a2, a3) real_langs = ['pl', 'cu', 'el'] # needed later columns = _transpose(a.data) columns_map = { real_langs[i] : columns[i] for i in range(len(real_langs)) } # common part for 2 and 3 chosen_columns = [columns_map[lang[:2]] for lang in langs] chosen_columns.append(columns[2]) return Alignment(_transpose(chosen_columns))
def extend_hit(self,hits): alignments = [] idx = 0 db_length = self.seq_pos[-1] #print(db_length) for hit in hits: name = self.seq_names[bisect_right(self.seq_pos,hit[0][1])-1] species = self.species_names[bisect_right(self.species_idx,hit[0][1])-1] score, align_s, align_q = self.extend_head(self.query[:hit[0][0]], hit[0][1]) #print(idx, score, align_s, align_q) for i in range(len(hit)-1): tmp_s = list(self.extract_seq(hit[i][1], hit[i][2])) tmp_q = list(self.query[hit[i][0]:hit[i][0]+hit[i][2]]) tmp_score = 0 for j in range(hit[i][2]): tmp_score += self.matrix[tmp_s[j]+tmp_q[j]] #print(idx, tmp_score, tmp_s, tmp_q) align_s += tmp_s align_q += tmp_q score += tmp_score tmp_s = self.extract_seq(hit[i][1]+hit[i][2], hit[i+1][1]-hit[i][1]-hit[i][2]) tmp_q = self.query[hit[i][0]+hit[i][2]:hit[i+1][0]] tmp_score, tmp_q, tmp_s = self.needle(tmp_s,tmp_q) #print(idx, tmp_score, tmp_s, tmp_q) align_s += tmp_s align_q += tmp_q score += tmp_score tmp_s = list(self.extract_seq(hit[-1][1], hit[-1][2])) tmp_q = list(self.query[hit[-1][0]:hit[-1][0]+hit[-1][2]]) tmp_score = 0 for j in range(hit[-1][2]): tmp_score += self.matrix[tmp_s[j]+tmp_q[j]] align_s += tmp_s align_q += tmp_q score += tmp_score tmp_score, tmp_s, tmp_q = self.extend_tail(self.query[hit[-1][0]+hit[-1][2]:], hit[-1][1]+hit[-1][2]) align_s += tmp_s align_q += tmp_q score += tmp_score idx += 1 alignment = Alignment(species, name, score, align_q, align_s, self.matrix, len(self.query), db_length) if alignment.E < self.evalue_limit: alignments.append(alignment) alignments.sort(key=lambda alignments: alignments.score, reverse=True) alignments = alignments[:self.LIM] return alignments
def ScoreAlignment(W_prime, G, G0): score = 0 assoc_vertices = [] indel_vertices = [] query_edges = [] ref_edges = [] if score_method: #Use -log(corresp score) as match value. want to maximize this score. for node in G0.node: corresp = getCorrespondingVertex(node, W_prime) if corresp is None: score = score - non_assoc_vertex_penalty else: assoc_vertices.append(node) if corresp.score <= 1e-200: score += -1.0*log(1e-200) else: score += -1.0*log(corresp.score) else: #Using raw correspondance score for matching -- want to maximize this score. for node in G0.node: corresp = getCorrespondingVertex(node, W_prime) if corresp is None: score = score - non_assoc_vertex_penalty else: assoc_vertices.append(node) score -= corresp.score #Indel penalties for i in range(0, len(assoc_vertices)): for j in range(i+1, len(assoc_vertices)): vi = assoc_vertices[i] vk = assoc_vertices[j] if G0.has_edge(vi, vk): query_edges.append((vi, vk)) vij = getCorrespondingVertex(vi, W_prime) vkl = getCorrespondingVertex(vk, W_prime) path_G = nx.shortest_path(G, vij.name, vkl.name) pathLen = len(path_G) - 2 for h in range(0, pathLen): indel_vertices.append(path_G[h+1]) score = score - pathLen*indel_penalty ref_edges.append(path_G) return Alignment(score, copy.copy(W_prime), indel_vertices, query_edges, ref_edges)
def run(self): number_of_dirs = len(self.img_dir_names) count = 0 for dir_name in self.img_dir_names: img_file_names = File.get_img_file_names( os.path.join(self.img_path, dir_name)) for file_name in img_file_names: LandmarksDetector( land_path=LAND_PATH, img_path=os.path.join(IMG_PATH, dir_name), face_pose_predictor=self.face_pose_predictor, face_detector=self.face_detector, file_name=file_name, ).run(True) Alignment(land_path=LAND_PATH, img_path=os.path.join(IMG_PATH, dir_name), deformed_img_path=os.path.join( DEFORMED_IMG_PATH, dir_name), ideal_mask=self.ideal_mask, file_name=file_name).run() print('{}/{}'.format(count, number_of_dirs)) count += 1 print()
def __init__(self,readname,chr,start,end,strand,score,readcount,readsequence,cigar,qualstring): Alignment.__init__(self,readname,chr,start,end,strand,score=readcount,readcount = readcount,readsequence=readsequence) self.qual = qualstring self.cigar = cigar
from Alignment import Alignment print Alignment("GCTGATATAGCT", "GGGTGATTAGCT") a = Alignment("GCTGATATAGCT", "GGGTGATTAGCT") print a.getDistance()
def calcUnconstrainedLogLikelihood1(self): """Calculate likelihood under the multinomial model. This calculates the unconstrained (multinomial) log like without regard to character partitions. The result is placed in the data variable unconstrainedLogLikelihood. If there is more than one partition, it makes a new temporary alignment and puts all the sequences in one part in that alignment. So it ultimately only works on one data partition. If there is more than one alignment, there is possibly more than one datatype, and so this method will refuse to do it. Note that the unconstrained log like of the combined data is not the sum of the unconstrained log likes of the separate partitions. See also calcUnconstrainedLogLikelihood2 """ if len(self.alignments) > 1: gm = ["Data.calcUnconstrainedLogLikelihood()"] gm.append( "This method is not implemented for more than one alignment.") raise Glitch, gm if self.nParts == 1: # no problem self.unconstrainedLogLikelihood = pf.getUnconstrainedLogLike( self.parts[0].cPart) else: a = self.alignments[0] import copy newAlig = Alignment() newAlig.dataType = a.dataType newAlig.symbols = a.symbols newAlig.dim = a.dim newAlig.equates = a.equates newAlig.taxNames = a.taxNames for s in a.sequences: newAlig.sequences.append(copy.deepcopy(s)) newAlig.checkLengthsAndTypes() newAlig._initParts() #newAlig.dump() self.unconstrainedLogLikelihood = pf.getUnconstrainedLogLike( newAlig.parts[0].cPart) del (newAlig)
def getTemplates(self): if self.debug: print(self.seq) result_handle = NCBIWWW.qblast("blastp", "pdb", str(self.seq), expect=0.01) blast_records = NCBIXML.parse(result_handle) if self.debug: print("BLAST Request Finished") print() for record in blast_records: for alignment in record.alignments: id = alignment.accession fasta = self.getFastaFromId(id) title = alignment.title length = alignment.length template = Template(id=id, fasta=fasta, sequence=title, length=length, alignments=[]) self.templates[id] = template self.fastas[id] = fasta for hsp in alignment.hsps: a = Alignment(id=id, title=title, expect=hsp.expect, score=hsp.score, identities=hsp.identities, similarity=(100 * hsp.identities / len(self.seq)), target=hsp.query, targetstart=hsp.query_start, match=hsp.match, template=hsp.sbjct, templatestart=hsp.sbjct_start, length=length) targetfront = str(self.seq[:a.targetstart - 1]) targetend = str(self.seq[(a.targetstart + a.length):]) a.target = ''.join(targetfront) + a.target + ''.join( targetend) a.length = len(a.target) templatefront = ['-'] * (a.targetstart - 1) templateend = ['-'] * (len(self.seq) - (a.targetstart + a.length)) a.template = ''.join(templatefront) + a.template + ''.join( templateend) self.templates[id].alignments.append(a) self.alignments.append(a) for id, fasta in self.fastas.items(): fname = '%s/%s.fasta' % (self.fastasfolder, id) if not os.path.exists(fname): f = open(fname, 'w') SeqIO.write(fasta, f, 'fasta') f.close() for i, a in enumerate(self.alignments): fname = '%s/%s-%s.alignment' % (self.alignmentsfolder, a.id, str(i)) if not os.path.exists(fname): f = open(fname, 'w') json.dump(a.toJSON(), f) f.close() return self.templates.keys()
for linear gap cost use -l or -linear\n\ for affine gap cost use -a or -affine\n\ - b, a: parameters for gap cost function\n\ b -> constant gap cost or slope when performing linear/affine gap constant (extension penalty)\n\ a -> instersect for affine gap cost (opening gap penalty)\n\ - -o: output alignment. if missing then outputs optimal score") arguments = GetArguments(sys.argv) # parsing arguments # print(arguments.seq2) # print(arguments.score_matrix) sequences = [arguments.seq1, arguments.seq2] substitution_matrix = arguments.score_matrix gap_params = arguments.gap_params alignmentType = arguments.alignment_type my_alignment = Alignment(sequences, substitution_matrix, alignmentType, gap_params) my_alignment.align() if arguments.output: print(">seq1") print(my_alignment.a) print(">seq2") print(my_alignment.b) else: print(my_alignment.score) # # Score matrix print # for row in range(len(T)): # print(T[row])
#!/usr/bin/env python3 # Usage: ./msa_exact.py score_matrix 5 test.fa # Requires Alignment.py in the directory from Alignment import Alignment from Alignment import GetArguments import sys arguments = GetArguments(sys.argv) test = Alignment(arguments.seqs, arguments.score_matrix, arguments.gapcost) score = test.sp_exact_3() alignm = test.backtrack_msa_exact() for i in range(len(alignm)): print(">", arguments.heads[i]) print(test.num_to_sequence(alignm[i]), "\n")
def main(argv): print "AnnotateTreeCmd v1.0" if len(argv) == 2 and argv[1] == '-t': conduct_tests() exit(0) elif len(argv) != 7: print 'usage python AnnotateTreeCmd.py seqnumfile seqfile treefile cdrfile tag wd.' sys.exit(0) for file in argv[1:4]: check_file(file) (seqnumfile, seqfile, treefile, cdrfile, tag, wdir) = argv[1:7] if len(cdrfile) > 0: check_file(cdrfile) else: cdrfile = None try: if not os.path.exists(wdir): os.makedirs(wdir) except: print "Error creating directory %s." % wdir sys.exit(0) try: msa = Alignment() msa.read_nt(seqfile) # Check that the sequence comprises a valid set of codons for seq in msa: if '*' in seq: print "Stop codon found in sequence %s." % seq.id sys.exit(0) except: print "Error parsing %s: %s." % (seqfile, sys.exc_info()[1]) sys.exit(0) try: seq_pos = msa.read_position_numbers(seqnumfile) except: print "Error parsing %s: %s." % (seqnumfile, sys.exc_info()[1]) sys.exit(0) if cdrfile is not None: try: acdr = AnalyseCDR(msa, file_name=cdrfile) except: print "Error parsing %s: %s." % (cdrfile, sys.exc_info()[1]) sys.exit(0) try: seq_align = AlignIO.read(seqfile, "fasta") except: try: seq_align = AlignIO.read(seqfile, "phylip") except: print "Error parsing %s: %s." % (seqfile, sys.exc_info()[1]) sys.exit(0) try: tree = Phylo.read(treefile, "newick") except: print "Error parsing %s: %s." % (treefile, sys.exc_info()[1]) sys.exit(0) dnaml = Dnaml() int_aas = dnaml.run_dnaml(seq_align, tree, seq_pos, cdrfile, wdir, report, tag) if int_aas is not None: try: if cdrfile is not None: acdr = AnalyseCDR(int_aas, file_name=cdrfile) cdr_output = acdr.analyse() fo = open(wdir + "/" + tag + "cdr_analysis.html", "w") fo.write(cdr_output) fo.close() except: print "Warning: CDRs were not analysed: " + str(sys.exc_info()[1]) try: gc.collect() RenderTree.render_annotate(wdir + "/" + tag + "annotated_treefile.new", wdir + "/" + tag + "annotated_treefile.png") gc.collect() RenderTree.render_annotate(wdir + "/" + tag + "annotated_treefile.new", wdir + "/" + tag + "annotated_treefile.svg") gc.collect() if cdrfile is not None: RenderTree.render_annotate(wdir + "/" + tag + "annotated_treefile_sum.new", wdir + "/" + tag + "annotated_treefile_sum.png") gc.collect() RenderTree.render_annotate(wdir + "/" + tag + "annotated_treefile_sum.new", wdir + "/" + tag + "annotated_treefile_sum.svg") gc.collect() RenderTree.render_annotate(wdir + "/" + tag + "annotated_treefile_tot.new", wdir + "/" + tag + "annotated_treefile_tot.png") gc.collect() RenderTree.render_annotate(wdir + "/" + tag + "annotated_treefile_tot.new", wdir + "/" + tag + "annotated_treefile_tot.svg") gc.collect() RenderTree.render_annotate(wdir + "/" + tag + "intermediates_treefile.new", wdir + "/" + tag + "intermediates_treefile.png") gc.collect() RenderTree.render_annotate(wdir + "/" + tag + "intermediates_treefile.new", wdir + "/" + tag + "intermediates_treefile.svg") gc.collect() except: print "Error rendering trees: " + str(sys.exc_info()[1]) first = True orig_recs = [] for rec in SeqIO.parse(wdir + "/" + tag + "aa_alignment.fa", "fasta"): if not first and "node_" not in rec.id: orig_recs.append(rec) first = False logo_alignment_file = wdir + "/" + tag + "alignment_for_logo.fa" SeqIO.write(orig_recs, wdir + "/" + tag + "alignment_for_logo.fa", "fasta") with open(wdir + "/" + tag + "weblogo_status.txt", "w") as fo: retcode = subprocess.call("seqlogo -f %salignment_for_logo.fa -F PNG -o aa_logo -h 2 -w 20 -acS" % tag, cwd=wdir, shell=True, stdout=fo, stderr=subprocess.STDOUT) if retcode == 1: fo.write("Trying seqlogo.pl instead.\n") retcode = subprocess.call("seqlogo.pl -f %salignment_for_logo.fa -F PNG -o aa_logo -h 2 -w 20 -acS" % tag, cwd=wdir, shell=True, stdout=fo, stderr=subprocess.STDOUT) if retcode == 1: print "Weblogo not installed: logo plot will not be generated."
def newFunction(*other, **args): if not isinstance(other[0], Alignment): other = list(other) other[0] = Alignment(other[0]) other = tuple(other) return f(*other, **args)
def run_dnaml(self, seq_align, ptree, seqpattern, cdrfile, wdir, rep, tag=""): """Run dnaml. Arguments are: seq_align: the input nt sequences (MultipleSequenceAlignment) ptree: phylogenetic tree (Bio.Phylo) seqpattern: A list of sequence number directives, in the format accepted by Alignment.set_position_numbers wdir: the name of a directory that run_paml should use. This must exist already. rep: a function that takes a string as an argument. This will be called should an error or warning be necessary (may be called multiple times in one invocation). tag: an optional tag to prefix filenames with Sequences in seq_align must be the same length, must start on a codon boundary, and be an integral number of codons in length. The first sequence must be the ancestral sequence or outgroup. Exactly he same sequence names must occur in the alignment and the tree. Sequence name format is pretty flexible (sequences are mapped to names acceptable to PAML and remapped after PAML has run). """ root_id = seq_align[0].id # Translate clade names to something safe namedict = {} serial = 1 for seq in seq_align: namedict[seq.id] = "N%09d" % serial seq.id = namedict[seq.id] serial += 1 qtree = copy.deepcopy(ptree) for clade in qtree.get_terminals(): if clade.name and clade.name in namedict: clade.name = namedict[clade.name] # Root the tree on the first record first = "N%09d" % 1 try: qtree.root_with_outgroup( qtree.find_clades(name=re.escape(first)).next()) except: raise ValueError("Error: root sequence not found in tree.") try: inv_dict = {v: k for k, v in namedict.items()} ptree.root_with_outgroup( ptree.find_clades(name=re.escape(inv_dict[first]))) Phylo.write(ptree, wdir + "/" + "input_treefile.new", "newick", plain=False) except: raise ValueError( "Error rooting trees: check for corrupt tree file or duplicated sequences." ) # Write the sequences, in PHYLIP format (real PHYLIP format, as used by PHYLIP!) with open(wdir + "/" + "infile", "w") as f: f.write(" %d %d\n" % (len(seq_align), len(seq_align[0]))) for seq in seq_align: f.write("%10s%s\n" % (seq.id, seq.seq.upper())) # Write the tree file Phylo.write(qtree, wdir + "/" + "intree", "newick") if os.path.exists(wdir + "/" + "outfile"): os.remove(wdir + "/" + "outfile") if os.path.exists(wdir + "/" + "outtree"): os.remove(wdir + "/" + "outtree") # The path to the module may reference either a .py or a .pyc file... ctlfile = os.path.abspath(__file__).replace(".pyc", ".ctl") if ".pyc" in os.path.abspath(__file__) \ else os.path.abspath(__file__).replace(".py", ".ctl") # Check for dnaml in the current directory dnamlfile = os.path.abspath(__file__).replace("Dnaml.pyc", "dnaml") if ".pyc" in os.path.abspath(__file__) \ else os.path.abspath(__file__).replace("Dnaml.py", "dnaml") if not os.path.exists(dnamlfile): dnamlfile = "dnaml" # must be on the path somewhere with open(wdir + "/" + "dnaml.txt", "w") as o, open(ctlfile, "r") as i: subprocess.call(dnamlfile, cwd=wdir, stdin=i, stdout=o) if not os.path.isfile(wdir + "/" + "outfile"): rep("No output returned by dnaml: please check the logs for the issue." ) return None if os.path.isfile(wdir + "/" + "outfile.txt"): os.remove(wdir + "/" + "outfile.txt") os.rename(wdir + "/" + "outfile", wdir + "/" + "outfile.txt") intseqs = self.__parse_outfile(wdir + "/" + "outfile.txt") if not intseqs: rep("Unexpected output returned by dnaml: please check the logs for the issue." ) return None # Custom sort function to put the root record first, then others supplied by the user, then intermediate nodes def key_ids(rec): if rec.id == "N%09d" % 1: return 'a__' + rec.id elif 'node_' in rec.id: return 'z__' + "%04d" % (int)(rec.id.split("_")[1]) else: return 'l__' + rec.id labelled_tree = Phylo.read(wdir + "/" + "outtree", "newick") intseqs.seek(0) int_seqs = Alignment(file_name=intseqs, format="fasta") int_seqs.sort(key=key_ids) intseqs.seek(0) int_aas = Alignment() int_aas.read_nt(intseqs, "fasta") int_aas.sort(key=key_ids) int_aas.set_position_numbers(position_numbers=seqpattern) # Put back the original names in all our collections for seq in int_seqs: if seq.id in inv_dict: seq.id = inv_dict[seq.id] seq.name = "" seq.description = "" for seq in int_aas: if seq.id in inv_dict: seq.id = inv_dict[seq.id] seq.name = "" seq.description = "" nodeid = 1 for clade in labelled_tree.find_clades(order="preorder"): if clade.name is None: clade.name = "node_%d" % nodeid # This relies on our traversal using the same order as dnaml nodeid += 1 else: if clade.name in inv_dict: clade.name = inv_dict[clade.name] # Now we need to map the labelling of the nodes in the labelled tree to the nodes in the original tree self.__map_names(ptree, labelled_tree) Phylo.write(ptree, wdir + "/" + tag + "intermediates_treefile.new", "newick", plain=False) cladenames = [] new_int_aas = Alignment() for clade in ptree.find_clades(): if clade.name is not None: cladenames.append(clade.name) for rec in int_aas: if rec.id in cladenames: new_int_aas.append(rec) int_aas = new_int_aas int_aas.set_position_numbers(position_numbers=seqpattern) copy_tree = copy.deepcopy(ptree) # Calculate AA diffs between each node and its parent, and write to the tree labels = {} def diffkey(diff): return int_aas.index_of(diff[1:-1]) for clade in ptree.find_clades(): if clade.name is not None: parent = self.__get_parent(ptree, clade) if parent is None: path = ptree.get_path(clade) if len(path) == 1 and clade.name != first: fname = inv_dict[first] parent = ptree.find_clades( name=re.escape(fname)).next() if parent is not None and parent.name is not None: diffs = list(int_aas.seqdiff(clade.name, parent.name)) diffs.sort(key=diffkey) diffs = "+".join(diffs) if "node_" in clade.name: labels[clade.name] = diffs else: labels[clade.name] = str(clade.name) + " " + diffs for clade in ptree.find_clades(): if clade.name is not None and clade.name in labels: clade.name = labels[clade.name] Phylo.write(ptree, wdir + "/" + tag + "annotated_treefile.new", "newick", plain=False) # Now write a tree with summary CDR/FR total changes if cdrfile is not None: ptree = copy.deepcopy(copy_tree) acdr = AnalyseCDR(int_aas, file_name=cdrfile) labels = {} for clade in ptree.find_clades(): if clade.name is not None: parent = self.__get_parent(ptree, clade) if parent is None: path = ptree.get_path(clade) if len(path) == 1 and clade.name != first: fname = inv_dict[first] parent = ptree.find_clades( name=re.escape(fname)).next() if parent is not None and parent.name is not None: diffs = acdr.category_diff(clade.name, parent.name) if "node_" in clade.name: labels[clade.name] = diffs else: labels[clade.name] = str(clade.name) + " " + diffs for clade in ptree.find_clades(): if clade.name is not None and clade.name in labels: clade.name = labels[clade.name] Phylo.write(ptree, wdir + "/" + tag + "annotated_treefile_sum.new", "newick", plain=False) # And write a tree with counts of total AA changes ptree = copy.deepcopy(copy_tree) labels = {} for clade in ptree.find_clades(): if clade.name is not None: parent = self.__get_parent(ptree, clade) if parent is None: path = ptree.get_path(clade) if len(path) == 1 and clade.name != first: fname = inv_dict[first] parent = ptree.find_clades( name=re.escape(fname)).next() if parent is not None and parent.name is not None: diffs = list(int_aas.seqdiff(clade.name, parent.name)) if "node_" in clade.name: labels[clade.name] = str( len(diffs)) if len(diffs) > 0 else "" else: labels[clade.name] = str(clade.name) + ( " " + str(len(diffs)) if len(diffs) > 0 else "") for clade in ptree.find_clades(): if clade.name is not None and clade.name in labels: clade.name = labels[clade.name] Phylo.write(ptree, wdir + "/" + tag + "annotated_treefile_tot.new", "newick", plain=False) f = open(wdir + "/" + tag + "aa_alignment.txt", "w") f.write(int_aas.report(100)) f.close() f = open(wdir + "/" + tag + "nt_alignment.txt", "w") f.write(int_seqs.report(100)) f.close() for rec in int_aas: rec.description = "" AlignIO.write(int_aas, wdir + "/" + tag + "aa_alignment.fa", "fasta") AlignIO.write(int_seqs, wdir + "/" + tag + "nt_alignment.fa", "fasta") return int_aas
def run_dnaml(self, seq_align, ptree, seqpattern, cdrfile, wdir, rep, tag=""): """Run dnaml. Arguments are: seq_align: the input nt sequences (MultipleSequenceAlignment) ptree: phylogenetic tree (Bio.Phylo) seqpattern: A list of sequence number directives, in the format accepted by Alignment.set_position_numbers wdir: the name of a directory that run_paml should use. This must exist already. rep: a function that takes a string as an argument. This will be called should an error or warning be necessary (may be called multiple times in one invocation). tag: an optional tag to prefix filenames with Sequences in seq_align must be the same length, must start on a codon boundary, and be an integral number of codons in length. The first sequence must be the ancestral sequence or outgroup. Exactly he same sequence names must occur in the alignment and the tree. Sequence name format is pretty flexible (sequences are mapped to names acceptable to PAML and remapped after PAML has run). """ root_id = seq_align[0].id # Translate clade names to something safe namedict = {} serial = 1 for seq in seq_align: namedict[seq.id] = "N%09d" % serial seq.id = namedict[seq.id] serial += 1 qtree = copy.deepcopy(ptree) for clade in qtree.get_terminals(): if clade.name and clade.name in namedict: clade.name = namedict[clade.name] # Root the tree on the first record first = "N%09d" % 1 try: qtree.root_with_outgroup(qtree.find_clades(name=re.escape(first)).next()) except: raise ValueError("Error: root sequence not found in tree.") try: inv_dict = {v: k for k, v in namedict.items()} ptree.root_with_outgroup(ptree.find_clades(name=re.escape(inv_dict[first]))) Phylo.write(ptree, wdir + "/" + "input_treefile.new", "newick", plain=False) except: raise ValueError("Error rooting trees: check for corrupt tree file or duplicated sequences.") # Write the sequences, in PHYLIP format (real PHYLIP format, as used by PHYLIP!) with open(wdir + "/" + "infile", "w") as f: f.write(" %d %d\n" % (len(seq_align), len(seq_align[0]))) for seq in seq_align: f.write("%10s%s\n" % (seq.id, seq.seq.upper())) # Write the tree file Phylo.write(qtree, wdir + "/" + "intree", "newick") if os.path.exists(wdir + "/" + "outfile"): os.remove(wdir + "/" + "outfile") if os.path.exists(wdir + "/" + "outtree"): os.remove(wdir + "/" + "outtree") # The path to the module may reference either a .py or a .pyc file... ctlfile = os.path.abspath(__file__).replace(".pyc", ".ctl") if ".pyc" in os.path.abspath(__file__) \ else os.path.abspath(__file__).replace(".py", ".ctl") # Check for dnaml in the current directory dnamlfile = os.path.abspath(__file__).replace("Dnaml.pyc", "dnaml") if ".pyc" in os.path.abspath(__file__) \ else os.path.abspath(__file__).replace("Dnaml.py", "dnaml") if not os.path.exists(dnamlfile): dnamlfile = "dnaml" # must be on the path somewhere with open(wdir + "/" + "dnaml.txt", "w") as o, open(ctlfile, "r") as i: subprocess.call(dnamlfile, cwd=wdir, stdin = i, stdout=o) if not os.path.isfile(wdir + "/" + "outfile"): rep("No output returned by dnaml: please check the logs for the issue.") return None if os.path.isfile(wdir + "/" + "outfile.txt"): os.remove(wdir + "/" + "outfile.txt") os.rename(wdir + "/" + "outfile", wdir + "/" + "outfile.txt") intseqs = self.__parse_outfile(wdir + "/" + "outfile.txt") if not intseqs: rep("Unexpected output returned by dnaml: please check the logs for the issue.") return None # Custom sort function to put the root record first, then others supplied by the user, then intermediate nodes def key_ids(rec): if rec.id == "N%09d" % 1: return 'a__' + rec.id elif 'node_' in rec.id: return 'z__' + "%04d" % (int)(rec.id.split("_")[1]) else: return 'l__' + rec.id labelled_tree = Phylo.read(wdir + "/" + "outtree", "newick") intseqs.seek(0) int_seqs = Alignment(file_name=intseqs, format="fasta") int_seqs.sort(key=key_ids) intseqs.seek(0) int_aas = Alignment() int_aas.read_nt(intseqs, "fasta") int_aas.sort(key=key_ids) int_aas.set_position_numbers(position_numbers = seqpattern) # Put back the original names in all our collections for seq in int_seqs: if seq.id in inv_dict: seq.id = inv_dict[seq.id] seq.name = "" seq.description = "" for seq in int_aas: if seq.id in inv_dict: seq.id = inv_dict[seq.id] seq.name = "" seq.description = "" nodeid = 1 for clade in labelled_tree.find_clades(order="preorder"): if clade.name is None: clade.name = "node_%d" % nodeid # This relies on our traversal using the same order as dnaml nodeid += 1 else: if clade.name in inv_dict: clade.name = inv_dict[clade.name] # Now we need to map the labelling of the nodes in the labelled tree to the nodes in the original tree self.__map_names(ptree, labelled_tree) Phylo.write(ptree, wdir + "/" + tag + "intermediates_treefile.new", "newick", plain=False) cladenames = [] new_int_aas = Alignment() for clade in ptree.find_clades(): if clade.name is not None: cladenames.append(clade.name) for rec in int_aas: if rec.id in cladenames: new_int_aas.append(rec) int_aas = new_int_aas int_aas.set_position_numbers(position_numbers = seqpattern) copy_tree = copy.deepcopy(ptree) # Calculate AA diffs between each node and its parent, and write to the tree labels = {} def diffkey(diff): return int_aas.index_of(diff[1:-1]) for clade in ptree.find_clades(): if clade.name is not None: parent = self.__get_parent(ptree, clade) if parent is None: path = ptree.get_path(clade) if len(path) == 1 and clade.name != first: fname = inv_dict[first] parent = ptree.find_clades(name = re.escape(fname)).next() if parent is not None and parent.name is not None: diffs = list(int_aas.seqdiff(clade.name, parent.name)) diffs.sort(key = diffkey) diffs = "+".join(diffs) if "node_" in clade.name: labels[clade.name] = diffs else: labels[clade.name] = str(clade.name) + " " + diffs for clade in ptree.find_clades(): if clade.name is not None and clade.name in labels: clade.name = labels[clade.name] Phylo.write(ptree, wdir + "/" + tag + "annotated_treefile.new", "newick", plain=False) # Now write a tree with summary CDR/FR total changes if cdrfile is not None: ptree = copy.deepcopy(copy_tree) acdr = AnalyseCDR(int_aas, file_name=cdrfile) labels = {} for clade in ptree.find_clades(): if clade.name is not None: parent = self.__get_parent(ptree, clade) if parent is None: path = ptree.get_path(clade) if len(path) == 1 and clade.name != first: fname = inv_dict[first] parent = ptree.find_clades(name = re.escape(fname)).next() if parent is not None and parent.name is not None: diffs = acdr.category_diff(clade.name, parent.name) if "node_" in clade.name: labels[clade.name] = diffs else: labels[clade.name] = str(clade.name) + " " + diffs for clade in ptree.find_clades(): if clade.name is not None and clade.name in labels: clade.name = labels[clade.name] Phylo.write(ptree, wdir + "/" + tag + "annotated_treefile_sum.new", "newick", plain=False) # And write a tree with counts of total AA changes ptree = copy.deepcopy(copy_tree) labels = {} for clade in ptree.find_clades(): if clade.name is not None: parent = self.__get_parent(ptree, clade) if parent is None: path = ptree.get_path(clade) if len(path) == 1 and clade.name != first: fname = inv_dict[first] parent = ptree.find_clades(name = re.escape(fname)).next() if parent is not None and parent.name is not None: diffs = list(int_aas.seqdiff(clade.name, parent.name)) if "node_" in clade.name: labels[clade.name] = str(len(diffs)) if len(diffs) > 0 else "" else: labels[clade.name] = str(clade.name) + (" " + str(len(diffs)) if len(diffs) > 0 else "") for clade in ptree.find_clades(): if clade.name is not None and clade.name in labels: clade.name = labels[clade.name] Phylo.write(ptree, wdir + "/" + tag + "annotated_treefile_tot.new", "newick", plain=False) f = open(wdir + "/" + tag + "aa_alignment.txt", "w") f.write(int_aas.report(100)) f.close() f = open(wdir + "/" + tag + "nt_alignment.txt", "w") f.write(int_seqs.report(100)) f.close() for rec in int_aas: rec.description = "" AlignIO.write(int_aas, wdir + "/" + tag + "aa_alignment.fa", "fasta") AlignIO.write(int_seqs, wdir + "/" + tag + "nt_alignment.fa", "fasta") return int_aas
def main(argv): print "AnnotateTreeCmd v1.0" if len(argv) == 2 and argv[1] == '-t': conduct_tests() exit(0) elif len(argv) != 7: print 'usage python AnnotateTreeCmd.py seqnumfile seqfile treefile cdrfile tag wd.' sys.exit(0) for file in argv[1:4]: check_file(file) (seqnumfile, seqfile, treefile, cdrfile, tag, wdir) = argv[1:7] if len(cdrfile) > 0: check_file(cdrfile) else: cdrfile = None try: if not os.path.exists(wdir): os.makedirs(wdir) except: print "Error creating directory %s." % wdir sys.exit(0) try: msa = Alignment() msa.read_nt( seqfile) # Check that the sequence comprises a valid set of codons for seq in msa: if '*' in seq: print "Stop codon found in sequence %s." % seq.id sys.exit(0) except: print "Error parsing %s: %s." % (seqfile, sys.exc_info()[1]) sys.exit(0) try: seq_pos = msa.read_position_numbers(seqnumfile) except: print "Error parsing %s: %s." % (seqnumfile, sys.exc_info()[1]) sys.exit(0) if cdrfile is not None: try: acdr = AnalyseCDR(msa, file_name=cdrfile) except: print "Error parsing %s: %s." % (cdrfile, sys.exc_info()[1]) sys.exit(0) try: seq_align = AlignIO.read(seqfile, "fasta") except: try: seq_align = AlignIO.read(seqfile, "phylip") except: print "Error parsing %s: %s." % (seqfile, sys.exc_info()[1]) sys.exit(0) try: tree = Phylo.read(treefile, "newick") except: print "Error parsing %s: %s." % (treefile, sys.exc_info()[1]) sys.exit(0) dnaml = Dnaml() int_aas = dnaml.run_dnaml(seq_align, tree, seq_pos, cdrfile, wdir, report, tag) if int_aas is not None: try: if cdrfile is not None: acdr = AnalyseCDR(int_aas, file_name=cdrfile) cdr_output = acdr.analyse() fo = open(wdir + "/" + tag + "cdr_analysis.html", "w") fo.write(cdr_output) fo.close() except: print "Warning: CDRs were not analysed: " + str(sys.exc_info()[1]) try: gc.collect() RenderTree.render_annotate( wdir + "/" + tag + "annotated_treefile.new", wdir + "/" + tag + "annotated_treefile.png") gc.collect() RenderTree.render_annotate( wdir + "/" + tag + "annotated_treefile.new", wdir + "/" + tag + "annotated_treefile.svg") gc.collect() if cdrfile is not None: RenderTree.render_annotate( wdir + "/" + tag + "annotated_treefile_sum.new", wdir + "/" + tag + "annotated_treefile_sum.png") gc.collect() RenderTree.render_annotate( wdir + "/" + tag + "annotated_treefile_sum.new", wdir + "/" + tag + "annotated_treefile_sum.svg") gc.collect() RenderTree.render_annotate( wdir + "/" + tag + "annotated_treefile_tot.new", wdir + "/" + tag + "annotated_treefile_tot.png") gc.collect() RenderTree.render_annotate( wdir + "/" + tag + "annotated_treefile_tot.new", wdir + "/" + tag + "annotated_treefile_tot.svg") gc.collect() RenderTree.render_annotate( wdir + "/" + tag + "intermediates_treefile.new", wdir + "/" + tag + "intermediates_treefile.png") gc.collect() RenderTree.render_annotate( wdir + "/" + tag + "intermediates_treefile.new", wdir + "/" + tag + "intermediates_treefile.svg") gc.collect() except: print "Error rendering trees: " + str(sys.exc_info()[1]) first = True orig_recs = [] for rec in SeqIO.parse(wdir + "/" + tag + "aa_alignment.fa", "fasta"): if not first and "node_" not in rec.id: orig_recs.append(rec) first = False logo_alignment_file = wdir + "/" + tag + "alignment_for_logo.fa" SeqIO.write(orig_recs, wdir + "/" + tag + "alignment_for_logo.fa", "fasta") with open(wdir + "/" + tag + "weblogo_status.txt", "w") as fo: retcode = subprocess.call( "seqlogo -f %salignment_for_logo.fa -F PNG -o aa_logo -h 2 -w 20 -acS" % tag, cwd=wdir, shell=True, stdout=fo, stderr=subprocess.STDOUT) if retcode == 1: fo.write("Trying seqlogo.pl instead.\n") retcode = subprocess.call( "seqlogo.pl -f %salignment_for_logo.fa -F PNG -o aa_logo -h 2 -w 20 -acS" % tag, cwd=wdir, shell=True, stdout=fo, stderr=subprocess.STDOUT) if retcode == 1: print "Weblogo not installed: logo plot will not be generated."
from Alignment import Alignment from Role import Role from Roles.Medic import Medic from Roles.Vigi import Vigi from Roles.Cop import Cop from Roles.Smart_Hunter_Ven import Smart_Hunter_Ven from Roles.Innocent_Child import Innocent_Child from Roles.Possum import Possum import Strategy from Strategies.VigiStrats import VigiStratSimpleActivation from Strategies.VigiStrats import VigiStratSmartActivation ALIGNMENT_TOWN = Alignment(name="Town", win_con="Last Faction Standing", night_action="None") ALIGNMENT_MAFIA_2KP2 = Alignment(name="Mafia", win_con="Parity", night_action="2 kp till 2") ALIGNMENT_MAFIA = Alignment(name="Mafia", win_con="Parity", night_action="1 kp") MEDIC = Medic(name="Medic", active_n0=True, heal_self=False) ONESHOTVIGI = Vigi(name="One Shot Vigi", active_n0=True, charges=1) ONESHOTVIGINON0 = Vigi(name="One Shot Vigi", active_n0=False, charges=1) COP = Cop(name="Full Alignment Cop", active_n0=True, target_self=False) SMART_HUNTER_VEN = Smart_Hunter_Ven(name="Smart Hunter") INNOCENT_CHILD = Innocent_Child(name="Inno Child")
if args.hand: hand_alignment = tfolder.get_alignment([args.lang1, args.lang2], backend='hand') forced_rungs = hand_alignment.as_ladder() print >> sys.stderr, "%d hand-aligned pairs found." % len(forced_rungs) # prealign if args.prealign: pre_alignment = list(find_matches(t1, t2, threshold=0.5, pair_count=100)) forced_rungs.extend(pre_alignment) print >> sys.stderr, "%d sentence pairs matched." % len(pre_alignment) forced_rungs = sorted(set(forced_rungs)) try: a = None a = make_composed_alignment(t1, t2, forced_rungs) a = Alignment(a) finally: output_filename = '%s/%s-%s.my' % (args.folder, args.lang1, args.lang2) if not a: raise SystemExit with open(output_filename, 'w') as f: for i, j, c in a.data: f.write("%d\t%d\t%.2f\n" % (i, j, c)) print >> sys.stderr, "Wrote alignment to %s." % output_filename c = a.summed_cost() print >> sys.stderr, "Total cost", c, "avg", c/len(a.data) if args.plot: import plot plot.plot_cost_matrix(cost, a.as_ladder(),
primer_range = Primer_range(BED_FILE) # [+strand alignment, -strand alignment] alignment_bucket = [None, None] out_buffer1 = [] out_buffer2 = [] cnt = 0 current_read = 'default' # Current read name for sam_line in sys.stdin: # Skip header if sam_line.startswith('@'): continue alignment = Alignment(sam_line.rstrip()) # Initialize alignment backet if current_read != alignment.read_name: alignment_bucket = [None, None] current_read = alignment.read_name # Skip non-primary and supplemental alignments if alignment.flag & (256 + 2048): continue if alignment.strand == '+': alignment_bucket[0] = alignment elif alignment.strand == '-': alignment_bucket[1] = alignment
from Alignment import Alignment <<<<<<< HEAD msg = "welcome" print(msg) Alignment.test() ======= msg = "welcome to my world, MK" print(msg) msg = "Hurry up...No time to lose!!!" print(msg) <<<<<<< HEAD #Alignment.test() >>>>>>> c55a02934fa8fbc5245c4c5de66bc85687866180 ======= Alignment.test() >>>>>>> 29b3891cc91b51c02f0d91a4606f93c1058bba86
def getTemplates(self): # http://biopython.org/DIST/docs/api/Bio.Blast.NCBIWWW-module.html if self.debug: print(self.seq) # Send BLAST request to server # Use blastp (protein) for the method # Use pdb as the database result_handle = NCBIWWW.qblast("blastp","pdb",str(self.seq),expect=0.01) # Parse the results into blast records blast_records = NCBIXML.parse(result_handle) if self.debug: print("BLAST Request Finished") # Read through each blast record for record in blast_records: # Grab the alignments from each record for alignment in record.alignments: # Use the alignment id as the template key id = alignment.accession fasta = self.getFastaFromId(id) title = alignment.title length = alignment.length # Set up the template object for this id template = Template( id=id,fasta=fasta,sequence=title, length=length,alignments=[] ) # Store the template in the template dict self.templates[id] = template """ self.templates[id] = {"fasta":self.getFastaFromId(id), 'asequence':alignment.title, 'alength':alignment.length, "alignments":[]} """ # Store fasta in dict self.fastas[id] = fasta # Get all alignments for this template for hsp in alignment.hsps: # Create an alignment object a = Alignment( id=id,title=title,expect=hsp.expect,score=hsp.score, identities=hsp.identities,similarity=(100*hsp.identities/len(self.seq)), target=hsp.query,targetstart=hsp.query_start,match=hsp.match, template=hsp.sbjct,templatestart=hsp.sbjct_start,length=length ) # Alignment isn't necessarily the same size as the sequence targetfront = str(self.seq[:a.targetstart-1]) targetend = str(self.seq[(a.targetstart+a.length):]) a.target = ''.join(targetfront) + a.target + ''.join(targetend) a.length = len(a.target) templatefront = ['-']*(a.targetstart-1) templateend = ['-']*(len(self.seq)-(a.targetstart+a.length)) a.template = ''.join(templatefront) + a.template + ''.join(templateend) if self.debug: print("Seq vs Target Length:",len(self.seq),a.length) # Append the alignment to the template's alignments self.templates[id].alignments.append(a) self.alignments.append(a) """ self.templates[id]["alignments"].append({'expect':hsp.expect, 'score':hsp.score, 'identities':hsp.identities, 'similarity':(100*hsp.identities/len(self.seq)), 'target':hsp.query, 'match':hsp.match, 'template':hsp.sbjct}) """ if self.debug: print() print('****ALIGNMENT***') print('id:',id) print('sequence:', title) print('length:',length) print('e value:', hsp.expect) print('score:', hsp.score) print('identities:',(100*hsp.identities/len(self.seq))) # need to print percentage of similarities print("Target :" + hsp.query[0:75] + '...') print("Match :" + hsp.match[0:75] + '...') print("Template:" + hsp.sbjct[0:75] + '...') print() # Save off the fasta file for id,fasta in self.fastas.items(): fname = '%s/%s.fasta' % (self.fastasfolder,id) if not os.path.exists(fname): f = open(fname,'w') SeqIO.write(fasta,f,'fasta') f.close() # Save off the alignments for i,a in enumerate(self.alignments): fname = '%s/%s-%s.alignment' % (self.alignmentsfolder,a.id,str(i)) if not os.path.exists(fname): f = open(fname,'w') json.dump(a.toJSON(),f) f.close() return self.templates.keys()
#!/usr/bin/env python3 # Usage: ./msa_approx.py score_matrix 5 test.fa # Requires Alignment.py in the directory from Alignment import Alignment from Alignment import GetArguments import sys arguments = GetArguments(sys.argv) test = Alignment(arguments.seqs, arguments.score_matrix, arguments.gapcost) alignm = test.multiple_align() for i in range(len(alignm)): print(">", arguments.heads[test.seqOrder[i]]) print(test.num_to_sequence(alignm[i]), "\n")
def gen(): yield (0, 0, 0) prev_i2 = 0 for (i1, i2) in al12: for _i2 in range(prev_i2+1, i2+1): try: i3s = map23[_i2] for i3 in i3s: # if map31[i3] == i1: yield (i1, _i2, i3) except KeyError: pass prev_i2 = i2 return Alignment(list(gen()), no_costs=True) if __name__ == '__main__': import sys name = sys.argv[1] a1 = Alignment.from_file(name + '/pl-cu.my').as_ladder() a2 = Alignment.from_file(name + '/cu-el.my').as_ladder() a3 = Alignment.from_file(name + '/pl-el.my').as_ladder() a3 = [(b, a) for (a, b) in a3] ma = merge_3_alignments(a1, a2, a3) ma.pretty_print(Text.from_file(name + '/pl.txt', lang='pl').as_sentences_flat(), Text.from_file(name + '/cu.txt', lang='cu').as_sentences_flat(), Text.from_file(name + '/el.txt', lang='el').as_sentences_flat())