def GetExec(self, optList, frame): # Respond to the "embossn" type command. self.frame = frame plugin_exe = r"C:/mEMBOSS/needle.exe" self.outfile = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\needle.txt" self.outtype = "fasta" cline = NeedleCommandline(plugin_exe, asequence=str(self.frame.paramBoxes[1].GetValue()), bsequence=str(self.frame.paramBoxes[3].GetValue())) cline.outfile = self.outfile cline.gapopen = self.param[7].GetValue() cline.gapextend = self.param[9].GetValue() if self.param[10].GetValue(): cline.similarity = True else: cline.similarity = False if self.frame.abet=="AA": cline.snucleotide = False cline.sprotein = True elif self.frame.abet=="DNA" or self.frame.abet=="RNA": cline.snucleotide = True cline.sprotein = False if self.frame.options: t = self.boxList[3].GetValue() if t != '': cline.datafile = str(t) return str(cline)
def out(): filename1 = e1.get() filename2 = e2.get() outfile = e3.get() needle_cline = NeedleCommandline() needle_cline.asequence = filename1 needle_cline.bsequence = filename2 needle_cline.gapopen = int(gapopen) needle_cline.gapextend = int(gapextend) needle_cline.outfile = "needle.txt" print(needle_cline) print(needle_cline.outfile) stdout, stderr = needle_cline() print(stdout + stderr) align = AlignIO.read("needle.txt", "emboss") file = open("needle.txt", "r") # print(file.read()) view = ("\n\n%s" % file.read()) with open(outfile, "w") as f: f.write(view) root = Tk() S = Scrollbar(root) T = Text(root, height=50, width=500) S.pack(side=RIGHT, fill=Y) T.pack(side=LEFT, fill=Y) S.config(command=T.yview) S.config(command=T.xview) T.config(yscrollcommand=S.set) T.config(xscrollcommand=S.set) quote = view T.insert(END, quote, 'color') mainloop() win.destroy()
def GetExec(self, optList, frame): # Respond to the "embossn" type command. self.frame = frame plugin_exe = r"C:/mEMBOSS/needle.exe" self.outfile = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\needle.txt" self.outtype = "fasta" cline = NeedleCommandline( plugin_exe, asequence=str(self.frame.paramBoxes[1].GetValue()), bsequence=str(self.frame.paramBoxes[3].GetValue())) cline.outfile = self.outfile cline.gapopen = self.param[7].GetValue() cline.gapextend = self.param[9].GetValue() if self.param[10].GetValue(): cline.similarity = True else: cline.similarity = False if self.frame.abet == "AA": cline.snucleotide = False cline.sprotein = True elif self.frame.abet == "DNA" or self.frame.abet == "RNA": cline.snucleotide = True cline.sprotein = False if self.frame.options: t = self.boxList[3].GetValue() if t != '': cline.datafile = str(t) return str(cline)
def each_seq_align(each_id=0, record_list=list(), pair_aln_dir=''): # prepare pairwise sequence alignment files tmp_a_seq = os.path.join(pair_aln_dir, '{0}_a.fasta'.format(str(each_id))) SeqIO.write(record_list[0], tmp_a_seq, 'fasta') tmp_b_seq = os.path.join(pair_aln_dir, '{0}_b.fasta'.format(str(each_id))) SeqIO.write(record_list[1], tmp_b_seq, 'fasta') result_file = os.path.join(pair_aln_dir, "{0}.txt".format(str(each_id))) needle_cline = NeedleCommandline() needle_cline.asequence = tmp_a_seq needle_cline.bsequence = tmp_b_seq needle_cline.gapopen = 10 needle_cline.gapextend = 0.5 needle_cline.outfile = result_file devnull = open(os.devnull, 'w') try: subprocess.call(str(needle_cline), shell=True, stdout=devnull, stderr=devnull) except OSError: sys.exit(1) os.remove(tmp_a_seq) os.remove(tmp_b_seq) in_pattern = re.compile(r'Identity.*\((\d+\.\d+)%\)') gene_alignment_result = '' with open(result_file, 'r') as f1: for a_line in f1.readlines(): if 'Identity' in a_line: m = re.search(in_pattern, a_line.strip()) similarity = m.group(1) gene_alignment_result = '{0}\t{1}\n'.format( str(each_id), str(similarity)) os.remove(result_file) with open(result_file, 'w') as f2: f2.write(gene_alignment_result)
def needleAlign(seq1, seq2, gapopen, gapextend): needle = NeedleCommandline() needle.asequence = seq1 needle.bsequence = seq2 needle.gapopen = gapopen needle.gapextend = gapextend needle.outfile = "needle.txt" stdout, stderr = needle() print(stdout)
def build_target_matrix(self): if self.mysubject_file.read().count( '>') > self.mytarget_file.read().count('>'): # Ensures the subject file is the smaller one (self.mysubject_file, self.mytarget_file) = (self.mytarget_file, self.mysubject_file) (self.subjectname, self.targetname) = (self.targetname, self.subjectname) self.mysubject_file.seek(0) matrixfile = self.outdir + '/matrix' if os.path.exists(matrixfile): self.assignments = pickle.load(open(matrixfile)) return count = self.mysubject_file.read().count('>') results = re.compile( r'# 2: (\w+).+?# Gaps:\s+\d+/\d+ \((\d+\.\d+)%\).+?# Score: (\d+\.\d+)', re.DOTALL) needle = NeedleCommandline() needle.gapopen = self.gapopen needle.gapextend = self.gapextend needle.outfile = 'stdout' needle.bsequence = self.mytarget_file.name mytargets = SeqIO.parse(self.mytarget_file.name, 'fasta') mytargets = SeqIO.to_dict(mytargets) mysubjects = SeqIO.parse(self.mysubject_file.name, 'fasta') print "Creating alignment matrix. Please wait..." pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=count).start() for i, subject in enumerate(mysubjects): needle.asequence = 'asis:%s' % str(subject.seq) (stdout, stderr) = needle() gaps = results.findall(stdout) gaps.sort(key=lambda x: float(x[2]), reverse=True) self.assignments.setdefault(subject.id, []).extend(gaps[0:self.assign]) pbar.update(i + 1) #pbar.finish() ''' outfile = tempfile.NamedTemporaryFile(delete=False) mycmd = 'ggsearch36 -s BL62 -m 8 -w 80 -f -8 -g -2 -b=3 -3 -k 500 %s %s'%(self.mysubject_file.name,self.mytarget_file.name) handle = subprocess.Popen(mycmd,shell=True,stdout=subprocess.PIPE) (stdout,stderr) = handle.communicate() results = re.compile('#.+?(^[^#].+?)#',re.MULTILINE|re.DOTALL) res = results.findall(stdout+"#") rows = [i.split('\n') for i in res] myrows = [] for row in rows: row.pop() row.sort(key=lambda x:float(x.split('\t')[-1]),reverse=True) myrows.append(row[0:self.assign]) for i in myrows: [self.assignments.setdefault(sub,[]).append(tar) for sub,tar in [x.split('\t')[0:2] for x in i if bool(re.search('\t',x))]] ''' pickle.dump(self.assignments, open(matrixfile, 'w')) return
if __name__ == "__main__": with open(os.path.join('data', 'rosalind_need.txt')) as dataset: ids = dataset.read().split() handle = Entrez.efetch(db='nucleotide', id=ids, rettype="fasta") records = list(SeqIO.parse(handle, 'fasta')) for i, r in enumerate(records): with open(ids[i], 'w') as f: SeqIO.write(r, f, 'fasta') needle_cline = NeedleCommandline() needle_cline.asequence = ids[0] needle_cline.bsequence = ids[1] needle_cline.outfile = "need.txt" needle_cline.gapopen = 11 needle_cline.gapextend = 1 needle_cline.endopen = 11 needle_cline.endextend = 1 needle_cline.endweight = True needle_cline() with open('need.txt') as f: output = f.readlines() for line in output: if 'Score:' in line: print(int(float(line[:-1].split(':')[-1].strip())))
def each_needle_run(pair_gene_dir, tmp_gene_converted_dir, pair_gene_alignment_dir, og_id, strain_dict): """ This function is used to call Needle program to do pairwise sequence alignment :param pair_gene_dir: each homologous gene directory :param tmp_gene_converted_dir: used to put some temporary files and will be deleted in the end :param pair_gene_alignment_dir: each orthologous gene pair-wised alignment directory :param og_id: each orthologous gene id :param strain_dict: inherit from load_strains_label function with strain information :return: the alignment result of each gene """ if not os.path.exists(pair_gene_dir): logger.error("There is no directory contains gene file, please check.") logger.error(last_exception()) sys.exit(1) tmp_gene_fasta = os.path.join(pair_gene_dir, og_id + '.fasta') converted_records = [] re_pattern = re.compile(r'fig\|(\d+\.\d+)\.peg\.(\d+)\s(.*)') in_pattern = re.compile(r'Identity.*\((\d+\.\d+)%\)') annotation = '' og_list = [] for record in SeqIO.parse(tmp_gene_fasta, 'fasta'): m = re.search(re_pattern, record.description) strain_id = m.group(1) gene_id = '{0}.peg.{1}'.format(strain_id, m.group(2)) og_list.append(gene_id) annotation = m.group(3) record.id = strain_dict[strain_id][0] final_record = SeqRecord(record.seq, record.id, description='') converted_records.append(final_record) the_strain_fasta = os.path.join(tmp_gene_converted_dir, 'a.fasta') other_strain_fasta = os.path.join(tmp_gene_converted_dir, 'b.fasta') SeqIO.write(converted_records[0], the_strain_fasta, 'fasta') SeqIO.write(converted_records[1], other_strain_fasta, 'fasta') result_file = os.path.join(pair_gene_alignment_dir, "{0}.txt".format(og_id)) needle_cline = NeedleCommandline() needle_cline.asequence = the_strain_fasta needle_cline.bsequence = other_strain_fasta needle_cline.gapopen = 10 needle_cline.gapextend = 0.5 needle_cline.outfile = result_file devnull = open(os.devnull, 'w') try: subprocess.call(str(needle_cline), shell=True, stdout=devnull, stderr=devnull) except OSError: logger.info( 'Try to call Needle program failed, please check if Needle has been installed successfully.' ) logger.error(last_exception()) sys.exit(1) os.remove(the_strain_fasta) os.remove(other_strain_fasta) gene_alignment_result = '' with open(result_file, 'r') as f: for a_line in f.readlines(): if 'Identity' in a_line: m = re.search(in_pattern, a_line.strip()) identity = m.group(1) gene_alignment_result = '{0}\t[{1}|{2}]\t{3}\t{4}\n'.format( og_id, og_list[0], og_list[1], identity, annotation) return gene_alignment_result
Entrez.email = "*****@*****.**" if __name__ == "__main__": with open('rosalind_need.txt') as dataset: ids = dataset.read().split() handle = Entrez.efetch(db='nucleotide', id=ids, rettype="fasta") records = list(SeqIO.parse(handle, 'fasta')) for i, r in enumerate(records): with open(ids[i], 'w') as f: SeqIO.write(r, f, 'fasta') needle_cline = NeedleCommandline() needle_cline.asequence = ids[0] needle_cline.bsequence = ids[1] needle_cline.outfile = "rosalind_need_output.txt" needle_cline.gapopen = 10 needle_cline.gapextend = 1 needle_cline.endopen = 10 needle_cline.endextend = 1 needle_cline.endweight = True needle_cline() with open('rosalind_need_output.txt') as f: output = f.readlines() for line in output: if 'Score:' in line: print(int(float(line[:-1].split(':')[-1].strip())))
handle = Entrez.efetch(db='nucleotide', id=ids, rettype='fasta') records = list(SeqIO.parse(handle, 'fasta')) for i, record in enumerate(records): with open(ids[i], 'w') as f: SeqIO.write(record, f, 'fasta') # Following step can be done in command line # using 'needle' needle_cl = NeedleCommandline() needle_cl.asequence = ids[0] needle_cl.bsequence = ids[1] needle_cl.outfile = 'need_output.txt' needle_cl.gapopen = 10 needle_cl.gapextend = 1 needle_cl.endopen = 10 needle_cl.endextend = 1 needle_cl.endweight = True needle_cl() # Score with open('need_output.txt', 'r') as f: content = f.readlines() for line in content: if 'Score' in line: score = float(line.split()[-1]) print(score) break