def mafft_wrapper(work_msa): app = Applications.MafftCommandline( input=work_msa.input_fasta, clustalout=True, ) stdout, stderr = app() work_msa.output_aln.write_text(stdout)
def probcons_wrapper(work_msa): app = Applications.ProbconsCommandline( input=work_msa.input_fasta, clustalw=True, ) stdout, stderr = app() work_msa.output_aln.write_text(stdout)
def tcoffee_wrapper(work_msa): app = Applications.TCoffeeCommandline( infile=work_msa.input_fasta, outfile=work_msa.output_aln, output='clustalw', ) app()
def msaprobs_wrapper(work_msa): app = Applications.MSAProbsCommandline( infile=work_msa.input_fasta, outfile=work_msa.output_aln, clustalw=True, ) app()
def muscle_wrapper(work_msa): app = Applications.MuscleCommandline( input=work_msa.input_fasta, out=work_msa.output_aln, clw=True, ) app()
def clustalo_wrapper(work_msa): app = Applications.ClustalOmegaCommandline( infile=work_msa.input_fasta, outfile=work_msa.output_aln, outfmt='clustal', verbose=True, auto=True, ) app()
def dialign2_wrapper(work_msa): raise NotImplementedError( "I can't figure out how to get dialign to output the MSA it calculates..." ) app = Applications.DialignCommandline( 'dialign', input=work_msa.input_fasta, fn=work_msa.output_aln.stem, ) app()
def __call__(self): """Calls the underlying alignment method. First, validate method, command, and outpath arguments as valid. Next, call the underlying method using BioPython commandline wrapper or internal method and handle stdout/stderr. """ # Either delegate call to BioPython or run internal method if self.method == 'Mafft': cmdline = Applications.MafftCommandline(self.cmd, input=self.inpath, **self.kwargs) try: stdout, stderr = cmdline() # Need to log stderr eventually except ApplicationError: # Raised if subprocess return code != 0 print( "Failed to run MAFFT") # Should process better eventually with open(self.outpath, 'w') as o: o.write(stdout) elif self.method == 'Generic': pass # To be implemented
def renumber_pdb(config, path, pdb_name, sequences, dummy_dir): ''' Renumber PDB file located in path folder with the real sequences path Folder where PDB file is located pdb PDB file sequences dictionary of sequences (of ProteinSequence Class from SeqIO) that define the Aa number chain identifier is the key of the dictionary dummy_dir Dummy directory to cerate files ''' #Initialize from SBI.structure.chain import Chain from SBI.sequence import Sequence from SBI.structure import PDB from Bio import SeqIO from Bio import ExPASy from Bio import AlignIO from Bio.Align import Applications clustal_exe = os.path.join(config.get('Paths', 'clustal_path'), 'clustalw2') name_pdb = ".".join(pdb_name.split('/')[-1].split('.')[:-1]) new_pdb = PDB() pdb_file = os.path.join(path, pdb_name) pdb = PDB(pdb_file) pdb.clean() for chain_id, chain_seq in sequences.iteritems(): name_chain = name_pdb + "_" + chain_id name_seq = chain_seq.get_identifier() pdb_chain = pdb.get_chain_by_id(chain_id) new_chain = Chain(name_pdb, chain_id) #define/create files infile = dummy_dir + "/tmp_" + name_chain + "_" + name_seq + ".fa" outfile = dummy_dir + "/tmp_" + name_chain + "_" + name_seq + ".aln" dndfile = dummy_dir + "/tmp_" + name_chain + "_" + name_seq + ".dnd" fd = open(infile, "w") fd.write(">{0:s}\n{1:s}\n".format(name_chain, pdb_chain.protein_sequence)) fd.write(">{0:s}\n{1:s}\n".format(name_seq, chain_seq.get_sequence())) fd.close() try: # run clustalw2 msa_cline = Applications.ClustalwCommandline(clustal_exe, infile=infile, outfile=outfile) child = subprocess.Popen(str(msa_cline), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell="/bin/bash") child.communicate() #store alignment in compare alignment = AlignIO.read(outfile, 'clustal') structure = alignment[0].seq reference = alignment[1].seq try: len_3d = len(structure) len_ref = len(reference) except Exception as e: sys.stderr.write("ERROR: %s\n" % e) return e except Exception as e: sys.stderr.write("ERROR: %s\n" % e) return e #remove temporary fasta and alignment files remove_files([infile, outfile, dndfile]) #mapping of residues to the original sequence mapping = create_mapping(pdb_chain.protein_idx.split(";"), structure, reference) #fill the new chain with the correct numbering of residues for residue in pdb_chain.aminoacids: pair = (str(residue.number), residue.version) number, version = mapping.get(pair) residue.number = number residue.version = version new_chain.add_residue(residue) #fill the new pdb new_pdb.add_chain(new_chain) return new_pdb
def __align_sequences__(args=None, seqs=None): if args.input_file is not None: assert args.file_input_format is not None, "Missed the file input format at __retrieve_data__(args=None)" if args.verbose: print("\nStarting sequences alignment process...\n\n") from Bio.Align import Applications # import subprocess # global nproc if args.tool == "clustalo": started = datetime.now() print("Starting at: %s" % started.strftime("%Y-%m-%d %H:%M:%S")) # from Bio.Align.Applications import ClustalwCommandline binpath = r"/usr/local/bin/clustalo" cmd=Applications.ClustalOmegaCommandline(\ binpath,\ infile=args.input_file,\ outfile="%s.aln.clustalo" % args.input_file,\ verbose=args.verbose,\ force=True,\ threads=nproc,\ guidetree_out="%s.dnd.clustalo" % args.input_file) # cmd="%s -i %s -o %s --threads=%i --force --guidetree-out=%s" % (binpath, args.input_file, ("%s.aln.clustalo" % args.input_file), nproc, ("%s.dnd.clustalo" % args.input_file)) # if args.verbose: # cmd="%s -i %s -o %s --threads=%i --force --guidetree-out=%s -v" % (binpath, args.input_file, ("%s.aln.clustalo" % args.input_file), nproc, ("%s.dnd.clustalo" % args.input_file)) # stdout,stderr=cmd() child=subprocess.Popen(\ str(cmd),\ stdout=subprocess.PIPE,\ stderr=subprocess.PIPE,\ universal_newlines=True,\ shell=(sys.platform!="win32")) child.wait() finished = datetime.now() print("Finished at: %s" % finished.strftime("%Y-%m-%d %H:%M:%S")) print("Total elapsed time: %s" % str(finished - started)) if args.verbose: stdout = child.stdout.read() if (len(stdout) > 0): print("\nStandard out is: %s\n" % stdout) else: print("\nStandard out is empty!\n") stderr = child.stderr.read() if (len(stderr) > 0): print("Standard error is: %s" % stderr) else: print("Standard error is empty") from Bio import AlignIO # align=AlignIO.read("tmp.aln","fasta") align = AlignIO.read("%s.aln.clustalo" % args.input_file, "fasta") print(align) elif args.tool == "muscle": started = datetime.now() print("Starting at: %s" % started.strftime("%Y-%m-%d %H:%M:%S")) # from Bio.Align.Applications import ClustalwCommandline cmd = None if not args.file_output_format or args.file_output_format is None: cmd = Applications.MuscleCommandline(input=args.input_file, out="%s.aln.muscle" % args.input_file) else: if args.file_output_format == "clustal": cmd = Applications.MuscleCommandline( input=args.input_file, clw=True, out="%s.aln.muscle.clustalwfmt" % args.input_file) elif args.file_output_format == "clustal-strict": cmd = Applications.MuscleCommandline( input=args.input_file, clwstrict=True, out="%s.aln.muscle.clustalwstrictfmt" % args.input_file) # cmd() child=subprocess.Popen(\ str(cmd),\ stdout=subprocess.PIPE,\ stderr=subprocess.PIPE,\ universal_newlines=True,\ shell=(sys.platform!="win32")) child.wait() finished = datetime.now() print("Finished at: %s" % finished.strftime("%Y-%m-%d %H:%M:%S")) print("Total elapsed time: %s" % str(finished - started)) if args.verbose: stdout = child.stdout.read() if (len(stdout) > 0): print("\nStandard out is: %s\n" % stdout) else: print("\nStandard out is empty!\n") stderr = child.stderr.read() if (len(stderr) > 0): print("Standard error is: %s" % stderr) else: print("Standard error is empty") from Bio import AlignIO align = None if args.file_output_format is None: align = AlignIO.read("%s.aln.muscle" % args.input_file, "fasta") elif args.file_output_format == "clustal": align = AlignIO.read( "%s.aln.muscle.clustalwfmt" % args.input_file, "clustal") elif args.file_output_format == "clustal-strict": align = AlignIO.read( "%s.aln.muscle.clustalwstrictfmt" % args.input_file, "clustal") print(align) elif args.tool == "emboss": raise NotImplementedError( "Not implemented yet! Fix the a and b sequence files") outfile = '' binpath = '' if args.emboss_algorithm == "needle": from Bio.Emboss.Applications import NeedleCommandline as EmbossCommandline outfile = "%s.needle.txt" % args.input_file binpath = r"/usr/local/bin/needle" elif args.emboss_algorithm == "water": from Bio.Emboss.Applications import WaterCommandline as EmbossCommandline outfile = "%s.water.txt" % args.input_file binpath = r"/usr/local/bin/water" started = datetime.now() print("Starting at: %s" % started.strftime("%Y-%m-%d %H:%M:%S")) cmd=EmbossCommandline(\ binpath,\ asequence="/home/edario/mines/bio/alpha.faa",\ bsequence="/home/edario/mines/bio/beta.faa",\ gapopen=10,\ gapextend=0.5,\ outfile=outfile) # stdout,stderr=cmd() child=subprocess.Popen(\ str(cmd),\ stdout=subprocess.PIPE,\ stderr=subprocess.PIPE,\ universal_newlines=True,\ shell=(sys.platform!="win32")) child.wait() finished = datetime.now() print("Finished at: %s" % finished.strftime("%Y-%m-%d %H:%M:%S")) print("Total elapsed time: %s" % str(finished - started)) if args.verbose: stdout = child.stdout.read() if (len(stdout) > 0): print("\nStandard out is: %s\n" % stdout) else: print("\nStandard out is empty!\n") stderr = child.stderr.read() if (len(stderr) > 0): print("Standard error is: %s" % stderr) else: print("Standard error is empty") from Bio import AlignIO # align=AlignIO.read("tmp.aln","fasta") # align=AlignIO.read("%s.needle.txt" % args.input_file,"emboss") align = AlignIO.read(outfile, "emboss") print(align) elif args.tool == "blast": assert args.blast_app is not None, "Missed the -bap|--blast-app arg" assert args.blast_database is not None, "Missed the -bdb|--blast-database arg" # assert args.blast_query_sequence is not None, "Missed the -bqs|--blast-query-sequence arg" started = datetime.now() print("Starting at: %s" % started.strftime("%Y-%m-%d %H:%M:%S")) from Bio.Blast import NCBIWWW result_handle = None # args.blast_query_sequence='' # for seq in SeqIO.parse(args.input_file,args.file_input_format): # args.blast_query_sequence+=seq.id+'\n' # if args.blast_query_sequence is not None: # if args.verbose: # print("Searching in BLAST with app %s, in database %s and query %s" % (args.blast_app, args.blast_database, args.blast_query_sequence)) # print("(cmd is %s -db %s)" % (args.blast_app, args.blast_database)) # result_handle=NCBIWWW.qblast(args.blast_app, args.blast_database, args.blast_query_sequence) if args.file_input_format.lower() != "xml": try: record = SeqIO.read(args.input_file, args.file_input_format) result_handle = NCBIWWW.qblast(args.blast_app, args.blast_database, record.seq) except ValueError as e: if "more than one record found in handle" in e.args[ 0].lower(): records = SeqIO.parse(args.input_file, args.file_input_format) query = '' for rec in records: query += rec.id + '\n' print("************query***********") print(type(query)) result_handle = NCBIWWW.qblast(args.blast_app, args.blast_database, query) quit() with open("blast.xml", 'w') as out_handle: # out_handle.write(result_handle.read()) out_handle.write(result_handle.getvalue()) # result_handle.close() else: result_handle = open(args.input_file) # else: # query='' # for seq in seqs: # query+="%s\n" % eq # result_handle=NCBIWWW.qblast(args.blast_app, args.blast_database, seq) from Bio.Blast import NCBIXML blast_records = NCBIXML.parse(result_handle) for blast_record in blast_records: for alignment in blast_record.alignments: for hsp in alignment.hsps: print("\nALIGNMENT\n") print("Sequence: ", alignment.title) print("Length: ", alignment.length) print("e value: ", hsp.expect) print(hsp.query[0:75] + "...") print(hsp.match[0:75] + "...") print(hsp.sbjct[0:75] + "...") # print(blast_record) result_handle.close() finished = datetime.now() print("Finished at: %s" % finished.strftime("%Y-%m-%d %H:%M:%S")) print("Total elapsed time: %s" % str(finished - started)) else: alignments = pairwise2.align.globalxx(seq1, seq2) for alignment in alignments: print(pairwise2.format_alignment(*alignment))