def test_simple_clustal_strict(self): """Simple muscle call using strict Clustal output""" input_file = "Fasta/f002" self.assertTrue(os.path.isfile(input_file)) records = list(SeqIO.parse(input_file,"fasta")) records.sort(key = lambda rec: rec.id) #Prepare the command... cmdline = MuscleCommandline(muscle_exe) cmdline.set_parameter("in", input_file) #Use clustal output (with a CLUSTAL header) cmdline.set_parameter("clwstrict", True) # Default None treated as False! self.assertEqual(str(cmdline).rstrip(), _escape_filename(muscle_exe) + " -in Fasta/f002 -clwstrict") self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) child = subprocess.Popen(str(cmdline), stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, shell=(sys.platform!="win32")) #Didn't use -quiet so there should be progress reports on stderr, align = AlignIO.read(child.stdout, "clustal") align.sort() self.assertTrue(child.stderr.read().strip().startswith("MUSCLE")) self.assertEqual(len(records),len(align)) for old, new in zip(records, align): self.assertEqual(old.id, new.id) self.assertEqual(str(new.seq).replace("-",""), str(old.seq)) return_code = child.wait() self.assertEqual(return_code, 0) child.stdout.close() child.stderr.close() del child
def muscleProcess (threadID, filebase, outbase, treebase): fasta = filebase % threadID output = outbase % threadID treeFile = treebase % threadID print( "Building NJ tree from %s" % fasta ) run_muscle = MuscleCommandline( cmd=muscle, input=fasta, out=output ) run_muscle.tree1 = treeFile run_muscle.cluster1 = "neighborjoining" run_muscle.maxiters = 1 thisVarHidesTheOutput = run_muscle()
def align_alleles(self): """ """ logging.info('Aligning extracted alleles with MUSCLE') cline = MuscleCommandline(input=self.unaligned_alleles, out=self.aligned_alleles) cline()
def alignment(): # align sequences with muscle, (http://www.drive5.com/muscle/) if muscle_loc: muscle_cline = MuscleCommandline(muscle_loc, input=file_ali_in, out=file_ali_out, clwstrict=True) muscle_cline() else: muscle_cline = MuscleCommandline(input=file_ali_in, out=file_ali_out, clwstrict=True) muscle_cline() alignment = open(file_ali_out, "r") print_alignment = alignment.read() print print_alignment alignment.close()
def muscle(path_to_seq): muscle_exe = "/afs/andrew.cmu.edu/usr23/lleung/muscle/muscle" muscle_cline = MuscleCommandline(muscle_exe, input=path_to_seq) head, filename = os.path.split(path_to_seq) stdout, stderr = muscle_cline() handle = open("aligned.fasta", "w") #change name of alignment file handle.write(stdout) handle.close()
def precentage_identity_readP_anntP(bm_ids, out_prec_iden, orfs_reads, annt_prot): with open(out_prec_iden + ".txt", "w") as pIdentity: pIdentity.write("read" + "\t" + "txt" + "\t" + "per_identity" + "\t" + "len_alig" + "\t" + "match" + "\t" + "mismatch" + "\n") indx_orfs_reads = SeqIO.index(orfs_reads, "fasta") indx_annt_protn = SeqIO.index(annt_prot, "fasta") txt_dic = get_txt_from_anntProtein(indx_annt_protn) in_both = 0 no_read = 0 no_txt = 0 for map_ids in open(bm_ids): df = map_ids.split() txt = str(df[1]).split("|")[0] read = str(df[0]) if read in indx_orfs_reads and txt in txt_dic: try: in_both += 1 record1 = indx_orfs_reads[read] record2 = indx_annt_protn[txt_dic[txt]] records = (record1, record2) handle = StringIO() SeqIO.write(records, handle, "fasta") muscle_cline = MuscleCommandline( clwstrict=True) #clwstrict , msf=True data = handle.getvalue() stdout, stderr = muscle_cline(stdin=data) align = AlignIO.read(StringIO(stdout), "clustal") target = str(align[0].seq) query = str(align[1].seq) match = 0 mismatch = 0 for t, q in zip(target, query): if t == q: match += 1 else: mismatch += 1 pIdentity.writelines( str(read) + "\t" + str(txt) + "\t" + str((match * 100 / len(target))) + "\t" + str(len(target)) + "\t" + str(match) + "\t" + str(mismatch) + "\n") except Exception as e: print(e) else: if read in indx_orfs_reads: no_txt = no_txt + 1 with open(out_prec_iden + "_transcpNoFound.txt", "a+") as tnf: tnf.writelines(str(txt) + "\n") elif txt in txt_dic: no_read = no_read + 1 with open(out_prec_iden + "_readsNoFound.txt", "a+") as rnf: rnf.writelines(str(read) + "\n") print("both: ", in_both, "read_no_found: ", no_read, "txt_no_found: ", no_txt)
def align_fasta(infname, outfname, debug=False): """ Generate an alignment for the given fasta file. Args: infname (str): Path to fasta to be aligned. outfname (str): Path to output fasta to be """ muscle_exec = { "Windows": "niclassify/bin/muscle3.8.31_i86win32.exe", "Linux": "niclassify/bin/muscle3.8.31_i86linux64", "Darwin": "niclassify/bin/muscle3.8.31_i86darwin64" }[PLATFORM] alignment_call = MuscleCommandline(os.path.realpath( os.path.join(MAIN_PATH, muscle_exec)), input=os.path.realpath(infname), out=os.path.realpath(outfname)) print(alignment_call.__str__()) if debug: subprocess.run(alignment_call.__str__(), creationflags=subprocess.CREATE_NEW_CONSOLE, shell=True) else: subprocess.run(alignment_call.__str__(), shell=True) r_script = os.path.realpath( os.path.join(MAIN_PATH, "niclassify/core/scripts/trim_alignment.R")) trim_call = [R_LOC, r_script, outfname, outfname] if debug: proc = subprocess.run(trim_call, creationflags=subprocess.CREATE_NEW_CONSOLE, env=os.environ.copy()) else: proc = subprocess.run(trim_call, env=os.environ.copy()) if os.stat(outfname).st_size == 0: raise ChildProcessError("Sequence Alignment Failed") if proc.returncode != 0: raise RScriptFailedError("R TrimAlignment failed")
def run_muscle(self, sequences_to_align, output_file_name, muscle_mode): """ This method allows to interact with the local MUSCLE. """ # TODO: to insert the following options: # - guide tree from: # - none # - first iteration # - second iteration self.pymod.build_sequence_file(sequences_to_align, output_file_name, unique_indices_headers=True) # Input FASTA for MUSCLE. infasta = os.path.join(self.pymod.alignments_dirpath, output_file_name + ".fasta") # Output FASTA from MUSCLE, in tree order. outfasta_tree = os.path.join(self.pymod.alignments_dirpath, output_file_name + ".out_fasta") # Output ALN. outaln = os.path.join(self.pymod.alignments_dirpath, output_file_name + ".aln") muscle_exec = self.tool["exe_file_path"].get_value() if muscle_mode == "highest_accuracy": cline = MuscleCommandline(muscle_exec, input=infasta, out=outfasta_tree, clwout=outaln) elif muscle_mode == "large_datasets": cline = MuscleCommandline(muscle_exec, input=infasta, out=outfasta_tree, clwout=outaln, maxiters=2) elif muscle_mode == "fastest": cline = MuscleCommandline(muscle_exec, input=infasta, out=outfasta_tree, clwout=outaln, maxiters=1, diags=True, sv=True, distance1="kbit20_3") else: raise KeyError(muscle_mode) self.pymod.execute_subprocess(str(cline))
def muscle_align(fasta_in, outname): """ """ cline = MuscleCommandline(input=fasta_in, out=outname) cline() aln = AlignIO.read(outname, 'fasta') return aln
def multiple_sequence_alignment( records, output_fn="/var/www/html/dl/alignment.fasta", format="clustal", id_prefix="", index=None, ): """Then go to https://www.ncbi.nlm.nih.gov/projects/msaviewer/ https://soerendip.com/dl/alignment.fasta """ if isinstance(records[0], str): if index is None: records = [ SeqRecord(Seq(r), id=f"{id_prefix}-{i:03.0f}") for i, r in enumerate(records) ] else: records = [ SeqRecord(Seq(r), id=f"{_id}") for i, (r, _id) in enumerate(zip(records, index)) ] path = tempfile.gettempdir() job_id = "msa-" + str(uuid()) tmp_inputs_fn = os.path.join(path, job_id + ".faa") if output_fn == None: output_fn = os.path.join(path, job_id + ".fasta") tmp_log = os.path.join(path, job_id + ".log") SeqIO.write(records, tmp_inputs_fn, "fasta") msa = MuscleCommandline(input=tmp_inputs_fn, out=output_fn, diags=True, maxiters=1, log=tmp_log) msa() with open(output_fn, "r") as file: align = AlignIO.read(file, "fasta") # print(align.format(format)) lines = align.format("stockholm").split("\n") result = [] index = [] for line in lines: if line.startswith("//"): continue if line == "": continue if not line.startswith("#"): result.append(list(line.split(" ")[1])) index.append(line.split(" ")[0]) return pd.DataFrame(np.array(result), index=index).sort_index()
def generaAln(self): self.file.close() muscle_exe = r"C:\Users\Gerson\Downloads\muscle.exe" cline = MuscleCommandline(muscle_exe, input="alinear.fasta", out="arbol.aln", clw=True) string = str(cline) subprocess.call(string, shell=True)
def align_with_muscle(input_fasta): muscle_exe = Path("../bin/muscle3.8.31_i86linux64") muscle_cline = MuscleCommandline(muscle_exe, input=input_fasta) # The variable `stdout` ("standard out") captures the output from MUSCLE # `stderr` ("standard error") captures any errors. stdout, stderr = muscle_cline() # `AlignIO` reads an alignment # `StringIO` lets BioPython treat a string as though it were a file return AlignIO.read(StringIO(stdout), "fasta")
def muscle_largeinput(file): muscle_cline = MuscleCommandline(input=file) child = subprocess.Popen(str(muscle_cline), stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, shell=(sys.platform != "linux2")) muscle_align = AlignIO.read(child.stdout, "fasta") print(muscle_align)
def quickAlign(refseq, testseq, maxiters=None, diags=None, gapopen=None): #sanity check try: refseq = re.sub("-", "", refseq) except TypeError: #not a string, probably a SeqRecord try: refseq = str(refseq.seq) refseq = re.sub("-", "", refseq) except AttributeError: #give up sys.exit( "quickAlign() requires inputs to be either strings or SeqRecord objects" ) try: testseq = re.sub("-", "", testseq) except TypeError: #not a string, probably a SeqRecord try: testseq = str(testseq.seq) testseq = re.sub("-", "", testseq) except AttributeError: #give up sys.exit( "quickAlign() requires inputs to be either strings or SeqRecord objects" ) handle = StringIO() handle.write(">ref\n%s\n>test\n%s\n" % (refseq, testseq)) data = handle.getvalue() muscle_cline = MuscleCommandline(cmd=muscle, quiet=True) if maxiters is not None: muscle_cline.maxiters = maxiters if diags is not None: muscle_cline.diags = diag if gapopen is not None: muscle_cline.gapopen = gapopen stdout, stderr = muscle_cline(stdin=data) aligned = dict() for p in SeqIO.parse(StringIO(stdout), "fasta"): aligned[p.id] = str(p.seq) return aligned
def test_with_multiple_output_formats(self): """Simple muscle call with multiple output formats.""" input_file = "Fasta/f002" output_html = "temp_f002.html" output_clwstrict = "temp_f002.clw" self.assertTrue(os.path.isfile(input_file)) records = list(SeqIO.parse(input_file, "fasta")) records.sort(key=lambda rec: rec.id) # noqa: E731 # Prepare the command... use Clustal output (with a MUSCLE header) cmdline = MuscleCommandline( muscle_exe, input=input_file, clw=True, htmlout=output_html, clwstrictout=output_clwstrict, ) self.assertEqual( str(cmdline).rstrip(), _escape_filename(muscle_exe) + " -in Fasta/f002 -clw -htmlout temp_f002.html" + " -clwstrictout temp_f002.clw", ) self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) child = subprocess.Popen( str(cmdline), stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, shell=(sys.platform != "win32"), ) # Clustalw on stdout: align = AlignIO.read(child.stdout, "clustal") align.sort() # Didn't use -quiet so there should be progress reports on stderr, self.assertTrue(child.stderr.read().strip().startswith("MUSCLE")) return_code = child.wait() self.assertEqual(return_code, 0) self.assertEqual(len(records), len(align)) for old, new in zip(records, align): self.assertEqual(old.id, new.id) child.stdout.close() child.stderr.close() del child handle = open(output_html) html = handle.read().strip().upper() handle.close() self.assertTrue(html.startswith("<HTML")) self.assertTrue(html.endswith("</HTML>")) # ClustalW strict: align = AlignIO.read(output_clwstrict, "clustal") align.sort() self.assertEqual(len(records), len(align)) for old, new in zip(records, align): self.assertEqual(old.id, new.id) os.remove(output_html) os.remove(output_clwstrict)
def find_sqce_consensus( list_of_sequences, sqce_type=Constants.SEQUENCE_TYPE_DNA, \ threshold=Constants.DEFAULT_SQCE_CONSENSUS_AMBIG_THRESHOLD, \ fasta_end_name = '' ): if (sqce_type == Constants.SEQUENCE_TYPE_DNA): alphabet = generic_dna ambiguous = Constants.SEQUENCE_AMBIGUOUS_DNA_BASE elif (sqce_type == Constants.SEQUENCE_TYPE_PROT): alphabet = generic_protein ambiguous = Constants.SEQUENCE_AMBIGUOUS_PROT_AA else: raise DenCellORFException( 'MergeStrategy.find_sqce_consensus(): The type of sequence provided' + ' has to be ' + Constants.SEQUENCE_TYPE_DNA + ' or ' + Constants.SEQUENCE_TYPE_PROT + ' (provided type: ' + str(sqce_type) + ').') # Store the input sequences in a fasta file in order to run Muscle input_sequences = (SeqRecord(Seq(s, alphabet)) for s in list_of_sequences) if (not os.path.exists(DefaultTemporaryFolder.TEMPORARY_FOLDER)): os.makedirs(DefaultTemporaryFolder.TEMPORARY_FOLDER) input_sequences_file = os.path.join( DefaultTemporaryFolder.TEMPORARY_FOLDER, 'input_sequences' + fasta_end_name + '.fasta') SeqIO.write(input_sequences, input_sequences_file, 'fasta') # Perform the multiple sequences alignment and # store the output in a fasta file aligned_sequences_file = os.path.join( DefaultTemporaryFolder.TEMPORARY_FOLDER, 'aligned_sequences' + fasta_end_name + '.fasta') muscle_cline = MuscleCommandline(cmd='/bin/muscle', input=input_sequences_file, out=aligned_sequences_file) (stdout, stderr) = muscle_cline() # Read the fasta file containing aligned sequences align = AlignIO.read(aligned_sequences_file, 'fasta') summary_align = AlignInfo.SummaryInfo(align) # Compute the consensus consensus = summary_align.gap_consensus(threshold=threshold, ambiguous=ambiguous) # Remove the temporary fasta files os.remove(input_sequences_file) os.remove(aligned_sequences_file) return str(consensus)
def muscle_alignment(seqs): """Align 2 sequences with muscle""" filename = 'temp.faa' SeqIO.write(seqs, filename, "fasta") name = os.path.splitext(filename)[0] from Bio.Align.Applications import MuscleCommandline cline = MuscleCommandline(input=filename, out=name+'.txt') stdout, stderr = cline() align = AlignIO.read(name+'.txt', 'fasta') return align
def _perform_ma(self, data): params = {'maxiters': 7} if self.diags is True: params['diags'] = True if self.maxhours is not None: params['maxhours'] = self.maxhours muscle_cline = MuscleCommandline(**params) stdout, stderr = muscle_cline(stdin=data) return stdout
def align_ks_domains(reference_alignment, ks_names, ks_seqs, data_dir): """Function that aligns a number of query KS domain sequences to the reference alignment of KS domains. """ #Set file names and write query domains to temp input file in_temp = os.path.join(os.getcwd(), "in_seq.fasta") in_temp_aligned = os.path.join(os.getcwd(), "in_seq_aligned.fasta") out_temp = os.path.join(os.getcwd(), "out_seq.fasta") alignment_file = os.path.join(os.getcwd(), "aligned.fasta") with open(in_temp, "w") as tmp_input: for name, seq in zip(ks_names, ks_seqs): tmp_input.write("%s\n%s\n" % (name, seq)) #Generate alignment of query sequences muscle_cmd = str(MuscleCommandline(input=in_temp, out=in_temp_aligned)) out, err, retcode = utils.execute(muscle_cmd.split(" ")) if retcode == 1: logging.error( "Alignment of query KS sequences with Muscle failed. Check if Muscle is installed appropriately." ) sys.exit(1) #Align the query alignment to the reference alignment using muscle --profile muscle_cmd = str( MuscleCommandline(profile='True', in1=reference_alignment, in2=in_temp_aligned, out=out_temp)) out, err, retcode = utils.execute(muscle_cmd.split(" ")) if retcode == 1: logging.error( "Alignment of query+reference KS sequences with Muscle failed. Check if Muscle is installed appropriately." ) sys.exit(1) else: f_temp_input = open(out_temp, 'r').read() reformat(input=f_temp_input, out_filename=alignment_file) #Remove temporary files for f in [in_temp, out_temp]: os.remove(f) return alignment_file
def muscle_align(input, output): try: in_file = r'{0}'.format(input) out_file = r'{0}'.format(output) muscle_cline = MuscleCommandline(input=in_file, out=out_file) stdout, stderr = muscle_cline() except: print('Imposible alinear el archivo ' + query + ':' '¿Ha comprobado sus valores de coverage e identity?') pass
def build_profile_hmm_for_repeats(repeats, error_rate): muscle_cline = MuscleCommandline('muscle', clwstrict=True) data = '\n'.join( ['>%s\n' % str(i) + repeats[i] for i in range(len(repeats))]) stdout, stderr = muscle_cline(stdin=data) alignment = AlignIO.read(StringIO(stdout), "clustal") aligned_repeats = [str(aligned.seq) for aligned in alignment] return build_profile_hmm_pseudocounts_for_alignment( error_rate, aligned_repeats)
def codon_align(self, alignment_tool="mafft", prune=True, verbose=0): ''' takes a nucleotide alignment, translates it, aligns the amino acids, pads the gaps note that this suppresses any compensated frameshift mutations Parameters: - alignment_tool: ['mafft', 'muscle'] the commandline tool to use ''' from Bio import AlignIO, SeqIO from Bio.SeqRecord import SeqRecord make_dir(self.run_dir) os.chdir(self.run_dir) # translage aa_seqs = {} bad_seq = 0 for seq in self.seqs.values(): tempseq = seq.seq.translate() # use only sequences that translate with out trouble if '*' not in str(tempseq)[:-1] or prune == False: aa_seqs[seq.id] = SeqRecord(tempseq, id=seq.id) aa_seqs[seq.id].attributes = seq.attributes else: if verbose: print(seq.id, "has premature stops, discarding") bad_seq += '*' in str(tempseq)[:-1] print('Number of sequences with stops:', bad_seq, 'out of total', len(self.seqs)) tmpfname = 'temp_in.fasta' SeqIO.write(aa_seqs.values(), tmpfname, 'fasta') if alignment_tool == 'muscle': from Bio.Align.Applications import MuscleCommandline cline = MuscleCommandline(input=tmpfname, out=tmpfname[:-5] + 'aligned.fasta') cline() aln_aa = AlignIO.read(tmpfname[:-5] + 'aligned.fasta', "fasta") elif alignment_tool == 'mafft': from Bio.Align.Applications import MafftCommandline from StringIO import StringIO mafft_cline = MafftCommandline(input=tmpfname) stdout, stderr = mafft_cline() aln_aa = AlignIO.read(StringIO(stdout), "fasta") else: print('Alignment tool not supported:', alignment_tool) return #generate nucleotide alignment self.aln = pad_nucleotide_sequences(aln_aa, self.seqs) self.sequence_lookup = {seq.id: seq for seq in self.aln} # add attributes to alignment for seq in self.seqs.values(): if seq.id in self.sequence_lookup: self.sequence_lookup[seq.id].attributes = seq.attributes os.chdir('..') remove_dir(self.run_dir)
def test_Muscle_simple(self): """Simple round-trip through app just infile and outfile""" cmdline = MuscleCommandline(muscle_exe, input=self.infile1, out=self.outfile1) self.assertEqual(str(cmdline), _escape_filename(muscle_exe) + ' -in Fasta/f002 -out "Fasta/temp align out1.fa"') self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) output, error = cmdline() self.assertEqual(output, "") self.assertTrue("ERROR" not in error)
def pool_write_microalignment(mblocknum,targetdata,extendedsourcedata,nbinitialsource,all_ids,msamethod): aln = {} i = mblocknum[0] mblock = mblocknum[1] input_muscle_file = "input_muscle.fasta"+str(i) output_muscle_file = "output_muscle.fasta"+str(i) input_muscle = open(input_muscle_file,"w") nbseq = 0 for gene in targetdata: geneid,geneseq = gene if geneid in mblock.keys() and mblock[geneid][1] > mblock[geneid][0]: input_muscle.write(">"+geneid + "\n" + geneseq[mblock[geneid][0]:mblock[geneid][1]]+"\n") nbseq += 1 for j in range(nbinitialsource): cds = extendedsourcedata[j] cdsid,cdsseq,cdsgeneid,null = cds if cdsid in mblock.keys() and mblock[cdsid][1] > mblock[cdsid][0]: input_muscle.write(">"+cdsid + "\n" + cdsseq[mblock[cdsid][0]:mblock[cdsid][1]]+"\n") nbseq += 1 input_muscle.close() msa = [] if(nbseq > 0): if(msamethod == "muscle"): muscle_cline = MuscleCommandline(input=input_muscle_file, out=output_muscle_file, gapopen=-800.0) stdout, stderr = muscle_cline() else:# msamethod == "mafft" mafft_cline = MafftCommandline(input=input_muscle_file) stdout, stderr = mafft_cline() with open(output_muscle_file, "w") as handle: handle.write(stdout) msa = AlignIO.read(output_muscle_file, "fasta") else: open(output_muscle_file,"w").close() present_ids = [] length = 0 for record in msa: present_ids.append(record.id) aln[record.id] = record.seq length = len(record.seq) for id in all_ids: if(id not in present_ids): aln[id] = '-'*length os.remove(input_muscle_file) os.remove(output_muscle_file) return aln
def align(self): if self.align_software == 'mafft': mafft_cline = MafftCommandline( cmd=self.mafft_path, input=self.pair_pep_file, auto=True) stdout, stderr = mafft_cline() align = AlignIO.read(StringIO(stdout), "fasta") AlignIO.write(align, self.prot_align_file, "fasta") if self.align_software == 'muscle': muscle_cline = MuscleCommandline( cmd=self.muscle_path, input=self.pair_pep_file, out=self.prot_align_file, seqtype="protein", clwstrict=True) stdout, stderr = muscle_cline()
def multialign_genomic_templates(fastafile): """Uses MUSCLE to return the multialigned genomic data.""" from Bio.Align.Applications import MuscleCommandline from StringIO import StringIO from Bio import AlignIO muscle_cline = MuscleCommandline(input=fastafile) stdout, stderr = muscle_cline() multialign = AlignIO.read(StringIO(stdout), "fasta") return multialign
def runMuscle(filePath): alnFilePath = os.path.splitext(filePath)[0] + ".muscle.aln" print("[INFO] Running muscle on {}".format(filePath)) muscle_cline = MuscleCommandline(input=filePath, out=alnFilePath, clw=True) stdout, stderr = muscle_cline() print("[INFO] Creating alignment from {}".format(alnFilePath)) align = AlignIO.read(alnFilePath, "clustal") return align
def muscleAlignment(seqs, muscle_exe="muscle"): ''' align sequences with muscle given a list of seqs in SeqIO format, return a aligned seqs in SeqIO format ''' f_mem = io.StringIO() SeqIO.write(seqs, f_mem, 'fasta') data = f_mem.getvalue() muscle_cline = MuscleCommandline(muscle_exe) stdout, stderr = muscle_cline(stdin=data) return list(SeqIO.parse(io.StringIO(stdout), 'fasta'))
def muscleAlign(seq_records): muscle_cmd_line = MuscleCommandline() child_process = subprocess.Popen(str(muscle_cmd_line), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, universal_newlines=True) SeqIO.write(seq_records, child_process.stdin, 'fasta') child_process.stdin.close() return AlignIO.read(child_process.stdout, 'fasta')
def muscleAlign(a, b): filenames = [a + ".fasta", b + ".fasta"] with open(a + "-" + b + ".fasta", 'w') as outfile: for fname in filenames: with open(fname) as infile: for line in infile: outfile.write(line) cline = MuscleCommandline(input=a + '-' + b + '.fasta', out=a + '-' + b + '_aligned.fasta') print cline os.system(str(cline))
def generated_paired_alignment(): print "Generating paired alignments in ClustalW format using MUSCLE" from Bio.Align.Applications import MuscleCommandline muscle_cline = MuscleCommandline( input= "F:\\KINEV\\fasta_files\\1FFW_A_1FFW_B_P0AE67_P07363.sffamily_interactingpairs.fasta" ) stdout, stderr = muscle_cline() from StringIO import StringIO from Bio import AlignIO align = AlignIO.read(StringIO(stdout), "fasta") print(align)
def quickAlign( refseq, testseq, maxiters=None, diags=None, gapopen=None ): #sanity check refseq = re.sub( "-", "", str(refseq) ) testseq = re.sub( "-", "", str(testseq) ) handle = StringIO() handle.write( ">ref\n%s\n>test\n%s\n"%(refseq,testseq) ) data = handle.getvalue() muscle_cline = MuscleCommandline(cmd=muscle, quiet=True) if maxiters is not None: muscle_cline.maxiters = maxiters if diags is not None: muscle_cline.diags = diag if gapopen is not None: muscle_cline.gapopen = gapopen stdout, stderr = muscle_cline(stdin=data) aligned = dict() for p in SeqIO.parse(StringIO(stdout), "fasta"): aligned[ p.id ] = str(p.seq) return aligned
def align_muscle(infile_name, outfile_name, log_file): """Make external call to Muscle aligner.""" cline = MuscleCommandline(input=infile_name, out=outfile_name, clw=True, loga=log_file, quiet='y') child = subprocess.Popen(str(cline), stdout=subprocess.PIPE, shell=True) output, error = child.communicate() report = {'output': output, 'error': error} # TODO: should set up something to parse MUSCLE errors return report
def allign_fasta(filename = "filename", extension_in = ".fasta", extension_out = ".aln"): """ This function requires MUSCLE from http://www.drive5.com/muscle. The main objective - read FASTA file with multiple records, find similar sequences, save alingment of similar sequences to "filename.aln". @param filename: FASTA file, which should be alligned. @param extension_in: FASTA file type end, could be .fa or similar. @param extension_out: Alignment file type: ".aln". """ from Bio.Align.Applications import MuscleCommandline if filename == None: return False; if not os.path.exists(filename + extension_in): return False; cline = MuscleCommandline(input=filename + extension_in, out=filename + extension_out); os.system(cline.__str__()); return True;
def test_Muscle_profile_simple(self): """Simple round-trip through app doing a profile alignment""" cmdline = MuscleCommandline(muscle_exe) cmdline.set_parameter("out", self.outfile3) cmdline.set_parameter("profile", True) cmdline.set_parameter("in1", self.infile2) cmdline.set_parameter("in2", self.infile3) self.assertEqual(str(cmdline), _escape_filename(muscle_exe) + " -out Fasta/temp_align_out3.fa" + " -profile -in1 Fasta/fa01 -in2 Fasta/f001") self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) output, error = cmdline() self.assertEqual(output, "") self.assertTrue("ERROR" not in error) self.assertTrue(error.strip().startswith("MUSCLE"), output)
def test_Muscle_profile_simple(self): """Simple round-trip through app doing a profile alignment.""" cmdline = MuscleCommandline(muscle_exe) cmdline.set_parameter("out", self.outfile3) cmdline.set_parameter("profile", True) cmdline.set_parameter("in1", self.infile2) cmdline.set_parameter("in2", self.infile3) self.assertEqual(str(cmdline), muscle_exe + \ " -out Fasta/temp_align_out3.fa" + \ " -profile -in1 Fasta/fa01 -in2 Fasta/f001") self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) stdin, stdout, stderr = generic_run(cmdline) self.assertEqual(stdin.return_code, 0) self.assertEqual(stdout.read(), "") self.assert_("ERROR" not in stderr.read()) self.assertEqual(str(stdin._cl), str(cmdline))
def test_Muscle_with_options(self): """Round-trip through app with a switch and valued option""" cmdline = MuscleCommandline(muscle_exe) cmdline.set_parameter("input", self.infile1) # "input" is alias for "in" cmdline.set_parameter("out", self.outfile2) #Use property: cmdline.objscore = "sp" cmdline.noanchors = True self.assertEqual(str(cmdline), _escape_filename(muscle_exe) + " -in Fasta/f002" + " -out Fasta/temp_align_out2.fa" + " -objscore sp -noanchors") self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) output, error = cmdline() self.assertEqual(output, "") self.assertTrue("ERROR" not in error) self.assertTrue(error.strip().startswith("MUSCLE"), output)
def test_Muscle_with_options(self): """Round-trip through app with a switch and valued option.""" cmdline = MuscleCommandline(muscle_exe) cmdline.set_parameter("input", self.infile1) #"input" is alias for "in" cmdline.set_parameter("out", self.outfile2) #Use property: cmdline.objscore = "sp" cmdline.noanchors = True self.assertEqual(str(cmdline), muscle_exe +\ " -in Fasta/f002" + \ " -out Fasta/temp_align_out2.fa" + \ " -objscore sp -noanchors") self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) stdin, stdout, stderr = generic_run(cmdline) self.assertEqual(stdin.return_code, 0) self.assertEqual(stdout.read(), "") self.assert_("ERROR" not in stderr.read()) self.assertEqual(str(stdin._cl), str(cmdline))
def test_Muscle_profile_simple(self): """Simple round-trip through app doing a profile alignment""" cmdline = MuscleCommandline(muscle_exe) cmdline.set_parameter("out", self.outfile3) cmdline.set_parameter("profile", True) cmdline.set_parameter("in1", self.infile2) cmdline.set_parameter("in2", self.infile3) self.assertEqual(str(cmdline), muscle_exe + \ " -out Fasta/temp_align_out3.fa" + \ " -profile -in1 Fasta/fa01 -in2 Fasta/f001") self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) child = subprocess.Popen(str(cmdline), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=(sys.platform!="win32")) output, error = child.communicate() self.assertEqual(child.returncode, 0) self.assertEqual(output, "") self.assert_("ERROR" not in error) del child
def test_Muscle_with_options(self): """Round-trip through app with a switch and valued option""" cmdline = MuscleCommandline(muscle_exe) cmdline.set_parameter("input", self.infile1) #"input" is alias for "in" cmdline.set_parameter("out", self.outfile2) #Use property: cmdline.objscore = "sp" cmdline.noanchors = True self.assertEqual(str(cmdline), muscle_exe +\ " -in Fasta/f002" + \ " -out Fasta/temp_align_out2.fa" + \ " -objscore sp -noanchors") self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) child = subprocess.Popen(str(cmdline), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=(sys.platform!="win32")) output, error = child.communicate() self.assertEqual(child.returncode, 0) self.assertEqual(output, "") self.assert_("ERROR" not in error) del child
def test_simple_clustal_strict(self): """Simple muscle call using strict Clustal output.""" input_file = "Fasta/f002" self.assert_(os.path.isfile(input_file)) records = list(SeqIO.parse(open(input_file),"fasta")) #Prepare the command... cmdline = MuscleCommandline(muscle_exe) cmdline.set_parameter("in", input_file) #Preserve input record order (makes checking output easier) cmdline.set_parameter("stable", True) #Default None treated as False! #Use clustal output (with a CLUSTAL header) cmdline.set_parameter("clwstrict", True) #Default None treated as False! self.assertEqual(str(cmdline).rstrip(), muscle_exe + \ " -in Fasta/f002 -clwstrict -stable") self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) result, out_handle, err_handle = generic_run(cmdline) align = AlignIO.read(out_handle, "clustal") self.assertEqual(len(records),len(align)) for old, new in zip(records, align): self.assertEqual(old.id, new.id) self.assertEqual(str(new.seq).replace("-",""), str(old.seq)) #Didn't use -quiet so there should be progress reports on stderr, self.assert_(err_handle.read().strip().startswith("MUSCLE"))
def test_long(self) : """Simple muscle call using long file.""" #Create a large input file by converting some of another example file temp_large_fasta_file = "temp_cw_prot.fasta" handle = open(temp_large_fasta_file, "w") records = list(SeqIO.parse(open("NBRF/Cw_prot.pir", "rU"), "pir"))[:40] SeqIO.write(records, handle, "fasta") handle.close() #Prepare the command... cmdline = MuscleCommandline(muscle_exe) cmdline.set_parameter("in", temp_large_fasta_file) #Preserve input record order cmdline.set_parameter("stable", True) #Default None treated as False! #Use fast options cmdline.set_parameter("maxiters", 1) cmdline.set_parameter("diags", True) #Default None treated as False! #Use clustal output cmdline.set_parameter("clwstrict", True) #Default None treated as False! #Shoudn't need this, but just to make sure it is accepted cmdline.set_parameter("maxhours", 0.1) #No progress reports to stderr cmdline.set_parameter("quiet", True) #Default None treated as False! #TODO - Fix the trailing space! self.assertEqual(str(cmdline).rstrip(), muscle_exe + \ " -in temp_cw_prot.fasta -diags -maxhours 0.1" + \ " -maxiters 1 -clwstrict -stable -quiet") self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) result, out_handle, err_handle = generic_run(cmdline) align = AlignIO.read(out_handle, "clustal") self.assertEqual(len(records), len(align)) for old, new in zip(records, align) : self.assertEqual(old.id, new.id) self.assertEqual(str(new.seq).replace("-",""), str(old.seq)) os.remove(temp_large_fasta_file) #See if quiet worked: self.assertEqual("", err_handle.read().strip())
def test_long(self): """Simple muscle call using long file""" #Create a large input file by converting some of another example file temp_large_fasta_file = "temp_cw_prot.fasta" records = list(SeqIO.parse("NBRF/Cw_prot.pir", "pir"))[:40] SeqIO.write(records, temp_large_fasta_file, "fasta") #Prepare the command... cmdline = MuscleCommandline(muscle_exe) cmdline.set_parameter("in", temp_large_fasta_file) #Use fast options cmdline.set_parameter("maxiters", 1) cmdline.set_parameter("diags", True) # Default None treated as False! #Use clustal output cmdline.set_parameter("clwstrict", True) # Default None treated as False! #Shoudn't need this, but just to make sure it is accepted cmdline.set_parameter("maxhours", 0.1) #No progress reports to stderr cmdline.set_parameter("quiet", True) # Default None treated as False! self.assertEqual(str(cmdline).rstrip(), _escape_filename(muscle_exe) + " -in temp_cw_prot.fasta -diags -maxhours 0.1" + " -maxiters 1 -clwstrict -quiet") self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) child = subprocess.Popen(str(cmdline), stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, shell=(sys.platform!="win32")) align = AlignIO.read(child.stdout, "clustal") align.sort() records.sort(key = lambda rec: rec.id) self.assertEqual(len(records), len(align)) for old, new in zip(records, align): self.assertEqual(old.id, new.id) self.assertEqual(str(new.seq).replace("-",""), str(old.seq)) #See if quiet worked: self.assertEqual("", child.stderr.read().strip()) return_code = child.wait() self.assertEqual(return_code, 0) child.stdout.close() child.stderr.close() del child os.remove(temp_large_fasta_file)
def run(self): run_muscle = MuscleCommandline( input=self.fasta, out=self.output ) run_muscle.tree1 = self.tree run_muscle.cluster1 = "neighborjoining" run_muscle.maxiters = 1 thisVarHidesTheOutput = run_muscle()
def GetExec(self, optList, frame): # Respond to the "muscle" command. self.frame = frame plugin_exe = r"C:/Program Files (x86)/py27/Lib/site-packages/Muscle.exe" self.outfile=r".\plugins\muscle.txt" self.outtype="fasta" cline = MuscleCommandline(plugin_exe,out=self.outfile) if '1ProfileCheck' in self.frame.paramBoxes: if self.frame.paramBoxes['1ProfileCheck'].GetValue(): cline.profile = True cline.in1 = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\my_seq.fasta" cline.in2 = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\my_seq.fasta" else: cline.input = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\my_seq.fasta" if '1DiagCheck' in self.frame.paramBoxes: if self.frame.paramBoxes['1DiagCheck'].GetValue(): cline.diags=True if "DiagLenSpin" in self.frame.paramBoxes: cline.diaglength=int(self.frame.paramBoxes["DiagLenSpin"]) if "DiagMargSpin" in self.frame.paramBoxes: cline.diaglength=int(self.frame.paramBoxes["DiagMargSpin"]) if "DiagBreakSpin" in self.frame.paramBoxes: cline.diaglength=int(self.frame.paramBoxes["DiagBreakSpin"]) elif "GapPenSpin" in self.frame.paramBoxes: cline.gapopen=float(self.frame.paramBoxes["GapPenSpin"].GetValue()) else: cline.input=r"C:\Users\francis\Documents\Monguis\BioGui\plugins\my_seq.fasta" if self.frame.abet=="AA": cline.seqtype="protein" elif self.frame.abet=="DNA" or self.frame.abet=="RNA": cline.seqtype="nucleo" else: cline.seqtype="auto" if self.frame.options: cline.objscore=str(self.boxList[9].GetValue()) cline.weight1=str(self.boxList[13].GetValue()) cline.weight2=str(self.boxList[15].GetValue()) cline.anchorspacing=int(self.boxList[17].GetValue()) cline.center=float(self.boxList[19].GetValue()) cline.hydro=int(self.boxList[21].GetValue()) cline.hydrofactor=float(self.boxList[23].GetValue()) cline.maxhours=float(self.boxList[25].GetValue()) cline.maxiters=int(self.boxList[27].GetValue()) cline.maxtrees=int(self.boxList[29].GetValue()) cline.minbestcolscore=float(self.boxList[31].GetValue()) cline.minsmoothscore=float(self.boxList[33].GetValue()) cline.smoothscoreceil=float(self.boxList[35].GetValue()) cline.smoothwindow=int(self.boxList[37].GetValue()) cline.sueff=float(self.boxList[39].GetValue()) return str(cline)
def main(): global inFile, lookup oldFiles = ( glob.glob("%s/infile" % prj_tree.phylo) + glob.glob("%s/outtree" % prj_tree.phylo) + glob.glob("%s/outfile" % prj_tree.phylo) ) if len(oldFiles) > 0: if force: for f in oldFiles: os.remove(f) else: sys.exit("Old files exist! Please use the -f flag to force overwrite.") if doAlign: # first create a working file to align and add the germline and natives shutil.copyfile( "%s/%s-collected.fa" % (prj_tree.nt, prj_name), "%s/%s_to_align.fa" % (prj_tree.phylo, prj_name) ) handle = open("%s/%s_to_align.fa" % (prj_tree.phylo, prj_name), "a") handle.write(">%s\n%s\n" % (germ_seq.id, germ_seq.seq)) for n in natives.values(): handle.write(">%s\n%s\n" % (n.id, n.seq)) handle.close() # now run muscle run_muscle = MuscleCommandline( input="%s/%s_to_align.fa" % (prj_tree.phylo, prj_name), out="%s/%s_aligned.afa" % (prj_tree.phylo, prj_name) ) run_muscle.maxiters = 2 run_muscle.diags = True run_muscle.gapopen = -5000.0 # code requires a float print run_muscle run_muscle() # thisVarHidesTheOutput = run_muscle() # change inFile variable so that remaining code is the same for both cases # It's probably really bad form to handle this in this way inFile = "%s/%s_aligned.afa" % (prj_tree.phylo, prj_name) # open the alignment to rename everything and find germline sequence # rename is to avoid possible errors with DNAML from sequence ids that are too long germ_pos = 1 with open(inFile, "rU") as handle: if doAlign: aln = AlignIO.read(handle, "fasta") else: try: aln = AlignIO.read(handle, "phylip") except: sys.exit("Please make sure custom input is aligned and in PHYLIP format") lookup = [] for seq in aln: lookup.append(seq.id) if re.search("(IG|VH|VK|VL|HV|KV|LV)", seq.id) is not None: germ_pos = len(lookup) seq.id = "%010d" % len(lookup) with open("%s/infile" % prj_tree.phylo, "w") as output: AlignIO.write(aln, output, "phylip") # now generate script for DNAML # J is "jumble" followed by random seed and number of times to repeat # O is outgroup root, followed by position of the germline in the alignment # 5 tells DNAML to do the ancestor inference # Y starts the run with open("%s/dnaml.in" % prj_tree.phylo, "w") as handle: seed = random.randint(0, 1e10) * 2 + 1 # seed must be odd handle.write("J\n%d\n3\nO\n%d\n5\nY\n" % (seed, germ_pos)) # change to work directory so DNAML finds "infile" and puts the output where we expect os.chdir(prj_tree.phylo) with open("%s/dnaml.in" % prj_tree.phylo, "rU") as pipe: subprocess.call([DNAML], stdin=pipe) # revert names in tree with open("%s/outtree" % prj_tree.phylo, "rU") as intree: mytree = intree.read() fixedtree = re.sub("\d{10}", revertName, mytree) with open("%s/%s.tree" % (prj_tree.out, prj_name), "w") as outtree: outtree.write(fixedtree) # revert names in out file with open("%s/outfile" % prj_tree.phylo, "rU") as instuff: mystuff = instuff.read() fixedstuff = re.sub("\d{10}", revertName, mystuff) with open("%s/%s.dnaml.out" % (prj_tree.logs, prj_name), "w") as outstuff: outstuff.write(fixedstuff) # clean up os.remove("infile") os.remove("outfile") os.remove("outtree")
def buildGSSP( vgene ): results = [] if len(masterList[vgene]) < arguments["--numSequences"]: print( "Skipping %s, not enough sequences (%d)..." % ( vgene, len(masterList[vgene]) ) ) return [] if vgene not in germList: print( "Skipping %s, it's not in the germline database..." %vgene ) return [] # Take random overlapping subsets to generate multiple profiles # need to add back a sanity check for capping the number of subsets if there's not enough raw data. numProfiles = arguments['--profiles'] if arguments["--profiles"] == 0: numProfiles = 1 success = 0 for i in range(numProfiles): seqs = [] + germList[vgene] #force a copy rather than an alias if arguments["--profiles"] == 0: seqs += list(masterList[vgene]) else: #get our sequence subset, add the germlines, and write them # to a temporary file for alignment seqs += list(numpy.random.choice(masterList[vgene], size=arguments["--numSequences"], replace=False)) tempFile = "%s/work/mGSSP/%s_profileBuilder" % (prj_tree.home, vgene) with open("%s.fa"%tempFile, "w") as temp: SeqIO.write(seqs,temp,"fasta") muscle_cline = MuscleCommandline(cmd=muscle, input="%s.fa"%tempFile, out="%s.aln"%tempFile) #try to speed up the process a little bit for large datasets #still going to max out at ~50k seqs per profile (probably) muscle_cline.maxiters = 2 muscle_cline.diags = True try: stdout, stderr = muscle_cline() except: print( "Error in alignment #%d for %s (skipping)" % (i+1, vgene) ) for f in glob.glob("%s.*"%tempFile): os.remove(f) continue alignment = AlignIO.read("%s.aln"%tempFile, "fasta")#"clustal") success += 1 #Input order is not maintained, so we need a little # kludge to find a germline sequences. Use the # first one to remove any insertions from the alignment germRow = 0 for n, rec in enumerate(alignment): if rec.id in [g.id for g in germList[vgene]]: germRow = n break #look for gaps one at a time so we don't get tripped up by shifting indices gap = re.search( "-+", str(alignment[germRow].seq) ) while (gap): alignment = alignment[:, 0:gap.start()] + alignment[:, gap.end():] gap = re.search( "-+", str(alignment[germRow].seq) ) #Now we get BioPython to make a PSSM for us. To convert that into # a mutability profile, we will delete the germline residue[s] # at each position (but save what they were) germRes = defaultdict(Counter) summary_align = AlignInfo.SummaryInfo(alignment) pssm = summary_align.pos_specific_score_matrix(chars_to_ignore=['-','X']) #get number of datapoints at each position (might be different than the number of sequences in the profile if there are gaps or missing data # do this by using sum(pos.values()) after ignoring missing data (previous line) but before dumping germline residues. denominator = [] for p,pos in enumerate(pssm): denominator.append( sum(pos.values()) - len(germList[vgene]) ) for germ in germList[vgene]: for pos, residue in enumerate(germ): if residue == "X": continue germRes[pos][residue] += 1 pssm[pos][residue] = 0 #normalize and save for p, pos in enumerate(pssm): germAA = ",".join([ x[0] for x in germRes[p].most_common() ]) results.append( [ vgene, i+1, p+1, germAA, "None" if (p < mask[vgene] or denominator[p] < arguments["--numSequences"]) else "%.5f"%(sum(pos.values())/denominator[p]) ] + [ "%.5f"%(pos.get(r,0)/sum(pos.values())) if sum(pos.values()) > 0 else "0.00" for r in aa_list ] ) #clean up for f in glob.glob("%s.*"%tempFile): os.remove(f) print( "Successfully built %d/%d profiles for %s using %d sequences!" % ( success, numProfiles, vgene, len(seqs)-len(germList[vgene]) ) ) return results