def get_alignment_commands(fastafile_name, outdir, aligner, threads): geneName = fastafile_name.split('/')[-1].split('.')[0] if aligner == "prank": command = PrankCommandline(d=fastafile_name, o=geneName, f=8, codon=True) elif (threads > 3): if aligner == "mafft": command = MafftCommandline(input=fastafile_name, auto=True, nuc=True) elif aligner == "clustal": command = ClustalOmegaCommandline( infile=fastafile_name, outfile=outdir + "aligned_gene_sequences/" + geneName + ".aln.fas", seqtype="DNA") elif (threads <= 3): if aligner == "mafft": command = MafftCommandline(input=fastafile_name, auto=True, thread=threads, nuc=True) elif aligner == "clustal": command = ClustalOmegaCommandline( infile=fastafile_name, outfile=outdir + "aligned_gene_sequences/" + geneName + ".aln.fas", seqtype="DNA", threads=threads) return (command, fastafile_name)
def align_seq_file(fasta_filename, output_aligned_fasta_filename,context): #parameters for Alignment: mafft_iter = context.UserFlags_dict['MAFFT_maxiterate'] mafft_pairMethod = context.UserFlags_dict['PairwiseAlignmentMethod'] if mafft_pairMethod == 'mafft_globalpair': mafft_cline = MafftCommandline(input=fasta_filename, nuc=True, quiet=True, maxiterate=int(mafft_iter), adjustdirection=True, globalpair=True) elif mafft_pairMethod == 'mafft_localpair': mafft_cline = MafftCommandline(input=fasta_filename, nuc=True, quiet=True, maxiterate=int(mafft_iter), adjustdirection=True, localpair=True) elif mafft_pairMethod == 'mafft_genafpair': mafft_cline = MafftCommandline(input=fasta_filename, nuc=True, quiet=True, maxiterate=int(mafft_iter), adjustdirection=True, genafpair=True) else: #Default Command mafft_cline = MafftCommandline(input=fasta_filename, nuc=True, adjustdirection=True, quiet=True) logger.info("Building alignment. Executing the following command - %s", mafft_cline) stdout, stderr = mafft_cline() handle = open(output_aligned_fasta_filename, "w") handle.write(stdout) handle.close() #Run Filter method to avoid large msa files: if context.UserFlags_dict['FilterMSA_Method'] == 'Trimal': logger.debug("MSA Filter method: Trimal") Trimal_cf = context.UserFlags_dict['Trimal_CutOff'] runTrimal(output_aligned_fasta_filename,Trimal_cf) logger.info("Files after Trimal at: %s" %output_aligned_fasta_filename) elif context.UserFlags_dict['FilterMSA_Method'] == 'Gblocks': logger.debug("MSA Filter method: Gblocks") runGblocks(output_aligned_fasta_filename,context.UserFlags_dict)
def run_mafft(mafftdir, outputDir, fastaFileList, outputFileNameList, startNum, endNum, algorithm): # Set up import platform for i in range(startNum, endNum): # Run MAFFT if platform.system() == 'Windows': mafft_cline = MafftCommandline(os.path.join( mafftdir, 'mafft.bat'), input=fastaFileList[i]) else: mafft_cline = MafftCommandline(os.path.join(mafftdir, 'mafft'), input=fastaFileList[i]) if algorithm != None: if algorithm.lower() == 'genafpair': mafft_cline.genafpair = True elif algorithm.lower() == 'localpair': mafft_cline.localpair = True elif algorithm.lower() == 'globalpair': mafft_cline.globalpair = True stdout, stderr = mafft_cline() if stdout == '': raise Exception('MAFFT error text below' + str(stderr)) # Process MAFFT output stdout = stdout.split('\n') while stdout[-1] == '\n' or stdout[-1] == '' or stdout[ -1] == 'Terminate batch job (Y/N)?\n': # Remove junk, sometimes MAFFT will have the 'Terminate ...' line del stdout[-1] stdout = '\n'.join(stdout) # Create output alignment files with open(outputFileNameList[i], 'w') as fileOut: fileOut.write(stdout)
def align_sequences(fasta_temp_dir, alignment_temp_dir, wd): os.chdir(fasta_temp_dir) print('aligning each sample sequence to reference genome') n = 0 for file in glob.glob('*.fasta'): n = n + 1 print(n) sample_seq_name = file.split('.fasta')[0] # for record in SeqIO.parse(file, 'fasta'): # if record.id != ref_id: # sample_seq_name = record.id # create outpath file name for alignment alignment_file_name = os.path.join( alignment_temp_dir, '%s.alignment.fasta' % sample_seq_name) if not os.path.isfile(alignment_file_name): # do alignment mafft_cline = MafftCommandline(input=file) print(mafft_cline) stdout, stderr = mafft_cline() with open(alignment_file_name, 'w') as handle: handle.write(stdout)
def test_Mafft_with_complex_command_line(self): """Round-trip with complex command line.""" cmdline = MafftCommandline(mafft_exe) cmdline.set_parameter("input", self.infile1) cmdline.set_parameter("--localpair", True) cmdline.set_parameter("--weighti", 4.2) cmdline.set_parameter("retree", 5) cmdline.set_parameter("maxiterate", 200) cmdline.set_parameter("--nofft", True) cmdline.set_parameter("op", 2.04) cmdline.set_parameter("--ep", 0.51) cmdline.set_parameter("--lop", 0.233) cmdline.set_parameter("lep", 0.2) cmdline.set_parameter("--reorder", True) cmdline.set_parameter("--treeout", True) cmdline.set_parameter("nuc", True) self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) self.assertEqual(str(cmdline), mafft_exe + " --localpair --weighti 4.2 --retree 5 " + "--maxiterate 200 --nofft --op 2.04 --ep 0.51" + " --lop 0.233 --lep 0.2 --reorder --treeout" + " --nuc Fasta/f002") stdoutdata, stderrdata = cmdline() self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680")) self.assertTrue("$#=0" not in stderrdata)
def align_cluster(self, cluster_file): """ Worker fuction for align_clusters Inputs a FASTA file containing an unaligned sequence cluster. Uses MAFFT to align the cluster. """ mafft_cline = MafftCommandline(input=cluster_file) mafft_cline.set_parameter("--auto", True) mafft_cline.set_parameter("--adjustdirection", True) color = Color() print(color.red + str(mafft_cline) + color.done) sys.stdout.flush() if cluster_file.find("/") != -1: alignment_file = "alignments" + cluster_file[cluster_file.index("/" ):] else: alignment_file = "alignments/" + cluster_file try: stdout, stderr = mafft_cline() with open(alignment_file, "w") as handle: handle.write(stdout) except: print( color.red + "Error: alignment file not generated. Please check your MAFFT installation." + color.done) return alignment_file
def mafft_align(file): stdout, stderr = MafftCommandline( input=file, auto=True, )() with open(f"{os.path.splitext(file)[0]}.fasta.mafft", "w") as aligned: aligned.write(stdout)
def call_mafft(genefile): """Calls MAFFT to generate an alignment. Parameters ---------- genefile : str a string with the name/path for the FASTA file. Returns ------- bool True if sucessful, False otherwise. """ try: mafft_cline = MafftCommandline( input=genefile, adjustdirection=True, treeout=True, thread=1, retree=1, maxiterate=0, ) stdout, stderr = mafft_cline() path_to_save = genefile.replace("_prot.fasta", "_aligned.fasta") with open(path_to_save, "w") as handle: handle.write(stdout) return True except Exception as e: print(e) return False
def mafft_alignment(mafft_cmd, *args): fa_fpath = '/dev/shm/tmp.fa' mafft_fpath = '/dev/shm/tmp.mafft' # Write seqs to fasta file with open(fa_fpath, 'w') as out: for i, s in enumerate(args): out.write('>{}\n'.format(i)) out.write('{}\n'.format(s)) # Align mf_cline = MafftCommandline(mafft_cmd, input=fa_fpath) stdout, stderr = mf_cline() with open(mafft_fpath, 'w') as out: out.write(stdout) #check_call([mafft_cmd, '--quiet', fa_fpath, '>', mafft_fpath], shell=True) # Read and order output alignment = [ (i, str(rec.seq)) for i, rec in enumerate(SeqIO.parse(open(mafft_fpath), 'fasta')) ] output = [s.upper() for i, s in sorted(alignment)] # Delete files os.remove(fa_fpath) os.remove(mafft_fpath) return output
def pool_write_microalignment(mblocknum, targetdata, extendedsourcedata, nbinitialsource, all_ids, msamethod): aln = {} i = mblocknum[0] mblock = mblocknum[1] input_muscle_file = "input_muscle.fasta" + str(i) output_muscle_file = "output_muscle.fasta" + str(i) input_muscle = open(input_muscle_file, "w") nbseq = 0 for gene in targetdata: geneid, geneseq = gene if geneid in mblock.keys() and mblock[geneid][1] > mblock[geneid][0]: input_muscle.write(">" + geneid + "\n" + geneseq[mblock[geneid][0]:mblock[geneid][1]] + "\n") nbseq += 1 for j in range(nbinitialsource): cds = extendedsourcedata[j] cdsid, cdsseq, cdsgeneid, null = cds if cdsid in mblock.keys() and mblock[cdsid][1] > mblock[cdsid][0]: input_muscle.write(">" + cdsid + "\n" + cdsseq[mblock[cdsid][0]:mblock[cdsid][1]] + "\n") nbseq += 1 input_muscle.close() msa = [] if (nbseq > 0): if (msamethod == "muscle"): muscle_cline = MuscleCommandline(input=input_muscle_file, out=output_muscle_file, gapopen=-800.0) stdout, stderr = muscle_cline() else: # msamethod == "mafft" mafft_cline = MafftCommandline(input=input_muscle_file) stdout, stderr = mafft_cline() with open(output_muscle_file, "w") as handle: handle.write(stdout) msa = AlignIO.read(output_muscle_file, "fasta") else: open(output_muscle_file, "w").close() present_ids = [] length = 0 for record in msa: present_ids.append(record.id) aln[record.id] = record.seq length = len(record.seq) for id in all_ids: if (id not in present_ids): aln[id] = '-' * length os.remove(input_muscle_file) os.remove(output_muscle_file) return aln
def executeMafft(mafft_exe, directory='', gap_penalty=10.0): import os, sys from Bio.Align.Applications import MafftCommandline if len(directory) > 0 and directory[-1] != '/': directory += '/' if len(mafft_exe) == 0: sys.stderr.write('Install mafft before execution.') sys.exit(-1) after = directory + 'aligned_contigs/' if not os.path.exists(after): os.mkdir(after) seq_dir = directory + 'sequences/' seqfiles = os.listdir(seq_dir) for seqfile in seqfiles: if seqfile[-6:] == '.fasta': sequences = {} seq_ids = [] for line in open(seq_dir + seqfile, 'r'): if line[0] == '>': seq_ids.append(line.strip()[1:]) else: sequences.setdefault(seq_ids[-1], '') sequences[seq_ids[-1]] += line.strip() transcript = seqfile[:seqfile.find('.')] mafft_cline = MafftCommandline(mafft_exe, input=seq_dir + seqfile) mafft_cline.set_parameter('--op', gap_penalty) writefile = open(after + transcript + '_aligned.fasta', 'w') stdout, stderr = mafft_cline() writefile.write(stdout) writefile.close()
def do_alignment(fasta, threads): # Run MAFFT alignment align_cmd = MafftCommandline(input=fasta, retree=1, maxiterate=0, thread=int(threads)) align_so, align_se = align_cmd() align = AlignIO.read(io.StringIO(align_so), "fasta") return align
def align(cls, seq_records, outfile=None): '''Align given sequences @param seq_records: a list of SeqRecords objects @param outfile: a filename for the output alignment or None @return: if the outfile is none, return an AlignmentExt object; otherwise return True on success. In both cases return None on error.''' if not outfile: outfile = mktmp_name('.aln.fasta') remove_out = True else: remove_out = False msafile = mktmp_fasta(seq_records) args = dict(thread=-1, input=msafile) if len(seq_records) < 10000: args['auto'] = True else: args['parttree'] = True args['partsize'] = 1000 ali = None if run_cline(MafftCommandline(**args), stdout=outfile): if os.path.isfile(outfile) and os.path.getsize(outfile) > 0: if remove_out: ali = AlignmentExt.from_msa(AlignIO.read(outfile, 'fasta')) else: ali = True else: ali = False if remove_out: safe_unlink(outfile) safe_unlink(msafile) return ali
def MakeAlignments(seqs,name,path): ##aligns exported data if os.path.isfile(path + name + '_aligned.txt') is False: in_file = seqs mafft_cline = MafftCommandline(input=in_file, auto=True, reorder=True) stdout, stderr = mafft_cline() handle = open(path + name + '_aligned.txt', 'w') handle.write(stdout) handle.close()
def align(fasta): # MAFFT needs to be in the path in_file = os.path.relpath(fasta) mafft_cline = MafftCommandline(input=in_file) stdout, stderr = mafft_cline() align = AlignIO.read(StringIO(stdout), "fasta") sequence1=str(align[0].seq) sequence2=str(align[1].seq) return [sequence1,sequence2]
def mafft_align(query_seq, target_seq, query_name, target_name, align_method="local", directory="./", quiet=False): # add time to file name to make it unique '20160809-144522_' 2016-08-09 14:45:22 file_name = directory + datetime.now().strftime("%Y%m%d_%H%M%S_") + query_name + ".fasta" with open(file_name, 'w') as data_out: data_out.write(">{}\n{}\n>{}\n{}".format(target_name, target_seq, query_name, query_seq)) if align_method == "local": mafft_cline = MafftCommandline(input=directory + file_name, nuc=True, localpair=True, maxiterate=1000, quiet=quiet) else: mafft_cline = MafftCommandline(input=directory + file_name, nuc=True, globalpair=True, maxiterate=1000, quiet=quiet) out, _ = mafft_cline() align = AlignIO.read(StringIO(out), "fasta") my_list = list(align) # target name, target seq, query name, query seq os.remove(file_name) return my_list[0].id, my_list[0].seq, my_list[1].id, my_list[1].seq
def mafft(infile): from Bio.Align.Applications import MafftCommandline from io import StringIO from Bio import AlignIO mafft_cline = MafftCommandline("mafft", input=infile) print(mafft_cline) stdout, stderr = mafft_cline() align = AlignIO.read(StringIO(stdout), "fasta") outfile = infile.replace('.fasta', '_mafft.aln') AlignIO.write(align, outfile, "clustal")
def create_msa(fasta_infile, msa_fasta, msa_phy): "Creates a multiple sequence alignment with mafft in phylip format" mafft_cline = MafftCommandline( input=fasta_infile) #Create mafft command line stdout, stderr = mafft_cline() #save mafft output into variable with open(msa_fasta, 'w') as handle: handle.write(stdout) #write mafft output in fasta format AlignIO.convert( msa_fasta, "fasta", msa_phy, "phylip-relaxed") #convert mafft output from fasta to phylip
def test_Mafft_with_options(self): """Simple round-trip through app with infile and options, result passed to stdout.""" cmdline = MafftCommandline(mafft_exe) cmdline.set_parameter("input", self.infile1) cmdline.set_parameter("maxiterate", 100) cmdline.set_parameter("--localpair", True) self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) stdoutdata, stderrdata = cmdline() self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680")) self.assertNotIn("$#=0", stderrdata)
def codon_align(self, alignment_tool="mafft", prune=True, verbose=0): ''' takes a nucleotide alignment, translates it, aligns the amino acids, pads the gaps note that this suppresses any compensated frameshift mutations Parameters: - alignment_tool: ['mafft', 'muscle'] the commandline tool to use ''' from Bio import AlignIO, SeqIO from Bio.SeqRecord import SeqRecord make_dir(self.run_dir) os.chdir(self.run_dir) # translage aa_seqs = {} bad_seq = 0 for seq in self.seqs.values(): tempseq = seq.seq.translate() # use only sequences that translate with out trouble if '*' not in str(tempseq)[:-1] or prune == False: aa_seqs[seq.id] = SeqRecord(tempseq, id=seq.id) aa_seqs[seq.id].attributes = seq.attributes else: if verbose: print(seq.id, "has premature stops, discarding") bad_seq += '*' in str(tempseq)[:-1] print('Number of sequences with stops:', bad_seq, 'out of total', len(self.seqs)) tmpfname = 'temp_in.fasta' SeqIO.write(aa_seqs.values(), tmpfname, 'fasta') if alignment_tool == 'muscle': from Bio.Align.Applications import MuscleCommandline cline = MuscleCommandline(input=tmpfname, out=tmpfname[:-5] + 'aligned.fasta') cline() aln_aa = AlignIO.read(tmpfname[:-5] + 'aligned.fasta', "fasta") elif alignment_tool == 'mafft': from Bio.Align.Applications import MafftCommandline from StringIO import StringIO mafft_cline = MafftCommandline(input=tmpfname) stdout, stderr = mafft_cline() aln_aa = AlignIO.read(StringIO(stdout), "fasta") else: print('Alignment tool not supported:', alignment_tool) return #generate nucleotide alignment self.aln = pad_nucleotide_sequences(aln_aa, self.seqs) self.sequence_lookup = {seq.id: seq for seq in self.aln} # add attributes to alignment for seq in self.seqs.values(): if seq.id in self.sequence_lookup: self.sequence_lookup[seq.id].attributes = seq.attributes os.chdir('..') remove_dir(self.run_dir)
def test_Mafft_simple(self): """Simple round-trip through app with infile. Result passed to stdout. """ #Use a keyword argument at init, cmdline = MafftCommandline(mafft_exe, input=self.infile1) self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) stdoutdata, stderrdata = cmdline() self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680")) self.assertTrue("Progressive alignment ..." in stderrdata, stderrdata) self.assertTrue("$#=0" not in stderrdata)
def align_with_mafft(filepath, localpair=False, maxiterate=1000): """ Align a file with the given filepath using MAFFT :param filepath: The file to align :param localpair: Should we use the l-insi method :return: The MAFFT alignment """ mafft_cline = MafftCommandline(input=filepath, localpair=localpair, maxiterate=maxiterate) stdout, stderr = mafft_cline() align = AlignIO.read(io.StringIO(stdout), "fasta") return align
def align(self): if self.align_software == 'mafft': mafft_cline = MafftCommandline( cmd=self.mafft_path, input=self.pair_pep_file, auto=True) stdout, stderr = mafft_cline() align = AlignIO.read(StringIO(stdout), "fasta") AlignIO.write(align, self.prot_align_file, "fasta") if self.align_software == 'muscle': muscle_cline = MuscleCommandline( cmd=self.muscle_path, input=self.pair_pep_file, out=self.prot_align_file, seqtype="protein", clwstrict=True) stdout, stderr = muscle_cline()
def align_fasta(in_file_loc): # Gets the base file *.fa out_file_base = in_file_loc.split(".fa")[0] + ".aln" # Runs command line to work with mafft mafft_cline = MafftCommandline(input=in_file_loc) # runs mafft using what our file was and to an output of base.aln stdout, stderr = mafft_cline() with open(out_file_base, "w") as handle: handle.write(stdout)
def test_Mafft_with_Clustalw_output(self): """Simple round-trip through app with clustal output""" cmdline = MafftCommandline(mafft_exe) #Use some properties: cmdline.input = self.infile1 cmdline.clustalout = True self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) stdoutdata, stderrdata = cmdline() #e.g. "CLUSTAL format alignment by MAFFT ..." #or "CLUSTAL (-like) formatted alignment by MAFFT FFT-NS-2 (v6.240)" self.assertTrue(stdoutdata.startswith("CLUSTAL"), stdoutdata) self.assertTrue("$#=0" not in stderrdata)
def run_msa(fasta_path, out_dir, bubble_num): mafft_cline = MafftCommandline(input=fasta_path) print('Performing MSA on bubble number', bubble_num) # run MAFFT stdout, stderr = mafft_cline() # write the MSA to a file with open(os.path.join(out_dir, 'msa-' + bubble_num + '.fasta'), 'w') as fh: fh.write(stdout)
def call_mafft_0(in_file, out_file): #mafft_exe = "D:\Gal\MultiCrisper\mafft-7.245-win64\mafft-win\mafft.bat" #in_file = "../Doc/examples/opuntia.fasta" #mafft_cline = MafftCommandline(mafft_exe, input=in_file) mafft_cline = MafftCommandline(input=in_file) print(mafft_cline) stdout, stderr = mafft_cline() with open(out_file, "w") as handle: handle.write(stdout) ##from Bio import AlignIO ##not in use for now ## align = AlignIO.read("aligned.fasta", "fasta") ##not in use for now return out_file
def test_Mafft_with_PHYLIP_namelength(self): """Check PHYLIP with --namelength""" cmdline = MafftCommandline(mafft_exe, input=self.infile1, phylipout=True, namelength=50) self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) stdoutdata, stderrdata = cmdline() #e.g. " 3 706\n" or " 3 681" but allow some variation in the column count self.assertTrue(stdoutdata.startswith(" 3 68") or stdoutdata.startswith(" 3 69") or stdoutdata.startswith(" 3 70"), stdoutdata) self.assertTrue("gi|1348912|gb|G26680|G26680" in stdoutdata, stdoutdata) self.assertTrue("$#=0" not in stderrdata)
def test_Mafft_with_PHYLIP_output(self): """Simple round-trip through app with PHYLIP output""" cmdline = MafftCommandline(mafft_exe, input=self.infile1, phylipout=True) self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) stdoutdata, stderrdata = cmdline() #e.g. " 3 706\n" but allow some variation in the column count self.assertTrue(stdoutdata.startswith(" 3 70"), stdoutdata) self.assertTrue("gi|1348912 " in stdoutdata, stdoutdata) self.assertTrue("gi|1348912|gb|G26680|G26680" not in stdoutdata, stdoutdata) self.assertTrue("$#=0" not in stderrdata)
def call_mafft(path_to_save, genefile): try: print "maffting " + os.path.basename(genefile) mafft_cline = MafftCommandline(input=genefile) stdout, stderr = mafft_cline() with open(path_to_save, "w") as handle: handle.write(stdout) return True except Exception as e: print e return False